first commit

2025-09-15 10:32:17 +08:00 · 2025-09-15 10:32:17 +08:00 · cc76bab27e
commit cc76bab27e
3854 changed files with 740345 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,240 @@
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 *.py,cover
 .hypothesis/
 # Tokenizer cache for tests
 .tokenizer_cache/
 .pytest_cache/
 cover/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3
 db.sqlite3-journal
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 .pybuilder/
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # IPython
 profile_default/
 ipython_config.py
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
 # .python-version
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 #   install all needed dependencies.
 #Pipfile.lock
 # poetry
 #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 #   This is especially recommended for binary packages to ensure reproducibility, and is more
 #   commonly ignored for libraries.
 #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
 #poetry.lock
 # pdm
 #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
 #pdm.lock
 #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
 #   in version control.
 #   https://pdm.fming.dev/#use-with-ide
 .pdm.toml
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
 __pypackages__/
 # Celery stuff
 celerybeat-schedule
 celerybeat.pid
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
 # Pyre type checker
 .pyre/
 # pytype static type analyzer
 .pytype/
 # Cython debug symbols
 cython_debug/
 # PyCharm
 #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 .idea/
 # MacOS
 .DS_Store
 # Vim
 *.swp
 # Documentation
 docs/_build
 # SGL
 benchmark/mmlu/data
 benchmark/mmlu/data.tar
 benchmark/llava_bench/images
 benchmark/llava_bench/mme_pack
 *.jsonl
 tmp*.txt
 # Plots
 *.png
 *.pdf
 # personnal
 work_dirs/
 *.csv
 !logo.png
 # Prerequisites
 *.d
 # Compiled Object files
 *.slo
 *.lo
 *.o
 *.obj
 # Precompiled Headers
 *.gch
 *.pch
 # Compiled Dynamic libraries
 *.so
 *.dylib
 *.dll
 # Fortran module files
 *.mod
 *.smod
 # Compiled Static libraries
 *.lai
 *.la
 *.a
 *.lib
 # Executables
 *.exe
 *.out
 *.app
 compile_commands.json
 *.iml
 # VSCode
 .vscode
 1
 # Autoenv
 .env.leave
 # Rust lib
 Cargo.lock
 lmms-eval
--- a/DeepEP/.gitignore
+++ b/DeepEP/.gitignore
@ -0,0 +1,8 @@
 compile_commands.json
 .idea
 .DS_Store
 *.pyc
 build/
 .cache/
 .vscode/
 */cmake-build-*/
--- a/DeepEP/LICENSE
+++ b/DeepEP/LICENSE
@ -0,0 +1,21 @@
 MIT License
 Copyright (c) 2025 DeepSeek
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
--- a/DeepEP/README.md
+++ b/DeepEP/README.md
@ -0,0 +1,344 @@
 # DeepEP
 DeepEP is a communication library tailored for Mixture-of-Experts (MoE) and expert parallelism (EP). It provides high-throughput and low-latency all-to-all GPU kernels, which are also known as MoE dispatch and combine. The library also supports low-precision operations, including FP8.
 To align with the group-limited gating algorithm proposed in the [DeepSeek-V3](https://github.com/deepseek-ai/DeepSeek-V3) paper, DeepEP offers a set of kernels optimized for asymmetric-domain bandwidth forwarding, such as forwarding data from NVLink domain to RDMA domain. These kernels deliver high throughput, making them suitable for both training and inference prefilling tasks. Additionally, they support SM (Streaming Multiprocessors) number control.
 For latency-sensitive inference decoding, DeepEP includes a set of low-latency kernels with pure RDMA to minimize delays. The library also introduces a hook-based communication-computation overlapping method that does not occupy any SM resource.
 Notice: the implementation in this library may have some slight differences from the [DeepSeek-V3](https://github.com/deepseek-ai/DeepSeek-V3) paper.
 ## Performance
 ### Normal kernels with NVLink and RDMA forwarding
 We test normal kernels on H800 (~160 GB/s NVLink maximum bandwidth), with each connected to a CX7 InfiniBand 400 Gb/s RDMA network card (~50 GB/s maximum bandwidth). And we follow the DeepSeek-V3/R1 pretraining setting (4096 tokens per batch, 7168 hidden, top-4 groups, top-8 experts, FP8 dispatching and BF16 combining).
 |   Type    | Dispatch #EP | Bottleneck bandwidth | Combine #EP | Bottleneck bandwidth |
 |:---------:|:------------:|:--------------------:|:-----------:|:--------------------:|
 | Intranode |      8       |  153 GB/s (NVLink)   |      8      |  158 GB/s (NVLink)   |
 | Internode |      16      |    43 GB/s (RDMA)    |     16      |    43 GB/s (RDMA)    |
 | Internode |      32      |    58 GB/s (RDMA)    |     32      |    57 GB/s (RDMA)    |
 | Internode |      64      |    51 GB/s (RDMA)    |     64      |    50 GB/s (RDMA)    |
 **News (2025.04.22)**: with optimizations from Tencent Network Platform Department, performance was enhanced by up to 30%, see [#130](https://github.com/deepseek-ai/DeepEP/pull/130) for more details. Thanks for the contribution!
 ### Low-latency kernels with pure RDMA
 We test low-latency kernels on H800 with each connected to a CX7 InfiniBand 400 Gb/s RDMA network card (~50 GB/s maximum bandwidth). And we follow a typical DeepSeek-V3/R1 production setting (128 tokens per batch, 7168 hidden, top-8 experts, FP8 dispatching and BF16 combining).
 | Dispatch #EP | Latency | RDMA bandwidth | Combine #EP | Latency | RDMA bandwidth |
 |:------------:|:-------:|:--------------:|:-----------:|:-------:|:--------------:|
 |      8       |  77 us  |    98 GB/s     |      8      | 114 us  |    127 GB/s    |
 |      16      | 118 us  |    63 GB/s     |     16      | 195 us  |    74 GB/s     |
 |      32      | 155 us  |    48 GB/s     |     32      | 273 us  |    53 GB/s     |
 |      64      | 173 us  |    43 GB/s     |     64      | 314 us  |    46 GB/s     |
 |     128      | 192 us  |    39 GB/s     |     128     | 369 us  |    39 GB/s     |
 |     256      | 194 us  |    39 GB/s     |     256     | 360 us  |    40 GB/s     |
 **News (2025.06.05)**: low-latency kernels now leverage NVLink as much as possible, see [#173](https://github.com/deepseek-ai/DeepEP/pull/173) for more details. Thanks for the contribution!
 ## Quick start
 ### Requirements
 - Ampere (SM80), Hopper (SM90) GPUs, or other architectures with SM90 PTX ISA support
 - Python 3.8 and above
 - CUDA version
  - CUDA 11.0 and above for SM80 GPUs
  - CUDA 12.3 and above for SM90 GPUs
 - PyTorch 2.1 and above
 - NVLink for intranode communication
 - RDMA network for internode communication
 ### Download and install NVSHMEM dependency
 DeepEP also depends on our modified NVSHMEM. Please refer to our [NVSHMEM Installation Guide](third-party/README.md) for instructions.
 ### Development
 ```bash
 # Build and make symbolic links for SO files
 NVSHMEM_DIR=/path/to/installed/nvshmem python setup.py build
 # You may modify the specific SO names according to your own platform
 ln -s build/lib.linux-x86_64-cpython-38/deep_ep_cpp.cpython-38-x86_64-linux-gnu.so
 # Run test cases
 # NOTES: you may modify the `init_dist` function in `tests/utils.py`
 # according to your own cluster settings, and launch into multiple nodes 
 python tests/test_intranode.py
 python tests/test_internode.py
 python tests/test_low_latency.py
 ```
 ### Installation
 ```bash
 NVSHMEM_DIR=/path/to/installed/nvshmem python setup.py install
 ```
 #### Installation environment variables
 - `NVSHMEM_DIR`: the path to the NVSHMEM directory, disable all internode and low-latency features if not specified 
 - `DISABLE_SM90_FEATURES`: 0 or 1, whether to disable SM90 features, it is required for SM90 devices or CUDA 11
 - `TORCH_CUDA_ARCH_LIST`: the list of target architectures, e.g. `TORCH_CUDA_ARCH_LIST="9.0"`
 - `DISABLE_AGGRESSIVE_PTX_INSTRS`: 0 or 1, whether to disable aggressive load/store instructions, see [Undefined-behavior PTX usage](#undefined-behavior-ptx-usage) for more details
 Then, import `deep_ep` in your Python project, and enjoy!
 ## Network configurations
 DeepEP is fully tested with InfiniBand networks. However, it is theoretically compatible with RDMA over Converged Ethernet (RoCE) as well.
 ### Traffic isolation
 Traffic isolation is supported by InfiniBand through Virtual Lanes (VL).
 To prevent interference between different types of traffic, we recommend segregating workloads across different virtual lanes as follows:
 - workloads using normal kernels
 - workloads using low-latency kernels
 - other workloads
 For DeepEP, you can control the virtual lane assignment by setting the `NVSHMEM_IB_SL` environment variable.
 ### Adaptive routing
 Adaptive routing is an advanced routing feature provided by InfiniBand switches that can evenly distribute traffic across multiple paths. Enabling adaptive routing can completely eliminate network congestion caused by routing conflicts, but it also introduces additional latency. We recommend the following configuration for optimal performance:
 - enable adaptive routing in environments with heavy network loads
 - use static routing in environments with light network loads
 ### Congestion control
 Congestion control is disabled as we have not observed significant congestion in our production environment.
 ## Interfaces and examples
 ### Example use in model training or inference prefilling
 The normal kernels can be used in model training or the inference prefilling phase (without the backward part) as the below example code shows.
 ```python
 import torch
 import torch.distributed as dist
 from typing import List, Tuple, Optional, Union
 from deep_ep import Buffer, EventOverlap
 # Communication buffer (will allocate at runtime)
 _buffer: Optional[Buffer] = None
 # Set the number of SMs to use
 # NOTES: this is a static variable
 Buffer.set_num_sms(24)
 # You may call this function at the framework initialization
 def get_buffer(group: dist.ProcessGroup, hidden_bytes: int) -> Buffer:
    global _buffer
    # NOTES: you may also replace `get_*_config` with your auto-tuned results via all the tests
    num_nvl_bytes, num_rdma_bytes = 0, 0
    for config in (Buffer.get_dispatch_config(group.size()), Buffer.get_combine_config(group.size())):
        num_nvl_bytes = max(config.get_nvl_buffer_size_hint(hidden_bytes, group.size()), num_nvl_bytes)
        num_rdma_bytes = max(config.get_rdma_buffer_size_hint(hidden_bytes, group.size()), num_rdma_bytes)
    # Allocate a buffer if not existed or not enough buffer size
    if _buffer is None or _buffer.group != group or _buffer.num_nvl_bytes < num_nvl_bytes or _buffer.num_rdma_bytes < num_rdma_bytes:
        _buffer = Buffer(group, num_nvl_bytes, num_rdma_bytes)
    return _buffer
 def get_hidden_bytes(x: torch.Tensor) -> int:
    t = x[0] if isinstance(x, tuple) else x
    return t.size(1) * max(t.element_size(), 2)
 def dispatch_forward(x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
                     topk_idx: torch.Tensor, topk_weights: torch.Tensor,
                     num_experts: int, previous_event: Optional[EventOverlap] = None) -> \
        Tuple[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]], torch.Tensor, torch.Tensor, List, Tuple, EventOverlap]:
    # NOTES: an optional `previous_event` means a CUDA event captured that you want to make it as a dependency 
    # of the dispatch kernel, it may be useful with communication-computation overlap. For more information, please
    # refer to the docs of `Buffer.dispatch`
    global _buffer
    # Calculate layout before actual dispatch
    num_tokens_per_rank, num_tokens_per_rdma_rank, num_tokens_per_expert, is_token_in_rank, previous_event = \
        _buffer.get_dispatch_layout(topk_idx, num_experts,
                                    previous_event=previous_event, async_finish=True,
                                    allocate_on_comm_stream=previous_event is not None)
    # Do MoE dispatch
    # NOTES: the CPU will wait for GPU's signal to arrive, so this is not compatible with CUDA graph
    # Unless you specify `num_worst_tokens`, but this flag is for intranode only
    # For more advanced usages, please refer to the docs of the `dispatch` function
    recv_x, recv_topk_idx, recv_topk_weights, num_recv_tokens_per_expert_list, handle, event = \
        _buffer.dispatch(x, topk_idx=topk_idx, topk_weights=topk_weights,
                         num_tokens_per_rank=num_tokens_per_rank, num_tokens_per_rdma_rank=num_tokens_per_rdma_rank,
                         is_token_in_rank=is_token_in_rank, num_tokens_per_expert=num_tokens_per_expert,
                         previous_event=previous_event, async_finish=True,
                         allocate_on_comm_stream=True)
    # For event management, please refer to the docs of the `EventOverlap` class
    return recv_x, recv_topk_idx, recv_topk_weights, num_recv_tokens_per_expert_list, handle, event
 def dispatch_backward(grad_recv_x: torch.Tensor, grad_recv_topk_weights: torch.Tensor, handle: Tuple) -> \
        Tuple[torch.Tensor, torch.Tensor, EventOverlap]:
    global _buffer
    # The backward process of MoE dispatch is actually a combine
    # For more advanced usages, please refer to the docs of the `combine` function
    combined_grad_x, combined_grad_recv_topk_weights, event = \
        _buffer.combine(grad_recv_x, handle, topk_weights=grad_recv_topk_weights, async_finish=True)
    # For event management, please refer to the docs of the `EventOverlap` class
    return combined_grad_x, combined_grad_recv_topk_weights, event
 def combine_forward(x: torch.Tensor, handle: Tuple, previous_event: Optional[EventOverlap] = None) -> \
        Tuple[torch.Tensor, EventOverlap]:
    global _buffer
    # Do MoE combine
    # For more advanced usages, please refer to the docs of the `combine` function
    combined_x, _, event = _buffer.combine(x, handle, async_finish=True, previous_event=previous_event,
                                           allocate_on_comm_stream=previous_event is not None)
    # For event management, please refer to the docs of the `EventOverlap` class
    return combined_x, event
 def combine_backward(grad_combined_x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
                     handle: Tuple, previous_event: Optional[EventOverlap] = None) -> \
        Tuple[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]], EventOverlap]:
    global _buffer
    # The backward process of MoE combine is actually a dispatch
    # For more advanced usages, please refer to the docs of the `dispatch` function
    grad_x, _, _, _, _, event = _buffer.dispatch(grad_combined_x, handle=handle, async_finish=True,
                                                 previous_event=previous_event,
                                                 allocate_on_comm_stream=previous_event is not None)
    # For event management, please refer to the docs of the `EventOverlap` class
    return grad_x, event
 ```
 Moreover, inside the dispatch function, we may not know how many tokens to receive for the current rank. So an implicit CPU wait for GPU received count signal will be involved, as the following figure shows.
 ![normal](figures/normal.png)
 ### Example use in inference decoding
 The low latency kernels can be used in the inference decoding phase as the below example code shows.
 ```python
 import torch
 import torch.distributed as dist
 from typing import Tuple, Optional
 from deep_ep import Buffer
 # Communication buffer (will allocate at runtime)
 # NOTES: there is no SM control API for the low-latency kernels
 _buffer: Optional[Buffer] = None
 # You may call this function at the framework initialization
 def get_buffer(group: dist.ProcessGroup, num_max_dispatch_tokens_per_rank: int, hidden: int, num_experts: int) -> Buffer:
    # NOTES: the low-latency mode will consume much more space than the normal mode
    # So we recommend that `num_max_dispatch_tokens_per_rank` (the actual batch size in the decoding engine) should be less than 256
    global _buffer
    num_rdma_bytes = Buffer.get_low_latency_rdma_size_hint(num_max_dispatch_tokens_per_rank, hidden, group.size(), num_experts)
    # Allocate a buffer if not existed or not enough buffer size
    if _buffer is None or _buffer.group != group or not _buffer.low_latency_mode or _buffer.num_rdma_bytes < num_rdma_bytes:
        # NOTES: for the best performance, the QP number **must** be equal to the number of the local experts
        assert num_experts % group.size() == 0
        _buffer = Buffer(group, 0, num_rdma_bytes, low_latency_mode=True, num_qps_per_rank=num_experts // group.size())
    return _buffer
 def low_latency_dispatch(hidden_states: torch.Tensor, topk_idx: torch.Tensor, num_max_dispatch_tokens_per_rank: int, num_experts: int):
    global _buffer
    # Do MoE dispatch, compatible with CUDA graph (but you may restore some buffer status once you replay)
    recv_hidden_states, recv_expert_count, handle, event, hook = \
        _buffer.low_latency_dispatch(hidden_states, topk_idx, num_max_dispatch_tokens_per_rank, num_experts,
                                     async_finish=False, return_recv_hook=True)
    # NOTES: the actual tensor will not be received only if you call `hook()`,
    # it is useful for double-batch overlapping, but **without any SM occupation**
    # If you don't want to overlap, please set `return_recv_hook=False`
    # Later, you can use our GEMM library to do the computation with this specific format
    return recv_hidden_states, recv_expert_count, handle, event, hook
 def low_latency_combine(hidden_states: torch.Tensor,
                        topk_idx: torch.Tensor, topk_weights: torch.Tensor, handle: Tuple):
    global _buffer
    # Do MoE combine, compatible with CUDA graph (but you may restore some buffer status once you replay)
    combined_hidden_states, event_overlap, hook = \
        _buffer.low_latency_combine(hidden_states, topk_idx, topk_weights, handle,
                                    async_finish=False, return_recv_hook=True)
    # NOTES: the same behavior as described in the dispatch kernel
    return combined_hidden_states, event_overlap, hook
 ```
 For two-micro-batch overlapping, you can refer to the following figure. With our receiving hook interface, the RDMA network traffic is happening in the background, without costing any GPU SMs from the computation part. But notice, the overlapped parts can be adjusted, i.e., the 4 parts of attention/dispatch/MoE/combine may not have the exact same execution time. You may adjust the stage settings according to your workload.
 ![low-latency](figures/low-latency.png)
 ## Roadmap
 - [x] AR support
 - [x] Refactor low-latency mode AR code
 - [x] A100 support (intranode only)
 - [x] Support BF16 for the low-latency dispatch kernel
 - [x] Support NVLink protocol for intranode low-latency kernels
 - [ ] TMA copy instead of LD/ST
  - [x] Intranode kernels
  - [ ] Internode kernels
  - [ ] Low-latency kernels
 - [ ] SM-free kernels and refactors
 - [ ] Fully remove undefined-behavior PTX instructions
 ## Notices
 #### Easier potential overall design
 The current DeepEP implementation uses queues for communication buffers which save memory but introduce complexity and potential deadlocks. If you're implementing your own version based on DeepEP, consider using fixed-size buffers allocated to maximum capacity for simplicity and better performance. For a detailed discussion of this alternative approach, see https://github.com/deepseek-ai/DeepEP/issues/39.
 #### Undefined-behavior PTX usage
 - For extreme performance, we discover and use an undefined-behavior PTX usage: using read-only PTX `ld.global.nc.L1::no_allocate.L2::256B` to **read volatile data**. The PTX modifier `.nc` indicates that a non-coherent cache is used. But the correctness is tested to be guaranteed with `.L1::no_allocate` on Hopper architectures, and performance will be much better. The reason we guess may be: the non-coherent cache is unified with L1, and the L1 modifier is not just a hint but a strong option, so that the correctness can be guaranteed by no dirty data in L1.
 - Initially, because NVCC could not automatically unroll volatile read PTX, we tried using `__ldg` (i.e., `ld.nc`). Even compared to manually unrolled volatile reads, it was significantly faster (likely due to additional compiler optimizations). However, the results could be incorrect or dirty. After consulting the PTX documentation, we discovered that L1 and non-coherent cache are unified on Hopper architectures. We speculated that `.L1::no_allocate` might resolve the issue, leading to this discovery.
 - If you find kernels not working on some other platforms, you may add `DISABLE_AGGRESSIVE_PTX_INSTRS=1` to `setup.py` and disable this, or file an issue.
 #### Auto-tuning on your cluster
 For better performance on your cluster, we recommend to run all the tests and use the best auto-tuned configuration. The default configurations are optimized on the DeepSeek's internal cluster.
 ## License
 This code repository is released under [the MIT License](LICENSE), except for codes that reference NVSHMEM (including `csrc/kernels/ibgda_device.cuh` and `third-party/nvshmem.patch`), which are subject to [NVSHMEM SLA](https://docs.nvidia.com/nvshmem/api/sla.html).
 ## Community Forks
 - [Infrawaves/DeepEP_ibrc_dual-ports_multiQP](https://github.com/Infrawaves/DeepEP_ibrc_dual-ports_multiQP) - Adds multi-QP solution and dual-port NIC support in IBRC transport
 ## Citation
 If you use this codebase or otherwise find our work valuable, please cite:
 ```bibtex
@misc{deepep2025,
      title={DeepEP: an efficient expert-parallel communication library},
      author={Chenggang Zhao and Shangyan Zhou and Liyue Zhang and Chengqi Deng and Zhean Xu and Yuxuan Liu and Kuai Yu and Jiashi Li and Liang Zhao},
      year={2025},
      publisher = {GitHub},
      howpublished = {\url{https://github.com/deepseek-ai/DeepEP}},
 }
 ```
--- a/DeepEP/csrc/CMakeLists.txt
+++ b/DeepEP/csrc/CMakeLists.txt
@ -0,0 +1,36 @@
 # NOTES: this CMake is only for debugging; for setup, please use Torch extension
 cmake_minimum_required(VERSION 3.10)
 project(deep_ep LANGUAGES CUDA CXX)
 set(CMAKE_VERBOSE_MAKEFILE ON)
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 -fPIC")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fPIC")
 set(CUDA_SEPARABLE_COMPILATION ON)
 list(APPEND CUDA_NVCC_FLAGS "-DENABLE_FAST_DEBUG")
 list(APPEND CUDA_NVCC_FLAGS "-O3")
 list(APPEND CUDA_NVCC_FLAGS "--ptxas-options=--verbose,--register-usage-level=10,--warn-on-local-memory-usage")
 set(USE_SYSTEM_NVTX on)
 set(CUDA_ARCH_LIST "9.0" CACHE STRING "List of CUDA architectures to compile")
 set(TORCH_CUDA_ARCH_LIST "${CUDA_ARCH_LIST}")
 find_package(CUDAToolkit REQUIRED)
 find_package(pybind11 REQUIRED)
 find_package(Torch REQUIRED)
 find_package(NVSHMEM REQUIRED HINTS ${NVSHMEM_ROOT_DIR}/lib/cmake/nvshmem)
 add_library(nvshmem ALIAS nvshmem::nvshmem)
 add_library(nvshmem_host ALIAS nvshmem::nvshmem_host)
 add_library(nvshmem_device ALIAS nvshmem::nvshmem_device)
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CUDA_STANDARD 17)
 include_directories(${CUDA_TOOLKIT_ROOT_DIR}/include ${TORCH_INCLUDE_DIRS} ${PYTHON_INCLUDE_DIRS} ${NVSHMEM_INCLUDE_DIR})
 link_directories(${TORCH_INSTALL_PREFIX}/lib ${CUDA_TOOLKIT_ROOT_DIR}/lib ${NVSHMEM_LIB_DIR})
 add_subdirectory(kernels)
 # Link CPP and CUDA together
 pybind11_add_module(deep_ep_cpp deep_ep.cpp)
 target_link_libraries(deep_ep_cpp PRIVATE ${EP_CUDA_LIBRARIES} ${TORCH_LIBRARIES} torch_python)
--- a/DeepEP/csrc/config.hpp
+++ b/DeepEP/csrc/config.hpp
@ -0,0 +1,188 @@
 #pragma once
 #include "kernels/api.cuh"
 #include "kernels/exception.cuh"
 namespace deep_ep {
 template <typename dtype_t>
 dtype_t ceil_div(dtype_t a, dtype_t b) {
    return (a + b - 1) / b;
 }
 template <typename dtype_t>
 dtype_t align(dtype_t a, dtype_t b) {
    return ceil_div<dtype_t>(a, b) * b;
 }
 struct Config {
    int num_sms;
    int num_max_nvl_chunked_send_tokens;
    int num_max_nvl_chunked_recv_tokens;
    int num_max_rdma_chunked_send_tokens;
    int num_max_rdma_chunked_recv_tokens;
    Config(int num_sms,
           int num_max_nvl_chunked_send_tokens, int num_max_nvl_chunked_recv_tokens,
           int num_max_rdma_chunked_send_tokens, int num_max_rdma_chunked_recv_tokens) :
            num_sms(num_sms),
            num_max_nvl_chunked_send_tokens(num_max_nvl_chunked_send_tokens),
            num_max_nvl_chunked_recv_tokens(num_max_nvl_chunked_recv_tokens),
            num_max_rdma_chunked_send_tokens(num_max_rdma_chunked_send_tokens),
            num_max_rdma_chunked_recv_tokens(num_max_rdma_chunked_recv_tokens) {
        EP_HOST_ASSERT(num_sms >= 0);
        EP_HOST_ASSERT(num_max_nvl_chunked_send_tokens > 0 and num_max_nvl_chunked_recv_tokens > 0);
        EP_HOST_ASSERT(num_max_nvl_chunked_send_tokens < num_max_nvl_chunked_recv_tokens);
        EP_HOST_ASSERT(num_max_rdma_chunked_send_tokens > 0 and num_max_rdma_chunked_recv_tokens > 0);
        // Ceil up RDMA buffer size
        this->num_max_rdma_chunked_recv_tokens = align<int>(num_max_rdma_chunked_recv_tokens, num_max_rdma_chunked_send_tokens);
        EP_HOST_ASSERT(num_max_rdma_chunked_send_tokens < num_max_rdma_chunked_recv_tokens);
        // NOTES: this assertion is related to RDMA lazy head update, we must ensure senders always have space to push
        EP_HOST_ASSERT(num_max_rdma_chunked_send_tokens <= num_max_rdma_chunked_recv_tokens / 2);
    }
    size_t get_nvl_buffer_size_hint(size_t hidden_bytes, int num_ranks) const {
        // Below are some assumptions
        // TODO: add assertions
        constexpr int kNumMaxTopK = 128;
        constexpr int kNumMaxScales = 128;
        EP_HOST_ASSERT(num_ranks < NUM_MAX_NVL_PEERS or num_ranks % NUM_MAX_NVL_PEERS == 0);
        EP_HOST_ASSERT(num_ranks <= NUM_MAX_NVL_PEERS or num_sms % 2 == 0);
        const auto num_rdma_ranks = std::max(num_ranks / NUM_MAX_NVL_PEERS, 1);
        const auto num_nvl_ranks = std::min(num_ranks, NUM_MAX_NVL_PEERS);
        const int num_channels = num_sms / 2;
        size_t num_bytes = 0;
        num_bytes += num_channels * num_nvl_ranks * (2 * num_rdma_ranks + 3) * sizeof(int);
        num_bytes += num_channels * num_nvl_ranks * num_max_nvl_chunked_recv_tokens * hidden_bytes;
 #ifndef DISABLE_NVSHMEM
        num_bytes += num_channels * num_nvl_ranks * num_max_nvl_chunked_recv_tokens * internode::get_source_meta_bytes();
 #endif
        num_bytes += num_channels * num_nvl_ranks * num_max_nvl_chunked_recv_tokens * kNumMaxTopK * sizeof(int64_t);
        num_bytes += num_channels * num_nvl_ranks * num_max_nvl_chunked_recv_tokens * kNumMaxTopK * sizeof(float);
        num_bytes += num_channels * num_nvl_ranks * num_max_nvl_chunked_recv_tokens * kNumMaxScales * sizeof(float);
        num_bytes = ((num_bytes + 127) / 128) * 128;
        return num_bytes;
    }
    size_t get_rdma_buffer_size_hint(int64_t hidden_bytes, int num_ranks) const {
 #ifndef DISABLE_NVSHMEM
        // Legacy mode
        if (num_ranks <= NUM_MAX_NVL_PEERS)
            return 0;
        // Below are some assumptions
        // TODO: add assertions
        constexpr int kNumMaxTopK = 128;
        constexpr int kNumMaxScales = 128;
        EP_HOST_ASSERT(num_ranks % NUM_MAX_NVL_PEERS == 0);
        EP_HOST_ASSERT(num_sms % 2 == 0);
        const int num_rdma_ranks = num_ranks / NUM_MAX_NVL_PEERS;
        const int num_channels = num_sms / 2;
        size_t num_bytes = 0;
        num_bytes += num_channels * num_rdma_ranks * (NUM_MAX_NVL_PEERS * 2 + 2) * 2 * sizeof(int);
        num_bytes += num_channels * num_rdma_ranks * num_max_rdma_chunked_recv_tokens * hidden_bytes * 2;
        num_bytes += num_channels * num_rdma_ranks * num_max_rdma_chunked_recv_tokens * internode::get_source_meta_bytes() * 2;
        num_bytes += num_channels * num_rdma_ranks * num_max_rdma_chunked_recv_tokens * kNumMaxTopK * sizeof(int64_t) * 2;
        num_bytes += num_channels * num_rdma_ranks * num_max_rdma_chunked_recv_tokens * kNumMaxTopK * sizeof(float) * 2;
        num_bytes += num_channels * num_rdma_ranks * num_max_rdma_chunked_recv_tokens * kNumMaxScales * sizeof(float) * 2;
        num_bytes += num_channels * num_rdma_ranks * num_max_rdma_chunked_recv_tokens * sizeof(int4) * 2;
        num_bytes = ((num_bytes + 127) / 128) * 128;
        return num_bytes;
 #else
        EP_HOST_ASSERT(false and "NVSHMEM is disable during compilation");
 #endif
    }
 };
 struct LowLatencyBuffer {
    int num_clean_int = 0;
    void* dispatch_rdma_send_buffer = nullptr;
    void* dispatch_rdma_recv_data_buffer = nullptr;
    int* dispatch_rdma_recv_count_buffer = nullptr;
    void* combine_rdma_send_buffer = nullptr;
    void* combine_rdma_recv_data_buffer = nullptr;
    int* combine_rdma_recv_flag_buffer = nullptr;
    void* combine_rdma_send_buffer_data_start = nullptr;
    size_t num_bytes_per_combine_msg = 0;
    std::pair<int*, int> clean_meta() {
        EP_HOST_ASSERT(dispatch_rdma_recv_count_buffer == combine_rdma_recv_flag_buffer);
        return {dispatch_rdma_recv_count_buffer, num_clean_int};
    }
 };
 struct LowLatencyLayout {
    size_t total_bytes = 0;
    LowLatencyBuffer buffers[2];
    template <typename out_ptr_t = void*, typename count_ptr_t = uint8_t*, typename in_ptr_t = void*>
    out_ptr_t advance(const in_ptr_t& ptr, size_t count) {
        return reinterpret_cast<out_ptr_t>(reinterpret_cast<count_ptr_t>(ptr) + count);
    }
    LowLatencyLayout(void* rdma_buffer, int num_max_dispatch_tokens_per_rank, int hidden, int num_ranks, int num_experts) {
        const int num_scales = hidden / 128;
        // Dispatch and combine layout:
        //  - 2 symmetric odd/even send buffer
        //  - 2 symmetric odd/even receive buffers
        //  - 2 symmetric odd/even signaling buffers
        // Message sizes
        // NOTES: you should add a control `int4` for combine messages if you want to do data transformation
        EP_HOST_ASSERT(num_scales * sizeof(float) <= hidden);
        size_t num_bytes_per_dispatch_msg = sizeof(int4) + std::max(hidden * sizeof(nv_bfloat16), hidden + num_scales * sizeof(float));
        size_t num_bytes_per_combine_msg = hidden * sizeof(nv_bfloat16);
        // Send buffer
        size_t dispatch_send_buffer_bytes = num_max_dispatch_tokens_per_rank * num_bytes_per_dispatch_msg;
        size_t combine_send_buffer_bytes = num_experts * num_max_dispatch_tokens_per_rank * num_bytes_per_combine_msg;
        size_t send_buffer_bytes = std::max(dispatch_send_buffer_bytes, combine_send_buffer_bytes);
        EP_HOST_ASSERT(send_buffer_bytes % sizeof(int4) == 0);
        total_bytes += send_buffer_bytes * 2;
        // Symmetric receive buffers
        // TODO: optimize memory usages
        size_t dispatch_recv_data_buffer_bytes = num_experts * num_max_dispatch_tokens_per_rank * num_bytes_per_dispatch_msg;
        size_t combine_recv_buffer_bytes = num_experts * num_max_dispatch_tokens_per_rank * num_bytes_per_combine_msg;
        size_t recv_buffer_bytes = std::max(dispatch_recv_data_buffer_bytes, combine_recv_buffer_bytes);
        EP_HOST_ASSERT(recv_buffer_bytes % sizeof(int4) == 0);
        total_bytes += recv_buffer_bytes * 2;
        // Symmetric signaling buffers
        size_t dispatch_recv_count_buffer_bytes = num_experts * sizeof(int);
        size_t combine_recv_flag_buffer_bytes = dispatch_recv_count_buffer_bytes;
        size_t signaling_buffer_bytes = std::max(dispatch_recv_count_buffer_bytes, combine_recv_flag_buffer_bytes);
        total_bytes += signaling_buffer_bytes * 2;
        // Assign pointers
        // NOTES: we still leave some space for distinguishing dispatch/combine buffer,
        // so you may see some parameters are duplicated
        for (int i = 0; i < 2; ++ i) {
            buffers[i] = {
                static_cast<int>(signaling_buffer_bytes / sizeof(int)),
                advance(rdma_buffer, send_buffer_bytes * i),
                advance(rdma_buffer, send_buffer_bytes * 2 + recv_buffer_bytes * i),
                advance<int*>(rdma_buffer, send_buffer_bytes * 2 + recv_buffer_bytes * 2 + signaling_buffer_bytes * i),
                advance(rdma_buffer, send_buffer_bytes * i),
                advance(rdma_buffer, send_buffer_bytes * 2 + recv_buffer_bytes * i),
                advance<int*>(rdma_buffer, send_buffer_bytes * 2 + recv_buffer_bytes * 2 + signaling_buffer_bytes * i),
                advance(rdma_buffer, send_buffer_bytes * i),
                num_bytes_per_combine_msg
            };
        }
    }
 };
 size_t get_low_latency_rdma_size_hint(int num_max_dispatch_tokens_per_rank, int hidden, int num_ranks, int num_experts) {
    auto num_bytes = LowLatencyLayout(nullptr, num_max_dispatch_tokens_per_rank, hidden, num_ranks, num_experts).total_bytes;
    return ((num_bytes + NUM_BUFFER_ALIGNMENT_BYTES) / NUM_BUFFER_ALIGNMENT_BYTES) * NUM_BUFFER_ALIGNMENT_BYTES;
 }
 } // namespace deep_ep
--- a/DeepEP/csrc/deep_ep.cpp
+++ b/DeepEP/csrc/deep_ep.cpp
--- a/DeepEP/csrc/deep_ep.hpp
+++ b/DeepEP/csrc/deep_ep.hpp
@ -0,0 +1,157 @@
 #pragma once
 // Forcibly disable NDEBUG
 #ifdef NDEBUG
 #undef NDEBUG
 #endif
 #include <pybind11/pybind11.h>
 #include <pybind11/pytypes.h>
 #include <torch/types.h>
 #include <tuple>
 #include <vector>
 #include "config.hpp"
 #include "event.hpp"
 #include "kernels/configs.cuh"
 #include "kernels/exception.cuh"
 #ifndef TORCH_EXTENSION_NAME
 #define TORCH_EXTENSION_NAME deep_ep_cpp
 #endif
 namespace deep_ep {
 struct Buffer {
    EP_STATIC_ASSERT(NUM_MAX_NVL_PEERS == 8, "The number of maximum NVLink peers must be 8");
 private:
    // Low-latency mode buffer
    int low_latency_buffer_idx = 0;
    bool low_latency_mode = false;
    // NVLink Buffer
    int64_t num_nvl_bytes;
    void* buffer_ptrs[NUM_MAX_NVL_PEERS] = {nullptr};
    void** buffer_ptrs_gpu = nullptr;
    // NVSHMEM Buffer
    int64_t num_rdma_bytes;
    void* rdma_buffer_ptr = nullptr;
    // Device info and communication
    int device_id;
    int num_device_sms;
    int rank, rdma_rank, nvl_rank;
    int num_ranks, num_rdma_ranks, num_nvl_ranks;
    cudaIpcMemHandle_t ipc_handles[NUM_MAX_NVL_PEERS];
    // Stream for communication
    at::cuda::CUDAStream comm_stream;
    // After IPC/NVSHMEM synchronization, this flag will be true
    bool available = false;
    // Barrier signals
    int* barrier_signal_ptrs[NUM_MAX_NVL_PEERS] = {nullptr};
    int** barrier_signal_ptrs_gpu = nullptr;
    // Workspace
    void* workspace = nullptr;
    // Host-side MoE info
    volatile int* moe_recv_counter = nullptr;
    int* moe_recv_counter_mapped = nullptr;
    // Host-side expert-level MoE info
    volatile int* moe_recv_expert_counter = nullptr;
    int* moe_recv_expert_counter_mapped = nullptr;
    // Host-side RDMA-level MoE info
    volatile int* moe_recv_rdma_counter = nullptr;
    int* moe_recv_rdma_counter_mapped = nullptr;
 public:
    Buffer(int rank, int num_ranks, int64_t num_nvl_bytes, int64_t num_rdma_bytes, bool low_latency_mode);
    ~Buffer() noexcept(false);
    bool is_available() const;
    bool is_internode_available() const;
    int get_num_rdma_ranks() const;
    int get_rdma_rank() const;
    int get_root_rdma_rank(bool global) const;
    int get_local_device_id() const;
    pybind11::bytearray get_local_ipc_handle() const;
    pybind11::bytearray get_local_nvshmem_unique_id() const;
    torch::Tensor get_local_buffer_tensor(const pybind11::object& dtype, int64_t offset, bool use_rdma_buffer) const;
    torch::Stream get_comm_stream() const;
    void sync(const std::vector<int>& device_ids, const std::vector<std::optional<pybind11::bytearray>>& all_gathered_handles, const std::optional<pybind11::bytearray>& root_unique_id_opt);
    std::tuple<torch::Tensor, std::optional<torch::Tensor>, torch::Tensor, torch::Tensor, std::optional<EventHandle>>
    get_dispatch_layout(const torch::Tensor& topk_idx, int num_experts, std::optional<EventHandle>& previous_event,
                        bool async, bool allocate_on_comm_stream);
    std::tuple<torch::Tensor, std::optional<torch::Tensor>, std::optional<torch::Tensor>, std::optional<torch::Tensor>, std::vector<int>, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, std::optional<EventHandle>>
    intranode_dispatch(const torch::Tensor& x, const std::optional<torch::Tensor>& x_scales,
                       const std::optional<torch::Tensor>& topk_idx, const std::optional<torch::Tensor>& topk_weights,
                       const std::optional<torch::Tensor>& num_tokens_per_rank, const torch::Tensor& is_token_in_rank, const std::optional<torch::Tensor>& num_tokens_per_expert,
                       int cached_num_recv_tokens, const std::optional<torch::Tensor>& cached_rank_prefix_matrix, const std::optional<torch::Tensor>& cached_channel_prefix_matrix,
                       int expert_alignment, int num_worst_tokens, const Config& config,
                       std::optional<EventHandle>& previous_event, bool async, bool allocate_on_comm_stream);
    std::tuple<torch::Tensor, std::optional<torch::Tensor>, std::optional<EventHandle>>
    intranode_combine(const torch::Tensor& x, const std::optional<torch::Tensor>& topk_weights,
                      const std::optional<torch::Tensor>& bias_0, const std::optional<torch::Tensor>& bias_1,
                      const torch::Tensor& src_idx, const torch::Tensor& rank_prefix_matrix, const torch::Tensor& channel_prefix_matrix,
                      const torch::Tensor& send_head, const Config& config, std::optional<EventHandle>& previous_event, bool async, bool allocate_on_comm_stream);
    std::tuple<torch::Tensor, std::optional<torch::Tensor>, std::optional<torch::Tensor>, std::optional<torch::Tensor>, std::vector<int>, torch::Tensor, torch::Tensor, std::optional<torch::Tensor>, torch::Tensor, std::optional<torch::Tensor>, torch::Tensor, std::optional<torch::Tensor>, std::optional<torch::Tensor>, std::optional<torch::Tensor>, std::optional<EventHandle>>
    internode_dispatch(const torch::Tensor& x, const std::optional<torch::Tensor>& x_scales,
                       const std::optional<torch::Tensor>& topk_idx, const std::optional<torch::Tensor>& topk_weights,
                       const std::optional<torch::Tensor>& num_tokens_per_rank, const std::optional<torch::Tensor>& num_tokens_per_rdma_rank,
                       const torch::Tensor& is_token_in_rank, const std::optional<torch::Tensor>& num_tokens_per_expert,
                       int cached_num_recv_tokens, int cached_num_rdma_recv_tokens,
                       const std::optional<torch::Tensor>& cached_rdma_channel_prefix_matrix, const std::optional<torch::Tensor>& cached_recv_rdma_rank_prefix_sum,
                       const std::optional<torch::Tensor>& cached_gbl_channel_prefix_matrix, const std::optional<torch::Tensor>& cached_recv_gbl_rank_prefix_sum,
                       int expert_alignment, const Config& config, std::optional<EventHandle>& previous_event, bool async, bool allocate_on_comm_stream);
    std::tuple<torch::Tensor, std::optional<torch::Tensor>, std::optional<EventHandle>>
    internode_combine(const torch::Tensor& x, const std::optional<torch::Tensor>& topk_weights,
                      const std::optional<torch::Tensor>& bias_0, const std::optional<torch::Tensor>& bias_1,
                      const torch::Tensor& src_meta, const torch::Tensor& is_combined_token_in_rank,
                      const torch::Tensor& rdma_channel_prefix_matrix, const torch::Tensor& rdma_rank_prefix_sum, const torch::Tensor& gbl_channel_prefix_matrix,
                      const torch::Tensor& combined_rdma_head, const torch::Tensor& combined_nvl_head,
                      const Config& config, std::optional<EventHandle>& previous_event, bool async, bool allocate_on_comm_stream);
    void clean_low_latency_buffer(int num_max_dispatch_tokens_per_rank, int hidden, int num_experts);
    std::tuple<torch::Tensor, std::optional<torch::Tensor>, torch::Tensor, torch::Tensor, torch::Tensor, std::optional<EventHandle>, std::optional<std::function<void()>>>
    low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_idx,
                         const std::optional<torch::Tensor>& cumulative_local_expert_recv_stats,
                         int num_max_dispatch_tokens_per_rank, int num_experts,
                         bool use_fp8, bool round_scale, bool use_ue8m0,
                         bool async, bool return_recv_hook);
    std::tuple<torch::Tensor, std::optional<EventHandle>, std::optional<std::function<void()>>>
    low_latency_combine(const torch::Tensor& x, const torch::Tensor& topk_idx, const torch::Tensor& topk_weights,
                        const torch::Tensor& src_info, const torch::Tensor& layout_range,
                        int num_max_dispatch_tokens_per_rank, int num_experts,
                        bool zero_copy, bool async, bool return_recv_hook,
                        const std::optional<torch::Tensor>& out = std::nullopt);
    torch::Tensor
    get_next_low_latency_combine_buffer(int num_max_dispatch_tokens_per_rank, int hidden, int num_experts) const;
 };
 } // namespace deep_ep
--- a/DeepEP/csrc/event.hpp
+++ b/DeepEP/csrc/event.hpp
@ -0,0 +1,43 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <memory>
 #include "kernels/exception.cuh"
 namespace deep_ep {
 struct EventHandle {
    std::shared_ptr<torch::Event> event;
    EventHandle() {
        event = std::make_shared<torch::Event>(torch::kCUDA);
        event->record(at::cuda::getCurrentCUDAStream());
    }
    explicit EventHandle(const at::cuda::CUDAStream& stream) {
        event = std::make_shared<torch::Event>(torch::kCUDA);
        event->record(stream);
    }
    EventHandle(const EventHandle& other) = default;
    void current_stream_wait() const {
        at::cuda::getCurrentCUDAStream().unwrap().wait(*event);
    }
 };
 torch::Event create_event(const at::cuda::CUDAStream &s) {
    auto event = torch::Event(torch::kCUDA);
    event.record(s);
    return event;
 }
 void stream_wait(const at::cuda::CUDAStream& s_0, const at::cuda::CUDAStream& s_1) {
    EP_HOST_ASSERT(s_0.id() != s_1.id());
    s_0.unwrap().wait(create_event(s_1));
 }
 void stream_wait(const at::cuda::CUDAStream& s, const EventHandle& event) {
    s.unwrap().wait(*event.event);
 }
 } // namespace deep_ep
--- a/DeepEP/csrc/kernels/CMakeLists.txt
+++ b/DeepEP/csrc/kernels/CMakeLists.txt
@ -0,0 +1,21 @@
 function(add_deep_ep_library target_name source_file)
    add_library(${target_name} STATIC ${source_file})
    set_target_properties(${target_name} PROPERTIES
            POSITION_INDEPENDENT_CODE ON
            CXX_STANDARD_REQUIRED ON
            CUDA_STANDARD_REQUIRED ON
            CXX_STANDARD 17
            CUDA_STANDARD 17
            CUDA_SEPARABLE_COMPILATION ON
    )
    target_link_libraries(${target_name} PUBLIC nvshmem cudart cudadevrt mlx5)
 endfunction()
 add_deep_ep_library(runtime_cuda runtime.cu)
 add_deep_ep_library(layout_cuda layout.cu)
 add_deep_ep_library(intranode_cuda intranode.cu)
 add_deep_ep_library(internode_cuda internode.cu)
 add_deep_ep_library(internode_ll_cuda internode_ll.cu)
 # Later, we should link all libraries in `EP_CUDA_LIBRARIES`
 set(EP_CUDA_LIBRARIES runtime_cuda layout_cuda intranode_cuda internode_cuda internode_ll_cuda PARENT_SCOPE)
--- a/DeepEP/csrc/kernels/api.cuh
+++ b/DeepEP/csrc/kernels/api.cuh
@ -0,0 +1,167 @@
 #pragma once
 #include <vector>
 namespace deep_ep {
 // Intranode runtime
 namespace intranode {
 void barrier(int** barrier_signal_ptrs, int rank, int num_ranks, cudaStream_t stream);
 } // namespace intranode
 // Internode runtime
 namespace internode {
 std::vector<uint8_t> get_unique_id();
 int init(const std::vector<uint8_t> &root_unique_id_val, int rank, int num_ranks, bool low_latency_mode);
 void *alloc(size_t size, size_t alignment);
 void free(void *ptr);
 void barrier();
 void finalize();
 } // namespace internode
 // Layout kernels
 namespace layout {
 void get_dispatch_layout(const int64_t* topk_idx,
                         int* num_tokens_per_rank, int* num_tokens_per_rdma_rank,
                         int* num_tokens_per_expert, bool* is_token_in_rank,
                         int num_tokens, int num_topk, int num_ranks, int num_experts,
                         cudaStream_t stream);
 } // namespace layout
 // Intranode kernels
 namespace intranode {
 void notify_dispatch(const int* num_tokens_per_rank, int* moe_recv_counter_mapped, int num_ranks,
                     const int* num_tokens_per_expert, int* moe_recv_expert_counter_mapped, int num_experts,
                     int num_tokens, const bool* is_token_in_rank, int* channel_prefix_matrix,
                     int* rank_prefix_matrix_copy, int num_memset_int, int expert_alignment,
                     void** buffer_ptrs, int** barrier_signal_ptrs, int rank,
                     cudaStream_t stream, int num_sms);
 void cached_notify_dispatch(const int* rank_prefix_matrix, int num_memset_int,
                            void** buffer_ptrs, int** barrier_signal_ptrs, int rank, int num_ranks,
                            cudaStream_t stream);
 void dispatch(void* recv_x, float* recv_x_scales, int* recv_src_idx, int64_t* recv_topk_idx, float* recv_topk_weights, int* recv_channel_offset,
              int* send_head, const void* x, const float* x_scales, const int64_t* topk_idx, const float* topk_weights,
              const bool* is_token_in_rank, const int* channel_prefix_matrix,
              int num_tokens, int num_worst_tokens, int hidden_int4, int num_topk, int num_experts, int num_scales,
              int scale_token_stride, int scale_hidden_stride,
              void** buffer_ptrs, int rank, int num_ranks,
              cudaStream_t stream, int num_sms,
              int num_max_send_tokens, int num_recv_buffer_tokens);
 void cached_notify_combine(void** buffer_ptrs, int* send_head, int num_channels, int num_recv_tokens, int num_memset_int,
                           int** barrier_signal_ptrs, int rank, int num_ranks, cudaStream_t stream);
 void combine(cudaDataType_t type,
             void* recv_x, float* recv_topk_weights,
             const void* x, const float* topk_weights,
             const void* bias_0, const void* bias_1,
             const int* src_idx, const int* rank_prefix_matrix, const int* channel_prefix_matrix,
             int* send_head, int num_tokens, int num_recv_tokens, int hidden, int num_topk,
             void** buffer_ptrs, int rank, int num_ranks,
             cudaStream_t stream, int num_sms,
             int num_max_send_tokens, int num_recv_buffer_tokens);
 } // namespace intranode
 // Internode kernels
 namespace internode {
 int get_source_meta_bytes();
 void notify_dispatch(const int* num_tokens_per_rank, int* moe_recv_counter_mapped, int num_ranks,
                     const int* num_tokens_per_rdma_rank, int* moe_recv_rdma_counter_mapped,
                     const int* num_tokens_per_expert, int* moe_recv_expert_counter_mapped, int num_experts,
                     const bool* is_token_in_rank, int num_tokens, int num_channels,
                     int hidden_int4, int num_scales, int num_topk, int expert_alignment,
                     int* rdma_channel_prefix_matrix, int* recv_rdma_rank_prefix_sum,
                     int* gbl_channel_prefix_matrix, int* recv_gbl_rank_prefix_sum,
                     void* rdma_buffer_ptr, int num_max_rdma_chunked_recv_tokens,
                     void** buffer_ptrs, int num_max_nvl_chunked_recv_tokens,
                     int** barrier_signal_ptrs, int rank,
                     cudaStream_t stream, int64_t num_rdma_bytes, int64_t num_nvl_bytes,
                     bool low_latency_mode);
 void dispatch(void* recv_x, float* recv_x_scales, int64_t* recv_topk_idx, float* recv_topk_weights, void* recv_src_meta,
              const void* x, const float* x_scales, const int64_t* topk_idx, const float* topk_weights,
              int* send_rdma_head, int* send_nvl_head,
              int* recv_rdma_channel_prefix_matrix, int* recv_gbl_channel_prefix_matrix,
              const int* rdma_channel_prefix_matrix, const int* recv_rdma_rank_prefix_sum,
              const int* gbl_channel_prefix_matrix, const int* recv_gbl_rank_prefix_sum,
              const bool* is_token_in_rank,
              int num_tokens, int hidden_int4, int num_scales, int num_topk, int num_experts,
              int scale_token_stride, int scale_hidden_stride,
              void* rdma_buffer_ptr, int num_max_rdma_chunked_send_tokens, int num_max_rdma_chunked_recv_tokens,
              void** buffer_ptrs, int num_max_nvl_chunked_send_tokens, int num_max_nvl_chunked_recv_tokens,
              int rank, int num_ranks, bool is_cached_dispatch,
              cudaStream_t stream, int num_channels, bool low_latency_mode);
 void cached_notify(int hidden_int4, int num_scales, int num_topk_idx, int num_topk_weights,
                   int num_ranks, int num_channels, int num_combined_tokens, int* combined_rdma_head,
                   const int* rdma_channel_prefix_matrix, const int* rdma_rank_prefix_sum, int* combined_nvl_head,
                   void* rdma_buffer_ptr, int num_max_rdma_chunked_recv_tokens,
                   void** buffer_ptrs, int num_max_nvl_chunked_recv_tokens,
                   int** barrier_signal_ptrs, int rank, cudaStream_t stream,
                   int64_t num_rdma_bytes, int64_t num_nvl_bytes,
                   bool is_cached_dispatch, bool low_latency_mode);
 void combine(cudaDataType_t type,
             void* combined_x, float* combined_topk_weights,
             const bool* is_combined_token_in_rank,
             const void* x, const float* topk_weights,
             const void* bias_0, const void* bias_1,
             const int* combined_rdma_head, const int* combined_nvl_head,
             const void* src_meta, const int* rdma_channel_prefix_matrix, const int* rdma_rank_prefix_sum, const int* gbl_channel_prefix_matrix,
             int num_tokens, int num_combined_tokens, int hidden, int num_topk,
             void* rdma_buffer_ptr, int num_max_rdma_chunked_send_tokens, int num_max_rdma_chunked_recv_tokens,
             void** buffer_ptrs, int num_max_nvl_chunked_send_tokens, int num_max_nvl_chunked_recv_tokens,
             int rank, int num_ranks, cudaStream_t stream, int num_channels, bool low_latency_mode);
 } // namespace internode
 // Internode low-latency kernels
 namespace internode_ll {
 void clean_low_latency_buffer(int* clean_0, int num_clean_int_0,
                              int* clean_1, int num_clean_int_1,
                              cudaStream_t stream);
 void dispatch(void* packed_recv_x, void* packed_recv_x_scales,
              int* packed_recv_src_info, int64_t* packed_recv_layout_range,
              int* packed_recv_count,
              int* cumulative_local_expert_recv_stats,
              void* rdma_recv_x, int* rdma_recv_count, void* rdma_x,
              const void* x, const int64_t* topk_idx,
              int* next_clean, int num_next_clean_int,
              int num_tokens, int hidden, int num_max_dispatch_tokens_per_rank,
              int num_topk, int num_experts, int rank, int num_ranks,
              bool use_fp8, bool round_scale, bool use_ue8m0,
              void* workspace, int num_device_sms,
              cudaStream_t stream, int phases);
 void combine(void* combined_x,
             void* rdma_recv_x, int* rdma_recv_flag, void* rdma_send_x,
             const void* x, const int64_t* topk_idx, const float* topk_weights,
             const int* src_info, const int64_t* layout_range,
             int* next_clean, int num_next_clean_int,
             int num_combined_tokens, int hidden, int num_max_dispatch_tokens_per_rank,
             int num_topk, int num_experts, int rank, int num_ranks,
             void* workspace, int num_device_sms,
             cudaStream_t stream, int phases, bool zero_copy);
 } // namespace internode_ll
 } // namespace deep_ep
--- a/DeepEP/csrc/kernels/buffer.cuh
+++ b/DeepEP/csrc/kernels/buffer.cuh
@ -0,0 +1,138 @@
 #pragma once
 #include "configs.cuh"
 #include "exception.cuh"
 namespace deep_ep {
 template <typename dtype_t>
 struct Buffer {
 private:
    uint8_t* ptr;
 public:
    int total_bytes;
    __device__ __forceinline__ Buffer() : ptr(nullptr), total_bytes(0) {}
    __device__ __forceinline__ Buffer(void* &gbl_ptr, int num_elems, int offset = 0) {
        total_bytes = num_elems * sizeof(dtype_t);
        ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + offset * sizeof(dtype_t);
        gbl_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + total_bytes;
    }
    __device__ __forceinline__ Buffer advance_also(void* &gbl_ptr) {
        gbl_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + total_bytes;
        return *this;
    }
    __device__ __forceinline__ dtype_t* buffer() {
        return reinterpret_cast<dtype_t*>(ptr);
    }
    __device__ __forceinline__ dtype_t& operator[](int idx) {
        return buffer()[idx];
    }
 };
 template <typename dtype_t, int kNumRanks = 1>
 struct AsymBuffer {
 private:
    uint8_t* ptrs[kNumRanks];
    int num_bytes;
 public:
    int total_bytes;
    __device__ __forceinline__ AsymBuffer(void* &gbl_ptr, int num_elems, int num_ranks,
                                          int sm_id = 0, int num_sms = 1, int offset = 0) {
        EP_STATIC_ASSERT(kNumRanks == 1, "");
        num_bytes = num_elems * sizeof(dtype_t);
        int per_channel_bytes = num_bytes * num_ranks;
        total_bytes = per_channel_bytes * num_sms;
        ptrs[0] = reinterpret_cast<uint8_t*>(gbl_ptr) + per_channel_bytes * sm_id + num_bytes * offset;
        gbl_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + total_bytes;
    }
    __device__ __forceinline__ AsymBuffer(void** gbl_ptrs, int num_elems, int num_ranks,
                                          int sm_id = 0, int num_sms = 1, int offset = 0) {
        EP_STATIC_ASSERT(kNumRanks > 1, "");
        num_bytes = num_elems * sizeof(dtype_t);
        int per_channel_bytes = num_bytes * num_ranks;
        total_bytes = per_channel_bytes * num_sms;
        for (int i = 0; i < kNumRanks; ++ i) {
            ptrs[i] = reinterpret_cast<uint8_t*>(gbl_ptrs[i]) + per_channel_bytes * sm_id + num_bytes * offset;
            gbl_ptrs[i] = reinterpret_cast<uint8_t*>(gbl_ptrs[i]) + total_bytes;
        }
    }
    __device__ __forceinline__ void advance(int shift) {
        #pragma unroll
        for (int i = 0; i < kNumRanks; ++ i)
            ptrs[i] = ptrs[i] + shift * sizeof(dtype_t);
    }
    __device__ __forceinline__ AsymBuffer advance_also(void* &gbl_ptr) {
        gbl_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + total_bytes;
        return *this;
    }
    template<int kNumAlsoRanks>
    __device__ __forceinline__ AsymBuffer advance_also(void** gbl_ptrs) {
        for (int i = 0; i < kNumAlsoRanks; ++ i)
            gbl_ptrs[i] = reinterpret_cast<uint8_t*>(gbl_ptrs[i]) + total_bytes;
        return *this;
    }
    __device__ __forceinline__ dtype_t* buffer(int idx = 0) {
        EP_STATIC_ASSERT(kNumRanks == 1, "`buffer` is only available for single rank case");
        return reinterpret_cast<dtype_t*>(ptrs[0] + num_bytes * idx);
    }
    __device__ __forceinline__ dtype_t* buffer_by(int rank_idx, int idx = 0) {
        EP_STATIC_ASSERT(kNumRanks > 1, "`buffer` is only available for single rank case");
        return reinterpret_cast<dtype_t*>(ptrs[rank_idx] + num_bytes * idx);
    }
 };
 template <typename dtype_t, bool kDecoupled = true>
 struct SymBuffer {
 private:
    // NOTES: for non-decoupled case, `recv_ptr` is not used
    uint8_t* send_ptr;
    uint8_t* recv_ptr;
    int num_bytes;
 public:
    int total_bytes;
    __device__ __forceinline__ SymBuffer(void* &gbl_ptr, int num_elems, int num_ranks,
                                         int sm_id = 0, int num_sms = 1) {
        num_bytes = num_elems * sizeof(dtype_t);
        int per_channel_bytes = num_bytes * num_ranks;
        total_bytes = per_channel_bytes * num_sms * (static_cast<int>(kDecoupled) + 1);
        send_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + per_channel_bytes * sm_id;
        recv_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + per_channel_bytes * (sm_id + num_sms);
        gbl_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + total_bytes;
    }
    __device__ __forceinline__ dtype_t* send_buffer(int idx = 0) {
        EP_STATIC_ASSERT(kDecoupled, "`send_buffer` is only available for non-decoupled case");
        return reinterpret_cast<dtype_t*>(send_ptr + num_bytes * idx);
    }
    __device__ __forceinline__ dtype_t* recv_buffer(int idx = 0) {
        EP_STATIC_ASSERT(kDecoupled, "`recv_buffer` is only available for non-decoupled case");
        return reinterpret_cast<dtype_t*>(recv_ptr + num_bytes * idx);
    }
    __device__ __forceinline__ dtype_t* buffer(int idx = 0) {
        EP_STATIC_ASSERT(not kDecoupled, "`buffer` is only available for decoupled case");
        return reinterpret_cast<dtype_t*>(send_ptr + num_bytes * idx);
    }
 };
 } // namespace deep_ep
--- a/DeepEP/csrc/kernels/configs.cuh
+++ b/DeepEP/csrc/kernels/configs.cuh
@ -0,0 +1,67 @@
 #pragma once
 #define NUM_MAX_NVL_PEERS 8
 #define NUM_MAX_RDMA_PEERS 20
 #define NUM_WORKSPACE_BYTES (32 * 1024 * 1024)
 #define NUM_MAX_LOCAL_EXPERTS 1024
 #define NUM_BUFFER_ALIGNMENT_BYTES 128
 #define FINISHED_SUM_TAG 1024
 #define NUM_WAIT_NANOSECONDS 500
 #ifndef ENABLE_FAST_DEBUG
 #define NUM_CPU_TIMEOUT_SECS 100
 #define NUM_TIMEOUT_CYCLES 200000000000ull // 200G cycles ~= 100s
 #else
 #define NUM_CPU_TIMEOUT_SECS 10
 #define NUM_TIMEOUT_CYCLES 20000000000ull // 20G cycles ~= 10s
 #endif
 #define LOW_LATENCY_SEND_PHASE 1
 #define LOW_LATENCY_RECV_PHASE 2
 // Make CLion CUDA indexing work
 #ifdef __CLION_IDE__
 #define __CUDA_ARCH__ 900 // NOLINT(*-reserved-identifier)
 #define __CUDACC_RDC__ // NOLINT(*-reserved-identifier)
 #endif
 // Remove Torch restrictions
 #ifdef __CUDA_NO_HALF_CONVERSIONS__
 #undef __CUDA_NO_HALF_CONVERSIONS__
 #endif
 #ifdef __CUDA_NO_HALF_OPERATORS__
 #undef __CUDA_NO_HALF_OPERATORS__
 #endif
 #ifdef __CUDA_NO_HALF2_OPERATORS__
 #undef __CUDA_NO_HALF2_OPERATORS__
 #endif
 #ifdef __CUDA_NO_BFLOAT16_CONVERSIONS__
 #undef __CUDA_NO_BFLOAT16_CONVERSIONS__
 #endif
 #ifdef __CUDA_NO_BFLOAT162_OPERATORS__
 #undef __CUDA_NO_BFLOAT162_OPERATORS__
 #endif
 #include <cstdint>
 #include <cuda_bf16.h>
 #include <cuda_runtime.h>
 #ifndef DISABLE_SM90_FEATURES
 #include <cuda_fp8.h>
 #else
 // Ampere does not support FP8 features
 #define __NV_E4M3 0
 #define __NV_E5M2 1
 typedef int __nv_fp8_interpretation_t;
 typedef int __nv_fp8x4_e4m3;
 typedef uint8_t __nv_fp8_storage_t;
 #endif
 #ifndef DISABLE_NVSHMEM
 #include <nvshmem.h>
 #include <nvshmemx.h>
 #include <infiniband/mlx5dv.h>
 #include <non_abi/device/threadgroup/nvshmemi_common_device_defines.cuh>
 #include <device_host_transport/nvshmem_common_ibgda.h>
 #endif
--- a/DeepEP/csrc/kernels/exception.cuh
+++ b/DeepEP/csrc/kernels/exception.cuh
@ -0,0 +1,51 @@
 #pragma once
 #include <string>
 #include <exception>
 #include "configs.cuh"
 #ifndef EP_STATIC_ASSERT
 #define EP_STATIC_ASSERT(cond, reason) static_assert(cond, reason)
 #endif
 class EPException: public std::exception {
 private:
    std::string message = {};
 public:
    explicit EPException(const char *name, const char* file, const int line, const std::string& error) {
        message = std::string("Failed: ") + name + " error " + file + ":" + std::to_string(line) + " '" + error + "'";
    }
    const char *what() const noexcept override { return message.c_str(); }
 };
 #ifndef CUDA_CHECK
 #define CUDA_CHECK(cmd) \
 do { \
    cudaError_t e = (cmd); \
    if (e != cudaSuccess) { \
        throw EPException("CUDA", __FILE__, __LINE__, cudaGetErrorString(e)); \
    } \
 } while (0)
 #endif
 #ifndef EP_HOST_ASSERT
 #define EP_HOST_ASSERT(cond) \
 do { \
    if (not (cond)) { \
        throw EPException("Assertion", __FILE__, __LINE__, #cond); \
    } \
 } while (0)
 #endif
 #ifndef EP_DEVICE_ASSERT
 #define EP_DEVICE_ASSERT(cond) \
 do { \
    if (not (cond)) { \
        printf("Assertion failed: %s:%d, condition: %s\n", __FILE__, __LINE__, #cond); \
        asm("trap;"); \
    } \
 } while (0)
 #endif
--- a/DeepEP/csrc/kernels/ibgda_device.cuh
+++ b/DeepEP/csrc/kernels/ibgda_device.cuh
@ -0,0 +1,482 @@
 // Portions derived from NVSHMEM (https://developer.nvidia.com/nvshmem)
 // Copyright (c) NVIDIA Corporation.
 // Licensed under the NVSHMEM Software License Agreement (version: September 3, 2019).
 // See full license at: https://docs.nvidia.com/nvshmem/api/sla.html
 //
 // Modified from original source:
 //  - nvshmem/src/include/non_abi/device/pt-to-pt/ibgda_device.cuh
 #pragma once
 #include "configs.cuh"
 #include "exception.cuh"
 #include "utils.cuh"
 namespace deep_ep {
 EP_STATIC_ASSERT(NVSHMEMI_IBGDA_MIN_QP_DEPTH >= 64, "Invalid QP minimum depth");
 __device__ static __forceinline__
 uint64_t HtoBE64(uint64_t x) {
    uint64_t ret;
    asm("{\n\t"
        ".reg .b32 ign;\n\t"
        ".reg .b32 lo;\n\t"
        ".reg .b32 hi;\n\t"
        ".reg .b32 new_lo;\n\t"
        ".reg .b32 new_hi;\n\t"
        "mov.b64 {lo,hi}, %1;\n\t"
        "prmt.b32 new_hi, lo, ign, 0x0123;\n\t"
        "prmt.b32 new_lo, hi, ign, 0x0123;\n\t"
        "mov.b64 %0, {new_lo,new_hi};\n\t"
        "}" : "=l"(ret) : "l"(x));
    return ret;
 }
 __device__ static __forceinline__
 uint32_t HtoBE32(uint32_t x) {
    uint32_t ret;
    asm("{\n\t"
        ".reg .b32 ign;\n\t"
        "prmt.b32 %0, %1, ign, 0x0123;\n\t"
        "}" : "=r"(ret) : "r"(x));
    return ret;
 }
 __device__ static __forceinline__
 uint16_t HtoBE16(uint16_t x) {
    // TODO: simplify PTX using 16-bit instructions
    auto a = static_cast<uint32_t>(x);
    uint32_t d;
    asm volatile(
        "{\n\t"
        ".reg .b32 mask;\n\t"
        ".reg .b32 ign;\n\t"
        "mov.b32 mask, 0x4401;\n\t"
        "mov.b32 ign, 0x0;\n\t"
        "prmt.b32 %0, %1, ign, mask;\n\t"
        "}"
        : "=r"(d)
        : "r"(a));
    return static_cast<uint16_t>(d);
 }
 typedef struct mlx5_wqe_ctrl_seg __attribute__((__aligned__(8))) ibgda_ctrl_seg_t;
 typedef struct {
    uint32_t add_data;
    uint32_t field_boundary;
    uint64_t reserved;
 } __attribute__((__packed__)) ibgda_atomic_32_masked_fa_seg_t;
 __device__ static __forceinline__
 nvshmemi_ibgda_device_state_t* ibgda_get_state() {
    return &nvshmemi_ibgda_device_state_d;
 }
 __device__ static __forceinline__
 nvshmemi_ibgda_device_qp_t* ibgda_get_rc(int pe, int id) {
    auto state = ibgda_get_state();
    const auto num_rc_per_pe = ibgda_get_state()->num_rc_per_pe;
    return &state->globalmem.rcs[pe * num_rc_per_pe + id % num_rc_per_pe];
 }
 __device__ static __forceinline__
 void ibgda_lock_acquire(int *lock) {
    while (atomicCAS(lock, 0, 1) == 1);
    // Prevent reordering before the lock is acquired
    memory_fence_cta();
 }
 __device__ static __forceinline__
 void ibgda_lock_release(int *lock) {
    memory_fence_cta();
    // Prevent reordering before lock is released
    st_na_relaxed(lock, 0);
 }
 __device__ static __forceinline__
 void ibgda_update_dbr(nvshmemi_ibgda_device_qp_t *qp, uint32_t dbrec_head) {
    // `DBREC` contains the index of the next empty `WQEBB`
    __be32 dbrec_val;
    __be32 *dbrec_ptr = qp->tx_wq.dbrec;
    // This is equivalent to `WRITE_ONCE(dbrec_ptr, HtoBE32(dbrec_head & 0xffff))`
    asm("{\n\t"
        ".reg .b32 dbrec_head_16b;\n\t"
        ".reg .b32 ign;\n\t"
        "and.b32 dbrec_head_16b, %1, 0xffff;\n\t"
        "prmt.b32 %0, dbrec_head_16b, ign, 0x123;\n\t"
        "}"
        : "=r"(dbrec_val)
        : "r"(dbrec_head));
    st_na_release(dbrec_ptr, dbrec_val);
 }
 __device__ static __forceinline__
 void ibgda_ring_db(nvshmemi_ibgda_device_qp_t *qp, uint16_t prod_idx) {
    auto bf_ptr = reinterpret_cast<uint64_t*>(qp->tx_wq.bf);
    ibgda_ctrl_seg_t ctrl_seg = {
        .opmod_idx_opcode = HtoBE32(prod_idx << 8),
        .qpn_ds = HtoBE32(qp->qpn << 8)
    };
    EP_STATIC_ASSERT(sizeof(decltype(&ctrl_seg)) == sizeof(uint64_t), "");
    st_na_release(bf_ptr, *(reinterpret_cast<uint64_t*>(&ctrl_seg)));
 }
 __device__ static __forceinline__
 void ibgda_post_send(nvshmemi_ibgda_device_qp_t *qp, uint64_t new_prod_idx) {
    nvshmemi_ibgda_device_qp_management_t *mvars = &qp->mvars;
    uint64_t old_prod_idx;
    // Update `prod_idx` before ringing the doorbell, so that we know which index is needed in quiet/fence
    ibgda_lock_acquire(&mvars->post_send_lock);
    old_prod_idx = atomicMax(reinterpret_cast<unsigned long long int*>(&mvars->tx_wq.prod_idx), new_prod_idx);
    if (new_prod_idx > old_prod_idx) {
        ibgda_update_dbr(qp, new_prod_idx);
        ibgda_ring_db(qp, new_prod_idx);
    }
    ibgda_lock_release(&mvars->post_send_lock);
 }
 template <bool kAlwaysDoPostSend>
 __device__ static __forceinline__
 void ibgda_submit_requests(nvshmemi_ibgda_device_qp_t *qp, uint64_t base_wqe_idx,
                           uint32_t num_wqes, int message_idx = 0) {
    nvshmemi_ibgda_device_qp_management_t *mvars = &qp->mvars;
    uint64_t new_wqe_idx = base_wqe_idx + num_wqes;
    // WQE writes must be finished first
    __threadfence();
    // Wait for prior WQE slots to be filled first
    auto *ready_idx = reinterpret_cast<unsigned long long int*>(&mvars->tx_wq.ready_head);
    while (atomicCAS(ready_idx, base_wqe_idx, new_wqe_idx) != base_wqe_idx);
    // Always post, not in batch
    constexpr int kNumRequestInBatch = 4;
    if (kAlwaysDoPostSend or (message_idx + 1) % kNumRequestInBatch == 0)
        ibgda_post_send(qp, new_wqe_idx);
 }
 __device__ static __forceinline__ void
 ibgda_write_rdma_write_inl_wqe(nvshmemi_ibgda_device_qp_t *qp, const uint32_t *val, uint64_t raddr,
                               __be32 rkey, uint16_t wqe_idx, void** out_wqes, uint32_t imm) {
    ibgda_ctrl_seg_t ctrl_seg;
    struct mlx5_wqe_raddr_seg raddr_seg;
    struct mlx5_wqe_inl_data_seg inl_seg;
    auto *ctrl_seg_ptr = reinterpret_cast<ibgda_ctrl_seg_t*>(out_wqes[0]);
    auto *raddr_seg_ptr = reinterpret_cast<mlx5_wqe_raddr_seg*>(reinterpret_cast<uintptr_t>(ctrl_seg_ptr) + sizeof(*ctrl_seg_ptr));
    auto *inl_seg_ptr = reinterpret_cast<mlx5_wqe_inl_data_seg*>(reinterpret_cast<uintptr_t>(raddr_seg_ptr) + sizeof(*raddr_seg_ptr));
    auto *wqe_data_ptr = reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(inl_seg_ptr) + sizeof(*inl_seg_ptr));
    raddr_seg.raddr = HtoBE64(raddr);
    raddr_seg.rkey = rkey;
    raddr_seg.reserved = 0;
    inl_seg.byte_count = HtoBE32(4 | MLX5_INLINE_SEG);
    // `imm == std::numeric_limits<uint32_t>::max()` means no imm writes
    ctrl_seg = {0};
    ctrl_seg.qpn_ds = HtoBE32((qp->qpn << 8) | 3);
    ctrl_seg.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
    ctrl_seg.opmod_idx_opcode = HtoBE32((wqe_idx << 8) | (imm != std::numeric_limits<uint32_t>::max() ? MLX5_OPCODE_RDMA_WRITE_IMM : MLX5_OPCODE_RDMA_WRITE));
    if (imm != std::numeric_limits<uint32_t>::max())
        ctrl_seg.imm = HtoBE32(imm);
    EP_STATIC_ASSERT(sizeof(*ctrl_seg_ptr) == 16, "sizeof(*ctrl_seg_ptr) == 16");
    EP_STATIC_ASSERT(sizeof(*raddr_seg_ptr) == 16, "sizeof(*raddr_seg_ptr) == 16");
    EP_STATIC_ASSERT(sizeof(*inl_seg_ptr) == 4, "sizeof(*inl_seg_ptr) == 4");
    st_na_relaxed(reinterpret_cast<int4*>(ctrl_seg_ptr), *reinterpret_cast<const int4*>(&ctrl_seg));
    st_na_relaxed(reinterpret_cast<int4*>(raddr_seg_ptr), *reinterpret_cast<const int4*>(&raddr_seg));
    st_na_relaxed(reinterpret_cast<uint32_t*>(inl_seg_ptr), *reinterpret_cast<const uint32_t*>(&inl_seg));
    st_na_relaxed(reinterpret_cast<uint32_t*>(wqe_data_ptr), *reinterpret_cast<const uint32_t*>(val));
 }
 __device__ static __forceinline__
 uint64_t ibgda_get_lkey_and_rkey(uint64_t laddr, __be32 *lkey,
                                 uint64_t raddr, int dst_pe, uint64_t *out_raddr, __be32 *out_rkey) {
    auto state = ibgda_get_state();
    auto heap_start = reinterpret_cast<uint64_t>(nvshmemi_device_state_d.heap_base);
    auto log2_cumem_granularity = state->log2_cumem_granularity;
    // Local key
    uint64_t idx = (laddr - heap_start) >> log2_cumem_granularity;
    auto device_key = state->constmem.lkeys[idx];
    auto lchunk_size = device_key.next_addr - laddr;
    *lkey = device_key.key;
    // Remote key
    uint64_t roffset = raddr - heap_start;
    idx = ((roffset >> log2_cumem_granularity) * nvshmemi_device_state_d.npes) + dst_pe;
    if (idx < NVSHMEMI_IBGDA_MAX_CONST_RKEYS) {
        device_key = state->constmem.rkeys[idx];
    } else {
        device_key = state->globalmem.rkeys[idx - NVSHMEMI_IBGDA_MAX_CONST_RKEYS];
    }
    *out_raddr = reinterpret_cast<uint64_t>(nvshmemi_device_state_d.peer_heap_base_remote[dst_pe]) + roffset;
    *out_rkey = device_key.key;
    // Return the minimum of local and remote chunk sizes
    auto rchunk_size = device_key.next_addr - roffset;
    return min(lchunk_size, rchunk_size);
 }
 __device__ static __forceinline__ void
 ibgda_get_rkey(uint64_t addr, int dst_pe, uint64_t *out_raddr, __be32 *out_rkey) {
    auto state = ibgda_get_state();
    auto heap_start = reinterpret_cast<uint64_t>(nvshmemi_device_state_d.heap_base);
    uint64_t roffset = addr - heap_start;
    uint64_t idx = ((roffset >> state->log2_cumem_granularity) * nvshmemi_device_state_d.npes) + dst_pe;
    nvshmemi_ibgda_device_key_t device_key;
    if (idx < NVSHMEMI_IBGDA_MAX_CONST_RKEYS)
        device_key = state->constmem.rkeys[idx];
    else
        device_key = state->globalmem.rkeys[idx - NVSHMEMI_IBGDA_MAX_CONST_RKEYS];
    *out_raddr = reinterpret_cast<uint64_t>(nvshmemi_device_state_d.peer_heap_base_remote[dst_pe]) + roffset;
    *out_rkey = device_key.key;
 }
 __device__ static __forceinline__ uint64_t
 ibgda_reserve_wqe_slots(nvshmemi_ibgda_device_qp_t *qp, uint32_t num_wqes) {
    auto mvars = &qp->mvars;
    return atomicAdd(reinterpret_cast<unsigned long long*>(&mvars->tx_wq.resv_head), static_cast<unsigned long long>(num_wqes));
 }
 __device__ static __forceinline__ void*
 ibgda_get_wqe_ptr(nvshmemi_ibgda_device_qp_t* qp, uint16_t wqe_idx) {
    uint16_t cnt = qp->tx_wq.nwqes;
    uint16_t idx = wqe_idx & (cnt - 1);
    return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(qp->tx_wq.wqe) + (idx << MLX5_SEND_WQE_SHIFT));
 }
 __device__ static __forceinline__ void
 nvshmemi_ibgda_rma_p(int *rptr, const int value, int dst_pe, int qp_id, uint32_t imm = std::numeric_limits<uint32_t>::max()) {
    // Get rkey
    // NOTES: the `p` operation will not cross multiple remote chunks
    __be32 rkey;
    uint64_t raddr;
    ibgda_get_rkey(reinterpret_cast<uint64_t>(rptr), dst_pe, &raddr, &rkey);
    // Write WQEs
    auto qp = ibgda_get_rc(dst_pe, qp_id);
    uint64_t base_wqe_idx = ibgda_reserve_wqe_slots(qp, 1);
    void *wqe_ptrs;
    wqe_ptrs = ibgda_get_wqe_ptr(qp, base_wqe_idx);
    ibgda_write_rdma_write_inl_wqe(qp, reinterpret_cast<const uint32_t*>(&value), raddr, rkey, base_wqe_idx, &wqe_ptrs, imm);
    // Submit requests
    ibgda_submit_requests<true>(qp, base_wqe_idx, 1);
 }
 __device__ static __forceinline__ void
 ibgda_write_rdma_write_wqe(nvshmemi_ibgda_device_qp_t *qp, uint64_t laddr, __be32 lkey,
                           uint64_t raddr, __be32 rkey, uint32_t bytes, uint16_t wqe_idx,
                           void** out_wqes) {
    ibgda_ctrl_seg_t ctrl_seg;
    struct mlx5_wqe_raddr_seg raddr_seg;
    struct mlx5_wqe_data_seg data_seg;
    auto *ctrl_seg_ptr = reinterpret_cast<ibgda_ctrl_seg_t*>(out_wqes[0]);
    void *av_seg_ptr = reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(ctrl_seg_ptr) + sizeof(*ctrl_seg_ptr));
    struct mlx5_wqe_raddr_seg *raddr_seg_ptr;
    struct mlx5_wqe_data_seg *data_seg_ptr;
    raddr_seg_ptr = reinterpret_cast<mlx5_wqe_raddr_seg*>(reinterpret_cast<uintptr_t>(av_seg_ptr));
    data_seg_ptr = reinterpret_cast<mlx5_wqe_data_seg*>(reinterpret_cast<uintptr_t>(raddr_seg_ptr) + sizeof(*raddr_seg_ptr));
    raddr_seg.raddr = HtoBE64(raddr);
    raddr_seg.rkey = rkey;
    raddr_seg.reserved = 0;
    data_seg.byte_count = HtoBE32(bytes);
    data_seg.lkey = lkey;
    data_seg.addr = HtoBE64(laddr);
    ctrl_seg = {0};
    ctrl_seg.qpn_ds = HtoBE32((qp->qpn << 8) | 3);
    ctrl_seg.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
    ctrl_seg.opmod_idx_opcode = HtoBE32((wqe_idx << 8) | MLX5_OPCODE_RDMA_WRITE);
    EP_STATIC_ASSERT(sizeof(*ctrl_seg_ptr) == 16, "sizeof(*ctrl_seg_ptr) == 16");
    EP_STATIC_ASSERT(sizeof(*raddr_seg_ptr) == 16, "sizeof(*raddr_seg_ptr) == 16");
    EP_STATIC_ASSERT(sizeof(*data_seg_ptr) == 16, "sizeof(*data_seg_ptr) == 16");
    st_na_relaxed(reinterpret_cast<int4*>(ctrl_seg_ptr), *reinterpret_cast<const int4*>(&ctrl_seg));
    st_na_relaxed(reinterpret_cast<int4*>(raddr_seg_ptr), *reinterpret_cast<const int4*>(&raddr_seg));
    st_na_relaxed(reinterpret_cast<int4*>(data_seg_ptr), *reinterpret_cast<const int4*>(&data_seg));
 }
 __device__ static __forceinline__ void
 ibgda_write_empty_recv_wqe(void *out_wqe) {
    auto *data_seg_ptr = reinterpret_cast<struct mlx5_wqe_data_seg*>(out_wqe);
    struct mlx5_wqe_data_seg data_seg;
    // Make the first segment in the WQE invalid, then the entire list will be invalid
    data_seg.byte_count = 0;
    data_seg.lkey = HtoBE64(MLX5_INVALID_LKEY);
    data_seg.addr = 0;
    EP_STATIC_ASSERT(sizeof(mlx5_wqe_data_seg) == sizeof(int4), "Invalid data type length");
    st_na_relaxed(reinterpret_cast<int4*>(data_seg_ptr), *reinterpret_cast<const int4*>(&data_seg));
 }
 template <bool kAlwaysDoPostSend = false>
 __device__ static __forceinline__ void
 nvshmemi_ibgda_put_nbi_warp(uint64_t req_rptr, uint64_t req_lptr, size_t bytes, int dst_pe, int qp_id, int lane_id, int message_idx) {
    // Get lkey and rkey, store them into lanes
    uint32_t num_wqes = 0;
    __be32 my_lkey = 0;
    uint64_t my_laddr = 0;
    __be32 my_rkey = 0;
    uint64_t my_raddr = 0;
    uint64_t my_chunk_size = 0;
    // Decide how many messages (theoretically 3 for maximum)
    auto remaining_bytes = bytes;
    while (remaining_bytes > 0) {
        if (lane_id == num_wqes)
            my_chunk_size = min(remaining_bytes, ibgda_get_lkey_and_rkey(my_laddr = req_lptr, &my_lkey, req_rptr, dst_pe, &my_raddr, &my_rkey));
        // Move one more message
        auto chunk_size = __shfl_sync(0xffffffff, my_chunk_size, static_cast<int>(num_wqes));
        remaining_bytes -= chunk_size;
        req_lptr += chunk_size;
        req_rptr += chunk_size;
        ++ num_wqes;
    }
    EP_DEVICE_ASSERT(num_wqes <= 32);
    // Process WQE
    auto qp = ibgda_get_rc(dst_pe, qp_id);
    uint64_t base_wqe_idx = 0;
    if (lane_id == 0)
        base_wqe_idx = ibgda_reserve_wqe_slots(qp, num_wqes);
    base_wqe_idx = __shfl_sync(0xffffffff, base_wqe_idx, 0);
    if (lane_id < num_wqes) {
        auto wqe_ptr = ibgda_get_wqe_ptr(qp, base_wqe_idx + lane_id);
        ibgda_write_rdma_write_wqe(qp, my_laddr, my_lkey, my_raddr, my_rkey, my_chunk_size,
                                   base_wqe_idx, &wqe_ptr);
    }
    __syncwarp();
    // Submit
    if (lane_id == 0)
        ibgda_submit_requests<kAlwaysDoPostSend>(qp, base_wqe_idx, num_wqes, message_idx);
    __syncwarp();
 }
 __device__ static __forceinline__ void ibgda_write_amo_add_wqe(
        nvshmemi_ibgda_device_qp_t *qp, const int &value,
        uint64_t laddr, __be32 lkey, uint64_t raddr, __be32 rkey,
        uint16_t wqe_idx, void** out_wqes) {
    ibgda_ctrl_seg_t ctrl_seg = {0};
    struct mlx5_wqe_raddr_seg raddr_seg;
    struct mlx5_wqe_atomic_seg atomic_seg_1;
    struct mlx5_wqe_data_seg data_seg;
    auto ctrl_seg_ptr = reinterpret_cast<ibgda_ctrl_seg_t*>(out_wqes[0]);
    auto raddr_seg_ptr = reinterpret_cast<mlx5_wqe_raddr_seg*>(reinterpret_cast<uintptr_t>(ctrl_seg_ptr) + sizeof(*ctrl_seg_ptr));
    auto atomic_seg_ptr = reinterpret_cast<mlx5_wqe_atomic_seg*>(reinterpret_cast<uintptr_t>(raddr_seg_ptr) + sizeof(*raddr_seg_ptr));
    auto data_seg_ptr = reinterpret_cast<mlx5_wqe_data_seg*>(reinterpret_cast<uintptr_t>(atomic_seg_ptr) + sizeof(*atomic_seg_ptr));
    raddr_seg.raddr = HtoBE64(raddr);
    raddr_seg.rkey = rkey;
    raddr_seg.reserved = 0;
    // NOTES: `0x08000000` means `IBGDA_4_BYTE_EXT_AMO_OPMOD`
    ctrl_seg.opmod_idx_opcode = HtoBE32(MLX5_OPCODE_ATOMIC_MASKED_FA | (wqe_idx << 8) | 0x08000000);
    auto atomic_32_masked_fa_seg = reinterpret_cast<ibgda_atomic_32_masked_fa_seg_t*>(&atomic_seg_1);
    atomic_32_masked_fa_seg->add_data = HtoBE32(value);
    atomic_32_masked_fa_seg->field_boundary = 0;
    ctrl_seg.qpn_ds = HtoBE32((qp->qpn << 8) | 4);
    ctrl_seg.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
    data_seg.byte_count = HtoBE32(sizeof(int));
    data_seg.lkey = lkey;
    data_seg.addr = HtoBE64(laddr);
    EP_STATIC_ASSERT(sizeof(*ctrl_seg_ptr) == sizeof(int4), "Invalid vectorization");
    EP_STATIC_ASSERT(sizeof(*raddr_seg_ptr) == sizeof(int4), "Invalid vectorization");
    EP_STATIC_ASSERT(sizeof(*atomic_seg_ptr) == sizeof(int4), "Invalid vectorization");
    EP_STATIC_ASSERT(sizeof(*data_seg_ptr) == sizeof(int4), "Invalid vectorization");
    st_na_relaxed(reinterpret_cast<int4*>(ctrl_seg_ptr), *reinterpret_cast<int4*>(&ctrl_seg));
    st_na_relaxed(reinterpret_cast<int4*>(raddr_seg_ptr), *reinterpret_cast<int4*>(&raddr_seg));
    st_na_relaxed(reinterpret_cast<int4*>(atomic_seg_ptr), *reinterpret_cast<int4*>(&atomic_seg_1));
    st_na_relaxed(reinterpret_cast<int4*>(data_seg_ptr), *reinterpret_cast<int4*>(&data_seg));
 }
 __device__ __forceinline__ void nvshmemi_ibgda_amo_nonfetch_add(void *rptr, const int& value, int pe, int qp_id, bool is_local_copy = false) {
    if (is_local_copy) {
        atomicAdd(static_cast<unsigned long long*>(rptr), value);
    } else {
        nvshmemi_ibgda_device_qp_t *qp = ibgda_get_rc(pe, qp_id);
        __be32 rkey;
        uint64_t raddr;
        ibgda_get_rkey(reinterpret_cast<uint64_t>(rptr), pe, &raddr, &rkey);
        uint64_t my_wqe_idx = ibgda_reserve_wqe_slots(qp, 1);
        void *wqe_ptrs = ibgda_get_wqe_ptr(qp, my_wqe_idx);
        ibgda_write_amo_add_wqe(qp, value, reinterpret_cast<uint64_t>(qp->ibuf.buf),
                                qp->ibuf.lkey, raddr, rkey, my_wqe_idx, &wqe_ptrs);
        ibgda_submit_requests<true>(qp, my_wqe_idx, 1);
    }
 }
 __device__ __forceinline__ uint64_t nvshmemi_get_p2p_ptr(const uint64_t& ptr, const int& rank, const int& dst_rank) {
    // Local rank, no need for mapping
    if (rank == dst_rank)
        return ptr;
    auto peer_base = __ldg(reinterpret_cast<uint64_t*>(nvshmemi_device_state_d.peer_heap_base_p2p) + dst_rank);
    // RDMA connected
    if (peer_base == 0)
        return 0;
    // NVLink P2P is enabled
    return peer_base + (ptr - reinterpret_cast<uint64_t>(nvshmemi_device_state_d.heap_base));
 }
 // This is a simplified version of NVSHMEM's `ibgda_poll_cq`. 
 // Note that this implementation does not guarantee thread safety,
 // so we must ensure that no other threads are concurrently using the same QP.
 __device__ static __forceinline__ void
 ibgda_poll_cq(nvshmemi_ibgda_device_cq_t *cq, uint64_t idx) {
    const auto cqe64 = static_cast<mlx5_cqe64*>(cq->cqe);
    const uint32_t ncqes = cq->ncqes;
    memory_fence_cta();
    // NOTES: this while loop is part of do-while below.
    // `wqe_counter` is the HW consumer index. However, we always maintain `index + 1`.
    // To be able to compare with the index, we need to use `wqe_counter + 1`.
    // Because `wqe_counter` is `uint16_t`, it may be overflow. Still, we know for
    // sure that if `idx - wqe_counter - 1 < ncqes`, `wqe_counter + 1 is less than
    // idx, and thus we need to wait. We don't need to wait when `idx == wqe_counter + 1`
    // That's why we use `- 2` here to make this case overflow.
    uint16_t wqe_counter;
    do {
        wqe_counter = HtoBE16(ld_na_relaxed(&cqe64->wqe_counter));
    } while ((static_cast<uint16_t>(static_cast<uint16_t>(idx) - wqe_counter - static_cast<uint16_t>(2)) < ncqes));
    *cq->cons_idx = idx;
    // Prevent reordering of this function and later instructions
    memory_fence_cta();
 }
 // Wait until wqe `idx - 1` is completed.
 __device__ static __forceinline__ void
 nvshmemi_ibgda_quiet(int dst_pe, int qp_id) {
    auto qp = ibgda_get_rc(dst_pe, qp_id);
    uint64_t prod_idx = ld_na_relaxed(qp->tx_wq.prod_idx);
    ibgda_poll_cq(qp->tx_wq.cq, prod_idx);
 }
 } // namespace deep_ep
--- a/DeepEP/csrc/kernels/internode.cu
+++ b/DeepEP/csrc/kernels/internode.cu
--- a/DeepEP/csrc/kernels/internode_ll.cu
+++ b/DeepEP/csrc/kernels/internode_ll.cu
@ -0,0 +1,584 @@
 #include "configs.cuh"
 #include "exception.cuh"
 #include "launch.cuh"
 #include "ibgda_device.cuh"
 namespace deep_ep {
 namespace internode_ll {
 template <int kNumThreads> __launch_bounds__(kNumThreads, 1)
 __global__ void clean_low_latency_buffer(int* clean_0, int num_clean_int_0,
                                         int* clean_1, int num_clean_int_1) {
    // Barrier before cleaning (in case of unfinished chunked EP)
    nvshmemx_barrier_all_block();
    // Clean
    auto thread_id = static_cast<int>(threadIdx.x);
    #pragma unroll
    for (int i = thread_id; i < num_clean_int_0; i += kNumThreads)
        clean_0[i] = 0;
    #pragma unroll
    for (int i = thread_id; i < num_clean_int_1; i += kNumThreads)
        clean_1[i] = 0;
    // Barrier after cleaning (make sure the low-latency mode works fine)
    nvshmemx_barrier_all_block();
 }
 void clean_low_latency_buffer(int* clean_0, int num_clean_int_0,
                              int* clean_1, int num_clean_int_1,
                              cudaStream_t stream) {
    constexpr int kNumThreads = 256;
    SETUP_LAUNCH_CONFIG(1, kNumThreads, stream);
    LAUNCH_KERNEL(&cfg, clean_low_latency_buffer<kNumThreads>,
                  clean_0, num_clean_int_0, clean_1, num_clean_int_1);
 }
 template <bool kUseFP8, bool kUseUE8M0, int kHidden>
 __global__ __launch_bounds__(1024, 1) void
 dispatch(void* packed_recv_x, void* packed_recv_x_scales,
         int* packed_recv_src_info, int64_t* packed_recv_layout_range,
         int* packed_recv_count,
         int* cumulative_local_expert_recv_stats,
         void* rdma_recv_x, int* rdma_recv_count, void* rdma_x,
         const void* x, const int64_t* topk_idx,
         int* atomic_counter_per_expert, int* atomic_finish_counter_per_expert,
         int* next_clean, int num_next_clean_int,
         int num_tokens, int num_max_dispatch_tokens_per_rank,
         int num_topk, int num_experts, int rank, int num_ranks,
         int num_warp_groups, int num_warps_per_group,
         bool round_scale, int phases) {
    const auto sm_id = static_cast<int>(blockIdx.x);
    const auto thread_id = static_cast<int>(threadIdx.x);
    const auto warp_id = thread_id / 32, lane_id = get_lane_id();
    const auto num_sms = static_cast<int>(gridDim.x);
    const auto num_warps = num_warp_groups * num_warps_per_group;
    const auto num_local_experts = num_experts / num_ranks;
    const auto warp_group_id = warp_id / num_warps_per_group;
    const auto sub_warp_id = warp_id % num_warps_per_group;
    const auto responsible_expert_idx = sm_id * num_warp_groups + warp_group_id;
    // May extract UE8M0 from the scales
    using scale_t = std::conditional_t<kUseUE8M0, uint8_t, float>;
    using packed_t = std::conditional_t<kUseUE8M0, uint32_t, float>;
    EP_STATIC_ASSERT(sizeof(packed_t) % sizeof(scale_t) == 0, "Invalid vector length");
    // FP8 staffs
    constexpr int kNumPerChannels = 128;
    const int num_scales = kHidden / kNumPerChannels;
    const size_t hidden_bytes = kHidden * (kUseFP8 ? sizeof(__nv_fp8_storage_t) : sizeof(nv_bfloat16));
    const size_t hidden_int4 = hidden_bytes / sizeof(int4);
    // Message package: hidden data, FP8 scales, index at source
    // NOTES: currently we have 3 reserved int fields for future use
    using vec_t = typename std::conditional<kUseFP8, int2, int4>::type;
    const size_t num_bytes_per_msg = sizeof(int4) + (kUseFP8 ? (kHidden + num_scales * sizeof(float)) : (kHidden * sizeof(nv_bfloat16)));
    const size_t num_int4_per_msg = num_bytes_per_msg / sizeof(int4);
    EP_DEVICE_ASSERT(num_bytes_per_msg % sizeof(int4) == 0);
    // Expert counts
    constexpr int kNumMaxWarpGroups = 32;
    __shared__ int shared_num_tokens_sent_per_expert[kNumMaxWarpGroups];
    // Sending phase
    if ((phases & LOW_LATENCY_SEND_PHASE) == 0)
        goto LOW_LATENCY_DISPATCH_RECV;
    // There are 2 kinds of warps in this part:
    // 1. The first-kind warps for FP8 cast and sending top-k tokens
    // 2. The last warp for reading `topk_idx` and count for per-expert information
    if (warp_id < num_warps - 1) {
        constexpr int kNumElemsPerRead = sizeof(int4) / sizeof(nv_bfloat16);
        EP_DEVICE_ASSERT(kHidden % kNumElemsPerRead == 0);
        EP_STATIC_ASSERT(kNumElemsPerRead * 32 % kNumPerChannels == 0, "Invalid vectorization");
        const auto num_threads = (num_warps - 1) * 32;
        const size_t hidden_bf16_int4 = kHidden / kNumElemsPerRead;
        for (int token_idx = sm_id; token_idx < num_tokens; token_idx += num_sms) {
            const auto x_int4 = static_cast<const int4*>(x) + token_idx * hidden_bf16_int4;
            const auto rdma_x_src_idx = reinterpret_cast<int*>(static_cast<uint8_t*>(rdma_x) + token_idx * num_bytes_per_msg);
            const auto rdma_x_vec = reinterpret_cast<vec_t*>(reinterpret_cast<uint8_t*>(rdma_x_src_idx) + sizeof(int4));
            const auto rdma_x_scales = reinterpret_cast<float*>(reinterpret_cast<uint8_t*>(rdma_x_vec) + hidden_bytes);
            // Overlap top-k index read and source token index writes
            auto dst_expert_idx = warp_id < num_topk ? static_cast<int>(__ldg(topk_idx + token_idx * num_topk + warp_id)) : -1;
            thread_id == 0 ? (*rdma_x_src_idx = token_idx) : 0;
            // FP8 cast
            #pragma unroll
            for (int i = thread_id; i < hidden_bf16_int4; i += num_threads) {
                // Read
                auto int4_value = __ldg(x_int4 + i);
                if constexpr (kUseFP8) {
                    // Calculate local amax
                    auto bf16_values = reinterpret_cast<nv_bfloat16*>(&int4_value);
                    float fp32_values[kNumElemsPerRead];
                    float amax = kFP8Margin, scale, scale_inv;
                    #pragma unroll
                    for (int j = 0; j < kNumElemsPerRead; ++ j) {
                        fp32_values[j] = static_cast<float>(bf16_values[j]);
                        amax = fmaxf(amax, fabsf(fp32_values[j]));
                    }
                    // Reduce amax and scale
                    EP_STATIC_ASSERT(kNumElemsPerRead * 32 / kNumPerChannels == 2, "Invalid vectorization");
                    amax = half_warp_reduce_max(amax);
                    calculate_fp8_scales(amax, scale, scale_inv, round_scale);
                    if (lane_id == 0 or lane_id == 16)
                        rdma_x_scales[i * kNumElemsPerRead / 128] = scale_inv;
                    // Cast into send buffer
                    vec_t int2_value;
                    auto fp8x2_values = reinterpret_cast<__nv_fp8x2_storage_t*>(&int2_value);
                    #pragma unroll
                    for (int j = 0; j < kNumElemsPerRead; j += 2) {
                        float2 fp32x2 = {fp32_values[j] * scale, fp32_values[j + 1] * scale};
                        fp8x2_values[j / 2] = __nv_cvt_float2_to_fp8x2(fp32x2, __NV_SATFINITE, __NV_E4M3);
                    }
                    rdma_x_vec[i] = int2_value;
                } else {
                    // Reinterpret-cast is for C++14 compatibility
                    rdma_x_vec[i] = *reinterpret_cast<vec_t*>(&int4_value);
                }
            }
            asm volatile("bar.sync 1, %0;" :: "r"(num_threads));
            // Issue IBGDA sends
            if (dst_expert_idx >= 0) {
                int slot_idx = lane_id == 0 ? atomicAdd(atomic_counter_per_expert + dst_expert_idx, 1) : 0;
                slot_idx = __shfl_sync(0xffffffff, slot_idx, 0);
                const auto dst_rank = dst_expert_idx / num_local_experts;
                const auto dst_expert_local_idx = dst_expert_idx % num_local_experts;
                const auto src_ptr = reinterpret_cast<uint64_t>(rdma_x_src_idx);
                const auto dst_ptr = reinterpret_cast<uint64_t>(rdma_recv_x) +
                                     dst_expert_local_idx * num_ranks * num_max_dispatch_tokens_per_rank * num_bytes_per_msg +
                                     rank * num_max_dispatch_tokens_per_rank * num_bytes_per_msg +
                                     slot_idx * num_bytes_per_msg;
                const auto dst_p2p_ptr = nvshmemi_get_p2p_ptr(dst_ptr, rank, dst_rank);
                if (dst_p2p_ptr == 0) {
                    nvshmemi_ibgda_put_nbi_warp(dst_ptr, src_ptr, num_bytes_per_msg, dst_rank, dst_expert_local_idx, lane_id, slot_idx);
                } else {
                    // NOTES: only 2 load iterations for 7K hidden with 8 unrolls
                    const auto* src_int4_ptr = reinterpret_cast<const int4*>(src_ptr);
                    const auto* dst_int4_ptr = reinterpret_cast<int4*>(dst_p2p_ptr);
                    UNROLLED_WARP_COPY(8, lane_id, num_int4_per_msg, dst_int4_ptr, src_int4_ptr, ld_nc_global, st_na_global);
                }
                // Increase counter after finishing
                __syncwarp();
                lane_id == 0 ? atomic_add_release_global(atomic_finish_counter_per_expert + dst_expert_idx, 1) : 0;
            }
        }
    } else if (warp_id == num_warps - 1) {
        EP_DEVICE_ASSERT(num_sms > 1);
        if (sm_id == 0) {
            // The first SM is also responsible for checking QPs
            EP_DEVICE_ASSERT(ibgda_get_state()->num_rc_per_pe >= num_local_experts);
            // The first SM is also responsible for cleaning the next buffer
            #pragma unroll
            for (int i = lane_id; i < num_next_clean_int; i += 32)
                next_clean[i] = 0;
            // Notify before executing `int_p`
            __syncwarp();
            #pragma unroll
            for (int i = lane_id; i < num_experts; i += 32)
                atomic_add_release_global(atomic_finish_counter_per_expert + i, FINISHED_SUM_TAG);
        }
        // This SM should be responsible for some destination experts, read `topk_idx` for them
        int expert_count[kNumMaxWarpGroups] = {0};
        const auto expert_begin_idx = sm_id * num_warp_groups;
        const auto expert_end_idx = min(expert_begin_idx + num_warp_groups, num_experts);
        // Per lane count
        #pragma unroll 8
        for (int i = lane_id; i < num_tokens * num_topk; i += 32) {
            auto idx = static_cast<int>(__ldg(topk_idx + i));
            if (idx >= expert_begin_idx and idx < expert_end_idx)
                expert_count[idx - expert_begin_idx] ++;
        }
        // Warp reduce
        #pragma unroll
        for (int i = expert_begin_idx; i < expert_end_idx; ++ i) {
            auto sum = warp_reduce_sum(expert_count[i - expert_begin_idx]);
            if (lane_id == 0) {
                shared_num_tokens_sent_per_expert[i - expert_begin_idx] = sum;
                atomic_add_release_global(atomic_finish_counter_per_expert + i, FINISHED_SUM_TAG - sum);
            }
        }
    }
    __syncthreads();
    // Issue count sends
    if (responsible_expert_idx < num_experts and sub_warp_id == 0 and lane_id == 0) {
        const auto dst_rank = responsible_expert_idx / num_local_experts;
        const auto dst_expert_local_idx = responsible_expert_idx % num_local_experts;
        const auto num_tokens_sent = shared_num_tokens_sent_per_expert[responsible_expert_idx - sm_id * num_warp_groups];
        // Wait local sends issued and send expert counts
        while (ld_acquire_global(atomic_finish_counter_per_expert + responsible_expert_idx) != FINISHED_SUM_TAG * 2);
        auto dst_ptr = reinterpret_cast<uint64_t>(rdma_recv_count + dst_expert_local_idx * num_ranks + rank);
        auto dst_p2p_ptr = nvshmemi_get_p2p_ptr(dst_ptr, rank, dst_rank);
        if (dst_p2p_ptr == 0) {
            nvshmemi_ibgda_amo_nonfetch_add(reinterpret_cast<int*>(dst_ptr), -num_tokens_sent - 1, dst_rank, dst_expert_local_idx);
        } else {
            st_release_sys_global(reinterpret_cast<int*>(dst_p2p_ptr), -num_tokens_sent - 1);
        }
        // Clean workspace for next use
        atomic_counter_per_expert[responsible_expert_idx] = 0;
        atomic_finish_counter_per_expert[responsible_expert_idx] = 0;
        // Clean `packed_recv_count`
        if (dst_rank == 0)
            packed_recv_count[dst_expert_local_idx] = 0;
    }
    __syncwarp();
    // Receiving phase
    LOW_LATENCY_DISPATCH_RECV:
    if ((phases & LOW_LATENCY_RECV_PHASE) == 0)
        return;
    // For send-and-recv kernels, we need a grid sync for making `packed_recv_count` visible
    if (phases & LOW_LATENCY_SEND_PHASE)
        cg::this_grid().sync();
    // Receiving and packing
    if (responsible_expert_idx < num_experts) {
        const auto src_rank = responsible_expert_idx / num_local_experts;
        const auto local_expert_idx = responsible_expert_idx % num_local_experts;
        const auto rdma_recv_x_uint8 = static_cast<uint8_t*>(rdma_recv_x) +
                local_expert_idx * num_ranks * num_max_dispatch_tokens_per_rank * num_bytes_per_msg +
                src_rank * num_max_dispatch_tokens_per_rank * num_bytes_per_msg;
        const auto recv_x_int4 = static_cast<int4*>(packed_recv_x) +
                local_expert_idx * num_ranks * num_max_dispatch_tokens_per_rank * hidden_int4;
        const auto recv_src_info = packed_recv_src_info + local_expert_idx * num_ranks * num_max_dispatch_tokens_per_rank;
        const auto recv_range = packed_recv_layout_range + local_expert_idx * num_ranks;
        const auto num_aligned_scales = align<int>(num_scales, sizeof(float) / sizeof(scale_t));
        const auto recv_x_scales = static_cast<scale_t*>(packed_recv_x_scales) + local_expert_idx * num_ranks * num_max_dispatch_tokens_per_rank * num_aligned_scales;
        // Shared between sub-warps in warp groups
        __shared__ int shared_num_recv_tokens[kNumMaxWarpGroups], shared_recv_token_begin_idx[kNumMaxWarpGroups];
        // Wait tokens to arrive
        // NOTES: using sub-warp 1 to overlap with sub-warp 0
        int num_recv_tokens, recv_token_begin_idx;
        EP_DEVICE_ASSERT(num_warps_per_group > 1 and num_warp_groups < 15);
        if (sub_warp_id == 1 and lane_id == 0) {
            while ((num_recv_tokens = ld_acquire_sys_global(rdma_recv_count + local_expert_idx * num_ranks + src_rank)) == 0);
            num_recv_tokens = -num_recv_tokens - 1;
            recv_token_begin_idx = atomicAdd(packed_recv_count + local_expert_idx, num_recv_tokens);
            shared_num_recv_tokens[warp_group_id] = num_recv_tokens;
            shared_recv_token_begin_idx[warp_group_id] = recv_token_begin_idx;
            recv_range[src_rank] = pack2<int, int64_t>(num_recv_tokens, recv_token_begin_idx);
            if (cumulative_local_expert_recv_stats != nullptr)
                atomicAdd(cumulative_local_expert_recv_stats + local_expert_idx, num_recv_tokens);
        }
        asm volatile("bar.sync %0, %1;" :: "r"(warp_group_id + 2), "r"(num_warps_per_group * 32));
        num_recv_tokens = shared_num_recv_tokens[warp_group_id];
        recv_token_begin_idx = shared_recv_token_begin_idx[warp_group_id];
        // Copy tokens
        EP_DEVICE_ASSERT(num_scales <= 64);
        for (int i = sub_warp_id; i < num_recv_tokens; i += num_warps_per_group) {
            // Copy source info
            const auto src_src_idx = reinterpret_cast<int*>(rdma_recv_x_uint8 + i * num_bytes_per_msg);
            if (lane_id == 0)
                recv_src_info[recv_token_begin_idx + i] = ld_nc_global(src_src_idx);
            __syncwarp();
            // Copy data
            // NOTES: only 2 load iterations for 7K hidden with 7 unrolls
            const auto src_data = reinterpret_cast<int4*>(reinterpret_cast<uint8_t*>(src_src_idx) + sizeof(int4));
            const auto dst_data = recv_x_int4 + (recv_token_begin_idx + i) * hidden_int4;
            UNROLLED_WARP_COPY(7, lane_id, hidden_int4, dst_data, src_data, ld_nc_global, st_na_global);
            // Copy scales
            if constexpr (kUseFP8) {
                // Equivalent CuTe layout:
                //   (num_tokens, (num_packed, num_elems_per_pack)):(num_elems_per_pack, (num_tokens * num_elems_per_pack, 1))
                const auto src_scales = reinterpret_cast<float*>(reinterpret_cast<uint8_t*>(src_data) + hidden_bytes);
                const auto num_elems_per_pack = static_cast<int>(sizeof(packed_t) / sizeof(scale_t));
                const auto token_idx = recv_token_begin_idx + i;
                const auto token_stride = num_elems_per_pack;
                const auto pack_stride = num_ranks * num_max_dispatch_tokens_per_rank * num_elems_per_pack;
                if (lane_id < num_scales) {
                    const auto pack_idx = lane_id / num_elems_per_pack;
                    const auto elem_idx = lane_id % num_elems_per_pack;
                    auto scale = extract_required_scale_format<kUseUE8M0>(ld_nc_global(src_scales + lane_id));
                    recv_x_scales[token_idx * token_stride + pack_idx * pack_stride + elem_idx] = scale;
                }
                if (lane_id + 32 < num_scales) {
                    const auto pack_idx = (lane_id + 32) / num_elems_per_pack;
                    const auto elem_idx = (lane_id + 32) % num_elems_per_pack;
                    auto scale = extract_required_scale_format<kUseUE8M0>(ld_nc_global(src_scales + lane_id + 32));
                    recv_x_scales[token_idx * token_stride + pack_idx * pack_stride + elem_idx] = scale;
                }
            }
        }
    }
 }
 void dispatch(void* packed_recv_x, void* packed_recv_x_scales,
              int* packed_recv_src_info, int64_t* packed_recv_layout_range,
              int* packed_recv_count,
              int* cumulative_local_expert_recv_stats,
              void* rdma_recv_x, int* rdma_recv_count, void* rdma_x,
              const void* x, const int64_t* topk_idx,
              int* next_clean, int num_next_clean_int,
              int num_tokens, int hidden, int num_max_dispatch_tokens_per_rank,
              int num_topk, int num_experts, int rank, int num_ranks,
              bool use_fp8, bool round_scale, bool use_ue8m0,
              void* workspace, int num_device_sms,
              cudaStream_t stream, int phases) {
    constexpr int kNumMaxTopK = 9;
    const int num_warp_groups = ceil_div(num_experts, num_device_sms);
    const int num_warps_per_group = 32 / num_warp_groups;
    EP_HOST_ASSERT(num_warp_groups > 0 and num_warps_per_group > 0);
    EP_HOST_ASSERT(kNumMaxTopK + 1 <= num_warp_groups * num_warps_per_group);
    const auto num_warps = num_warp_groups * num_warps_per_group;
    const auto num_sms = ceil_div(num_experts, num_warp_groups);
    EP_HOST_ASSERT(num_topk <= kNumMaxTopK);
    // Workspace checks
    auto atomic_counter_per_expert = static_cast<int*>(workspace);
    auto atomic_finish_counter_per_expert = atomic_counter_per_expert + num_experts;
    EP_HOST_ASSERT(num_experts * sizeof(int) * 2 <= NUM_WORKSPACE_BYTES);
    // FP8 checks
    if (use_ue8m0)
        EP_HOST_ASSERT(round_scale and "UE8M0 SF requires `round_scale=True`");
 #define DISPATCH_LAUNCH_CASE(hidden) { \
 auto dispatch_func = dispatch<false, false, hidden>; \
 if (use_fp8 and not use_ue8m0) \
    dispatch_func = dispatch<true, false, hidden>; \
 if (use_fp8 and use_ue8m0) \
    dispatch_func = dispatch<true, true, hidden>; \
 LAUNCH_KERNEL(&cfg, dispatch_func, \
              packed_recv_x, packed_recv_x_scales, \
              packed_recv_src_info, packed_recv_layout_range, \
              packed_recv_count, \
              cumulative_local_expert_recv_stats, \
              rdma_recv_x, rdma_recv_count, rdma_x, \
              x, topk_idx, \
              atomic_counter_per_expert, atomic_finish_counter_per_expert, \
              next_clean, num_next_clean_int, \
              num_tokens, num_max_dispatch_tokens_per_rank, \
              num_topk, num_experts, rank, num_ranks, \
              num_warp_groups, num_warps_per_group, \
              round_scale, phases); } break
    SETUP_LAUNCH_CONFIG(num_sms, num_warps * 32, stream);
    SWITCH_HIDDEN(DISPATCH_LAUNCH_CASE);
 #undef DISPATCH_LAUNCH_CASE
 }
 template <int kHidden, int kNumMaxTopk>
 __global__ __launch_bounds__(1024, 1) void
 combine(void* combined_x,
        void* rdma_recv_x, int* rdma_recv_flag, void* rdma_send_x,
        const void* x, const int64_t* topk_idx, const float* topk_weights,
        const int* src_info, const int64_t* layout_range,
        int* next_clean, int num_next_clean_int,
        int* atomic_clean_flag,
        int num_combined_tokens, int hidden, int num_topk,
        int num_max_dispatch_tokens_per_rank,
        int num_experts, int rank, int num_ranks,
        int num_warp_groups, int num_warps_per_group,
        int phases, bool zero_copy) {
    const auto sm_id = static_cast<int>(blockIdx.x);
    const auto num_sms = static_cast<int>(gridDim.x);
    const auto thread_id = static_cast<int>(threadIdx.x);
    const auto num_threads = static_cast<int>(blockDim.x);
    const auto warp_id = thread_id / 32, lane_id = get_lane_id();
    const auto num_local_experts = num_experts / num_ranks;
    const auto warp_group_id = warp_id / num_warps_per_group;
    const auto sub_warp_id = warp_id % num_warps_per_group;
    const auto responsible_expert_idx = sm_id * num_warp_groups + warp_group_id;
    // Data type staffs
    constexpr int kNumElemsPerInt4 = sizeof(int4) / sizeof(nv_bfloat16);
    const size_t hidden_bf16_int4 = kHidden / kNumElemsPerInt4;
    // Message package
    constexpr size_t num_bytes_per_slot = kHidden * sizeof(nv_bfloat16);
    EP_STATIC_ASSERT(num_bytes_per_slot % sizeof(int4) == 0, "Invalid vectorization");
    // Sending phase
    if ((phases & LOW_LATENCY_SEND_PHASE) == 0)
        goto LOW_LATENCY_COMBINE_RECV;
    // Clean up next buffer
    if (sm_id == 0 and warp_group_id == 0 and sub_warp_id == 0) {
        #pragma unroll
        for (int i = lane_id; i < num_next_clean_int; i += 32)
            next_clean[i] = 0;
        // Notify before executing `int_p`
        __syncwarp();
        if (lane_id == 0)
            atomic_add_release_global(atomic_clean_flag, num_experts);
    }
    // Issue IBGDA sends
    if (responsible_expert_idx < num_experts) {
        const auto dst_rank = responsible_expert_idx / num_local_experts;
        const auto local_expert_idx = responsible_expert_idx % num_local_experts;
        const auto global_expert_idx = rank * num_local_experts + local_expert_idx;
        const auto layout = __ldg(layout_range + local_expert_idx * num_ranks + dst_rank);
        const auto local_x = static_cast<const int4*>(x) +
                local_expert_idx * num_ranks * num_max_dispatch_tokens_per_rank * hidden_bf16_int4;
        const auto local_src_info = src_info + local_expert_idx * num_ranks * num_max_dispatch_tokens_per_rank;
        const auto rdma_send_x_vec = static_cast<uint8_t*>(rdma_send_x) +
                local_expert_idx * num_ranks * num_max_dispatch_tokens_per_rank * num_bytes_per_slot;
        // Unpack layout
        int offset, num_tokens_to_send;
        unpack2(layout, num_tokens_to_send, offset);
        // Issue IBGDA send
        for (int token_idx = offset + sub_warp_id; token_idx < offset + num_tokens_to_send; token_idx += num_warps_per_group) {
            const auto x_int4 = local_x + token_idx * hidden_bf16_int4;
            const auto rdma_send_type_row = reinterpret_cast<int*>(rdma_send_x_vec + token_idx * num_bytes_per_slot);
            const auto rdma_send_x_vec_row = reinterpret_cast<uint8_t*>(rdma_send_type_row);
            // Copy directly to local rank, or copy to buffer and issue RDMA
            auto src_idx = __ldg(local_src_info + token_idx);
            const auto buf_ptr = reinterpret_cast<int64_t>(rdma_send_x_vec_row);
            const auto dst_ptr = reinterpret_cast<uint64_t>(rdma_recv_x) + (global_expert_idx * num_max_dispatch_tokens_per_rank + src_idx) * num_bytes_per_slot;
            const auto dst_p2p_ptr = nvshmemi_get_p2p_ptr(dst_ptr, rank, dst_rank);
            if (dst_p2p_ptr == 0) {
                const auto buf_int4_ptr = reinterpret_cast<int4*>(buf_ptr);
                if (not zero_copy)
                    UNROLLED_WARP_COPY(7, lane_id, hidden_bf16_int4, buf_int4_ptr, x_int4, ld_nc_global, st_na_global);
                nvshmemi_ibgda_put_nbi_warp(dst_ptr, buf_ptr, hidden * sizeof(nv_bfloat16), dst_rank, local_expert_idx, lane_id, token_idx - offset);
            } else {
                const auto dst_int4_ptr = reinterpret_cast<int4*>(dst_p2p_ptr);
                UNROLLED_WARP_COPY(7, lane_id, hidden_bf16_int4, dst_int4_ptr, x_int4, ld_nc_global, st_na_global);
            }
        }
        // Put the finishing flag
        EP_DEVICE_ASSERT(num_warps_per_group > 1 and num_warp_groups < 16);
        asm volatile("bar.sync %0, %1;" :: "r"(warp_group_id + 1), "r"(num_warps_per_group * 32));
        if (sub_warp_id == 1 and lane_id == 0) {
            while (ld_acquire_global(atomic_clean_flag) == 0);
            auto dst_ptr = reinterpret_cast<uint64_t>(rdma_recv_flag + global_expert_idx);
            auto dst_p2p_ptr = nvshmemi_get_p2p_ptr(dst_ptr, rank, dst_rank);
            if (dst_p2p_ptr == 0) {
                nvshmemi_ibgda_amo_nonfetch_add(reinterpret_cast<int*>(dst_ptr), 1, dst_rank, local_expert_idx);
            } else {
                st_release_sys_global(reinterpret_cast<int*>(dst_p2p_ptr), 1);
            }
            atomic_add_release_global(atomic_clean_flag, -1);
        }
        __syncwarp();
    }
    // Receiving phase
    LOW_LATENCY_COMBINE_RECV:
    if ((phases & LOW_LATENCY_RECV_PHASE) == 0)
        return;
    // Wait all ranks to arrive
    if (responsible_expert_idx < num_experts) {
        EP_DEVICE_ASSERT(num_warps_per_group > 1);
        if (sub_warp_id == 0 and lane_id == 0) {
            while (ld_acquire_sys_global(rdma_recv_flag + responsible_expert_idx) == 0);
        }
    }
    cg::this_grid().sync();
    // Reduce tokens
    EP_DEVICE_ASSERT(num_topk <= 32 and hidden_bf16_int4 <= num_threads);
    EP_STATIC_ASSERT(kHidden % (32 * kNumElemsPerInt4) == 0, "Invalid vectorization");
    if (thread_id < hidden_bf16_int4) {
        for (int token_idx = sm_id; token_idx < num_combined_tokens; token_idx += num_sms) {
            // Read top-k indices and weights
            int reg_topk_idx[kNumMaxTopk];
            float reg_topk_weights[kNumMaxTopk];
            #pragma unroll
            for (int i = 0; i < num_topk; ++ i) {
                reg_topk_idx[i] = static_cast<int>(__ldg(topk_idx + token_idx * num_topk + i));
                reg_topk_weights[i] = __ldg(topk_weights + token_idx * num_topk + i);
            }
            float combined_values[kNumElemsPerInt4] = {0.0f};
            #pragma unroll
            for (int i = 0; i < num_topk; ++ i) if (reg_topk_idx[i] >= 0) {
                // Read from sources
                auto rdma_buffer_type = reinterpret_cast<const int*>(static_cast<uint8_t*>(rdma_recv_x) + (reg_topk_idx[i] * num_max_dispatch_tokens_per_rank + token_idx) * num_bytes_per_slot);
                auto rdma_buffer_row = reinterpret_cast<const uint8_t*>(rdma_buffer_type);
                // Reduce
                auto x_vec = ld_nc_global(reinterpret_cast<const int4*>(rdma_buffer_row) + thread_id);
                const auto x_bf16 = reinterpret_cast<nv_bfloat16*>(&x_vec);
                #pragma unroll
                for (int j = 0; j < kNumElemsPerInt4; ++ j)
                    combined_values[j] += static_cast<float>(x_bf16[j]) * reg_topk_weights[i];
            }
            // Write results
            int4& combined_int4 = *reinterpret_cast<int4*>(combined_values);
            auto combined_bf16 = reinterpret_cast<nv_bfloat16*>(&combined_values);
            #pragma unroll
            for (int j = 0; j < kNumElemsPerInt4; ++ j)
                combined_bf16[j] = static_cast<nv_bfloat16>(combined_values[j]);
            (static_cast<int4*>(combined_x) + token_idx * hidden_bf16_int4)[thread_id] = combined_int4;
        }
    }
 }
 void combine(void* combined_x,
             void* rdma_recv_x, int* rdma_recv_flag, void* rdma_send_x,
             const void* x, const int64_t* topk_idx, const float* topk_weights,
             const int* src_info, const int64_t* layout_range,
             int* next_clean, int num_next_clean_int,
             int num_combined_tokens, int hidden, int num_max_dispatch_tokens_per_rank,
             int num_topk, int num_experts, int rank, int num_ranks,
             void* workspace, int num_device_sms,
             cudaStream_t stream, int phases, bool zero_copy) {
    constexpr int kNumMaxTopk = 9;
    const int num_warp_groups = ceil_div(num_experts, num_device_sms);
    const int num_warps_per_group = 32 / num_warp_groups;
    EP_HOST_ASSERT(num_warp_groups > 0 and num_warps_per_group > 0);
    const auto num_warps = num_warp_groups * num_warps_per_group;
    const auto num_sms = ceil_div(num_experts, num_warp_groups);
    // Check workspace
    auto atomic_clean_flag = static_cast<int*>(workspace);
    EP_HOST_ASSERT(sizeof(int) <= NUM_WORKSPACE_BYTES);
    EP_HOST_ASSERT(num_topk <= kNumMaxTopk);
 #define COMBINE_LAUNCH_CASE(hidden) { \
 auto combine_func = combine<hidden, kNumMaxTopk>; \
 LAUNCH_KERNEL(&cfg, combine_func, \
              combined_x, \
              rdma_recv_x, rdma_recv_flag, rdma_send_x, \
              x, topk_idx, topk_weights, src_info, layout_range, \
              next_clean, num_next_clean_int, \
              atomic_clean_flag, \
              num_combined_tokens, hidden, num_topk, \
              num_max_dispatch_tokens_per_rank, \
              num_experts, rank, num_ranks, \
              num_warp_groups, num_warps_per_group, \
              phases, zero_copy); } break
    SETUP_LAUNCH_CONFIG(num_sms, num_warps * 32, stream);
    SWITCH_HIDDEN(COMBINE_LAUNCH_CASE);
 #undef COMBINE_LAUNCH_CASE
 }
 } // namespace internode_ll
 } // namespace deep_ep
--- a/DeepEP/csrc/kernels/intranode.cu
+++ b/DeepEP/csrc/kernels/intranode.cu
@ -0,0 +1,935 @@
 #include "configs.cuh"
 #include "buffer.cuh"
 #include "exception.cuh"
 #include "launch.cuh"
 #include "utils.cuh"
 namespace deep_ep {
 namespace intranode {
 template<int kNumRanks>
 __global__ void
 notify_dispatch(const int* num_tokens_per_rank, int* moe_recv_counter_mapped,
                const int* num_tokens_per_expert, int* moe_recv_expert_counter_mapped, int num_experts,
                int num_tokens, int num_channels, const bool* is_token_in_rank, int* channel_prefix_matrix,
                int* rank_prefix_matrix_copy, int num_memset_int, int expert_alignment,
                void** buffer_ptrs, int** barrier_signal_ptrs, int rank) {
    auto sm_id = static_cast<int>(blockIdx.x);
    auto thread_id = static_cast<int>(threadIdx.x), num_threads = static_cast<int>(blockDim.x);
    auto lane_id = thread_id % 32, warp_id = thread_id / 32, num_warps = num_threads / 32;
    if (sm_id == 0) {
        // Barrier first
        barrier_block<kNumRanks, true>(barrier_signal_ptrs, rank);
        int *per_rank_buffer, *per_expert_buffer;
        if (thread_id < kNumRanks) {
            per_rank_buffer = static_cast<int*>(buffer_ptrs[thread_id]);
            per_expert_buffer = per_rank_buffer + kNumRanks * kNumRanks;
        }
        // After this loop:
        //  - `per_rank_buffer[rank][i, j]` means the number of tokens from rank i to rank j
        //  - `per_expert_buffer[rank][i, j]` means the number of tokens from rank i to local expert j
        int num_experts_per_rank = num_experts / kNumRanks;
        if (thread_id < kNumRanks) {
            #pragma unroll
            for (int i = 0; i < kNumRanks; ++ i)
                per_rank_buffer[rank * kNumRanks + i] = num_tokens_per_rank[i];
            #pragma unroll
            for (int i = 0; i < num_experts_per_rank; ++ i)
                per_expert_buffer[rank * num_experts_per_rank + i] = num_tokens_per_expert[thread_id * num_experts_per_rank + i];
        }
        // Wait for all ranks to be finished
        barrier_block<kNumRanks>(barrier_signal_ptrs, rank);
        // Sum per-rank counts and return to CPU
        // Also pre-compute the prefix sum for data sending
        auto local_per_rank_buffer = static_cast<int*>(buffer_ptrs[rank]);
        if (thread_id < kNumRanks) {
            #pragma unroll
            for (int i = 1; i < kNumRanks; ++ i)
                local_per_rank_buffer[i * kNumRanks + thread_id] += local_per_rank_buffer[(i - 1) * kNumRanks + thread_id];
            if (thread_id == rank)
                *moe_recv_counter_mapped = local_per_rank_buffer[(kNumRanks - 1) * kNumRanks + rank];
        }
        // Sum per-experts counts and return to CPU
        auto local_per_expert_buffer = local_per_rank_buffer + kNumRanks * kNumRanks;
        if (thread_id < num_experts_per_rank) {
            int sum = 0;
            #pragma unroll
            for (int i = 0; i < kNumRanks; ++ i)
                sum += local_per_expert_buffer[i * num_experts_per_rank + thread_id];
            sum = (sum + expert_alignment - 1) / expert_alignment * expert_alignment;
            moe_recv_expert_counter_mapped[thread_id] = sum;
        }
        __syncthreads();
        // Copy rank size prefix matrix to another tensor
        #pragma unroll
        for (int i = thread_id; i < kNumRanks * kNumRanks; i += num_threads)
            rank_prefix_matrix_copy[i] = local_per_rank_buffer[i];
        // Extra memset for later communication queue
        #pragma unroll
        for (int i = thread_id; i < num_memset_int; i += num_threads)
            local_per_expert_buffer[i] = 0;
        // Barrier
        barrier_block<kNumRanks>(barrier_signal_ptrs, rank);
    } else {
        int dst_rank = sm_id - 1;
        for (int channel_id = warp_id; channel_id < num_channels; channel_id += num_warps) {
            int token_start_idx, token_end_idx;
            get_channel_task_range(num_tokens, num_channels, channel_id, token_start_idx, token_end_idx);
            // Iterate over tokens
            int count = 0;
            for (int64_t i = token_start_idx + lane_id; i < token_end_idx; i += 32)
                count += is_token_in_rank[i * kNumRanks + dst_rank];
            count = warp_reduce_sum(count);
            if (lane_id == 0)
                channel_prefix_matrix[dst_rank * num_channels + channel_id] = count;
        }
        __syncthreads();
        // Pre-compute prefix sum for all channels
        if (thread_id == 0) {
            #pragma unroll
            for (int i = 1; i < num_channels; ++ i)
                channel_prefix_matrix[dst_rank * num_channels + i] += channel_prefix_matrix[dst_rank * num_channels + i - 1];
        }
    }
 }
 void notify_dispatch(const int* num_tokens_per_rank, int* moe_recv_counter_mapped, int num_ranks,
                     const int* num_tokens_per_expert, int* moe_recv_expert_counter_mapped, int num_experts,
                     int num_tokens, const bool* is_token_in_rank, int* channel_prefix_matrix,
                     int* rank_prefix_matrix_copy, int num_memset_int, int expert_alignment,
                     void** buffer_ptrs, int** barrier_signal_ptrs, int rank,
                     cudaStream_t stream, int num_channels) {
 #define NOTIFY_DISPATCH_LAUNCH_CASE(ranks) \
    LAUNCH_KERNEL(&cfg, notify_dispatch<ranks>, \
        num_tokens_per_rank, moe_recv_counter_mapped, \
        num_tokens_per_expert, moe_recv_expert_counter_mapped, num_experts, \
        num_tokens, num_channels, is_token_in_rank, channel_prefix_matrix, \
        rank_prefix_matrix_copy, num_memset_int, expert_alignment, \
        buffer_ptrs, barrier_signal_ptrs, rank); \
    break
    constexpr int kNumThreads = 128;
    EP_HOST_ASSERT(num_experts % num_ranks == 0);
    EP_HOST_ASSERT(num_experts / num_ranks <= kNumThreads and num_ranks <= kNumThreads);
    SETUP_LAUNCH_CONFIG(1 + num_ranks, kNumThreads, stream);
    SWITCH_RANKS(NOTIFY_DISPATCH_LAUNCH_CASE);
 #undef NOTIFY_DISPATCH_LAUNCH_CASE
 }
 template<int kNumRanks>
 __global__ void
 cached_notify_dispatch(const int* rank_prefix_matrix, int num_memset_int,
                       void** buffer_ptrs, int** barrier_signal_ptrs, int rank) {
    // A simplified version for cached handles
    barrier_block<kNumRanks, true>(barrier_signal_ptrs, rank);
    // Copy and clean
    auto thread_id = static_cast<int>(threadIdx.x), num_threads = static_cast<int>(blockDim.x);
    auto ptr = static_cast<int*>(buffer_ptrs[rank]);
    #pragma unroll
    for (int i = thread_id; i < kNumRanks * kNumRanks; i += num_threads)
        ptr[i] = rank_prefix_matrix[i];
    #pragma unroll
    for (int i = thread_id; i < num_memset_int; i += num_threads)
        ptr[kNumRanks * kNumRanks + i] = 0;
    // Barrier after cleaning
    barrier_block<kNumRanks>(barrier_signal_ptrs, rank);
 }
 void cached_notify_dispatch(const int* rank_prefix_matrix, int num_memset_int,
                            void** buffer_ptrs, int** barrier_signal_ptrs,
                            int rank, int num_ranks, cudaStream_t stream) {
 #define CACHED_NOTIFY_DISPATCH_LAUNCH_CASE(ranks) \
    LAUNCH_KERNEL(&cfg, cached_notify_dispatch<ranks>, \
        rank_prefix_matrix, num_memset_int, buffer_ptrs, barrier_signal_ptrs, rank); \
    break
    SETUP_LAUNCH_CONFIG(1, 128, stream);
    SWITCH_RANKS(CACHED_NOTIFY_DISPATCH_LAUNCH_CASE);
 #undef CACHED_NOTIFY_DISPATCH_LAUNCH_CASE
 }
 template <int kNumRanks, int kNumThreads, int kNumTMABytesPerWarp>
 __global__ void __launch_bounds__(kNumThreads, 1)
 dispatch(int4* recv_x, float* recv_x_scales, int* recv_src_idx, int64_t* recv_topk_idx, float* recv_topk_weights, int* recv_channel_offset,
         int* send_head, const int4* x, const float* x_scales, const int64_t* topk_idx, const float* topk_weights,
         const bool* is_token_in_rank, const int* channel_prefix_matrix,
         int num_tokens, int num_worst_tokens, int hidden_int4, int num_topk, int num_experts, int num_scales,
         int scale_token_stride, int scale_hidden_stride,
         void** buffer_ptrs, int rank,
         int num_max_send_tokens, int num_recv_buffer_tokens) {
    const auto num_sms = static_cast<int>(gridDim.x), sm_id = static_cast<int>(blockIdx.x);
    const auto thread_id = static_cast<int>(threadIdx.x), lane_id = get_lane_id();
    const bool is_sender = sm_id % 2 == 0;
    EP_DEVICE_ASSERT(num_sms % 2 == 0);
    // Several warps are response for a single rank
    const auto num_threads_per_rank = kNumThreads / kNumRanks;
    const auto num_channels = num_sms / 2;
    const auto responsible_rank = (static_cast<int>(thread_id)) / num_threads_per_rank;
    // Even-numbered blocks for sending, odd-numbered blocks for receiving.
    const auto responsible_channel = sm_id / 2;
    int num_experts_per_rank = num_experts / kNumRanks;
    EP_DEVICE_ASSERT(num_experts_per_rank > 0 or num_topk == 0);
    EP_DEVICE_ASSERT(num_topk <= 32);
    EP_DEVICE_ASSERT((topk_idx == nullptr)  == (topk_weights == nullptr));
    EP_DEVICE_ASSERT((recv_topk_idx == nullptr) == (recv_topk_weights == nullptr));
    // Calculate pointers by the specific layout
    // `rank_prefix_matrix`: kNumRanks * kNumRanks * sizeof(int)
    auto ptr = reinterpret_cast<void*>(static_cast<int8_t*>(buffer_ptrs[is_sender ? responsible_rank : rank]) + kNumRanks * kNumRanks * sizeof(int));
    int target_rank = is_sender ? rank : responsible_rank;
    auto num_channels_total = num_channels * kNumRanks;
    auto channel_rank_offset = responsible_channel * kNumRanks + target_rank;
    // Channel buffer metadata
    // Senders are responsible for tails, and receivers are responsible for heads
    // Stored on the receiver side
    // The retired signals are actually boolean flags, but to align with 16 bytes, we make it `int64_t`
    // `start_offset`: kNumChannels * kNumRanks * sizeof(int)
    // `end_offset`: kNumChannels * kNumRanks * sizeof(int)
    // `head_idx`: kNumChannels * kNumRanks * sizeof(int)
    // `tail_idx`: kNumChannels * kNumRanks * sizeof(int)
    auto channel_start_offset = Buffer<int>(ptr, num_channels_total, channel_rank_offset);
    auto channel_end_offset = Buffer<int>(ptr, num_channels_total, channel_rank_offset);
    auto channel_head_idx = Buffer<int>(ptr, num_channels_total, channel_rank_offset);
    auto channel_tail_idx = Buffer<int>(ptr, num_channels_total, channel_rank_offset);
    // Channel data buffers, stored on the receiver side
    // `x_buffers`: kNumChannels * kNumRanks * num_recv_buffer_tokens * hidden_int4 * sizeof(int4)
    // `src_idx_buffers`: kNumChannels * kNumRanks * num_recv_buffer_tokens * sizeof(int)
    // `topk_idx_buffers`: kNumChannels * kNumRanks * num_recv_buffer_tokens * num_topk * sizeof(int64_t)
    // `topk_weights_buffers`: kNumChannels * kNumRanks * num_recv_buffer_tokens * num_topk * sizeof(float)
    // `x_scales_buffers`: kNumChannels * kNumRanks * num_recv_buffer_tokens * num_scales * sizeof(float)
    auto channel_x_buffers = Buffer<int4>(ptr, num_channels_total * num_recv_buffer_tokens * hidden_int4, channel_rank_offset * num_recv_buffer_tokens * hidden_int4);
    auto channel_src_idx_buffers = Buffer<int>(ptr, num_channels_total * num_recv_buffer_tokens, channel_rank_offset * num_recv_buffer_tokens);
    auto channel_topk_idx_buffers = Buffer<int64_t>(ptr, num_channels_total * num_recv_buffer_tokens * num_topk, channel_rank_offset * num_recv_buffer_tokens * num_topk);
    auto channel_topk_weights_buffers = Buffer<float>(ptr, num_channels_total * num_recv_buffer_tokens * num_topk, channel_rank_offset * num_recv_buffer_tokens * num_topk);
    auto channel_x_scales_buffers = Buffer<float>(ptr, num_channels_total * num_recv_buffer_tokens * num_scales, channel_rank_offset * num_recv_buffer_tokens * num_scales);
    // TMA stuffs
 #ifndef DISABLE_SM90_FEATURES
    extern __shared__ __align__(1024) uint8_t smem_buffer[];
    auto half_hidden_int4 = hidden_int4 / 2;
    auto half_hidden_bytes = half_hidden_int4 * static_cast<int>(sizeof(int4));
    auto tma_buffer = smem_buffer + (thread_id / 32) * kNumTMABytesPerWarp;
    auto tma_mbarrier = reinterpret_cast<uint64_t*>(tma_buffer + half_hidden_bytes);
    uint32_t tma_phase = 0;
    if (lane_id == 0) {
        mbarrier_init(tma_mbarrier, 1);
        fence_view_async_shared();
        fence_barrier_init();
        EP_DEVICE_ASSERT(hidden_int4 % 2 == 0 and half_hidden_bytes + sizeof(uint64_t) <= kNumTMABytesPerWarp);
    }
    __syncwarp();
 #endif
    if (is_sender) {
        // Workers for sending
        constexpr int num_send_warps = kNumThreads / 32;
        constexpr int num_send_warps_per_rank = num_send_warps / kNumRanks;
        const auto send_thread_id = thread_id;
        const auto send_warp_id_in_rank = send_thread_id % num_threads_per_rank / 32;
        EP_DEVICE_ASSERT(kNumRanks <= 32);
        EP_DEVICE_ASSERT(num_send_warps % kNumRanks == 0);
        // Send offset by `-value - 1`, e.g. 0 -> -1, 1 -> -2
        // NOTES: this is for distinguishing zero tokens
        if (lane_id == 0 and send_warp_id_in_rank == 0) {
            int value = responsible_channel > 0 ? channel_prefix_matrix[responsible_rank * num_channels + responsible_channel - 1] : 0;
            st_relaxed_sys_global(channel_start_offset.buffer(), -value - 1);
            value = channel_prefix_matrix[responsible_rank * num_channels + responsible_channel];
            st_relaxed_sys_global(channel_end_offset.buffer(), -value - 1);
        }
        __syncwarp();
        // Get tasks
        int token_start_idx, token_end_idx;
        get_channel_task_range(num_tokens, num_channels, responsible_channel, token_start_idx, token_end_idx);
        // Iterate over all tokens and send by chunks
        int cached_channel_tail_idx = 0;
        for (int64_t token_idx = token_start_idx; token_idx < token_end_idx; ) {
            // Check destination queue emptiness, or wait a buffer to be released (rare cases)
            // NOTES: the head index received by different warps may not be the same
            auto start_time = clock64();
            while (lane_id == 0) {
                // NOTES: we only consider the worst case, because counting the real numbers are time-consuming
                int num_used_slots = cached_channel_tail_idx - ld_volatile_global(channel_head_idx.buffer());
                if (num_recv_buffer_tokens - num_used_slots >= num_max_send_tokens)
                    break;
                // Rare cases to loop again
                if (clock64() - start_time > NUM_TIMEOUT_CYCLES) {
                    printf("DeepEP timeout for dispatch senders, rank %d, responsible_channel = %d\n", rank, responsible_channel);
                    trap();
                }
            }
            __syncwarp();
            int chunk_token_idx = 0;
            while (chunk_token_idx < num_max_send_tokens and token_idx < token_end_idx) {
                // NOTES: for the same token, the warp assigned to save `send_head` may be different from the warp assigned to send the following data
                if (lane_id == 0 and token_idx % num_send_warps_per_rank == send_warp_id_in_rank)
                    send_head[token_idx * kNumRanks + responsible_rank] = is_token_in_rank[token_idx * kNumRanks + responsible_rank] ? cached_channel_tail_idx : -1;
                // Skip if not selected
                if (not is_token_in_rank[token_idx * kNumRanks + responsible_rank]) {
                    token_idx ++;
                    continue;
                }
                // Get an empty slot
                int dst_slot_idx = (cached_channel_tail_idx ++) % num_recv_buffer_tokens;
                if (cached_channel_tail_idx % num_send_warps_per_rank == send_warp_id_in_rank) {
                    // Copy data
                    auto shifted_channel_x_buffers = channel_x_buffers.buffer() + dst_slot_idx * hidden_int4;
                    auto shifted_x = x + token_idx * hidden_int4;
                    UNROLLED_WARP_COPY(5, lane_id, hidden_int4, shifted_channel_x_buffers, shifted_x, __ldg, st_na_global);
                    // Copy source index
                    if (lane_id == 0)
                        channel_src_idx_buffers[dst_slot_idx] = static_cast<int>(token_idx);
                    // Copy `topk_idx` and `topk_weights` with transformed index
                    if (lane_id < num_topk) {
                        // Top-k index
                        int recv_expert_begin = responsible_rank * num_experts_per_rank, recv_expert_end = (responsible_rank + 1) * num_experts_per_rank;
                        auto idx_value = __ldg(topk_idx + token_idx * num_topk + lane_id);
                        idx_value = (idx_value >= recv_expert_begin and idx_value < recv_expert_end) ? idx_value - recv_expert_begin : -1;
                        channel_topk_idx_buffers[dst_slot_idx * num_topk + lane_id] = idx_value;
                        // Top-k weights
                        auto weight_value = __ldg(topk_weights + token_idx * num_topk + lane_id);
                        weight_value = (idx_value >= 0) ? weight_value : 0.0f;
                        channel_topk_weights_buffers[dst_slot_idx * num_topk + lane_id] = weight_value;
                    }
                    // Copy `x_scales`
                    #pragma unroll
                    for (int i = lane_id; i < num_scales; i += 32) {
                        auto offset = token_idx * scale_token_stride + i * scale_hidden_stride;
                        channel_x_scales_buffers[dst_slot_idx * num_scales + i] = __ldg(x_scales + offset);
                    }
                }
                // Move token index
                chunk_token_idx ++, token_idx ++;
            }
            // Move tail index
            // NOTES: here all warps should share the same new tail
            asm volatile("bar.sync %0, %1;" :: "r"(responsible_rank), "r"(num_threads_per_rank));
            if (send_warp_id_in_rank == 0 and lane_id == 0)
                st_release_sys_global(channel_tail_idx.buffer(), cached_channel_tail_idx);
        }
    } else {
        // Workers for receiving and copying into buffer
        constexpr int num_recv_warps = kNumThreads / 32;
        constexpr int num_recv_warps_per_rank = num_recv_warps / kNumRanks;
        const auto recv_thread_id = thread_id;
        const auto recv_thread_id_in_rank = recv_thread_id % num_threads_per_rank;
        const auto recv_warp_id_in_rank = recv_thread_id_in_rank / 32;
        EP_DEVICE_ASSERT(kNumRanks <= 32);
        EP_DEVICE_ASSERT(recv_thread_id >= 0 and num_recv_warps % kNumRanks == 0);
        // Calculate offset first
        auto rank_prefix_matrix = static_cast<int*>(buffer_ptrs[rank]);
        int rank_offset = responsible_rank > 0 ? rank_prefix_matrix[(responsible_rank - 1) * kNumRanks + rank] : 0;
        // Receive channel offset
        int total_offset, num_tokens_to_recv;
        while (lane_id == 0 and (total_offset = ld_volatile_global(channel_start_offset.buffer())) == 0);
        while (lane_id == 0 and (num_tokens_to_recv = ld_volatile_global(channel_end_offset.buffer())) == 0);
        if (lane_id == 0) {
            total_offset = -total_offset - 1, num_tokens_to_recv = -num_tokens_to_recv - 1;
            if (recv_warp_id_in_rank == 0)
                recv_channel_offset[responsible_rank * num_channels + responsible_channel] = total_offset;
            num_tokens_to_recv -= total_offset;
        }
        total_offset = __shfl_sync(0xffffffff, total_offset, 0);
        total_offset += rank_offset;
        num_tokens_to_recv = __shfl_sync(0xffffffff, num_tokens_to_recv, 0);
        // Shared tail indices for different warps
        __shared__ volatile int shared_channel_tail_idx[kNumRanks];
        auto start_time = clock64();
        int cached_channel_head_idx = 0, cached_channel_tail_idx = 0;
        while (num_tokens_to_recv > 0) {
            // NOTES: unlike the sender, the receiver must ensure that the tail indices hold by different warps are the same
            while (recv_thread_id_in_rank == 0) {
                cached_channel_tail_idx = ld_acquire_sys_global(channel_tail_idx.buffer());
                // Ready to copy
                if (cached_channel_head_idx != cached_channel_tail_idx) {
                    shared_channel_tail_idx[responsible_rank] = cached_channel_tail_idx;
                    break;
                }
                // Timeout check
                if (clock64() - start_time > NUM_TIMEOUT_CYCLES) {
                    printf("DeepEP timeout for dispatch receivers, rank %d, responsible_channel = %d, tokens remained: %d\n", rank, responsible_channel, num_tokens_to_recv);
                    trap();
                }
            }
            // Synchronize queue tail
            asm volatile("bar.sync %0, %1;" :: "r"(responsible_rank), "r"(num_threads_per_rank));
            cached_channel_tail_idx = shared_channel_tail_idx[responsible_rank];
            // Copy data
            int num_recv_tokens = cached_channel_tail_idx - cached_channel_head_idx;
            for (int chunk_idx = recv_warp_id_in_rank; chunk_idx < num_recv_tokens; chunk_idx += num_recv_warps_per_rank) {
                int token_idx_in_buffer = (cached_channel_head_idx + chunk_idx) % num_recv_buffer_tokens;
                auto shifted_buffer_x_int4 = channel_x_buffers.buffer() + token_idx_in_buffer * hidden_int4;
                auto shifted_recv_x_int4 = recv_x + static_cast<int64_t>(total_offset + chunk_idx) * hidden_int4;
 #ifndef DISABLE_SM90_FEATURES
                #pragma unroll
                for (int i = 0; i < 2; ++ i) if (lane_id == 0) {
                    tma_store_wait();
                    tma_load_1d(tma_buffer, shifted_buffer_x_int4 + i * half_hidden_int4, tma_mbarrier, half_hidden_bytes);
                    mbarrier_arrive_and_expect_tx(tma_mbarrier, half_hidden_bytes);
                    mbarrier_wait(tma_mbarrier, tma_phase);
                    tma_store_1d(tma_buffer, shifted_recv_x_int4 + i * half_hidden_int4, half_hidden_bytes, false);
                }
                __syncwarp();
 #else
                UNROLLED_WARP_COPY(5, lane_id, hidden_int4, shifted_recv_x_int4, shifted_buffer_x_int4,
                                   ld_nc_global, st_na_global);
 #endif
            }
            // Copy `src_idx`
            #pragma unroll 4
            for (int chunk_idx = cached_channel_head_idx + recv_thread_id_in_rank; chunk_idx < cached_channel_tail_idx; chunk_idx += 32 * num_recv_warps_per_rank)
                recv_src_idx[total_offset + chunk_idx - cached_channel_head_idx] = ld_nc_global(channel_src_idx_buffers.buffer() + chunk_idx % num_recv_buffer_tokens);
            // Copy `topk_idx` and `topk_weights`
            #pragma unroll 4
            for (int idx = recv_thread_id_in_rank; idx < num_recv_tokens * num_topk; idx += 32 * num_recv_warps_per_rank) {
                int chunk_idx = idx / num_topk, token_topk_idx = idx % num_topk;
                int token_idx_in_buffer = (cached_channel_head_idx + chunk_idx) % num_recv_buffer_tokens;
                auto recv_idx = static_cast<int64_t>(total_offset + chunk_idx) * num_topk + token_topk_idx;
                auto buffer_idx = token_idx_in_buffer * num_topk + token_topk_idx;
                recv_topk_idx[recv_idx] = ld_nc_global(channel_topk_idx_buffers.buffer() + buffer_idx);
                recv_topk_weights[recv_idx] = ld_nc_global(channel_topk_weights_buffers.buffer() + buffer_idx);
            }
            // Copy `x_scales`
            #pragma unroll 4
            for (int i = recv_thread_id_in_rank; i < num_recv_tokens * num_scales; i += 32 * num_recv_warps_per_rank) {
                int chunk_idx = i / num_scales, scales_idx = i % num_scales;
                int token_idx_in_buffer = (cached_channel_head_idx + chunk_idx) % num_recv_buffer_tokens;
                recv_x_scales[static_cast<int64_t>(total_offset + chunk_idx) * num_scales + scales_idx] =
                        ld_nc_global(channel_x_scales_buffers.buffer() + token_idx_in_buffer * num_scales + scales_idx);
            }
            // Move queue
            cached_channel_head_idx += num_recv_tokens;
            total_offset += num_recv_tokens;
            asm volatile("bar.sync %0, %1;" :: "r"(responsible_rank), "r"(num_threads_per_rank));
            if (recv_warp_id_in_rank == num_recv_warps_per_rank - 1 and lane_id == 0)
                st_relaxed_sys_global(channel_head_idx.buffer(), cached_channel_head_idx);
            // Exit
            num_tokens_to_recv -= num_recv_tokens;
        }
        // Make TMA store visible to the next kernel
 #ifndef DISABLE_SM90_FEATURES
        if (lane_id == 0)
            tma_store_wait();
 #endif
    }
    // Clean unused `recv_topk_idx` as -1
    if (num_worst_tokens > 0) {
        auto rank_prefix_matrix = static_cast<int*>(buffer_ptrs[rank]);
        const auto num_recv_tokens = rank_prefix_matrix[(kNumRanks - 1) * kNumRanks + rank];
        const auto clean_start = num_recv_tokens * num_topk + sm_id * kNumThreads;
        const auto clean_end = num_worst_tokens * num_topk;
        const auto clean_stride = num_sms * kNumThreads;
        #pragma unroll
        for (int i = clean_start + thread_id; i < clean_end; i += clean_stride)
            recv_topk_idx[i] = -1;
    }
 }
 void dispatch(void* recv_x, float* recv_x_scales, int* recv_src_idx, int64_t* recv_topk_idx, float* recv_topk_weights, int* recv_channel_offset,
              int* send_head, const void* x, const float* x_scales, const int64_t* topk_idx, const float* topk_weights,
              const bool* is_token_in_rank, const int* channel_prefix_matrix,
              int num_tokens, int num_worst_tokens, int hidden_int4, int num_topk, int num_experts, int num_scales,
              int scale_token_stride, int scale_hidden_stride,
              void** buffer_ptrs, int rank, int num_ranks,
              cudaStream_t stream, int num_sms, int num_max_send_tokens, int num_recv_buffer_tokens) {
    constexpr int kNumThreads = 768;
    constexpr int kNumTMABytesPerWarp = 8192;
 #ifndef DISABLE_SM90_FEATURES
    constexpr int smem_size = kNumTMABytesPerWarp * (kNumThreads / 32);
 #endif
    // Make sure never OOB
    EP_HOST_ASSERT(static_cast<int64_t>(num_scales) * scale_hidden_stride < std::numeric_limits<int>::max());
 #define DISPATCH_LAUNCH_CASE(ranks) { \
    auto kernel = dispatch<ranks, kNumThreads, kNumTMABytesPerWarp>; \
    SET_SHARED_MEMORY_FOR_TMA(kernel); \
    LAUNCH_KERNEL(&cfg, kernel, \
        reinterpret_cast<int4*>(recv_x), recv_x_scales, recv_src_idx, recv_topk_idx, recv_topk_weights, recv_channel_offset, \
        send_head, reinterpret_cast<const int4*>(x), x_scales, topk_idx, topk_weights, \
        is_token_in_rank, channel_prefix_matrix, \
        num_tokens, num_worst_tokens, hidden_int4, num_topk, num_experts, num_scales, \
        scale_token_stride, scale_hidden_stride, \
        buffer_ptrs, rank, \
        num_max_send_tokens, num_recv_buffer_tokens); \
    } break
    // Even-numbered blocks for sending, odd-numbered blocks for receiving.
    EP_HOST_ASSERT(num_sms % 2 == 0);
    SETUP_LAUNCH_CONFIG(num_sms, kNumThreads, stream);
    SWITCH_RANKS(DISPATCH_LAUNCH_CASE);
 #undef DISPATCH_LAUNCH_CASE
 }
 template<int kNumRanks>
 __global__ void
 cached_notify_combine(void** buffer_ptrs, int* send_head, int num_channels, int num_recv_tokens, int num_memset_int,
                      int** barrier_signal_ptrs, int rank) {
    const auto sm_id = static_cast<int>(blockIdx.x);
    if (sm_id == 0) {
        // Barrier before cleaning
        barrier_block<kNumRanks, true>(barrier_signal_ptrs, rank);
        // Clean
        auto thread_id = static_cast<int>(threadIdx.x), num_threads = static_cast<int>(blockDim.x);
        auto ptr = static_cast<int*>(buffer_ptrs[rank]);
        #pragma unroll
        for (int i = thread_id; i < num_memset_int; i += num_threads)
            ptr[i] = 0;
        // Barrier after cleaning
        barrier_block<kNumRanks>(barrier_signal_ptrs, rank);
    } else {
        const auto channel_id = sm_id - 1;
        const auto thread_id = static_cast<int>(threadIdx.x);
        const auto rank_id = thread_id / 32;
        const auto lane_id = thread_id % 32;
        if (rank_id >= kNumRanks)
            return;
        int token_start_idx, token_end_idx;
        get_channel_task_range(num_recv_tokens, num_channels, channel_id, token_start_idx, token_end_idx);
        // NOTES: `1 << 25` is a heuristic large number
        int last_head = 1 << 25;
        #pragma unroll
        for (int token_idx_tail = token_end_idx - 1; token_idx_tail >= token_start_idx; token_idx_tail -= 32) {
            int token_idx = token_idx_tail - lane_id, expected_head = 0;
            auto current_head = (token_idx >= token_start_idx) ? __ldg(send_head + token_idx * kNumRanks + rank_id) : -1;
            for (int i = 0; i < min(32, token_idx_tail - token_start_idx + 1); ++ i) {
                const int head = __shfl_sync(0xffffffff, current_head, i);
                if (head < 0) {
                    if (lane_id == i)
                        expected_head = -last_head - 1;
                } else {
                    last_head = head;
                }
            }
            if (current_head < 0 and token_idx >= token_start_idx)
                send_head[token_idx * kNumRanks + rank_id] = expected_head;
        }
    }
 }
 void cached_notify_combine(void** buffer_ptrs, int* send_head, int num_channels,
                           int num_recv_tokens, int num_memset_int,
                           int** barrier_signal_ptrs, int rank, int num_ranks,
                           cudaStream_t stream) {
 #define CACHED_NOTIFY_COMBINE(ranks) \
    LAUNCH_KERNEL(&cfg, cached_notify_combine<ranks>, \
        buffer_ptrs, send_head, num_channels, num_recv_tokens, num_memset_int, barrier_signal_ptrs, rank); \
    break
    const int num_threads = std::max(128, 32 * num_ranks);
    EP_HOST_ASSERT(num_ranks <= num_threads);
    EP_HOST_ASSERT(num_threads <= 1024);
    EP_HOST_ASSERT(1 + num_channels <= num_channels * 2);
    SETUP_LAUNCH_CONFIG(1 + num_channels, num_threads, stream);
    SWITCH_RANKS(CACHED_NOTIFY_COMBINE);
 #undef CACHED_NOTIFY_COMBINE
 }
 template<typename dtype_t, int kNumRanks, int kNumThreads, int kNumTMABytesPerWarp>
 __global__ void __launch_bounds__(kNumThreads, 1)
 combine(dtype_t* recv_x, float* recv_topk_weights,
        const dtype_t* x, const float* topk_weights,
        const dtype_t* bias_0, const dtype_t* bias_1,
        const int* src_idx, const int* rank_prefix_matrix, const int* channel_prefix_matrix,
        int* send_head, int num_tokens, int num_recv_tokens, int hidden, int num_topk,
        void** buffer_ptrs, int rank,
        int num_max_send_tokens, int num_recv_buffer_tokens) {
    const auto num_sms = static_cast<int>(gridDim.x);
    const auto thread_id = static_cast<int>(threadIdx.x);
    const auto sm_id = static_cast<int>(blockIdx.x), lane_id = get_lane_id();
    const auto num_channels = num_sms / 2;
    const bool is_sender = sm_id % 2 == 0;
    const int responsible_channel = sm_id / 2;
    EP_DEVICE_ASSERT(num_topk <= 32);
    constexpr int kDtypePerInt4 = sizeof(int4) / sizeof(dtype_t);
    int hidden_int4 = hidden * sizeof(dtype_t) / sizeof(int4);
    auto x_int4 = reinterpret_cast<const int4*>(x);
    auto bias_0_int4 = reinterpret_cast<const int4*>(bias_0);
    auto bias_1_int4 = reinterpret_cast<const int4*>(bias_1);
    auto recv_int4 = reinterpret_cast<int4*>(recv_x);
    // TMA stuffs
 #ifndef DISABLE_SM90_FEATURES
    extern __shared__ __align__(1024) uint8_t smem_buffer[];
    auto tma_buffer = smem_buffer + (thread_id / 32) * kNumTMABytesPerWarp;
 #endif
    if (is_sender) {
        // Workers for sending
        // Several warps are responsible for a single rank
        constexpr int num_send_warps_per_rank = (kNumThreads / 32) / kNumRanks;
        constexpr int num_send_warps = num_send_warps_per_rank * kNumRanks;
        const auto num_threads_per_rank = num_send_warps_per_rank * 32;
        const auto send_thread_id = thread_id;
        const auto send_warp_id = send_thread_id / 32;
        const auto send_rank_id = (responsible_channel + send_warp_id) % kNumRanks;
        const auto send_warp_id_in_rank = send_warp_id / kNumRanks;
        EP_STATIC_ASSERT(num_send_warps * 32 == kNumThreads, "Invalid warp count");
        // Calculate pointers by the specific layout
        auto ptr = reinterpret_cast<void*>(static_cast<int8_t*>(buffer_ptrs[send_rank_id]));
        auto num_channels_total = num_channels * kNumRanks;
        auto channel_rank_offset = responsible_channel * kNumRanks + rank;
        // Channel meta data
        // `head_idx`: kNumChannels * kNumRanks * sizeof(int)
        // `tail_idx`: kNumChannels * kNumRanks * sizeof(int)
        // `x_buffers`: kNumChannels * kNumRanks * num_recv_buffer_tokens * hidden_int4 * sizeof(int4)
        // `src_idx_buffers`: kNumChannels * kNumRanks * num_recv_buffer_tokens * sizeof(int)
        // `topk_weights_buffers`: kNumChannels * kNumRanks * num_recv_buffer_tokens * num_topk * sizeof(float)
        auto channel_head_idx = Buffer<int>(ptr, num_channels_total, channel_rank_offset);
        auto channel_tail_idx = Buffer<int>(ptr, num_channels_total, channel_rank_offset);
        auto channel_x_buffers = Buffer<int4>(ptr, num_channels_total * num_recv_buffer_tokens * hidden_int4, channel_rank_offset * num_recv_buffer_tokens * hidden_int4);
        auto channel_src_idx_buffers = Buffer<int>(ptr, num_channels_total * num_recv_buffer_tokens, channel_rank_offset * num_recv_buffer_tokens);
        auto channel_topk_weights_buffers = Buffer<float>(ptr, num_channels_total * num_recv_buffer_tokens * num_topk, channel_rank_offset * num_recv_buffer_tokens * num_topk);
        // Get tasks
        // NOTES: `channel_offset` is already shifted
        int rank_offset = send_rank_id > 0 ? rank_prefix_matrix[(send_rank_id - 1) * kNumRanks + rank] : 0;
        int num_rank_tokens = rank_prefix_matrix[send_rank_id * kNumRanks + rank] - rank_offset;
        int channel_offset = channel_prefix_matrix[send_rank_id * num_channels + responsible_channel];
        int num_channel_tokens = (responsible_channel == num_channels - 1 ? num_rank_tokens : channel_prefix_matrix[send_rank_id * num_channels + responsible_channel + 1]) - channel_offset;
        int token_start_idx = rank_offset + channel_offset, token_end_idx = rank_offset + channel_offset + num_channel_tokens;
        // Iterate over all tokens and send by chunks
        int current_channel_tail_idx = 0;
        for (int64_t token_idx = token_start_idx; token_idx < token_end_idx; ) {
            // Check destination queue emptiness, or wait a buffer to be released (rare cases)
            auto start_time = clock64();
            int num_round_tokens = min(num_max_send_tokens, token_end_idx - static_cast<int>(token_idx));
            while (lane_id == 0) {
                // NOTES: we only consider the worst case, because counting the real numbers are time-consuming
                int num_used_slots = current_channel_tail_idx - ld_volatile_global(channel_head_idx.buffer());
                if (num_recv_buffer_tokens - num_used_slots >= num_round_tokens)
                    break;
                // Rare cases to loop again
                if (clock64() - start_time > NUM_TIMEOUT_CYCLES) {
                    printf("DeepEP timeout for combine senders, rank %d, responsible_channel = %d\n", rank, responsible_channel);
                    trap();
                }
            }
            __syncwarp();
            // Send by chunk
            #pragma unroll
            for (int i = send_warp_id_in_rank; i < num_round_tokens; i += num_send_warps_per_rank) {
                // Get an empty slot
                int dst_slot_idx = (current_channel_tail_idx + i) % num_recv_buffer_tokens;
                // Copy data
                auto shifted_x_buffers = channel_x_buffers.buffer() + dst_slot_idx * hidden_int4;
                auto shifted_x = x_int4 + (token_idx + i) * hidden_int4;
                UNROLLED_WARP_COPY(4, lane_id, hidden_int4, shifted_x_buffers, shifted_x, ld_nc_global, st_na_global);
                // Send source index
                if (lane_id == 0)
                    channel_src_idx_buffers[dst_slot_idx] = __ldg(src_idx + token_idx + i);
                // Send `topk_weights`
                if (num_topk > 0 and lane_id < num_topk)
                    channel_topk_weights_buffers[dst_slot_idx * num_topk + lane_id] = __ldg(topk_weights + (token_idx + i) * num_topk + lane_id);
            }
            token_idx += num_round_tokens;
            current_channel_tail_idx += num_round_tokens;
            // Move tail index
            asm volatile("bar.sync %0, %1;" :: "r"(send_rank_id), "r"(num_threads_per_rank));
            if (lane_id == 0 and send_warp_id_in_rank == 0)
                st_release_sys_global(channel_tail_idx.buffer(), current_channel_tail_idx);
        }
    } else {
        // Workers for receiving
        // One warp for moving the queue head, others for reduction
        constexpr int num_recv_warps = kNumThreads / 32;
        const auto recv_warp_id = thread_id / 32;
        EP_DEVICE_ASSERT(kNumRanks <= 32 and kNumThreads > 32);
        EP_DEVICE_ASSERT(thread_id >= 0 and kNumThreads % 32 == 0);
        // Shared head, tail and retired flags for receiver warps
        __shared__ volatile int warp_channel_head_idx[num_recv_warps][kNumRanks];
        __shared__ volatile int channel_tail_idx[kNumRanks];
        __shared__ volatile bool warp_retired[num_recv_warps];
        if (thread_id < num_recv_warps)
            warp_retired[thread_id] = false;
        if (lane_id < kNumRanks)
            warp_channel_head_idx[recv_warp_id][lane_id] = 0;
        if (thread_id < kNumRanks)
            channel_tail_idx[thread_id] = 0;
        asm volatile("bar.sync 0, %0;" :: "r"(kNumThreads));
        if (thread_id < 32) {
            int* channel_head_idx_ptr = static_cast<int*>(buffer_ptrs[rank]) + responsible_channel * kNumRanks + lane_id;
            int* channel_tail_idx_ptr = channel_head_idx_ptr + num_channels * kNumRanks;
            // Queue head updater
            int last_head = 0;
            while (lane_id < kNumRanks) {
                // Check retired
                bool retired = true;
                #pragma unroll
                for (int i = 1; i < num_recv_warps; ++ i)
                    retired = retired and warp_retired[i];
                if (retired)
                    break;
                // Update queue tail
                channel_tail_idx[lane_id] = ld_acquire_sys_global(channel_tail_idx_ptr);
                // Update minimum head
                int min_head = std::numeric_limits<int>::max();
                #pragma unroll
                for (int i = 1; i < num_recv_warps; ++ i) if (not warp_retired[i])
                    min_head = min(min_head, warp_channel_head_idx[i][lane_id]);
                if (min_head != std::numeric_limits<int>::max() and min_head > last_head)
                    st_relaxed_sys_global(channel_head_idx_ptr, last_head = min_head);
            }
        } else {
            // Receivers
            // Channel metadata
            // All lanes will use data buffer, but only rank lane will use `head/tail/src_idx`
            Buffer<int4> channel_x_buffers[kNumRanks];
            Buffer<float> channel_topk_weights_buffers[kNumRanks];
            // Calculate pointers by the specific layout
            #pragma unroll
            for (int i = 0; i < kNumRanks; ++ i) {
                auto channel_rank_offset = responsible_channel * kNumRanks + i;
                auto num_channels_total = num_channels * kNumRanks;
                // `head_idx` & `tail_idx`: kNumChannels * kNumRanks * sizeof(int)
                auto ptr = reinterpret_cast<void*>(static_cast<int8_t*>(buffer_ptrs[rank]) + 2 * num_channels * kNumRanks * sizeof(int));
                // `x_buffers`: kNumChannels * kNumRanks * num_recv_buffer_tokens * hidden_int4 * sizeof(int4)
                channel_x_buffers[i] = Buffer<int4>(ptr, num_channels_total * num_recv_buffer_tokens * hidden_int4, channel_rank_offset * num_recv_buffer_tokens * hidden_int4);
                // `src_idx_buffers`: kNumChannels * kNumRanks * num_recv_buffer_tokens * sizeof(int)
                ptr = reinterpret_cast<void*>(static_cast<int8_t*>(ptr) + num_channels_total * num_recv_buffer_tokens * sizeof(int));
                // `topk_weights_buffers`: kNumChannels * kNumRanks * num_recv_buffer_tokens * num_topk * sizeof(float)
                channel_topk_weights_buffers[i] = Buffer<float>(ptr, num_channels_total * num_recv_buffer_tokens * num_topk, channel_rank_offset * num_recv_buffer_tokens * num_topk);
            }
            // The same tokens as the dispatch process
            int token_start_idx, token_end_idx;
            get_channel_task_range(num_recv_tokens, num_channels, responsible_channel, token_start_idx, token_end_idx);
            // Iterate over all tokens and combine
            for (int64_t token_idx = token_start_idx + recv_warp_id - 1; token_idx < token_end_idx; token_idx += num_recv_warps - 1) {
                // Read expected head
                int expected_head = -1;
                if (lane_id < kNumRanks)
                    expected_head = ld_nc_global(send_head + token_idx * kNumRanks + lane_id);
                auto start_time = clock64();
                while (__any_sync(0xffffffff, channel_tail_idx[lane_id] <= expected_head and expected_head >= 0)) {
                    // Timeout check
                    if (clock64() - start_time > NUM_TIMEOUT_CYCLES) {
                        printf("DeepEP timeout for combine receivers, rank %d, responsible_channel = %d, expect = %d\n", rank, responsible_channel, expected_head);
                        trap();
                    }
                }
                __syncwarp();
                // Broadcast current heads
                int num_topk_ranks = 0, topk_ranks[kNumRanks], slot_indices[kNumRanks];
                #pragma unroll
                for (int i = 0; i < kNumRanks; ++ i) {
                    auto expected_head_i = __shfl_sync(0xffffffff, expected_head, i);
                    if (expected_head_i >= 0) {
                        slot_indices[num_topk_ranks] = expected_head_i % num_recv_buffer_tokens;
                        topk_ranks[num_topk_ranks ++] = i;
                    }
                }
                // Wait shared memory release
 #ifndef DISABLE_SM90_FEATURES
                if (lane_id == 0)
                    tma_store_wait();
                __syncwarp();
 #endif
                // Reduce data with pipeline
                constexpr int kNumStages = 8;
                EP_STATIC_ASSERT(kNumStages * 32 * sizeof(int4) <= kNumTMABytesPerWarp, "Invalid count");
                #pragma unroll
                for (int i = lane_id; i < hidden_int4; i += 32) {
                    // Read bias
                    // TODO: make it as a template
                    int4 bias_0_value_int4 = bias_0_int4 != nullptr ? __ldg(bias_0_int4 + token_idx * hidden_int4 + i) : make_int4(0, 0, 0, 0);
                    int4 bias_1_value_int4 = bias_1_int4 != nullptr ? __ldg(bias_1_int4 + token_idx * hidden_int4 + i) : make_int4(0, 0, 0, 0);
                    // Read buffers
                    int4 recv_value_int4[kNumRanks];
                    #pragma unroll
                    for (int j = 0; j < num_topk_ranks; ++ j)
                        recv_value_int4[j] = ld_nc_global(channel_x_buffers[topk_ranks[j]].buffer() + slot_indices[j] * hidden_int4 + i);
                    // Reduce bias
                    float values[kDtypePerInt4];
                    auto bias_0_values = reinterpret_cast<const dtype_t*>(&bias_0_value_int4);
                    auto bias_1_values = reinterpret_cast<const dtype_t*>(&bias_1_value_int4);
                    #pragma unroll
                    for (int j = 0; j < kDtypePerInt4; ++ j)
                        values[j] = static_cast<float>(bias_0_values[j]) + static_cast<float>(bias_1_values[j]);
                    // Reduce all-to-all results
                    #pragma unroll
                    for (int j = 0; j < num_topk_ranks; ++ j) {
                        auto recv_value_dtypes = reinterpret_cast<const dtype_t*>(&recv_value_int4[j]);
                        #pragma unroll
                        for (int k = 0; k < kDtypePerInt4; ++ k)
                            values[k] += static_cast<float>(recv_value_dtypes[k]);
                    }
                    // Cast back to `dtype_t`
                    int4 out_int4;
                    auto out_dtypes = reinterpret_cast<dtype_t*>(&out_int4);
                    #pragma unroll
                    for (int j = 0; j < kDtypePerInt4; ++ j)
                        out_dtypes[j] = static_cast<dtype_t>(values[j]);
 #ifndef DISABLE_SM90_FEATURES
                    // Wait TMA arrival
                    if (lane_id == 0)
                        tma_store_wait<kNumStages - 1>();
                    __syncwarp();
                    // Write into TMA buffer
                    auto tma_stage_idx = (i / 32) % kNumStages;
                    reinterpret_cast<int4*>(tma_buffer)[tma_stage_idx * 32 + lane_id] = out_int4;
                    // Issue TMA
                    tma_store_fence();
                    __syncwarp();
                    if (lane_id == 0) {
                        auto tma_bytes = min(32, hidden_int4 - i) * static_cast<int>(sizeof(int4));
                        tma_store_1d(reinterpret_cast<int4*>(tma_buffer) + tma_stage_idx * 32,
                                     recv_int4 + token_idx * hidden_int4 + i, tma_bytes, false);
                    }
                    __syncwarp();
 #else
                    recv_int4[token_idx * hidden_int4 + i] = out_int4;
 #endif
                }
                // Reduce `topk_weights`
                if (lane_id < num_topk) {
                    float value = 0;
                    #pragma unroll
                    for (int i = 0; i < num_topk_ranks; ++ i)
                        value += ld_nc_global(channel_topk_weights_buffers[topk_ranks[i]].buffer() + slot_indices[i] * num_topk + lane_id);
                    recv_topk_weights[token_idx * num_topk + lane_id] = value;
                }
                // Update head
                if (lane_id < kNumRanks)
                    warp_channel_head_idx[recv_warp_id][lane_id] = (expected_head < 0) ? -expected_head - 1 : expected_head + 1;
            }
            // Retired
            __syncwarp();
            if (lane_id == 0)
                warp_retired[recv_warp_id] = true;
            // Make TMA store visible to the next kernel
 #ifndef DISABLE_SM90_FEATURES
            if (lane_id == 0)
                tma_store_wait();
 #endif
        }
    }
 }
 void combine(cudaDataType_t type,
             void* recv_x, float* recv_topk_weights,
             const void* x, const float* topk_weights,
             const void* bias_0, const void* bias_1,
             const int* src_idx, const int* rank_prefix_matrix, const int* channel_prefix_matrix,
             int* send_head, int num_tokens, int num_recv_tokens, int hidden, int num_topk,
             void** buffer_ptrs, int rank, int num_ranks,
             cudaStream_t stream, int num_sms,
             int num_max_send_tokens, int num_recv_buffer_tokens) {
    constexpr int kNumThreads = 768;
    constexpr int kNumTMABytesPerWarp = 4096;
 #ifndef DISABLE_SM90_FEATURES
    constexpr int smem_size = kNumTMABytesPerWarp * (kNumThreads / 32);
 #endif
 #define COMBINE_LAUNCH_CASE(dtype, ranks) { \
    auto kernel = combine<dtype, ranks, kNumThreads, kNumTMABytesPerWarp>; \
    SET_SHARED_MEMORY_FOR_TMA(kernel); \
    LAUNCH_KERNEL(&cfg, kernel, \
        reinterpret_cast<dtype*>(recv_x), recv_topk_weights, \
        reinterpret_cast<const dtype*>(x), topk_weights,   \
        reinterpret_cast<const dtype*>(bias_0), reinterpret_cast<const dtype*>(bias_1), \
        src_idx, rank_prefix_matrix, channel_prefix_matrix, \
        send_head, num_tokens, num_recv_tokens, hidden, num_topk, \
        buffer_ptrs, rank, \
        num_max_send_tokens, num_recv_buffer_tokens); } \
    break
 #define COMBINE_DTYPE_LAUNCH_CASE(dtype) SWITCH_RANKS_WITH_DTYPE(dtype, COMBINE_LAUNCH_CASE); break
    // Even-numbered blocks for sending, odd-numbered blocks for receiving
    EP_HOST_ASSERT(num_sms % 2 == 0);
    EP_HOST_ASSERT(kNumThreads >= num_ranks * 32);
    SETUP_LAUNCH_CONFIG(num_sms, kNumThreads, stream);
    SWITCH_TYPES(COMBINE_DTYPE_LAUNCH_CASE);
 #undef COMBINE_DTYPE_LAUNCH_CASE
 #undef COMBINE_LAUNCH_CASE
 }
 } // namespace intranode
 } // namespace deep_ep
--- a/DeepEP/csrc/kernels/launch.cuh
+++ b/DeepEP/csrc/kernels/launch.cuh
@ -0,0 +1,89 @@
 #pragma once
 #include "configs.cuh"
 #include "exception.cuh"
 #ifndef SETUP_LAUNCH_CONFIG
 #ifndef DISABLE_SM90_FEATURES
 #define SETUP_LAUNCH_CONFIG(num_sms, num_threads, stream) \
    cudaLaunchConfig_t cfg = {(num_sms), (num_threads), 0, stream, nullptr, 0}; \
    cudaLaunchAttribute attr[1]; \
    attr[0].id = cudaLaunchAttributeCooperative; \
    attr[0].val.cooperative = 1; \
    cfg.attrs = attr; \
    cfg.numAttrs = 1
 #else
 #define SETUP_LAUNCH_CONFIG(sms, threads, stream) \
    int __num_sms = (sms); \
    int __num_threads = (threads); \
    auto __stream = (stream)
 #endif
 #endif
 #ifndef LAUNCH_KERNEL
 #ifndef DISABLE_SM90_FEATURES
 #define LAUNCH_KERNEL(config, kernel, ...) CUDA_CHECK(cudaLaunchKernelEx(config, kernel, ##__VA_ARGS__))
 #else
 #define LAUNCH_KERNEL(config, kernel, ...) \
 do { \
    kernel<<<__num_sms, __num_threads, 0, __stream>>>(__VA_ARGS__); \
    cudaError_t e = cudaGetLastError(); \
    if (e != cudaSuccess) { \
        EPException cuda_exception("CUDA", __FILE__, __LINE__, cudaGetErrorString(e)); \
        fprintf(stderr, "%s\n", cuda_exception.what()); \
        throw cuda_exception; \
    } \
 } while (0)
 #endif
 #endif
 #ifndef SET_SHARED_MEMORY_FOR_TMA
 #ifndef DISABLE_SM90_FEATURES
 #define SET_SHARED_MEMORY_FOR_TMA(kernel) \
 EP_HOST_ASSERT(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size) == cudaSuccess); \
 cfg.dynamicSmemBytes = smem_size;
 #else
 #define SET_SHARED_MEMORY_FOR_TMA(kernel) void()
 #endif
 #endif
 #define SWITCH_RANKS(case_macro) \
    switch (num_ranks) { \
        case 2: case_macro(2); \
        case 4: case_macro(4); \
        case 8: case_macro(8); \
        default: EP_HOST_ASSERT(false and "Unsupported ranks"); \
    } while (false)
 #define SWITCH_RDMA_RANKS(case_macro) \
    switch (num_ranks / NUM_MAX_NVL_PEERS) { \
        case 2: case_macro(2); \
        case 4: case_macro(4); \
        case 8: case_macro(8); \
        case 16: case_macro(16); \
        default: EP_HOST_ASSERT(false and "Unsupported RDMA ranks"); \
    } while (false)
 #define SWITCH_RANKS_WITH_DTYPE(dtype, case_macro) \
    switch (num_ranks) { \
        case 2: case_macro(dtype, 2); \
        case 4: case_macro(dtype, 4); \
        case 8: case_macro(dtype, 8); \
        default: EP_HOST_ASSERT(false && "Unsupported ranks"); \
    } while (false)
 #define SWITCH_TYPES(case_macro) \
    switch (type) { \
        case CUDA_R_16BF: case_macro(nv_bfloat16); \
        default: EP_HOST_ASSERT(false && "Unsupported type"); \
    } while (false)
 #define SWITCH_HIDDEN(case_macro) \
    switch (hidden) { \
        case 2048: case_macro(2048); \
        case 2560: case_macro(2560); \
        case 4096: case_macro(4096); \
        case 5120: case_macro(5120); \
        case 7168: case_macro(7168); \
        default: EP_HOST_ASSERT(false && "Unsupported hidden"); \
    } while (false)
--- a/DeepEP/csrc/kernels/layout.cu
+++ b/DeepEP/csrc/kernels/layout.cu
@ -0,0 +1,136 @@
 #include "configs.cuh"
 #include "exception.cuh"
 #include "launch.cuh"
 namespace deep_ep {
 namespace layout {
 template <int kNumThreads, int kNumExpertsPerSM, int kNumRanksPerSM>
 __global__ void __launch_bounds__(kNumThreads, 1)
 get_dispatch_layout(const int64_t* topk_idx,
                    int* num_tokens_per_rank, int* num_tokens_per_rdma_rank,
                    int* num_tokens_per_expert, bool* is_token_in_rank,
                    int num_tokens, int num_topk, int num_ranks, int num_experts) {
    auto sm_id = static_cast<int>(blockIdx.x);
    auto thread_id = static_cast<int>(threadIdx.x);
    // Count expert statistics
    __shared__ int num_tokens_per_expert_per_thread[kNumThreads][kNumExpertsPerSM];
    int expert_begin_idx = sm_id * kNumExpertsPerSM, expert_end_idx = min(expert_begin_idx + kNumExpertsPerSM, num_experts);
    if (expert_begin_idx < expert_end_idx) {
        // Per-thread count
        #pragma unroll
        for (int i = 0; i < kNumExpertsPerSM; ++ i)
            num_tokens_per_expert_per_thread[thread_id][i] = 0;
        #pragma unroll
        for (int i = thread_id; i < num_tokens; i += kNumThreads) {
            auto shifted_topk_idx = topk_idx + i * num_topk;
            #pragma unroll
            for (int j = 0, expert_idx; j < num_topk; ++ j) {
                expert_idx = static_cast<int>(shifted_topk_idx[j]);
                if (expert_begin_idx <= expert_idx and expert_idx < expert_end_idx)
                    ++ num_tokens_per_expert_per_thread[thread_id][expert_idx - expert_begin_idx];
            }
        }
        __syncthreads();
        // Sum up
        EP_STATIC_ASSERT(kNumExpertsPerSM <= kNumThreads, "Too many experts per SM");
        if (expert_begin_idx + thread_id < expert_end_idx) {
            int sum = 0;
            #pragma unroll
            for (int i = 0; i < kNumThreads; ++ i)
                sum += num_tokens_per_expert_per_thread[i][thread_id];
            num_tokens_per_expert[expert_begin_idx + thread_id] = sum;
        }
        return;
    }
    if (num_tokens_per_rdma_rank != nullptr)
        EP_DEVICE_ASSERT(num_ranks % NUM_MAX_NVL_PEERS == 0 and num_ranks > NUM_MAX_NVL_PEERS);
    // Count rank statistics
    constexpr int kNumRDMARanksPerSM = kNumRanksPerSM / NUM_MAX_NVL_PEERS;
    __shared__ int num_tokens_per_rank_per_thread[kNumThreads][kNumRanksPerSM];
    __shared__ int num_tokens_per_rdma_rank_per_thread[kNumThreads][kNumRDMARanksPerSM];
    auto sm_begin = (num_experts + kNumExpertsPerSM - 1) / kNumExpertsPerSM;
    int rank_begin_idx = (sm_id - sm_begin) * kNumRanksPerSM, rank_end_idx = min(rank_begin_idx + kNumRanksPerSM, num_ranks);
    int rdma_rank_begin_idx = rank_begin_idx / NUM_MAX_NVL_PEERS, rdma_rank_end_idx = rank_end_idx / NUM_MAX_NVL_PEERS;
    if (rank_begin_idx < rank_end_idx) {
        const auto num_expert_per_rank = num_experts / num_ranks;
        auto expert_begin = rank_begin_idx * num_expert_per_rank;
        auto expert_end = rank_end_idx * num_expert_per_rank;
        // Per-thread count
        #pragma unroll
        for (int i = 0; i < kNumRanksPerSM; ++ i)
            num_tokens_per_rank_per_thread[thread_id][i] = 0;
        #pragma unroll
        for (int i = 0; i < kNumRDMARanksPerSM; ++ i)
            num_tokens_per_rdma_rank_per_thread[thread_id][i] = 0;
        #pragma unroll
        for (int i = thread_id; i < num_tokens; i += kNumThreads) {
            auto shifted_topk_idx = topk_idx + i * num_topk;
            int is_in_rank[kNumRanksPerSM] = {0}, is_in_rdma_rank[kNumRDMARanksPerSM] = {0};
            #pragma unroll
            for (int j = 0, expert_idx, rank_idx; j < num_topk; ++j) {
                expert_idx = static_cast<int>(shifted_topk_idx[j]);
                if (expert_begin <= expert_idx and expert_idx < expert_end) {
                    // Count single rank
                    rank_idx = expert_idx / num_expert_per_rank - rank_begin_idx;
                    is_in_rank[rank_idx] ++, is_in_rdma_rank[rank_idx / NUM_MAX_NVL_PEERS] ++;
                }
            }
            auto shifted_is_token_in_rank = is_token_in_rank + i * num_ranks;
            #pragma unroll
            for (int j = 0; j + rank_begin_idx < rank_end_idx; ++ j) {
                shifted_is_token_in_rank[j + rank_begin_idx] = (is_in_rank[j] > 0);
                num_tokens_per_rank_per_thread[thread_id][j] += (is_in_rank[j] > 0);
            }
            #pragma unroll
            for (int j = 0; j + rdma_rank_begin_idx < rdma_rank_end_idx; ++ j)
                num_tokens_per_rdma_rank_per_thread[thread_id][j] += (is_in_rdma_rank[j] > 0);
        }
        __syncthreads();
        // Sum up
        EP_STATIC_ASSERT(kNumRanksPerSM <= kNumThreads, "Too many ranks per SM");
        if (rank_begin_idx + thread_id < rank_end_idx) {
            int sum = 0;
            #pragma unroll
            for (int i = 0; i < kNumThreads; ++ i)
                sum += num_tokens_per_rank_per_thread[i][thread_id];
            num_tokens_per_rank[rank_begin_idx + thread_id] = sum;
        }
        if (num_tokens_per_rdma_rank != nullptr and rdma_rank_begin_idx + thread_id < rdma_rank_end_idx) {
            int sum = 0;
            #pragma unroll
            for (int i = 0; i < kNumThreads; ++ i)
                sum += num_tokens_per_rdma_rank_per_thread[i][thread_id];
            num_tokens_per_rdma_rank[rdma_rank_begin_idx + thread_id] = sum;
        }
    }
 }
 void get_dispatch_layout(const int64_t* topk_idx,
                         int* num_tokens_per_rank, int* num_tokens_per_rdma_rank,
                         int* num_tokens_per_expert, bool* is_token_in_rank,
                         int num_tokens, int num_topk, int num_ranks, int num_experts,
                         cudaStream_t stream) {
    constexpr int kNumThreads = 256, kNumExpertsPerSM = 32, kNumRanksPerSM = 8;
    int num_sms = ((num_experts + kNumExpertsPerSM - 1) / kNumExpertsPerSM) + (num_ranks + kNumRanksPerSM - 1) / kNumRanksPerSM;
    EP_STATIC_ASSERT(kNumExpertsPerSM % NUM_MAX_NVL_PEERS == 0, "Invalid number of experts per SM");
    SETUP_LAUNCH_CONFIG(num_sms, kNumThreads, stream);
    LAUNCH_KERNEL(&cfg, (get_dispatch_layout<kNumThreads, kNumExpertsPerSM, kNumRanksPerSM>),
                  topk_idx, num_tokens_per_rank, num_tokens_per_rdma_rank, num_tokens_per_expert, is_token_in_rank,
                  num_tokens, num_topk, num_ranks, num_experts);
 }
 } // namespace layout
 } // namespace deep_ep
--- a/DeepEP/csrc/kernels/runtime.cu
+++ b/DeepEP/csrc/kernels/runtime.cu
@ -0,0 +1,92 @@
 #include <vector>
 #include <cstring>
 #include "configs.cuh"
 #include "exception.cuh"
 #include "launch.cuh"
 #include "utils.cuh"
 #ifndef DISABLE_NVSHMEM
 #include "ibgda_device.cuh"
 #endif
 namespace deep_ep {
 namespace intranode {
 template<int kNumRanks>
 __global__ void barrier(int** barrier_signal_ptrs, int rank) {
    barrier_block<kNumRanks>(barrier_signal_ptrs, rank);
 }
 void barrier(int** barrier_signal_ptrs, int rank, int num_ranks, cudaStream_t stream) {
 #define BARRIER_LAUNCH_CASE(ranks) \
    LAUNCH_KERNEL(&cfg, barrier<ranks>, barrier_signal_ptrs, rank); \
    break
    SETUP_LAUNCH_CONFIG(1, 32, stream);
    SWITCH_RANKS(BARRIER_LAUNCH_CASE);
 #undef BARRIER_LAUNCH_CASE
 }
 } // namespace intranode
 namespace internode {
 #ifndef DISABLE_NVSHMEM
 nvshmem_team_t cpu_rdma_team = NVSHMEM_TEAM_INVALID;
 nvshmem_team_config_t cpu_rdma_team_config;
 std::vector<uint8_t> get_unique_id() {
    nvshmemx_uniqueid_t unique_id;
    nvshmemx_get_uniqueid(&unique_id);
    std::vector<uint8_t> result(sizeof(nvshmemx_uniqueid_t));
    std::memcpy(result.data(), &unique_id, sizeof(nvshmemx_uniqueid_t));
    return result;
 }
 int init(const std::vector<uint8_t> &root_unique_id_val, int rank, int num_ranks, bool low_latency_mode) {
    nvshmemx_uniqueid_t root_unique_id;
    nvshmemx_init_attr_t attr;
    std::memcpy(&root_unique_id, root_unique_id_val.data(), sizeof(nvshmemx_uniqueid_t));
    nvshmemx_set_attr_uniqueid_args(rank, num_ranks, &root_unique_id, &attr);
    nvshmemx_init_attr(NVSHMEMX_INIT_WITH_UNIQUEID, &attr);
    // Create sub-RDMA teams
    // NOTES: if `num_ranks <= NUM_MAX_NVL_PEERS` then only low-latency kernels are used
    if (low_latency_mode and num_ranks > NUM_MAX_NVL_PEERS) {
        EP_HOST_ASSERT(cpu_rdma_team == NVSHMEM_TEAM_INVALID);
        EP_HOST_ASSERT(num_ranks % NUM_MAX_NVL_PEERS == 0);
        EP_HOST_ASSERT(nvshmem_team_split_strided(NVSHMEM_TEAM_WORLD, rank % NUM_MAX_NVL_PEERS, NUM_MAX_NVL_PEERS,
                                                  num_ranks / NUM_MAX_NVL_PEERS, &cpu_rdma_team_config, 0, &cpu_rdma_team) == 0);
        EP_HOST_ASSERT(cpu_rdma_team != NVSHMEM_TEAM_INVALID);
    }
    nvshmem_barrier_all();
    return nvshmem_my_pe();
 }
 void* alloc(size_t size, size_t alignment) {
    return nvshmem_align(alignment, size);
 }
 void free(void* ptr) {
    nvshmem_free(ptr);
 }
 void barrier() {
    nvshmem_barrier_all();
 }
 void finalize() {
    if (cpu_rdma_team != NVSHMEM_TEAM_INVALID) {
        nvshmem_team_destroy(cpu_rdma_team);
        cpu_rdma_team = NVSHMEM_TEAM_INVALID;
    }
    nvshmem_finalize();
 }
 #endif
 } // namespace internode
 } // namespace deep_ep
--- a/DeepEP/csrc/kernels/utils.cuh
+++ b/DeepEP/csrc/kernels/utils.cuh
@ -0,0 +1,496 @@
 #pragma once
 #include "exception.cuh"
 #define UNROLLED_WARP_COPY(UNROLL_FACTOR, LANE_ID, N, DST, SRC, LD_FUNC, ST_FUNC) \
 { \
    constexpr int kLoopStride = 32 * (UNROLL_FACTOR); \
    typename std::remove_reference<decltype(LD_FUNC((SRC) + 0))>::type unrolled_values[(UNROLL_FACTOR)]; \
    auto __src = (SRC); \
    auto __dst = (DST); \
    for (int __i = (LANE_ID); __i < ((N) / kLoopStride) * kLoopStride; __i += kLoopStride) { \
        _Pragma("unroll") \
        for (int __j = 0; __j < (UNROLL_FACTOR); ++ __j) \
            unrolled_values[__j] = LD_FUNC(__src + __i + __j * 32); \
        _Pragma("unroll") \
        for (int __j = 0; __j < (UNROLL_FACTOR); ++ __j) \
            ST_FUNC(__dst + __i + __j * 32, unrolled_values[__j]); \
    } \
    for (int __i = ((N) / kLoopStride) * kLoopStride + (LANE_ID); __i < (N); __i += 32) \
        ST_FUNC(__dst + __i, LD_FUNC(__src + __i)); \
 }
 namespace deep_ep {
 template <int kBytes>
 struct VecInt {};
 template<> struct VecInt<1> { using vec_t = int8_t; };
 template<> struct VecInt<2> { using vec_t = int16_t; };
 template<> struct VecInt<4> { using vec_t = int; };
 template<> struct VecInt<8> { using vec_t = int64_t; };
 template<> struct VecInt<16> { using vec_t = int4; };
 __device__ __forceinline__ void trap() {
    asm("trap;");
 }
 __device__ __forceinline__ void memory_fence() {
    asm volatile("fence.acq_rel.sys;":: : "memory");
 }
 __device__ __forceinline__ void memory_fence_gpu() {
    asm volatile("fence.acq_rel.gpu;":: : "memory");
 }
 __device__ __forceinline__ void memory_fence_cta() {
    asm volatile("fence.acq_rel.cta;":: : "memory");
 }
 __device__  __forceinline__ void st_relaxed_sys_global(const int *ptr, int val) {
    asm volatile("st.relaxed.sys.global.s32 [%0], %1;"::"l"(ptr), "r"(val) : "memory");
 }
 __device__  __forceinline__ void st_release_sys_global(const int *ptr, int val) {
    asm volatile("st.release.sys.global.s32 [%0], %1;"::"l"(ptr), "r"(val) : "memory");
 }
 __device__  __forceinline__ void st_release_cta(const int *ptr, int val) {
    asm volatile("st.release.cta.s32 [%0], %1;"::"l"(ptr), "r"(val) : "memory");
 }
 __device__ __forceinline__ int ld_acquire_sys_global(const int *ptr) {
    int ret;
    asm volatile("ld.acquire.sys.global.s32 %0, [%1];" : "=r"(ret) : "l"(ptr));
    return ret;
 }
 __device__ __forceinline__ uint64_t ld_acquire_sys_global(const uint64_t *ptr) {
    uint64_t ret;
    asm volatile("ld.acquire.sys.global.u64 %0, [%1];" : "=l"(ret) : "l"(ptr));
    return ret;
 }
 __device__ __forceinline__ int ld_acquire_global(const int *ptr) {
    int ret;
    asm volatile("ld.acquire.gpu.global.s32 %0, [%1];" : "=r"(ret) : "l"(ptr));
    return ret;
 }
 __device__ __forceinline__ int atomic_add_release_sys_global(const int* ptr, int value) {
    int ret;
    asm volatile("atom.add.release.sys.global.s32 %0, [%1], %2;" : "=r"(ret) : "l"(ptr), "r"(value));
    return ret;
 }
 __device__ __forceinline__ int atomic_add_release_global(const int* ptr, int value) {
    int ret;
    asm volatile("atom.add.release.gpu.global.s32 %0, [%1], %2;" : "=r"(ret) : "l"(ptr), "r"(value));
    return ret;
 }
 __device__ __forceinline__ int ld_acquire_cta(const int *ptr) {
    int ret;
    asm volatile("ld.acquire.cta.s32 %0, [%1];" : "=r"(ret) : "l"(ptr));
    return ret;
 }
 __device__ __forceinline__ uint8_t ld_na_relaxed(const uint8_t *ptr) {
    uint16_t ret;
    asm volatile("ld.relaxed.gpu.global.L1::no_allocate.b8 %0, [%1];" : "=h"(ret) : "l"(ptr));
    return static_cast<uint8_t>(ret);
 }
 __device__ __forceinline__ uint16_t ld_na_relaxed(const uint16_t *ptr) {
    uint16_t ret;
    asm volatile("ld.relaxed.gpu.global.L1::no_allocate.b16 %0, [%1];" : "=h"(ret) : "l"(ptr));
    return ret;
 }
 __device__ __forceinline__ uint32_t ld_na_relaxed(const uint32_t *ptr) {
    uint32_t ret;
    asm volatile("ld.relaxed.gpu.global.L1::no_allocate.b32 %0, [%1];" : "=r"(ret) : "l"(ptr));
    return ret;
 }
 __device__ __forceinline__ uint64_t ld_na_relaxed(const uint64_t *ptr) {
    uint64_t ret;
    asm volatile("ld.relaxed.gpu.global.L1::no_allocate.b64 %0, [%1];" : "=l"(ret) : "l"(ptr));
    return ret;
 }
 __device__  __forceinline__ int ld_volatile_global(const int *ptr) {
    int ret;
    asm volatile("ld.volatile.global.s32 %0, [%1];" : "=r"(ret) : "l"(ptr));
    return ret;
 }
 __device__  __forceinline__ float ld_volatile_global(const float *ptr) {
    float ret;
    asm volatile("ld.volatile.global.f32 %0, [%1];" : "=f"(ret) : "l"(ptr));
    return ret;
 }
 __device__  __forceinline__ int64_t ld_volatile_global(const int64_t *ptr) {
    int64_t ret;
    asm volatile("ld.volatile.global.s64 %0, [%1];" : "=l"(ret) : "l"(ptr));
    return ret;
 }
 __device__  __forceinline__ int64_t ld_volatile_global(const uint64_t *ptr) {
    int64_t ret;
    asm volatile("ld.volatile.global.u64 %0, [%1];" : "=l"(ret) : "l"(ptr));
    return ret;
 }
 #ifndef DISABLE_AGGRESSIVE_PTX_INSTRS
 #define LD_NC_FUNC "ld.global.nc.L1::no_allocate.L2::256B"
 #else
 #define LD_NC_FUNC "ld.volatile.global.L2::256B"
 #endif
 // `ld.global.nc.L1::no_allocate` will be translated into `LDG.E.NA.[width].CONSTANT` in SASS
 template <typename dtype_t>
 __device__  __forceinline__ dtype_t ld_nc_global(const dtype_t *ptr) {
    auto ret = ld_nc_global(reinterpret_cast<const typename VecInt<sizeof(dtype_t)>::vec_t*>(ptr));
    return *reinterpret_cast<dtype_t*>(&ret);
 }
 template <>
 __device__  __forceinline__ uint8_t ld_nc_global(const uint8_t *ptr) {
    uint16_t ret;
    // NOTES: we must use `uint16_t` as inline ASM does not support 8-bit constraint letter (`h` below means unsigned 16-bit)
    asm volatile(LD_NC_FUNC ".u8 %0, [%1];" : "=h"(ret) : "l"(ptr));
    return static_cast<uint8_t>(ret);
 }
 template <>
 __device__  __forceinline__ int ld_nc_global(const int *ptr) {
    int ret;
    asm volatile(LD_NC_FUNC ".s32 %0, [%1];" : "=r"(ret) : "l"(ptr));
    return ret;
 }
 template <>
 __device__  __forceinline__ int64_t ld_nc_global(const int64_t *ptr) {
    int64_t ret;
    asm volatile(LD_NC_FUNC ".s64 %0, [%1];" : "=l"(ret) : "l"(ptr));
    return ret;
 }
 template <>
 __device__  __forceinline__ float ld_nc_global(const float *ptr) {
    float ret;
    asm volatile(LD_NC_FUNC ".f32 %0, [%1];" : "=f"(ret) : "l"(ptr));
    return ret;
 }
 template <>
 __device__  __forceinline__ int2 ld_nc_global(const int2 *ptr) {
    int2 ret;
    asm volatile(LD_NC_FUNC ".v2.s32 {%0, %1}, [%2];" : "=r"(ret.x), "=r"(ret.y) : "l"(ptr));
    return ret;
 }
 template <>
 __device__  __forceinline__ int4 ld_nc_global(const int4 *ptr) {
    int4 ret;
    asm volatile(LD_NC_FUNC ".v4.s32 {%0, %1, %2, %3}, [%4];"
            : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : "l"(ptr));
    return ret;
 }
 __device__ __forceinline__ void st_na_relaxed(const uint8_t *ptr, uint8_t val) {
    asm volatile("st.relaxed.gpu.global.L1::no_allocate.b8 [%0], %1;" : : "l"(ptr), "h"(static_cast<uint16_t>(val)));
 }
 __device__ __forceinline__ void st_na_relaxed(const uint16_t *ptr, uint16_t val) {
    asm volatile("st.relaxed.gpu.global.L1::no_allocate.b16 [%0], %1;" : : "l"(ptr), "h"(val));
 }
 __device__ __forceinline__ void st_na_relaxed(const uint32_t *ptr, uint32_t val) {
    asm volatile("st.relaxed.gpu.global.L1::no_allocate.b32 [%0], %1;" : : "l"(ptr), "r"(val));
 }
 __device__ __forceinline__ void st_na_relaxed(const int *ptr, int val) {
    asm volatile("st.relaxed.gpu.global.L1::no_allocate.b32 [%0], %1;" : : "l"(ptr), "r"(val));
 }
 __device__ __forceinline__ void st_na_relaxed(const int4 *ptr, int4 val) {
    asm volatile("st.relaxed.gpu.global.L1::no_allocate.v4.s32 [%0], {%1, %2, %3, %4};"
            : : "l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w));
 }
 __device__ __forceinline__ void st_na_release(const int *ptr, int val) {
    asm volatile("st.release.gpu.global.L1::no_allocate.b32 [%0], %1;" : : "l"(ptr), "r"(val));
 }
 __device__ __forceinline__ void st_na_release(const uint32_t *ptr, uint32_t val) {
    asm volatile("st.release.gpu.global.L1::no_allocate.b32 [%0], %1;" : : "l"(ptr), "r"(val));
 }
 __device__ __forceinline__ void st_na_release(const uint64_t *ptr, uint64_t val) {
    asm volatile("st.release.gpu.global.L1::no_allocate.b64 [%0], %1;" : : "l"(ptr), "l"(val));
 }
 // `st.global.L1::no_allocate` will be translated into `ST.E.NA.[width]` in SASS
 #ifndef DISABLE_AGGRESSIVE_PTX_INSTRS
 #define ST_NA_FUNC "st.global.L1::no_allocate"
 #else
 #define ST_NA_FUNC "st.global"
 #endif
 template <typename dtype_t>
 __device__  __forceinline__ void st_na_global(const dtype_t *ptr, const dtype_t& value) {
    st_na_global(reinterpret_cast<const typename VecInt<sizeof(dtype_t)>::vec_t*>(ptr),
                 *reinterpret_cast<const typename VecInt<sizeof(dtype_t)>::vec_t*>(&value));
 }
 template <>
 __device__  __forceinline__ void st_na_global(const int *ptr, const int& value) {
    asm volatile(ST_NA_FUNC ".s32 [%0], %1;" ::"l"(ptr), "r"(value));
 }
 template <>
 __device__  __forceinline__ void st_na_global(const int64_t *ptr, const int64_t& value) {
    asm volatile(ST_NA_FUNC ".s64 [%0], %1;" ::"l"(ptr), "l"(value));
 }
 template <>
 __device__  __forceinline__ void st_na_global(const float *ptr, const float& value) {
    asm volatile(ST_NA_FUNC ".f32 [%0], %1;" ::"l"(ptr), "f"(value));
 }
 template <>
 __device__  __forceinline__ void st_na_global(const int4 *ptr, const int4& value) {
    asm volatile(ST_NA_FUNC ".v4.s32 [%0], {%1, %2, %3, %4};"
            ::"l"(ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w));
 }
 // TMA PTX instructions
 #ifndef DISABLE_SM90_FEATURES
 __device__ __forceinline__ void fence_view_async_shared() {
    asm volatile("fence.proxy.async.shared::cta; \n" :: );
 }
 __device__ __forceinline__ void fence_barrier_init() {
    asm volatile("fence.mbarrier_init.release.cluster; \n" :: );
 }
 __device__ __forceinline__ void mbarrier_init(uint64_t* mbar_ptr, uint32_t arrive_count) {
    auto mbar_int_ptr = static_cast<uint32_t>(__cvta_generic_to_shared(mbar_ptr));
    asm volatile("mbarrier.init.shared::cta.b64 [%1], %0;" :: "r"(arrive_count), "r"(mbar_int_ptr));
 }
 __device__ __forceinline__ void mbarrier_wait(uint64_t* mbar_ptr, uint32_t& phase) {
    auto mbar_int_ptr = static_cast<uint32_t>(__cvta_generic_to_shared(mbar_ptr));
    asm volatile("{\n\t"
                 ".reg .pred       P1; \n\t"
                 "LAB_WAIT: \n\t"
                 "mbarrier.try_wait.parity.shared::cta.b64 P1, [%0], %1, %2; \n\t"
                 "@P1 bra DONE; \n\t"
                 "bra     LAB_WAIT; \n\t"
                 "DONE: \n\t"
                 "}" :: "r"(mbar_int_ptr), "r"(phase), "r"(0x989680));
    phase ^= 1;
 }
 __device__ __forceinline__ void mbarrier_arrive_and_expect_tx(uint64_t* mbar_ptr, int num_bytes) {
    auto mbar_int_ptr = static_cast<uint32_t>(__cvta_generic_to_shared(mbar_ptr));
    asm volatile("mbarrier.arrive.expect_tx.shared::cta.b64 _, [%1], %0; \n\t" :: "r"(num_bytes), "r"(mbar_int_ptr));
 }
 __device__ __forceinline__ void tma_store_fence() {
    asm volatile ("fence.proxy.async.shared::cta;");
 }
 constexpr uint64_t kEvictFirst = 0x12f0000000000000;
 constexpr uint64_t kEvictNormal = 0x1000000000000000;
 __device__ __forceinline__ void tma_load_1d(const void* smem_ptr, const void* gmem_ptr, uint64_t* mbar_ptr, int num_bytes,
                                            bool evict_first = true) {
    auto mbar_int_ptr = static_cast<uint32_t>(__cvta_generic_to_shared(mbar_ptr));
    auto smem_int_ptr  = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
    const auto cache_hint = evict_first ? kEvictFirst : kEvictNormal;
    asm volatile("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.L2::cache_hint [%0], [%1], %2, [%3], %4;\n"
                 :: "r"(smem_int_ptr), "l"(gmem_ptr), "r"(num_bytes), "r"(mbar_int_ptr), "l"(cache_hint) : "memory");
 }
 __device__ __forceinline__ void tma_store_1d(const void* smem_ptr, const void* gmem_ptr, int num_bytes,
                                             bool evict_first = true) {
    auto smem_int_ptr = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
    const auto cache_hint = evict_first ? kEvictFirst : kEvictNormal;
    asm volatile("cp.async.bulk.global.shared::cta.bulk_group.L2::cache_hint [%0], [%1], %2, %3;\n"
                 :: "l"(gmem_ptr), "r"(smem_int_ptr), "r"(num_bytes), "l"(cache_hint) : "memory");
    asm volatile("cp.async.bulk.commit_group;");
 }
 template <int N = 0>
 __device__ __forceinline__ void tma_store_wait() {
    asm volatile("cp.async.bulk.wait_group.read %0;" :: "n"(N) : "memory");
 }
 #endif
 template <typename dtype_t>
 __host__ __device__ dtype_t ceil_div(dtype_t a, dtype_t b) {
    return (a + b - 1) / b;
 }
 template <typename dtype_t>
 __host__ __device__ dtype_t align(dtype_t a, dtype_t b) {
    return ceil_div<dtype_t>(a, b) * b;
 }
 __forceinline__ __device__ void get_channel_task_range(int num_tokens, int num_sms, int sm_id,
                                                       int& token_start_idx, int& token_end_idx) {
    int num_tokens_per_sm = ceil_div(num_tokens, num_sms);
    token_start_idx = min(num_tokens_per_sm * sm_id, num_tokens);
    token_end_idx = min(token_start_idx + num_tokens_per_sm, num_tokens);
 }
 template <typename dtype_a_t, typename dtype_b_t>
 __device__ __forceinline__ dtype_b_t pack2(const dtype_a_t& x, const dtype_a_t& y) {
    EP_STATIC_ASSERT(sizeof(dtype_a_t) * 2 == sizeof(dtype_b_t), "Invalid dtypes");
    dtype_b_t packed;
    auto unpacked_ptr = reinterpret_cast<dtype_a_t*>(&packed);
    unpacked_ptr[0] = x, unpacked_ptr[1] = y;
    return packed;
 }
 template <typename dtype_a_t, typename dtype_b_t>
 __device__ __forceinline__ void unpack2(const dtype_b_t& packed, dtype_a_t& x, dtype_a_t& y) {
    EP_STATIC_ASSERT(sizeof(dtype_a_t) * 2 == sizeof(dtype_b_t), "Invalid dtypes");
    auto unpacked_ptr = reinterpret_cast<const dtype_a_t*>(&packed);
    x = unpacked_ptr[0], y = unpacked_ptr[1];
 }
 template <typename dtype_t>
 __device__ __forceinline__ dtype_t broadcast(dtype_t& ptr, int src_lane_idx) {
    EP_STATIC_ASSERT(sizeof(dtype_t) % sizeof(int) == 0, "");
    auto send_int_values = reinterpret_cast<int*>(&ptr);
    int recv_int_values[sizeof(dtype_t) / sizeof(int)];
    #pragma unroll
    for (int i = 0; i < sizeof(dtype_t) / sizeof(int); ++ i)
        recv_int_values[i] = __shfl_sync(0xffffffff, send_int_values[i], src_lane_idx);
    return *reinterpret_cast<dtype_t*>(recv_int_values);
 }
 __forceinline__ __device__ int warp_reduce_sum(int value) {
    value += __shfl_xor_sync(0xffffffff, value, 16);
    value += __shfl_xor_sync(0xffffffff, value, 8);
    value += __shfl_xor_sync(0xffffffff, value, 4);
    value += __shfl_xor_sync(0xffffffff, value, 2);
    value += __shfl_xor_sync(0xffffffff, value, 1);
    return value;
 }
 __forceinline__ __device__ float half_warp_reduce_max(float value) {
    auto mask = __activemask();
    // The mask be in `{0xffffffff, 0xffff}`
    value = max(value, __shfl_xor_sync(mask, value, 8));
    value = max(value, __shfl_xor_sync(mask, value, 4));
    value = max(value, __shfl_xor_sync(mask, value, 2));
    value = max(value, __shfl_xor_sync(mask, value, 1));
    return value;
 }
 __forceinline__ __device__ int get_lane_id() {
    int lane_id;
    asm("mov.s32 %0, %laneid;" : "=r"(lane_id));
    return lane_id;
 }
 constexpr float kFP8Margin = 1e-4;
 constexpr float kFinfoAmaxE4M3 = 448.0f;
 constexpr float kFinfoAmaxInvE4M3 = 1 / 448.0f;
 __forceinline__ __device__ float fast_pow2(int x) {
    // We can ensure `-126 <= x and x <= 127`
    uint32_t bits_x = (x + 127) << 23;
    return *reinterpret_cast<float*>(&bits_x);
 }
 __forceinline__ __device__ int fast_log2_ceil(float x) {
    auto bits_x = *reinterpret_cast<uint32_t*>(&x);
    auto exp_x = (bits_x >> 23) & 0xff;
    auto man_bits = bits_x & ((1 << 23) - 1);
    return exp_x - 127 + (man_bits != 0);
 }
 __forceinline__ __device__ void calculate_fp8_scales(float amax, float& scale, float& scale_inv, bool round_scale) {
    if (round_scale) {
        auto exp_scale_inv = fast_log2_ceil(amax * kFinfoAmaxInvE4M3);
        scale = fast_pow2(-exp_scale_inv);
        scale_inv = fast_pow2(exp_scale_inv);
    } else {
        scale_inv = amax * kFinfoAmaxInvE4M3;
        scale = kFinfoAmaxE4M3 / amax;
    }
 }
 template <bool kIsUE8M0, typename out_dtype_t = std::conditional_t<kIsUE8M0, uint8_t, float>>
 __forceinline__ __device__ out_dtype_t extract_required_scale_format(float value) {
    if constexpr (kIsUE8M0) {
        return static_cast<uint8_t>((*reinterpret_cast<uint32_t*>(&value)) >> 23);
    } else {
        return value;
    }
 }
 template <int kNumRanks, bool kSyncOnly = false>
 __forceinline__ __device__ void
 barrier_block(int** barrier_signal_ptrs, int rank) {
    auto thread_id = static_cast<int>(threadIdx.x);
    // For non-sync-only cases, the memory operations by other threads in the block must be visible to the `sys` scope
    if constexpr (not kSyncOnly) {
        memory_fence();
        __syncthreads();
    }
    // Add self-ranks, sub other ranks
    if (thread_id < kNumRanks) {
        atomicAdd_system(barrier_signal_ptrs[rank] + thread_id, FINISHED_SUM_TAG);
        atomicSub_system(barrier_signal_ptrs[thread_id] + rank, FINISHED_SUM_TAG);
    }
    EP_DEVICE_ASSERT(kNumRanks <= blockDim.x);
    // Check timeout
    auto start_time = clock64();
    while (true) {
        auto value = thread_id < kNumRanks ? ld_volatile_global(barrier_signal_ptrs[rank] + thread_id) : 0;
        if (__all_sync(0xffffffff, value <= 0))
            break;
        if (clock64() - start_time > NUM_TIMEOUT_CYCLES and get_lane_id() == 0) {
            printf("DeepEP timeout check failed: rank = %d, thread = %d)\n", rank, thread_id);
            trap();
        }
    }
    __syncthreads();
 }
 __forceinline__ __device__ int atomic_cas_cta_acquire(int* addr, int x, int y) {
    int ret;
    asm volatile("atom.acquire.cta.shared::cta.cas.b32 %0, [%1], %2, %3;" : "=r"(ret) : "l"(addr), "r"(x), "r"(y) : "memory");
    return ret;
 }
 __forceinline__ __device__ int atomic_exch_cta_release(int* addr, int x) {
    int ret;
    asm volatile("atom.release.cta.shared::cta.exch.b32 %0, [%1], %2;" : "=r"(ret) : "l"(addr), "r"(x) : "memory");
    return ret;
 }
 __forceinline__ __device__ void acquire_lock(int* mutex) {
    // To make later memory operations valid, we must use `acquire` for memory semantics
    while (atomic_cas_cta_acquire(mutex, 0, 1) != 0);
 }
 __forceinline__ __device__ void release_lock(int* mutex) {
    // To make previous memory operations visible to other threads, we must use `release` for memory semantics
    atomic_exch_cta_release(mutex, 0);
 }
 } // namespace deep_ep
--- a/DeepEP/deep_ep/init.py
+++ b/DeepEP/deep_ep/init.py
@ -0,0 +1,7 @@
 import torch
 from .utils import EventOverlap
 from .buffer import Buffer
 # noinspection PyUnresolvedReferences
 from deep_ep_cpp import Config
--- a/DeepEP/deep_ep/buffer.py
+++ b/DeepEP/deep_ep/buffer.py
@ -0,0 +1,617 @@
 import os
 import torch
 import torch.distributed as dist
 from typing import Callable, List, Tuple, Optional, Union
 # noinspection PyUnresolvedReferences
 import deep_ep_cpp
 # noinspection PyUnresolvedReferences
 from deep_ep_cpp import Config, EventHandle
 from .utils import EventOverlap, check_nvlink_connections
 class Buffer:
    """
    The core expert-parallel (EP) communication buffers for Mixture of Experts (MoE) model, which supports:
        - high-throughput intranode all-to-all (dispatch and combine, using NVLink)
        - high-throughput internode all-to-all (dispatch and combine, using RDMA and NVLink)
        - low-latency all-to-all (dispatch and combine, using RDMA)
    Attributes:
        num_sms: the SMs used in high-throughput kernels.
        rank: the local rank number.
        group_size: the number of ranks in the group.
        group: the communication group.
        num_nvl_bytes: the buffer size for intranode NVLink communication.
        num_rdma_bytes: the buffer size for internode (also for intranode with low-latency mode) RDMA communication.
        runtime: the C++ runtime.
    """
    num_sms: int = 20
    def __init__(self, group: dist.ProcessGroup,
                 num_nvl_bytes: int = 0, num_rdma_bytes: int = 0,
                 low_latency_mode: bool = False, num_qps_per_rank: int = 24,
                 allow_nvlink_for_low_latency_mode: bool = True,
                 allow_mnnvl: bool = False) -> None:
        """
        Initialize the communication buffer.
        Arguments:
            group: the communication group.
            num_nvl_bytes: the buffer size for intranode NVLink communication.
            num_rdma_bytes: the buffer size for internode (also for intranode with low-latency mode) RDMA communication.
            low_latency_mode: whether to enable low-latency mode.
            num_qps_per_rank: the number of QPs for RDMA, the low-latency mode requires that this number equals
                to the number of local experts.
            allow_nvlink_for_low_latency_mode: whether allow NVLink traffic for low-latency mode, you should notice
                this is somehow incompatible with the hook-based overlapping.
                Warning: PCIe connections may lead to errors due to memory ordering issues,
                please make sure all connections are via NVLink.
            allow_mnnvl: whether to allow MNNVL
        """
        check_nvlink_connections(group)
        # Initialize the CPP runtime
        self.rank = group.rank()
        self.group_size = group.size()
        self.group = group
        self.num_nvl_bytes = num_nvl_bytes
        self.num_rdma_bytes = num_rdma_bytes
        self.low_latency_mode = low_latency_mode
        self.runtime = deep_ep_cpp.Buffer(self.rank, self.group_size, num_nvl_bytes, num_rdma_bytes, low_latency_mode)
        # Synchronize device IDs
        device_ids = [None, ] * self.group_size
        local_device_id = self.runtime.get_local_device_id()
        dist.all_gather_object(device_ids, local_device_id, group)
        # Synchronize IPC handles
        ipc_handles = [None, ] * self.group_size
        local_ipc_handle = self.runtime.get_local_ipc_handle()
        dist.all_gather_object(ipc_handles, local_ipc_handle, group)
        # Synchronize NVSHMEM unique IDs
        root_unique_id = None
        if self.runtime.get_num_rdma_ranks() > 1 or low_latency_mode:
            # Enable IBGDA 
            assert num_qps_per_rank > 0
            os.environ['NVSHMEM_DISABLE_P2P'] = '0' if allow_nvlink_for_low_latency_mode else '1'
            os.environ['NVSHMEM_IB_ENABLE_IBGDA'] = '1'
            os.environ['NVSHMEM_IBGDA_NIC_HANDLER'] = 'gpu'
            os.environ['NVSHMEM_IBGDA_NUM_RC_PER_PE'] = f'{num_qps_per_rank}'
            # Make sure QP depth is always larger than the number of on-flight WRs, so that we can skip WQ slot check
            os.environ['NVSHMEM_QP_DEPTH'] = '1024'
            # Reduce gpu memory usage
            # 6 default teams + 1 extra team
            os.environ['NVSHMEM_MAX_TEAMS'] = '7'
            # Disable NVLink SHArP
            os.environ['NVSHMEM_DISABLE_NVLS'] = '1'
            # NOTES: NVSHMEM initialization requires at least 256 MiB
            os.environ['NVSHMEM_CUMEM_GRANULARITY'] = f'{2 ** 29}'
            if not allow_mnnvl:
                # Disable multi-node NVLink detection
                os.environ['NVSHMEM_DISABLE_MNNVL'] = '1'
            # Synchronize using the root ID
            nvshmem_unique_ids = [None, ] * self.group_size
            if (low_latency_mode and self.rank == 0) or (not low_latency_mode and self.runtime.get_rdma_rank() == 0):
                root_unique_id = self.runtime.get_local_nvshmem_unique_id()
            dist.all_gather_object(nvshmem_unique_ids, root_unique_id, group)
            root_unique_id = nvshmem_unique_ids[0 if low_latency_mode else self.runtime.get_root_rdma_rank(True)]
        # Make CPP runtime available
        self.runtime.sync(device_ids, ipc_handles, root_unique_id)
        assert self.runtime.is_available()
    @staticmethod
    def is_sm90_compiled():
        return deep_ep_cpp.is_sm90_compiled()
    @staticmethod
    def set_num_sms(new_num_sms: int) -> None:
        """
        Set the number of SMs to use in high-throughput kernels.
        Arguments:
            new_num_sms: the new number to be set.
        """
        assert new_num_sms % 2 == 0, 'The SM count must be even'
        Buffer.num_sms = new_num_sms
    @staticmethod
    def capture() -> EventOverlap:
        """
        Capture a CUDA event on the current stream, i.e. `torch.cuda.current_stream()`.
        Returns:
            event: the captured event.
        """
        return EventOverlap(EventHandle())
    @staticmethod
    def get_low_latency_rdma_size_hint(num_max_dispatch_tokens_per_rank: int, hidden: int, num_ranks: int, num_experts: int) -> int:
        """
        Get a minimum size requirement for the RDMA buffer. The size calculation will be done with BF16.
        Arguments:
            num_max_dispatch_tokens_per_rank: the maximum number of tokens to dispatch, all the ranks must hold the same value.
            hidden: the hidden dimension of each token.
            num_ranks: the number of EP group ranks.
            num_experts: the number of all experts.
        Returns:
            size: the RDMA buffer size recommended.
        """
        return deep_ep_cpp.get_low_latency_rdma_size_hint(num_max_dispatch_tokens_per_rank, hidden, num_ranks, num_experts)
    def get_comm_stream(self) -> torch.Stream:
        """
        Get the communication stream.
        Returns:
            stream: the communication stream. 
        """
        ts: torch.Stream = self.runtime.get_comm_stream()
        return torch.cuda.Stream(stream_id=ts.stream_id, device_index=ts.device_index, device_type=ts.device_type)
    def get_local_buffer_tensor(self, dtype: torch.dtype, size: Optional[torch.Size] = None,
                                offset: int = 0, use_rdma_buffer: bool = False) -> torch.Tensor:
        """
        Get the raw buffer (slice supported) as a PyTorch tensor.
        Argument:
            dtype: the data type (PyTorch `dtype`) for the tensor.
            size: the slice size (by elements) to get from the buffer.
            offset: the offset of the beginning element.
            use_rdma_buffer: whether to return the RDMA buffer.
        """
        tensor = self.runtime.get_local_buffer_tensor(dtype, offset, use_rdma_buffer)
        if size is None:
            return tensor
        assert tensor.numel() >= size.numel()
        return tensor[:size.numel()].view(size)
    @staticmethod
    def _unpack_bias(bias: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]):
        bias_0, bias_1 = None, None
        if isinstance(bias, torch.Tensor):
            bias_0 = bias
        elif isinstance(bias, tuple):
            assert len(bias) == 2
            bias_0, bias_1 = bias
        return bias_0, bias_1
    @staticmethod
    def get_dispatch_config(num_ranks: int) -> Config:
        """
        Get a recommended dispatch config.
        Argument:
            num_ranks: the number of ranks.
        Returns:
            config: the recommended config.
        """
        # TODO: automatically tune
        config_map = {
            2: Config(Buffer.num_sms, 24, 256, 6, 128),
            4: Config(Buffer.num_sms, 6, 256, 6, 128),
            8: Config(Buffer.num_sms, 6, 256, 6, 128),
            16: Config(Buffer.num_sms, 16, 288, 20, 128),
            24: Config(Buffer.num_sms, 8, 288, 32, 128),
            32: Config(Buffer.num_sms, 8, 288, 32, 128),
            64: Config(Buffer.num_sms, 20, 288, 28, 128),
            128: Config(Buffer.num_sms, 20, 560, 32, 128),
            144: Config(Buffer.num_sms, 32, 720, 12, 128),
            160: Config(Buffer.num_sms, 28, 720, 12, 128),
        }
        assert num_ranks in config_map, f'Unsupported number of EP ranks: {num_ranks}'
        return config_map[num_ranks]
    @staticmethod
    def get_combine_config(num_ranks: int) -> Config:
        """
        Get a recommended combine config.
        Argument:
            num_ranks: the number of ranks.
        Returns:
            config: the recommended config.
        """
        # TODO: automatically tune
        config_map = {
            2: Config(Buffer.num_sms, 10, 256, 6, 128),
            4: Config(Buffer.num_sms, 9, 256, 6, 128),
            8: Config(Buffer.num_sms, 4, 256, 6, 128),
            16: Config(Buffer.num_sms, 2, 288, 28, 128),
            24: Config(Buffer.num_sms, 1, 288, 20, 128),
            32: Config(Buffer.num_sms, 1, 288, 20, 128),
            64: Config(Buffer.num_sms, 1, 288, 20, 128),
            128: Config(Buffer.num_sms, 1, 560, 12, 128),
            144: Config(Buffer.num_sms, 2, 720, 8, 128),
            160: Config(Buffer.num_sms, 2, 720, 8, 128),
        }
        assert num_ranks in config_map, f'Unsupported number of EP ranks: {num_ranks}'
        return config_map[num_ranks]
    # noinspection PyTypeChecker
    def get_dispatch_layout(self, topk_idx: torch.Tensor, num_experts: int,
                            previous_event: Optional[EventOverlap] = None, async_finish: bool = False,
                            allocate_on_comm_stream: bool = False) -> \
            Tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor, torch.Tensor, EventOverlap]:
        """
        Calculate the layout required for later communication.
        Arguments:
            topk_idx: `[num_tokens, num_topk]`, dtype must be `torch.int64`, the expert indices selected by each token,
                `-1` means no selections.
            num_experts: the number of experts.
            previous_event: the event to wait before actually executing the kernel.
            async_finish: the current stream will not wait for the communication kernels to be finished if set.
            allocate_on_comm_stream: control whether all the allocated tensors' ownership to be on the communication stream.
        Returns:
            num_tokens_per_rank: `[num_ranks]` with `torch.int`, the number of tokens to be sent to each rank.
            num_tokens_per_rdma_rank: `[num_rdma_ranks]` with `torch.int`, the number of tokens to be sent to each RDMA
                rank (with the same GPU index), return `None` for intranode settings.
            num_tokens_per_expert: `[num_experts]` with `torch.int`, the number of tokens to be sent to each expert.
            is_token_in_rank: `[num_tokens, num_ranks]` with `torch.bool`, whether a token be sent to a rank.
            event: the event after executing the kernel (valid only if `async_finish` is set).
        """
        num_tokens_per_rank, num_tokens_per_rdma_rank, num_tokens_per_expert, is_token_in_rank, event = \
            self.runtime.get_dispatch_layout(topk_idx, num_experts, getattr(previous_event, 'event', None),
                                             async_finish, allocate_on_comm_stream)
        return num_tokens_per_rank, num_tokens_per_rdma_rank, num_tokens_per_expert, is_token_in_rank, EventOverlap(event)
    # noinspection PyTypeChecker
    def dispatch(self, x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
                 handle: Optional[Tuple] = None,
                 num_tokens_per_rank: Optional[torch.Tensor] = None, num_tokens_per_rdma_rank: Optional[torch.Tensor] = None,
                 is_token_in_rank: Optional[torch.Tensor] = None, num_tokens_per_expert: Optional[torch.Tensor] = None,
                 topk_idx: Optional[torch.Tensor] = None, topk_weights: Optional[torch.Tensor] = None,
                 expert_alignment: int = 1, num_worst_tokens: int = 0,
                 config: Optional[Config] = None,
                 previous_event: Optional[EventOverlap] = None, async_finish: bool = False,
                 allocate_on_comm_stream: bool = False) -> \
            Tuple[Union[Tuple[torch.Tensor, torch.Tensor], torch.Tensor], Optional[torch.Tensor],
                  Optional[torch.Tensor], List[int], Tuple, EventOverlap]:
        """
        Dispatch tokens to different ranks, both intranode and internode settings are supported.
        Intranode kernels require all the ranks should be visible via NVLink.
        Internode kernels require the ranks in a node should be visible via NVLink, while the ranks with the same GPU
            index should be visible via RDMA.
        Arguments:
            x: `torch.Tensor` or tuple of `torch.Tensor`, for the first type, the shape must be `[num_tokens, hidden]`,
                and type must be `torch.bfloat16`; for the second type, the first element of the tuple must be shaped as
                `[num_tokens, hidden]` with type `torch.float8_e4m3fn`, the second must be `[num_tokens, hidden // 128]`
                 (requiring divisible) with type `torch.float`.
            handle: an optional communication handle, if set, the CPU will reuse the layout information to save some time.
            num_tokens_per_rank: `[num_ranks]` with `torch.int`, the number of tokens to be sent to each rank.
            num_tokens_per_rdma_rank: `[num_rdma_ranks]` with `torch.int`, the number of tokens to be sent to each RDMA
                rank (with the same GPU index), return `None` for intranode settings.
            is_token_in_rank: `[num_tokens, num_ranks]` with `torch.bool`, whether a token be sent to a rank.
            num_tokens_per_expert: `[num_experts]` with `torch.int`, the number of tokens to be sent to each expert.
            topk_idx: `[num_tokens, num_topk]` with `torch.int64`, the expert indices selected by each token,
                `-1` means no selections.
            topk_weights: `[num_tokens, num_topk]` with `torch.float`, the expert weights of each token to dispatch.
            expert_alignment: align the number of tokens received by each local expert to this variable.
            num_worst_tokens: the worst number of tokens to receive, if specified, there will be no CPU sync, and it
                will be CUDA-graph compatible. Please also notice that this flag is for intranode only.
            config: the performance tuning config.
            previous_event: the event to wait before actually executing the kernel.
            async_finish: the current stream will not wait for the communication kernels to be finished if set.
            allocate_on_comm_stream: control whether all the allocated tensors' ownership to be on the communication stream.
        Returns:
            recv_x: received tokens, the same type and tuple as the input `x`, but the number of tokens equals to the
                received token count.
            recv_topk_idx: received expert indices.
            recv_topk_weights: received expert weights.
            num_recv_tokens_per_expert_list: Python list shaped `[num_local_experts]`, the received token count by
                each local expert, aligned to the input `expert_alignment`. If `num_worst_tokens` is specified, the list
                will be empty.
            handle: the returned communication handle.
            event: the event after executing the kernel (valid only if `async_finish` is set).
        """
        # Default config
        config = self.get_dispatch_config(self.group_size) if config is None else config
        # Internode
        if self.runtime.get_num_rdma_ranks() > 1:
            assert num_worst_tokens == 0, 'Internode dispatch does not support `num_worst_tokens > 0`'
            return self.internode_dispatch(x, handle, num_tokens_per_rank, num_tokens_per_rdma_rank, is_token_in_rank, num_tokens_per_expert,
                                           topk_idx, topk_weights, expert_alignment, config, previous_event, async_finish, allocate_on_comm_stream)
        # Launch the kernel with cached or non-cached mode
        x, x_scales = x if isinstance(x, tuple) else (x, None)
        if handle is not None:
            assert topk_idx is None and topk_weights is None
            rank_prefix_matrix, channel_prefix_matrix, recv_channel_prefix_matrix, recv_src_idx, is_token_in_rank, send_head = handle
            num_recv_tokens = recv_src_idx.size(0)
            recv_x, recv_x_scales, _, _, _, _, _, _, _, _, event = self.runtime.intranode_dispatch(
                x, x_scales, None, None,
                None, is_token_in_rank, None, num_recv_tokens, rank_prefix_matrix, channel_prefix_matrix,
                expert_alignment, num_worst_tokens, config,
                getattr(previous_event, 'event', None), async_finish, allocate_on_comm_stream)
            return (recv_x, recv_x_scales) if x_scales is not None else recv_x, None, None, None, None, EventOverlap(event)
        else:
            assert num_tokens_per_rank is not None and is_token_in_rank is not None and num_tokens_per_expert is not None
            recv_x, recv_x_scales, recv_topk_idx, recv_topk_weights, num_recv_tokens_per_expert_list, rank_prefix_matrix, channel_prefix_matrix, recv_channel_prefix_matrix, recv_src_idx, send_head, event = \
                self.runtime.intranode_dispatch(x, x_scales, topk_idx, topk_weights,
                                                num_tokens_per_rank, is_token_in_rank, num_tokens_per_expert, 0, None, None,
                                                expert_alignment, num_worst_tokens, config,
                                                getattr(previous_event, 'event', None), async_finish, allocate_on_comm_stream)
            handle = (rank_prefix_matrix, channel_prefix_matrix, recv_channel_prefix_matrix, recv_src_idx, is_token_in_rank, send_head)
            return (recv_x, recv_x_scales) if x_scales is not None else recv_x, recv_topk_idx, recv_topk_weights, num_recv_tokens_per_expert_list, handle, EventOverlap(event)
    # noinspection PyTypeChecker
    def combine(self, x: torch.Tensor, handle: Tuple,
                topk_weights: Optional[torch.Tensor] = None,
                bias: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]] = None,
                config: Optional[Config] = None,
                previous_event: Optional[EventOverlap] = None, async_finish: bool = False,
                allocate_on_comm_stream: bool = False) -> \
            Tuple[torch.Tensor, Optional[torch.Tensor], EventOverlap]:
        """
        Combine (reduce) tokens (addition **without** weights) from different ranks, both intranode and internode
            settings are supported.
        Intranode kernels require all the ranks should be visible via NVLink.
        Internode kernels require the ranks in a node should be visible via NVLink, while the ranks with the same GPU
            index should be visible via RDMA.
        Arguments:
            x: `[num_tokens, hidden]` with `torch.bfloat16`, the tokens to send for reducing to its original ranks.
            handle: a must-set communication handle, you can obtain this from the dispatch function.
            topk_weights: `[num_tokens, num_topk]` with `torch.float`, the tokens' top-k weights for reducing to its original ranks.
            config: the performance tuning config.
            previous_event: the event to wait before actually executing the kernel.
            async_finish: the current stream will not wait for the communication kernels to be finished if set.
            allocate_on_comm_stream: control whether all the allocated tensors' ownership to be on the communication stream.
        Returns:
            recv_x: the reduced token from its dispatched ranks.
            recv_topk_weights: the reduced top-k weights from its dispatch ranks.
            event: the event after executing the kernel (valid only if `async_finish` is set).
        """
        # Default config
        config = self.get_combine_config(self.group_size) if config is None else config
        # Internode
        if self.runtime.get_num_rdma_ranks() > 1:
            return self.internode_combine(x, handle, topk_weights, bias, config, previous_event, async_finish, allocate_on_comm_stream)
        # NOTES: the second `_` is for the sending side, so we should use the third one
        rank_prefix_matrix, _, channel_prefix_matrix, src_idx, is_recv_token_in_rank, send_head = handle
        bias_0, bias_1 = Buffer._unpack_bias(bias)
        # Launch the kernel
        recv_x, recv_topk_weights, event = self.runtime.intranode_combine(
            x, topk_weights, bias_0, bias_1,
            src_idx, rank_prefix_matrix, channel_prefix_matrix, send_head, config,
            getattr(previous_event, 'event', None), async_finish, allocate_on_comm_stream)
        return recv_x, recv_topk_weights, EventOverlap(event)
    # noinspection PyTypeChecker
    def internode_dispatch(self, x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
                           handle: Optional[Tuple] = None,
                           num_tokens_per_rank: Optional[torch.Tensor] = None, num_tokens_per_rdma_rank: Optional[torch.Tensor] = None,
                           is_token_in_rank: Optional[torch.Tensor] = None, num_tokens_per_expert: Optional[torch.Tensor] = None,
                           topk_idx: Optional[torch.Tensor] = None, topk_weights: Optional[torch.Tensor] = None, expert_alignment: int = 1,
                           config: Optional[Config] = None,
                           previous_event: Optional[EventOverlap] = None, async_finish: bool = False,
                           allocate_on_comm_stream: bool = False) -> \
            Tuple[Union[Tuple[torch.Tensor, torch.Tensor], torch.Tensor], Optional[torch.Tensor],
            Optional[torch.Tensor], List[int], Tuple, EventOverlap]:
        """
        Internode dispatch implementation, for more details, please refer to the `dispatch` docs.
        Normally, you should not directly call this function.
        """
        assert config is not None
        # Launch the kernel with cached or non-cached mode
        x, x_scales = x if isinstance(x, tuple) else (x, None)
        if handle is not None:
            assert topk_idx is None and topk_weights is None
            is_token_in_rank, \
                rdma_channel_prefix_matrix, gbl_channel_prefix_matrix, \
                recv_rdma_channel_prefix_matrix, recv_rdma_rank_prefix_sum, recv_gbl_channel_prefix_matrix, recv_gbl_rank_prefix_sum, \
                recv_src_meta, send_rdma_head, send_nvl_head = handle
            num_recv_tokens = recv_src_meta.size(0)
            num_rdma_recv_tokens = send_nvl_head.size(0)
            recv_x, recv_x_scales, _, _, _, _, _, _, _, _, _, _, _, _, event = self.runtime.internode_dispatch(
                x, x_scales, topk_idx, topk_weights,
                None, None, is_token_in_rank, None,
                num_recv_tokens, num_rdma_recv_tokens,
                rdma_channel_prefix_matrix, recv_rdma_rank_prefix_sum, gbl_channel_prefix_matrix, recv_gbl_rank_prefix_sum,
                expert_alignment, config, getattr(previous_event, 'event', None), async_finish, allocate_on_comm_stream)
            return (recv_x, recv_x_scales) if x_scales is not None else recv_x, None, None, None, None, EventOverlap(event)
        else:
            assert num_tokens_per_rank is not None and is_token_in_rank is not None and num_tokens_per_expert is not None
            recv_x, recv_x_scales, recv_topk_idx, recv_topk_weights, num_recv_tokens_per_expert_list, \
                rdma_channel_prefix_matrix, gbl_channel_prefix_matrix, \
                recv_rdma_channel_prefix_matrix, recv_rdma_rank_prefix_sum, \
                recv_gbl_channel_prefix_matrix, recv_gbl_rank_prefix_sum, \
                recv_src_meta, send_rdma_head, send_nvl_head, event = self.runtime.internode_dispatch(
                x, x_scales, topk_idx, topk_weights,
                num_tokens_per_rank, num_tokens_per_rdma_rank, is_token_in_rank, num_tokens_per_expert,
                0, 0, None, None, None, None,
                expert_alignment, config, getattr(previous_event, 'event', None), async_finish, allocate_on_comm_stream)
            handle = (is_token_in_rank,
                      rdma_channel_prefix_matrix, gbl_channel_prefix_matrix,
                      recv_rdma_channel_prefix_matrix, recv_rdma_rank_prefix_sum, recv_gbl_channel_prefix_matrix, recv_gbl_rank_prefix_sum,
                      recv_src_meta, send_rdma_head, send_nvl_head)
            return (recv_x, recv_x_scales) if x_scales is not None else recv_x, recv_topk_idx, recv_topk_weights, num_recv_tokens_per_expert_list, handle, EventOverlap(event)
    # noinspection PyTypeChecker
    def internode_combine(self, x: torch.Tensor, handle: Union[tuple, list],
                          topk_weights: Optional[torch.Tensor] = None,
                          bias: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]] = None,
                          config: Optional[Config] = None,
                          previous_event: Optional[EventOverlap] = None, async_finish: bool = False,
                          allocate_on_comm_stream: bool = False) -> \
            Tuple[torch.Tensor, Optional[torch.Tensor], EventOverlap]:
        """
        Internode combine implementation, for more details, please refer to the `combine` docs.
        Normally, you should not directly call this function.
        """
        assert config is not None
        # Unpack handle and bias
        is_combined_token_in_rank, \
            _, _, \
            rdma_channel_prefix_matrix, rdma_rank_prefix_sum, gbl_channel_prefix_matrix, gbl_rank_prefix_sum, \
            src_meta, send_rdma_head, send_nvl_head = handle
        bias_0, bias_1 = Buffer._unpack_bias(bias)
        # Launch the kernel
        combined_x, combined_topk_weights, event = self.runtime.internode_combine(
            x, topk_weights, bias_0, bias_1,
            src_meta, is_combined_token_in_rank,
            rdma_channel_prefix_matrix, rdma_rank_prefix_sum, gbl_channel_prefix_matrix,
            send_rdma_head, send_nvl_head, config, getattr(previous_event, 'event', None),
            async_finish, allocate_on_comm_stream)
        return combined_x, combined_topk_weights, EventOverlap(event)
    def clean_low_latency_buffer(self, num_max_dispatch_tokens_per_rank: int, hidden: int, num_experts: int) -> None:
        """
        As low-latency kernels require part of the buffer to be zero-initialized, so it is vital to clean the buffer
            if the buffer is dirty at some time.
        For example, after running the normal dispatch/combine, you must run this function before executing any
            low-latency kernel.
        Arguments:
            num_max_dispatch_tokens_per_rank: the maximum number of tokens to dispatch, all the ranks must hold the same value.
            hidden: the hidden dimension of each token.
            num_experts: the number of all experts.
        """
        self.runtime.clean_low_latency_buffer(num_max_dispatch_tokens_per_rank, hidden, num_experts)
    # noinspection PyTypeChecker
    def low_latency_dispatch(self, x: torch.Tensor, topk_idx: torch.Tensor,
                             num_max_dispatch_tokens_per_rank: int, num_experts: int,
                             cumulative_local_expert_recv_stats: Optional[torch.Tensor] = None,
                             use_fp8: bool = True, round_scale: bool = False, use_ue8m0: bool = False,
                             async_finish: bool = False, return_recv_hook: bool = False) -> \
            Tuple[Tuple[torch.Tensor, torch.Tensor], torch.Tensor, Tuple, EventOverlap, Callable]:
        """
        A low-latency implementation for dispatching with IBGDA.
        This kernel requires all the ranks (no matter intranode or internode) should be visible via RDMA
            (specifically, IBGDA must be enabled).
        Warning: as there are only two buffers, and the returned tensors reuse the buffer, you cannot hold more than 2
            low-latency kernels' result tensors at a single moment.
        Arguments:
            x: `torch.Tensor` with `torch.bfloat16`, shaped as `[num_tokens, hidden]`, only several hidden shapes are
                supported. The number of tokens to be dispatched must be less than `num_max_dispatch_tokens_per_rank`.
            topk_idx: `torch.Tensor` with `torch.int64`, shaped as `[num_tokens, num_topk]`, only several top-k shapes
                are supported. `-1` indices (not selecting any expert) are supported.
            num_max_dispatch_tokens_per_rank: the maximum number of tokens to dispatch, all the ranks must hold the same value.
            num_experts: the number of all experts.
            cumulative_local_expert_recv_stats: a cumulative expert count tensor for statistics, which should have shape
                `[num_local_experts]` and be typed as `torch.int`. This is useful for online service EP load balance
                monitoring.
            use_fp8: whether to enable FP8 casting, with this, the received data will be a tuple of FP8 tensor and scaling factors.
            round_scale: whether round the scaling factors into power of 2.
            use_ue8m0: whether use UE8M0 as scaling factor format (available only with `round_scale=True`).
            async_finish: the current stream will not wait for the communication kernels to be finished if set.
            return_recv_hook: return a receiving hook if set. If set, the kernel will just do the RDMA request issues,
                but **without actually receiving the data**. You must call the received hook to make sure the data's arrival.
                If you do not set this flag, the kernel will ensure the data's arrival.
        Returns:
            recv_x: a tensor or tuple with received tokens for each expert.
                With `use_fp8=True`: the first element is a `torch.Tensor` shaped as
                `[num_local_experts, num_max_dispatch_tokens_per_rank * num_ranks, hidden]` with `torch.float8_e4m3fn`.
                The second tensor is the corresponding scales for the first element with shape
                `[num_local_experts, num_max_dispatch_tokens_per_rank * num_ranks, hidden // 128]` with `torch.float`,
                if `use_ue8m0=False`. With `use_ue8m0=True`, the second one is packed and shaped as
                `[num_local_experts, num_max_dispatch_tokens_per_rank * num_ranks, hidden // 512]` with type `torch.int`.
                Notice that, the last-two-dimension of the scaling tensors are in column-major for TMA compatibility.
                With `use_fp8=False`, the result would be a tensor shaped as
                `[num_local_experts, num_max_dispatch_tokens_per_rank * num_ranks, hidden]` with `torch.bfloat16`.
                Moreover, not all tokens are valid, only some of the `num_max_dispatch_tokens_per_rank * num_ranks` are,
                as we do not synchronize CPU received count with GPU (also not incompatible with CUDA graph if synced).
            recv_count: a tensor shaped `[num_local_experts]` with type `torch.int`, indicating how many tokens each
                expert receives. As mentioned before, not all tokens are valid in `recv_x`.
            handle: the communication handle to be used in the `low_latency_combine` function.
            event: the event after executing the kernel (valid only if `async_finish` is set).
            hook: the receiving hook function (valid only if `return_recv_hook` is set).
        """
        packed_recv_x, packed_recv_x_scales, packed_recv_count, packed_recv_src_info, packed_recv_layout_range, event, hook = \
            self.runtime.low_latency_dispatch(x, topk_idx,
                                              cumulative_local_expert_recv_stats,
                                              num_max_dispatch_tokens_per_rank, num_experts,
                                              use_fp8, round_scale, use_ue8m0,
                                              async_finish, return_recv_hook)
        handle = (packed_recv_src_info, packed_recv_layout_range, num_max_dispatch_tokens_per_rank, x.size(1), num_experts)
        tensors_to_record = (x, topk_idx,
                             packed_recv_x, packed_recv_x_scales, packed_recv_count,
                             packed_recv_src_info, packed_recv_layout_range,
                             cumulative_local_expert_recv_stats)
        return (packed_recv_x, packed_recv_x_scales) if use_fp8 else packed_recv_x, packed_recv_count, handle, \
            EventOverlap(event, tensors_to_record if async_finish else None), hook
    # noinspection PyTypeChecker
    def low_latency_combine(self, x: torch.Tensor, topk_idx: torch.Tensor, topk_weights: torch.Tensor,
                            handle: tuple, zero_copy: bool = False, async_finish: bool = False,
                            return_recv_hook: bool = False, out: Optional[torch.Tensor] = None) -> \
            Tuple[torch.Tensor, EventOverlap, Callable]:
        """
        A low-latency implementation for combining tokens (reduce **with weights**) with IBGDA.
        This kernel requires all the ranks (no matter intranode or internode) should be visible via RDMA
            (specifically, IBGDA must be enabled).
        Warning: as there are only two buffers, and the returned tensors reuse the buffer, you cannot hold more than 2
            low-latency kernels' result tensors at a single moment.
        Arguments:
            x: `[num_local_experts, num_max_dispatch_tokens_per_rank * num_ranks, hidden]` with `torch.bfloat16`,
                the local calculated tokens to be sent to this original rank and reduced.
            topk_idx: `[num_combined_tokens, num_topk]` with `torch.int64`, the expert indices selected by the dispatched
                tokens. `-1` indices (not selecting any expert) are supported. Note that, `num_combined_tokens` equals
                to the number of dispatched tokens.
            topk_weights: `[num_combined_tokens, num_topk]` with `torch.float`, the expert weights selected by the dispatched
                tokens. The received tokens will be reduced with the weights in this tensor.
            handle: the communication handle given by the `dispatch` function.
            zero_copy: whether the tensor is already copied into the RDMA buffer, should be cooperative
                with `get_next_low_latency_combine_buffer`.
            async_finish: the current stream will not wait for the communication kernels to be finished if set.
            return_recv_hook: return a receiving hook if set. If set, the kernel will just do the RDMA request issues,
                but **without actually receiving the data**. You must call the received hook to make sure the data's arrival.
                If you do not set this flag, the kernel will ensure the data's arrival.
            out: the in-place output tensor, if set, the kernel will write the result to this tensor and return it directly.
        Returns:
            combined_x: the reduced token tensor, with shape `[num_combined_tokens, hidden]` and type `torch.bfloat16`.
            event: the event after executing the kernel (valid only if `async_finish` is set).
            hook: the receiving hook function (valid only if `return_recv_hook` is set).
        """
        src_info, layout_range, num_max_dispatch_tokens_per_rank, hidden, num_experts = handle
        combined_x, event, hook = self.runtime.low_latency_combine(x, topk_idx, topk_weights, src_info, layout_range,
                                                                   num_max_dispatch_tokens_per_rank, num_experts,
                                                                   zero_copy, async_finish, return_recv_hook, out)
        tensors_to_record = (x, topk_idx, topk_weights, src_info, layout_range, combined_x)
        return combined_x, EventOverlap(event, tensors_to_record if async_finish else None), hook
    def get_next_low_latency_combine_buffer(self, handle: object):
        """
        Get the raw registered RDMA buffer tensor for next low-latency combine, so that the next combine kernel can skip the copying.
        Arguments:
            handle: the communication handle given by the `dispatch` function.
        Returns:
            buffer: the raw RDMA low-latency buffer as a BF16 PyTorch tensor with shape
                `[num_local_experts, num_ranks * num_max_dispatch_tokens_per_rank, hidden]`, you should fill this buffer
                by yourself.
        """
        src_info, layout_range, num_max_dispatch_tokens_per_rank, hidden, num_experts = handle
        return self.runtime.get_next_low_latency_combine_buffer(num_max_dispatch_tokens_per_rank, hidden, num_experts)
--- a/DeepEP/deep_ep/utils.py
+++ b/DeepEP/deep_ep/utils.py
@ -0,0 +1,101 @@
 import os
 import subprocess
 import torch
 import torch.distributed as dist
 from typing import Any, Optional, Tuple
 # noinspection PyUnresolvedReferences
 from deep_ep_cpp import Config, EventHandle
 class EventOverlap:
    """
    A wrapper class to manage CUDA events, also for better overlapping convenience.
    Attributes:
        event: the CUDA event captured.
        extra_tensors: an easier way to simulate PyTorch tensor `record_stream`, may be useful with CUDA graph.
    """
    def __init__(self, event: Optional[EventHandle] = None,
                 extra_tensors: Optional[Tuple[torch.Tensor]] = None) -> None:
        """
        Initialize the class.
        Arguments:
            event: the CUDA event captured.
            extra_tensors: an easier way to simulate PyTorch tensor `record_stream`, may be useful with CUDA graph.
        """
        self.event = event
        # NOTES: we use extra tensors to achieve stream recording, otherwise,
        # stream recording will be incompatible with CUDA graph.
        self.extra_tensors = extra_tensors
    def current_stream_wait(self) -> None:
        """
        The current stream `torch.cuda.current_stream()` waits for the event to be finished.
        """
        assert self.event is not None
        self.event.current_stream_wait()
    def __enter__(self) -> Any:
        """
        Utility for overlapping and Python `with` syntax.
        You can overlap the kernels on the current stream with the following example:
        ```python
        event_overlap = event_after_all_to_all_kernels()
        with event_overlap():
            do_something_on_current_stream()
        # After exiting the `with` scope, the current stream with wait the event to be finished.
        ```
        """
        return self
    def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
        """
        Utility for overlapping and Python `with` syntax.
        Please follow the example in the `__enter__` function.
        """
        if self.event is not None:
            self.event.current_stream_wait()
 def check_nvlink_connections(group: dist.ProcessGroup):
    """
    Check NVLink connection between every pair of GPUs.
    Arguments:
        group: the communication group.
    """
    # Check NVLink connection
    # NOTES: some A100 PCIE GPUs only have pairwise NVLink connection, so that we can only use EP2
    # TODO: check all cases, all local-node GPUs in the group should be connected via NVLink
    if 'PCIE' in torch.cuda.get_device_name():
        assert group.size() <= 2, 'PCIe GPUs only have pairwise NVLink connections'
        # noinspection PyUnresolvedReferences
        import pynvml
        pynvml.nvmlInit()
        # noinspection PyTypeChecker
        devices = os.environ.get('CUDA_VISIBLE_DEVICES', '0,1,2,3,4,5,6,7').strip(',').split(',')
        physical_device_idx = int(devices[torch.cuda.current_device()])
        physical_device_indices = [0, ] * group.size()
        dist.all_gather_object(physical_device_indices, physical_device_idx, group)
        # Check whether they are all connected via NVLink
        # Reference: https://github.com/vllm-project/vllm/blob/b8e809a057765c574726a6077fd124db5077ce1f/vllm/platforms/cuda.py#L438
        handles = [pynvml.nvmlDeviceGetHandleByIndex(i) for i in physical_device_indices]
        for i, handle in enumerate(handles):
            for j, peer_handle in enumerate(handles):
                if i >= j:
                    continue
                status = pynvml.nvmlDeviceGetP2PStatus(handle, peer_handle, pynvml.NVML_P2P_CAPS_INDEX_NVLINK)
                assert status == pynvml.NVML_P2P_STATUS_OK,\
                    f'GPU {physical_device_indices[i]} and GPU {physical_device_indices[j]} are not connected via NVLink'
        # Close NVML
        pynvml.nvmlShutdown()
--- a/DeepEP/install.sh
+++ b/DeepEP/install.sh
@ -0,0 +1,12 @@
 # Change current directory into project root
 original_dir=$(pwd)
 script_dir=$(dirname "$0")
 cd "$script_dir"
 # Remove old dist file, build, and install
 rm -rf dist
 python setup.py bdist_wheel
 pip install dist/*.whl
 # Open users' original directory
 cd "$original_dir"
--- a/DeepEP/setup.py
+++ b/DeepEP/setup.py
@ -0,0 +1,107 @@
 import os
 import subprocess
 import setuptools
 from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 if __name__ == '__main__':
    nvshmem_dir = os.getenv('NVSHMEM_DIR', None)
    disable_nvshmem = nvshmem_dir is None
    if disable_nvshmem:
        print('Warning: `NVSHMEM_DIR` is not specified, all internode and low-latency features are disabled\n')
    else:
        assert os.path.exists(nvshmem_dir), f'Failed to find NVSHMEM: {nvshmem_dir}'
    cxx_flags = ['-O3', '-Wno-deprecated-declarations', '-Wno-unused-variable',
                 '-Wno-sign-compare', '-Wno-reorder', '-Wno-attributes']
    nvcc_flags = ['-O3', '-Xcompiler', '-O3']
    sources = ['csrc/deep_ep.cpp', 'csrc/kernels/runtime.cu', 'csrc/kernels/layout.cu', 'csrc/kernels/intranode.cu']
    include_dirs = ['csrc/']
    library_dirs = []
    nvcc_dlink = []
    extra_link_args = []
    # NVSHMEM flags
    if disable_nvshmem:
        cxx_flags.append('-DDISABLE_NVSHMEM')
        nvcc_flags.append('-DDISABLE_NVSHMEM')
    else:
        sources.extend(['csrc/kernels/internode.cu', 'csrc/kernels/internode_ll.cu'])
        include_dirs.extend([f'{nvshmem_dir}/include'])
        library_dirs.extend([f'{nvshmem_dir}/lib'])
        nvcc_dlink.extend(['-dlink', f'-L{nvshmem_dir}/lib', '-lnvshmem'])
        extra_link_args.extend(['-l:libnvshmem.a', '-l:nvshmem_bootstrap_uid.so', f'-Wl,-rpath,{nvshmem_dir}/lib'])
    if int(os.getenv('DISABLE_SM90_FEATURES', 0)):
        # Prefer A100
        os.environ['TORCH_CUDA_ARCH_LIST'] = os.getenv('TORCH_CUDA_ARCH_LIST', '8.0')
        # Disable some SM90 features: FP8, launch methods, and TMA
        cxx_flags.append('-DDISABLE_SM90_FEATURES')
        nvcc_flags.append('-DDISABLE_SM90_FEATURES')
        # Disable internode and low-latency kernels
        assert disable_nvshmem
    else:
        # Prefer H800 series
        os.environ['TORCH_CUDA_ARCH_LIST'] = os.getenv('TORCH_CUDA_ARCH_LIST', '9.0')
        # CUDA 12 flags
        nvcc_flags.extend(['-rdc=true', '--ptxas-options=--register-usage-level=10'])
    # Disable LD/ST tricks, as some CUDA version does not support `.L1::no_allocate`
    if os.environ['TORCH_CUDA_ARCH_LIST'].strip() != '9.0':
        assert int(os.getenv('DISABLE_AGGRESSIVE_PTX_INSTRS', 1)) == 1
        os.environ['DISABLE_AGGRESSIVE_PTX_INSTRS'] = '1'
    # Disable aggressive PTX instructions
    if int(os.getenv('DISABLE_AGGRESSIVE_PTX_INSTRS', '0')):
        cxx_flags.append('-DDISABLE_AGGRESSIVE_PTX_INSTRS')
        nvcc_flags.append('-DDISABLE_AGGRESSIVE_PTX_INSTRS')
    # Put them together
    extra_compile_args = {
        'cxx': cxx_flags,
        'nvcc': nvcc_flags,
    }
    if len(nvcc_dlink) > 0:
        extra_compile_args['nvcc_dlink'] = nvcc_dlink
    # Summary
    print(f'Build summary:')
    print(f' > Sources: {sources}')
    print(f' > Includes: {include_dirs}')
    print(f' > Libraries: {library_dirs}')
    print(f' > Compilation flags: {extra_compile_args}')
    print(f' > Link flags: {extra_link_args}')
    print(f' > Arch list: {os.environ["TORCH_CUDA_ARCH_LIST"]}')
    print(f' > NVSHMEM path: {nvshmem_dir}')
    print()
    # noinspection PyBroadException
    try:
        cmd = ['git', 'rev-parse', '--short', 'HEAD']
        revision = '+' + subprocess.check_output(cmd).decode('ascii').rstrip()
    except Exception as _:
        revision = ''
    setuptools.setup(
        name='deep_ep',
        version='1.1.0' + revision,
        packages=setuptools.find_packages(
            include=['deep_ep']
        ),
        ext_modules=[
            CUDAExtension(
                name='deep_ep_cpp',
                include_dirs=include_dirs,
                library_dirs=library_dirs,
                sources=sources,
                extra_compile_args=extra_compile_args,
                extra_link_args=extra_link_args
            )
        ],
        cmdclass={
            'build_ext': BuildExtension
        }
    )
--- a/DeepEP/tests/test_internode.py
+++ b/DeepEP/tests/test_internode.py
@ -0,0 +1,254 @@
 import os
 import time
 import torch
 import torch.distributed as dist
 # noinspection PyUnresolvedReferences
 import deep_ep
 from utils import init_dist, bench, calc_diff, create_grouped_scores, inplace_unique, per_token_cast_to_fp8, per_token_cast_back
 # Test compatibility with low latency functions
 import test_low_latency
 def test_main(num_sms: int, local_rank: int, num_local_ranks: int, num_ranks: int, num_nodes: int, rank: int, buffer: deep_ep.Buffer, group: dist.ProcessGroup):
    # Settings
    num_tokens, hidden, num_topk_groups, num_topk, num_experts = 4096, 7168, min(num_nodes, 4), 8, (256 // num_ranks) * num_ranks
    assert num_experts % num_ranks == 0 and num_local_ranks == 8
    if local_rank == 0:
        print(f'[config] num_tokens={num_tokens}, hidden={hidden}, num_topk_groups={num_topk_groups}, num_topk={num_topk}', flush=True)
    # Random data
    x = torch.ones((num_tokens, hidden), dtype=torch.bfloat16, device='cuda') * rank
    x_pure_rand = torch.randn((num_tokens, hidden), dtype=torch.bfloat16, device='cuda')
    x_e4m3 = per_token_cast_to_fp8(x)
    x_e4m3 = (x_e4m3[0], x_e4m3[1].T.contiguous().T)
    scores = torch.randn((num_tokens, num_experts), dtype=torch.float32, device='cuda').abs() + 1
    group_scores = scores.view(num_tokens, num_nodes, -1).amax(dim=-1)
    group_idx = torch.topk(group_scores, k=num_topk_groups, dim=-1, sorted=False).indices
    masked_scores = create_grouped_scores(scores, group_idx, num_nodes)
    topk_idx = torch.topk(masked_scores, num_topk, dim=-1, largest=True, sorted=False)[1]
    topk_weights = torch.ones((num_tokens, num_topk), dtype=torch.float32, device='cuda') * rank
    topk_weights_pure_rand = torch.randn((num_tokens, num_topk), dtype=torch.float32, device='cuda')
    rank_idx = topk_idx // (num_experts // num_ranks)
    rank_idx.masked_fill_(topk_idx == -1, -1)
    inplace_unique(rank_idx, num_ranks)
    rdma_rank_idx = rank_idx // num_local_ranks
    rdma_rank_idx.masked_fill_(rank_idx == -1, -1)
    inplace_unique(rdma_rank_idx, num_nodes)
    # RDMA dispatch counts
    rdma_idx = topk_idx // (num_experts // num_nodes)
    rdma_idx.masked_fill_(topk_idx == -1, -1)
    inplace_unique(rdma_idx, num_nodes)
    num_rdma_token_sent = rdma_idx.ne(-1).sum().item()
    # Expert meta
    num_tokens_per_expert = torch.zeros((num_experts, ), dtype=torch.int, device='cuda')
    for i in range(num_experts):
        num_tokens_per_expert[i] = (topk_idx == i).sum()
    gbl_num_tokens_per_expert = num_tokens_per_expert.clone()
    dist.all_reduce(gbl_num_tokens_per_expert, group=group)
    # Rank layout meta
    num_tokens_per_rank = torch.empty((num_ranks, ), dtype=torch.int, device='cuda')
    num_tokens_per_rdma_rank = torch.empty((num_nodes, ), dtype=torch.int, device='cuda')
    token_idx_in_rank = torch.full((num_ranks, num_tokens), -1, dtype=torch.long, device='cuda')
    for i in range(num_ranks):
        num_tokens_per_rank[i] = (rank_idx == i).sum()
        token_sel = (rank_idx == i).max(dim=-1)[0]
        count = token_sel.sum().item()
        tokens = torch.sort(token_sel.to(torch.int), descending=True)[1]
        tokens[:count] = torch.sort(tokens[:count])[0]
        token_idx_in_rank[i][tokens[:count]] = torch.arange(count, dtype=torch.long, device='cuda')
    for i in range(num_nodes):
        num_tokens_per_rdma_rank[i] = (rdma_rank_idx == i).sum()
    token_idx_in_rank = token_idx_in_rank.T.contiguous().to(torch.int)
    is_token_in_rank = token_idx_in_rank >= 0
    gbl_num_tokens_per_rank = num_tokens_per_rank.clone()
    dist.all_reduce(gbl_num_tokens_per_rank, group=group)
    ref_num_tokens_per_rank, ref_num_tokens_per_rdma_rank, ref_num_tokens_per_expert, ref_is_token_in_rank, _ = \
        buffer.get_dispatch_layout(topk_idx, num_experts)
    assert torch.allclose(ref_num_tokens_per_rank, num_tokens_per_rank)
    assert torch.allclose(ref_num_tokens_per_rdma_rank, num_tokens_per_rdma_rank)
    assert torch.allclose(ref_num_tokens_per_expert, num_tokens_per_expert)
    assert torch.allclose(ref_is_token_in_rank, is_token_in_rank)
    t = bench(lambda: buffer.get_dispatch_layout(topk_idx, num_experts))[0]
    if local_rank == 0:
        print(f'[layout] Kernel performance: {t * 1000:.3f} ms', flush=True)
        print('', flush=True)
    group.barrier()
    time.sleep(1)
    # Config
    rdma_buffer_size, nvl_buffer_size = 128, (720 if num_ranks in (144, 160) else 512)
    config = deep_ep.Config(num_sms, 8, nvl_buffer_size, 16, rdma_buffer_size)
    # Test dispatch
    # noinspection PyShadowingNames
    def check_data(check_x, recv_gbl_rank_prefix_sum):
        assert torch.allclose(check_x.amin(dim=1), check_x.amax(dim=1))
        check_start = 0
        for i in range(num_ranks):
            check_end = recv_gbl_rank_prefix_sum[i].item()
            assert (check_x[check_start:check_end, :].int() - i).sum().item() == 0
            check_start = check_end
    for previous_mode in (False, True):
        for async_mode in (False, True):
            for current_x in (x_pure_rand, x, x_e4m3):
                for with_topk in (False, True):
                    if local_rank == 0:
                        print(f'[testing] Running with {"FP8" if isinstance(current_x, tuple) else "BF16"}, {"with" if with_topk else "without"} top-k (async={async_mode}, previous={previous_mode}) ...', flush=True, end='')
                    dispatch_args = {'x': current_x, 'num_tokens_per_rank': num_tokens_per_rank, 'num_tokens_per_rdma_rank': num_tokens_per_rdma_rank,  'is_token_in_rank': is_token_in_rank,
                                     'num_tokens_per_expert': num_tokens_per_expert, 'config': config, 'async_finish': async_mode}
                    if with_topk:
                        dispatch_args.update({'topk_idx': topk_idx, 'topk_weights': topk_weights_pure_rand if current_x is x_pure_rand else topk_weights})
                    if previous_mode:
                        dispatch_args.update({'previous_event': buffer.capture()})
                    recv_x, recv_topk_idx, recv_topk_weights, recv_num_tokens_per_expert_list, handle, event = buffer.dispatch(**dispatch_args)
                    event.current_stream_wait() if async_mode else ()
                    recv_x = per_token_cast_back(*recv_x) if isinstance(recv_x, tuple) else recv_x
                    # Checks
                    recv_gbl_rank_prefix_sum = handle[-4]
                    assert gbl_num_tokens_per_rank[rank].item() == recv_x.size(0), f'{gbl_num_tokens_per_rank[rank].item()} != {recv_x.size(0)}'
                    assert gbl_num_tokens_per_expert.view(num_ranks, -1)[rank].tolist() == recv_num_tokens_per_expert_list
                    if current_x is not x_pure_rand:
                        check_data(recv_x, recv_gbl_rank_prefix_sum)
                    if with_topk:
                        # Check `topk_idx`
                        assert (recv_topk_idx.eq(-1) | ((recv_topk_idx >= 0) & (recv_topk_idx < (num_experts // num_ranks)))).sum().item() == recv_topk_idx.numel()
                        for i, count in enumerate(recv_num_tokens_per_expert_list):
                            assert recv_topk_idx.eq(i).sum().item() == count
                        # Check `topk_weights`
                        if current_x is not x_pure_rand:
                            recv_topk_weights[recv_topk_idx.eq(-1)] = recv_topk_weights.amax(dim=1, keepdim=True).expand_as(recv_topk_weights)[recv_topk_idx.eq(-1)]
                            check_data(recv_topk_weights, recv_gbl_rank_prefix_sum)
                    # Test cached dispatch (must without top-k staffs)
                    if not with_topk:
                        dispatch_args = {'x': current_x, 'handle': handle, 'config': config, 'async_finish': async_mode}
                        if previous_mode:
                            dispatch_args.update({'previous_event': buffer.capture()})
                        recv_x, _, _, _, _, event = buffer.dispatch(**dispatch_args)
                        event.current_stream_wait() if async_mode else ()
                        recv_x = per_token_cast_back(*recv_x) if isinstance(recv_x, tuple) else recv_x
                        if current_x is not x_pure_rand:
                            check_data(recv_x, recv_gbl_rank_prefix_sum)
                    # Test combine
                    bias_0 = torch.ones((num_tokens, hidden), dtype=torch.bfloat16, device='cuda')
                    bias_1 = torch.randn((num_tokens, hidden), dtype=torch.bfloat16, device='cuda')
                    combine_args = {'x': recv_x, 'bias': (bias_0, bias_1), 'handle': handle, 'config': config, 'async_finish': async_mode}
                    if with_topk:
                        combine_args.update({'topk_weights': recv_topk_weights})
                    if previous_mode:
                        combine_args.update({'previous_event': buffer.capture()})
                    combined_x, combined_topk_weights, event = buffer.combine(**combine_args)
                    event.current_stream_wait() if async_mode else ()
                    check_x = (combined_x.float() - bias_0.float() - bias_1.float()) / is_token_in_rank.sum(dim=1).unsqueeze(1)
                    ref_x = x_pure_rand if current_x is x_pure_rand else x
                    assert calc_diff(check_x, ref_x) < 5e-6
                    if with_topk:
                        check_topk_weights = combined_topk_weights if (current_x is x_pure_rand) else (combined_topk_weights / is_token_in_rank.sum(dim=1).unsqueeze(1))
                        ref_topk_weights = topk_weights_pure_rand if current_x is x_pure_rand else topk_weights
                        assert calc_diff(check_topk_weights, ref_topk_weights) < 1e-9
                    # For later tuning
                    dispatch_bf16_rdma_send_bytes = num_rdma_token_sent * hidden * 2
                    dispatch_bf16_nvl_recv_bytes = recv_x.numel() * 2
                    combine_bf16_nvl_send_bytes = dispatch_bf16_nvl_recv_bytes
                    combine_bf16_rdma_recv_bytes = dispatch_bf16_rdma_send_bytes
                    if local_rank == 0:
                        print(' passed', flush=True)
    if local_rank == 0:
        print('', flush=True)
    # Tune dispatch performance
    best_dispatch_results = None
    fp8_factor = (1 + 4 / 128) / 2
    for current_x in (x_e4m3, x):
        best_time, best_results = 1e10, None
        rdma_send_bytes = (dispatch_bf16_rdma_send_bytes * fp8_factor) if isinstance(current_x, tuple) else dispatch_bf16_rdma_send_bytes
        nvl_recv_bytes = (dispatch_bf16_nvl_recv_bytes * fp8_factor) if isinstance(current_x, tuple) else dispatch_bf16_nvl_recv_bytes
        for nvl_chunk_size in range(4, 33, 4):
            for rdma_chunk_size in range(4, 33, 4):
                config = deep_ep.Config(num_sms, nvl_chunk_size, nvl_buffer_size, rdma_chunk_size, rdma_buffer_size)
                tune_args = {'x': current_x, 'handle': handle, 'config': config}
                t = bench(lambda: buffer.dispatch(**tune_args))[0]
                if t < best_time:
                    best_time, best_results = t, (num_sms, nvl_chunk_size, rdma_chunk_size)
                if local_rank == 0:
                    print(f'[tuning] SMs {num_sms}, NVL chunk {nvl_chunk_size}, RDMA chunk {rdma_chunk_size}: {rdma_send_bytes / 1e9 / t:.2f} GB/s (RDMA), {nvl_recv_bytes / 1e9 / t:.2f} GB/s (NVL) ', flush=True)
        if local_rank == 0:
            print(f'[tuning] Best dispatch ({"FP8" if isinstance(current_x, tuple) else "BF16"}): SMs {best_results[0]}, NVL chunk {best_results[1]}, RDMA chunk {best_results[2]}: {rdma_send_bytes / 1e9 / best_time:.2f} GB/s (RDMA), {nvl_recv_bytes / 1e9 / best_time:.2f} GB/s (NVL)', flush=True)
            print('', flush=True)
        if isinstance(current_x, tuple):
            # Gather FP8 the best config from rank 0
            best_dispatch_results = torch.tensor([best_results[0], best_results[1], best_results[2]], dtype=torch.int32, device='cuda')
            all_best_fp8_results_list = [torch.zeros_like(best_dispatch_results) for _ in range(torch.distributed.get_world_size())]
            dist.all_gather(all_best_fp8_results_list, best_dispatch_results, group=group)
            best_dispatch_results = all_best_fp8_results_list[0].tolist()
    dispatch_config = deep_ep.Config(best_dispatch_results[0], best_dispatch_results[1], nvl_buffer_size, best_dispatch_results[2], rdma_buffer_size)
    dispatch_args = {'x': x, 'num_tokens_per_rank': num_tokens_per_rank, 'num_tokens_per_rdma_rank': num_tokens_per_rdma_rank,
                     'is_token_in_rank': is_token_in_rank, 'num_tokens_per_expert': num_tokens_per_expert,
                     'config': dispatch_config if dispatch_config is not None else config}
    recv_x, _, _, _, handle, _ = buffer.dispatch(**dispatch_args)
    # Tune combine performance
    best_time, best_results = 1e10, None
    for nvl_chunk_size in range(1, 5, 1):
        for rdma_chunk_size in range(8, 33, 4):
            config = deep_ep.Config(num_sms, nvl_chunk_size, nvl_buffer_size, rdma_chunk_size, rdma_buffer_size)
            tune_args = {'x': recv_x, 'handle': handle, 'config': config}
            t = bench(lambda: buffer.combine(**tune_args))[0]
            if local_rank == 0:
                print(f'[tuning] SMs {num_sms}, NVL chunk {nvl_chunk_size}, RDMA chunk {rdma_chunk_size}: {combine_bf16_rdma_recv_bytes / 1e9 / t:.2f} GB/s (RDMA), {combine_bf16_nvl_send_bytes / 1e9 / t:.2f} GB/s (NVL) ', flush=True)
                if t < best_time:
                    best_time, best_results = t, (num_sms, nvl_chunk_size, rdma_chunk_size)
    if local_rank == 0:
        print(f'[tuning] Best combine: SMs {best_results[0]}, NVL chunk {best_results[1]}, RDMA chunk {best_results[2]}: {combine_bf16_rdma_recv_bytes / 1e9 / best_time:.2f} GB/s (RDMA), {combine_bf16_nvl_send_bytes / 1e9 / best_time:.2f} GB/s (NVL)', flush=True)
        print('', flush=True)
 # noinspection PyUnboundLocalVariable
 def test_loop(local_rank: int, num_local_ranks: int):
    num_nodes = int(os.getenv('WORLD_SIZE', 1))
    rank, num_ranks, group = init_dist(local_rank, num_local_ranks)
    test_ll_compatibility = os.getenv('EP_TEST_LL_COMPATIBILITY', False)
    if test_ll_compatibility:
        ll_num_tokens, ll_hidden, ll_num_experts, ll_num_topk = 16, 5120, 256, 9
    num_sms = 24
    num_qps_per_rank = max(num_sms, ll_num_experts // num_ranks if test_ll_compatibility else 0)
    buffer = deep_ep.Buffer(group, int(1e9), int(1e9), low_latency_mode=test_ll_compatibility,
                            num_qps_per_rank=num_qps_per_rank)
    assert num_local_ranks == 8 and num_ranks > 8
    torch.manual_seed(rank)
    for i in (num_sms, ):
        test_main(i, local_rank, num_local_ranks, num_ranks, num_nodes, rank, buffer, group)
        if local_rank == 0:
            print('', flush=True)
    # Test compatibility with low latency functions
    if test_ll_compatibility:
        buffer.clean_low_latency_buffer(ll_num_tokens, ll_hidden, ll_num_experts)
        test_low_latency.test_main(ll_num_tokens, ll_hidden, ll_num_experts, ll_num_topk, rank, num_ranks, group, buffer, seed=1)
    # Destroy the communication group
    dist.barrier()
    dist.destroy_process_group()
 if __name__ == '__main__':
    num_processes = 8
    torch.multiprocessing.spawn(test_loop, args=(num_processes, ), nprocs=num_processes)
--- a/DeepEP/tests/test_intranode.py
+++ b/DeepEP/tests/test_intranode.py
@ -0,0 +1,256 @@
 import time
 import torch
 import torch.distributed as dist
 # noinspection PyUnresolvedReferences
 import deep_ep
 from utils import init_dist, bench, calc_diff, inplace_unique, per_token_cast_to_fp8, per_token_cast_back
 # Test compatibility with low latency functions
 import test_low_latency
 def test_main(num_sms: int, local_rank: int, num_ranks: int, rank: int, buffer: deep_ep.Buffer, group: dist.ProcessGroup):
    # Settings
    num_tokens, hidden, num_topk, num_experts = 4096, 7168, 8, (256 // num_ranks) * num_ranks
    assert num_experts % num_ranks == 0
    if local_rank == 0:
        print(f'[config] num_tokens={num_tokens}, hidden={hidden}, num_topk={num_topk}', flush=True)
    # Random data
    x = torch.ones((num_tokens, hidden), dtype=torch.bfloat16, device='cuda') * rank
    x_pure_rand = torch.randn((num_tokens, hidden), dtype=torch.bfloat16, device='cuda')
    x_e4m3 = per_token_cast_to_fp8(x) if deep_ep.Buffer.is_sm90_compiled() else None
    x_e4m3 = (x_e4m3[0], x_e4m3[1].T.contiguous().T) if x_e4m3 is not None else None
    scores = torch.randn((num_tokens, num_experts), dtype=torch.float32, device='cuda').abs() + 1
    topk_idx = torch.topk(scores, num_topk, dim=-1, largest=True, sorted=False)[1]
    topk_weights = torch.ones((num_tokens, num_topk), dtype=torch.float32, device='cuda') * rank
    topk_weights_pure_rand = torch.randn((num_tokens, num_topk), dtype=torch.float32, device='cuda')
    rank_idx = topk_idx // (num_experts // num_ranks)
    rank_idx.masked_fill_(topk_idx == -1, -1)
    inplace_unique(rank_idx, num_ranks)
    # Expert meta
    num_tokens_per_expert = torch.zeros((num_experts, ), dtype=torch.int, device='cuda')
    for i in range(num_experts):
        num_tokens_per_expert[i] = (topk_idx == i).sum()
    gbl_num_tokens_per_expert = num_tokens_per_expert.clone()
    dist.all_reduce(gbl_num_tokens_per_expert, group=group)
    # Rank layout meta
    num_tokens_per_rank = torch.empty((num_ranks, ), dtype=torch.int, device='cuda')
    token_idx_in_rank = torch.full((num_ranks, num_tokens), -1, dtype=torch.long, device='cuda')
    for i in range(num_ranks):
        num_tokens_per_rank[i] = (rank_idx == i).sum()
        token_sel = (rank_idx == i).max(dim=-1)[0]
        count = token_sel.sum().item()
        tokens = torch.sort(token_sel.to(torch.int), descending=True)[1]
        tokens[:count] = torch.sort(tokens[:count])[0]
        token_idx_in_rank[i][tokens[:count]] = torch.arange(count, dtype=torch.long, device='cuda')
    token_idx_in_rank = token_idx_in_rank.T.contiguous().to(torch.int)
    is_token_in_rank = token_idx_in_rank >= 0
    gbl_num_tokens_per_rank = num_tokens_per_rank.clone()
    dist.all_reduce(gbl_num_tokens_per_rank, group=group)
    ref_num_tokens_per_rank, _, ref_num_tokens_per_expert, ref_is_token_in_rank, _ = \
        buffer.get_dispatch_layout(topk_idx, num_experts)
    assert torch.allclose(ref_num_tokens_per_rank, num_tokens_per_rank)
    assert torch.allclose(ref_num_tokens_per_expert, num_tokens_per_expert)
    assert torch.allclose(ref_is_token_in_rank, is_token_in_rank)
    t = bench(lambda: buffer.get_dispatch_layout(topk_idx, num_experts))[0]
    if local_rank == 0:
        print(f'[layout] Kernel performance: {t * 1000:.3f} ms', flush=True)
        print('', flush=True)
    group.barrier()
    time.sleep(1)
    # Config
    nvl_buffer_size = 256
    config = deep_ep.Config(num_sms, 8, nvl_buffer_size)
    # Test dispatch
    # noinspection PyShadowingNames
    def check_data(check_x, rank_prefix_matrix):
        assert torch.allclose(check_x.amin(dim=1), check_x.amax(dim=1))
        check_start = 0
        for i in range(num_ranks):
            check_end = rank_prefix_matrix[i][rank].item()
            assert (check_x[check_start:check_end, :].int() - i).sum().item() == 0
            check_start = check_end
    for previous_mode in (False, True):
        for async_mode in (False, True):
            for current_x in filter(lambda elem: elem is not None, (x_pure_rand, x, x_e4m3)):
                for with_topk in (False, True):
                    if local_rank == 0:
                        print(f'[testing] Running with {"FP8" if isinstance(current_x, tuple) else "BF16"}, {"with" if with_topk else "without"} top-k (async={async_mode}, previous={previous_mode}) ...', flush=True, end='')
                    dispatch_args = {'x': current_x, 'num_tokens_per_rank': num_tokens_per_rank,  'is_token_in_rank': is_token_in_rank,
                                     'num_tokens_per_expert': num_tokens_per_expert, 'config': config, 'async_finish': async_mode}
                    if with_topk:
                        dispatch_args.update({'topk_idx': topk_idx, 'topk_weights': topk_weights_pure_rand if current_x is x_pure_rand else topk_weights})
                    if previous_mode:
                        dispatch_args.update({'previous_event': buffer.capture()})
                    recv_x, recv_topk_idx, recv_topk_weights, recv_num_tokens_per_expert_list, handle, event = buffer.dispatch(**dispatch_args)
                    event.current_stream_wait() if async_mode else ()
                    recv_x = per_token_cast_back(*recv_x) if isinstance(recv_x, tuple) else recv_x
                    # Checks
                    rank_prefix_matrix = handle[0]
                    assert gbl_num_tokens_per_rank[rank].item() == recv_x.size(0), f'{gbl_num_tokens_per_rank[rank].item()} != {recv_x.size(0)}'
                    assert gbl_num_tokens_per_expert.view(num_ranks, -1)[rank].tolist() == recv_num_tokens_per_expert_list
                    if current_x is not x_pure_rand:
                        check_data(recv_x, rank_prefix_matrix)
                    recv_topk_weights_clone = None
                    if with_topk:
                        # Check `topk_idx`
                        assert (recv_topk_idx.eq(-1) | ((recv_topk_idx >= 0) & (recv_topk_idx < (num_experts // num_ranks)))).sum().item() == recv_topk_idx.numel()
                        for i, count in enumerate(recv_num_tokens_per_expert_list):
                            assert recv_topk_idx.eq(i).sum().item() == count
                        # Check `topk_weights`
                        recv_topk_weights_clone = recv_topk_weights.clone()
                        if current_x is not x_pure_rand:
                            recv_topk_weights[recv_topk_idx.eq(-1)] = recv_topk_weights.amax(dim=1, keepdim=True).expand_as(recv_topk_weights)[recv_topk_idx.eq(-1)]
                            check_data(recv_topk_weights, rank_prefix_matrix)
                    # Test `num_worst_tokens != 0`
                    if with_topk:
                        num_worst_tokens = num_tokens * num_ranks
                        dispatch_args.update({'num_worst_tokens': num_worst_tokens})
                        recv_worst_x, recv_worst_topk_idx, recv_worst_topk_weights, empty_list, _, event = buffer.dispatch(**dispatch_args)
                        event.current_stream_wait() if async_mode else ()
                        recv_worst_x = per_token_cast_back(*recv_worst_x) if isinstance(recv_worst_x, tuple) else recv_worst_x
                        assert len(empty_list) == 0
                        assert num_worst_tokens == recv_worst_x.size(0)
                        assert num_worst_tokens == recv_worst_topk_idx.size(0)
                        assert num_worst_tokens == recv_worst_topk_weights.size(0)
                        assert torch.equal(recv_x, recv_worst_x[:recv_x.size(0)])
                        assert torch.equal(recv_topk_idx, recv_worst_topk_idx[:recv_x.size(0)])
                        assert torch.equal(recv_topk_weights_clone, recv_worst_topk_weights[:recv_x.size(0)])
                        assert torch.all(recv_worst_topk_idx[recv_x.size(0):] == -1).item()
                    # Test cached dispatch (must without top-k staffs)
                    if not with_topk:
                        dispatch_args = {'x': current_x, 'handle': handle, 'config': config, 'async_finish': async_mode}
                        if previous_mode:
                            dispatch_args.update({'previous_event': buffer.capture()})
                        recv_x, _, _, _, _, event = buffer.dispatch(**dispatch_args)
                        event.current_stream_wait() if async_mode else ()
                        recv_x = per_token_cast_back(*recv_x) if isinstance(recv_x, tuple) else recv_x
                        if current_x is not x_pure_rand:
                            check_data(recv_x, rank_prefix_matrix)
                    # Test combine
                    combine_args = {'x': recv_x, 'handle': handle, 'config': config, 'async_finish': async_mode}
                    if with_topk:
                        combine_args.update({'topk_weights': recv_topk_weights})
                    if previous_mode:
                        combine_args.update({'previous_event': buffer.capture()})
                    combined_x, combined_topk_weights, event = buffer.combine(**combine_args)
                    event.current_stream_wait() if async_mode else ()
                    check_x = combined_x.float() / is_token_in_rank.sum(dim=1).unsqueeze(1)
                    ref_x = x_pure_rand if current_x is x_pure_rand else x
                    assert calc_diff(check_x, ref_x) < 5e-6
                    if with_topk:
                        check_topk_weights = combined_topk_weights if (current_x is x_pure_rand) else (combined_topk_weights / is_token_in_rank.sum(dim=1).unsqueeze(1))
                        ref_topk_weights = topk_weights_pure_rand if current_x is x_pure_rand else topk_weights
                        assert calc_diff(check_topk_weights, ref_topk_weights) < 1e-9
                    # For later tuning
                    dispatch_bf16_nvl_recv_bytes = recv_x.numel() * 2
                    combine_bf16_nvl_send_bytes = dispatch_bf16_nvl_recv_bytes
                    if local_rank == 0:
                        print(' passed', flush=True)
    if local_rank == 0:
        print('', flush=True)
    # Tune dispatch performance
    best_dispatch_results = None
    fp8_factor = (1 + 4 / 128) / 2
    for current_x in filter(lambda elem: elem is not None, (x_e4m3, x)):
        best_time, best_results = 1e10, None
        nvl_recv_bytes = (dispatch_bf16_nvl_recv_bytes * fp8_factor) if isinstance(current_x, tuple) else dispatch_bf16_nvl_recv_bytes
        for nvl_chunk_size in tuple(range(4, 33, 2)) + (0, ):
            if nvl_chunk_size > 0:
                config = deep_ep.Config(num_sms, nvl_chunk_size, nvl_buffer_size)
            else:
                # Test default config as well
                deep_ep.Buffer.set_num_sms(num_sms)
                config = deep_ep.Buffer.get_dispatch_config(num_ranks)
            tune_args = {'x': current_x, 'handle': handle, 'config': config}
            t = bench(lambda: buffer.dispatch(**tune_args))[0]
            if t < best_time and nvl_chunk_size > 0:
                best_time, best_results = t, (num_sms, nvl_chunk_size)
            if local_rank == 0:
                print(f'[tuning] SMs {num_sms}, NVL chunk {nvl_chunk_size if nvl_chunk_size else "default"}: '
                      f'{nvl_recv_bytes / 1e9 / t:.2f} GB/s (NVL), avg_t: {t * 1e6:.2f} us', flush=True)
        if local_rank == 0:
            print(f'[tuning] Best dispatch ({"FP8" if isinstance(current_x, tuple) else "BF16"}): SMs {best_results[0]}, NVL chunk {best_results[1]}, {nvl_recv_bytes / 1e9 / best_time:.2f} GB/s (NVL), t: {best_time * 1e6:.2f} us', flush=True)
            print('', flush=True)
        # Gather the best config from rank 0 and the first test setting
        if best_dispatch_results is None:
            best_dispatch_results = torch.tensor([best_results[0], best_results[1]], dtype=torch.int32, device='cuda')
            all_best_fp8_results_list = [torch.zeros_like(best_dispatch_results) for _ in range(torch.distributed.get_world_size())]
            dist.all_gather(all_best_fp8_results_list, best_dispatch_results, group=group)
            best_dispatch_results = all_best_fp8_results_list[0].tolist()
    dispatch_config = deep_ep.Config(best_dispatch_results[0], best_dispatch_results[1], nvl_buffer_size)
    dispatch_args = {'x': x, 'num_tokens_per_rank': num_tokens_per_rank,
                     'is_token_in_rank': is_token_in_rank, 'num_tokens_per_expert': num_tokens_per_expert,
                     'config': dispatch_config if dispatch_config is not None else config}
    recv_x, _, _, _, handle, _ = buffer.dispatch(**dispatch_args)
    # Tune combine performance
    best_time, best_results = 1e10, None
    for nvl_chunk_size in tuple(range(1, 17, 1)) + (0, ):
        if nvl_chunk_size > 0:
            config = deep_ep.Config(num_sms, nvl_chunk_size, nvl_buffer_size)
        else:
            # Test default config as well
            deep_ep.Buffer.set_num_sms(num_sms)
            config = deep_ep.Buffer.get_combine_config(num_ranks)
        tune_args = {'x': recv_x, 'handle': handle, 'config': config}
        t = bench(lambda: buffer.combine(**tune_args))[0]
        if local_rank == 0:
            print(f'[tuning] SMs {num_sms}, NVL chunk {nvl_chunk_size if nvl_chunk_size else "default"}: '
                  f'{combine_bf16_nvl_send_bytes / 1e9 / t:.2f} GB/s (NVL), avg_t: {t * 1e6:.2f} us', flush=True)
            if t < best_time and nvl_chunk_size > 0:
                best_time, best_results = t, (num_sms, nvl_chunk_size)
    if local_rank == 0:
        print(f'[tuning] Best combine: SMs {best_results[0]}, NVL chunk {best_results[1]}: {combine_bf16_nvl_send_bytes / 1e9 / best_time:.2f} GB/s (NVL), t: {best_time * 1e6:.2f} us', flush=True)
        print('', flush=True)
 # noinspection PyUnboundLocalVariable
 def test_loop(local_rank: int, num_local_ranks: int):
    rank, num_ranks, group = init_dist(local_rank, num_local_ranks)
    test_ll_compatibility, num_rdma_bytes = False, 0
    if test_ll_compatibility:
        ll_num_tokens, ll_hidden, ll_num_experts, ll_num_topk = 16, 5120, 256, 9
        num_rdma_bytes = deep_ep.Buffer.get_low_latency_rdma_size_hint(ll_num_tokens, ll_hidden, num_ranks, ll_num_experts)
    buffer = deep_ep.Buffer(group, int(2e9), num_rdma_bytes, low_latency_mode=test_ll_compatibility,
                            num_qps_per_rank=(ll_num_experts // num_ranks if test_ll_compatibility else 1))
    torch.manual_seed(rank)
    for i in (24, ):
        test_main(i, local_rank, num_ranks, rank, buffer, group)
        if local_rank == 0:
            print('', flush=True)
    # Test compatibility with low latency functions
    if test_ll_compatibility:
        buffer.clean_low_latency_buffer(ll_num_tokens, ll_hidden, ll_num_experts)
        test_low_latency.test_main(ll_num_tokens, ll_hidden, ll_num_experts, ll_num_topk, rank, num_ranks, group, buffer, seed=1)
    # Destroy the communication group
    dist.barrier()
    dist.destroy_process_group()
 if __name__ == '__main__':
    num_processes = 8
    torch.multiprocessing.spawn(test_loop, args=(num_processes, ), nprocs=num_processes)
--- a/DeepEP/tests/test_low_latency.py
+++ b/DeepEP/tests/test_low_latency.py
@ -0,0 +1,187 @@
 import random
 import torch
 import torch.distributed as dist
 from functools import partial
 import deep_ep
 from utils import init_dist, bench, bench_kineto, calc_diff, hash_tensor, per_token_cast_back
 def test_main(num_tokens: int, hidden: int, num_experts: int, num_topk: int,
              rank: int, num_ranks: int, group: dist.ProcessGroup, buffer: deep_ep.Buffer, seed: int = 0):
    torch.manual_seed(seed + rank)
    random.seed(seed + rank)
    assert num_experts % num_ranks == 0
    num_local_experts = num_experts // num_ranks
    # NOTES: the integers greater than 256 exceeds the BF16 precision limit
    rank_offset = 128
    assert num_ranks - rank_offset < 257, 'Too many ranks (exceeding test precision limit)'
    x = torch.ones((num_tokens, hidden), dtype=torch.bfloat16, device='cuda') * (rank - rank_offset)
    x[:, -128:] = torch.arange(num_tokens, device='cuda').to(torch.bfloat16).view(-1, 1)
    scores = torch.randn((num_tokens, num_experts), dtype=torch.float32, device='cuda').abs() + 1
    topk_idx = torch.topk(scores, num_topk, dim=-1, largest=True, sorted=True)[1]
    topk_weights = torch.randn((num_tokens, num_topk), dtype=torch.float32, device='cuda').abs()
    # Randomly mask some positions
    for i in range(10):
        topk_idx[random.randint(0, num_tokens - 1), random.randint(0, num_topk - 1)] = -1
    # Check dispatch correctness
    do_check = True
    hash_value, num_times = 0, 0
    for return_recv_hook in (False, True):
        for dispatch_use_fp8 in (False, True):
            for round_scale in (False, True) if dispatch_use_fp8 else (False, ):
                for use_ue8m0 in (False, True) if round_scale else (False, ):
                    num_times += 1
                    for i in range((num_times % 2) + 1):
                        cumulative_local_expert_recv_stats = torch.zeros((num_local_experts, ), dtype=torch.int, device='cuda')
                        packed_recv_x, packed_recv_count, handle, event, hook = \
                            buffer.low_latency_dispatch(x, topk_idx, num_tokens, num_experts,
                                                        use_fp8=dispatch_use_fp8, round_scale=round_scale, use_ue8m0=use_ue8m0,
                                                        cumulative_local_expert_recv_stats=cumulative_local_expert_recv_stats,
                                                        async_finish=not return_recv_hook, return_recv_hook=return_recv_hook)
                        hook() if return_recv_hook else event.current_stream_wait()
                    packed_recv_x = (packed_recv_x[0], packed_recv_x[1].contiguous()) if dispatch_use_fp8 else packed_recv_x
                    simulated_gemm_x = per_token_cast_back(packed_recv_x[0].view(-1, hidden), packed_recv_x[1].view(-1, hidden // 128)).view(packed_recv_x[0].shape) \
                        if dispatch_use_fp8 else packed_recv_x.clone()
                    all_topk_idx = torch.empty((num_ranks, num_tokens, num_topk), dtype=topk_idx.dtype, device='cuda')
                    dist.all_gather_into_tensor(all_topk_idx, topk_idx, group=group)
                    for i in range(num_local_experts if do_check else 0):
                        expert_id = rank * num_local_experts + i
                        recv_x = per_token_cast_back(packed_recv_x[0][i], packed_recv_x[1][i]) if dispatch_use_fp8 else packed_recv_x[i]
                        recv_count, recv_src_info, recv_layout_range = packed_recv_count[i], handle[0][i], handle[1][i]
                        # Check expert indices
                        int_mask = (2 ** 32) - 1
                        num_valid_tokens = recv_count.item()
                        assert cumulative_local_expert_recv_stats[i].item() == num_valid_tokens, f'{cumulative_local_expert_recv_stats[i].item()} != {num_valid_tokens}'
                        assert num_valid_tokens == (recv_layout_range & int_mask).sum().item(), f'{num_valid_tokens} != {recv_layout_range & int_mask}.sum().item()'
                        assert num_valid_tokens == (all_topk_idx == expert_id).sum().item(), f'{num_valid_tokens} != {(all_topk_idx == expert_id).sum().item()}'
                        # Check received data
                        recv_x = recv_x[:num_valid_tokens]
                        recv_x_amin = recv_x[:, :-128].amin(dim=-1)
                        recv_src_info = recv_src_info[:num_valid_tokens]
                        assert torch.equal(recv_x_amin, recv_x[:, :-128].amax(dim=-1))
                        if round_scale:
                            assert calc_diff(recv_x[:, -1], recv_src_info.view(-1)) < 0.007
                        else:
                            assert (recv_x[:, -128:] - recv_src_info.view(-1, 1) % num_tokens).sum().item() == 0
                        for j in range(num_ranks):
                            begin_idx, count = (recv_layout_range[j] >> 32).item(), (recv_layout_range[j] & int_mask).item()
                            if not round_scale:
                                assert (recv_x_amin == j - rank_offset).sum().item() == (all_topk_idx[j] == expert_id).sum().item()
                            assert (recv_x[begin_idx:begin_idx + count][:-128] - j).sum().item() == 0
                        if dispatch_use_fp8:
                            hash_value ^= hash_tensor(packed_recv_x[0][i, :num_valid_tokens])
                            hash_value ^= hash_tensor(packed_recv_x[1][i, :num_valid_tokens])
                        else:
                            hash_value ^= hash_tensor(packed_recv_x[i, :num_valid_tokens])
                    # Check combine correctness
                    for zero_copy in (False, True):
                        if zero_copy:
                            buffer.get_next_low_latency_combine_buffer(handle)[:, :, :] = simulated_gemm_x
                        out = torch.empty((num_tokens, hidden), dtype=torch.bfloat16, device='cuda')
                        combined_x, event, hook = buffer.low_latency_combine(simulated_gemm_x, topk_idx, topk_weights, handle,
                                                                             async_finish=not return_recv_hook, zero_copy=zero_copy,
                                                                             return_recv_hook=return_recv_hook, out=out)
                        hook() if return_recv_hook else event.current_stream_wait()
                        if do_check:
                            diff = calc_diff(x * topk_weights.masked_fill(topk_idx == -1, 0).sum(dim=1).view(-1, 1), combined_x)
                            assert torch.isnan(combined_x).sum().item() == 0
                            assert diff < (7e-4 if round_scale else 1e-5), f'Error: {diff=}, {zero_copy=}'
                            hash_value ^= hash_tensor(combined_x)
    def create_test_cast_with_outliers(num_outliers):
        tmp = torch.randn((num_tokens, hidden), dtype=torch.bfloat16, device='cuda')
        tmp /= tmp.abs().amax(dim=1).view(-1, 1)
        assert tmp.abs().amax().item() <= 1
        # Create some amax outliers
        for i in range(num_outliers):
            tmp[random.randint(0, num_tokens - 1)] *= 1e3
        return tmp
    # noinspection PyShadowingNames
    def large_gemm_with_hook(hook):
        mat_0 = torch.randn((8192, 8192), dtype=torch.float)
        mat_1 = torch.randn((8192, 8192), dtype=torch.float)
        mat_0 @ mat_1
        hook()
    # noinspection PyShadowingNames
    def test_func(zero_copy: bool, return_recv_hook: bool):
        recv_x, recv_count, handle, event, hook = \
            buffer.low_latency_dispatch(x, topk_idx, num_tokens, num_experts,
                                        cumulative_local_expert_recv_stats=cumulative_local_expert_recv_stats,
                                        use_fp8=True, async_finish=False, return_recv_hook=return_recv_hook)
        large_gemm_with_hook(hook) if return_recv_hook else None
        if zero_copy:
            buffer.get_next_low_latency_combine_buffer(handle)[:, :, :] = simulated_gemm_x
        combined_x, event, hook = buffer.low_latency_combine(simulated_gemm_x, topk_idx, topk_weights, handle,
                                                             zero_copy=zero_copy, return_recv_hook=return_recv_hook)
        large_gemm_with_hook(hook) if return_recv_hook else None
    # Calculate bandwidth
    num_fp8_bytes, num_bf16_bytes = (hidden + hidden / 128 * 4 + 16), hidden * 2
    num_dispatch_comm_bytes, num_combine_comm_bytes = 0, 0
    for i in range(num_tokens):
        num_selections = (topk_idx[i] != -1).sum().item()
        num_dispatch_comm_bytes += num_fp8_bytes * num_selections
        num_combine_comm_bytes += num_bf16_bytes * num_selections
    # Dispatch + combine testing
    avg_t, min_t, max_t = bench(partial(test_func, zero_copy=False, return_recv_hook=False))
    print(f'[rank {rank}] Dispatch + combine bandwidth: {(num_dispatch_comm_bytes + num_combine_comm_bytes) / 1e9 / avg_t:.2f} GB/s, '
          f'avg_t={avg_t * 1e6:.2f} us, min_t={min_t * 1e6:.2f} us, max_t={max_t * 1e6:.2f} us', flush=True)
    # Separate profiling
    for return_recv_hook in (False, True):
        group.barrier()
        dispatch_t, combine_t = bench_kineto(partial(test_func, zero_copy=True, return_recv_hook=return_recv_hook),
                                             kernel_names=('dispatch', 'combine'), barrier_comm_profiling=True,
                                             suppress_kineto_output=True)
        if not return_recv_hook:
            print(f'[rank {rank}] Dispatch bandwidth: {num_dispatch_comm_bytes / 1e9 / dispatch_t:.2f} GB/s, avg_t={dispatch_t * 1e6:.2f} us | '
                  f'Combine bandwidth: {num_combine_comm_bytes / 1e9 / combine_t:.2f} GB/s, avg_t={combine_t * 1e6:.2f} us', flush=True)
        else:
            print(f'[rank {rank}] Dispatch send/recv time: {dispatch_t * 2 * 1e6:.2f} us | '
                  f'Combine send/recv time: {combine_t * 2 * 1e6:.2f} us', flush=True)
    return hash_value
 # noinspection PyUnboundLocalVariable
 def test_loop(local_rank: int, num_local_ranks: int):
    rank, num_ranks, group = init_dist(local_rank, num_local_ranks)
    num_tokens, hidden, num_topk, num_experts = 128, 7168, 8, 288
    num_rdma_bytes = deep_ep.Buffer.get_low_latency_rdma_size_hint(num_tokens, hidden, num_ranks, num_experts)
    if local_rank == 0:
        print(f'Allocating buffer size: {num_rdma_bytes / 1e6} MB ...', flush=True)
    buffer = deep_ep.Buffer(group, num_rdma_bytes=num_rdma_bytes, low_latency_mode=True,
                            num_qps_per_rank=num_experts // num_ranks)
    test_main(num_tokens, hidden, num_experts, num_topk, rank, num_ranks, group, buffer, seed=1)
    do_pressure_test = False
    for seed in range(int(1e9) if do_pressure_test else 0):
        if local_rank == 0:
            print(f'Testing with seed {seed} ...', flush=True)
        ref_hash = test_main(num_tokens, hidden, num_experts, num_topk, rank, num_ranks, group, buffer, seed=seed)
        for i in range(20):
            assert test_main(num_tokens, hidden, num_experts, num_topk, rank, num_ranks, group, buffer, seed=seed) == ref_hash, f'Error: seed={seed}'
    # Destroy the communication group
    dist.barrier()
    dist.destroy_process_group()
 if __name__ == '__main__':
    # TODO: you may modify NUMA binding for less CPU overhead
    num_processes = 8
    torch.multiprocessing.spawn(test_loop, args=(num_processes,), nprocs=num_processes)
--- a/DeepEP/tests/utils.py
+++ b/DeepEP/tests/utils.py
@ -0,0 +1,201 @@
 import inspect
 import numpy as np
 import os
 import sys
 import torch
 import torch.distributed as dist
 from typing import Optional
 def init_dist(local_rank: int, num_local_ranks: int):
    # NOTES: you may rewrite this function with your own cluster settings
    ip = os.getenv('MASTER_ADDR', '127.0.0.1')
    port = int(os.getenv('MASTER_PORT', '8361'))
    num_nodes = int(os.getenv('WORLD_SIZE', 1))
    node_rank = int(os.getenv('RANK', 0))
    assert (num_local_ranks < 8 and num_nodes == 1) or num_local_ranks == 8
    sig = inspect.signature(dist.init_process_group)
    params = {
        'backend': 'nccl',
        'init_method': f'tcp://{ip}:{port}',
        'world_size': num_nodes * num_local_ranks,
        'rank': node_rank * num_local_ranks + local_rank,
    }
    if 'device_id' in sig.parameters:
        # noinspection PyTypeChecker
        params['device_id'] = torch.device(f'cuda:{local_rank}')
    dist.init_process_group(**params)
    torch.set_default_dtype(torch.bfloat16)
    torch.set_default_device('cuda')
    torch.cuda.set_device(local_rank)
    return dist.get_rank(), dist.get_world_size(), dist.new_group(list(range(num_local_ranks * num_nodes)))
 def calc_diff(x: torch.Tensor, y: torch.Tensor):
    x, y = x.double() + 1, y.double() + 1
    denominator = (x * x + y * y).sum()
    sim = 2 * (x * y).sum() / denominator
    return (1 - sim).item()
 def per_token_cast_to_fp8(x: torch.Tensor):
    assert x.dim() == 2 and x.size(1) % 128 == 0
    m, n = x.shape
    x_view = x.view(m, -1, 128)
    x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
    return (x_view * (448.0 / x_amax.unsqueeze(2))).to(torch.float8_e4m3fn).view(m, n), (x_amax / 448.0).view(m, -1)
 def per_token_cast_back(x_fp8: torch.Tensor, x_scales: torch.Tensor):
    if x_scales.dtype == torch.int:
        x_scales = x_scales.view(dtype=torch.int8).to(torch.int) << 23
        x_scales = x_scales.view(dtype=torch.float)
    x_fp32 = x_fp8.to(torch.float32).view(x_fp8.size(0), -1, 128)
    x_scales = x_scales.view(x_fp8.size(0), -1, 1)
    return (x_fp32 * x_scales).view(x_fp8.shape).to(torch.bfloat16)
 def inplace_unique(x: torch.Tensor, num_slots: int):
    assert x.dim() == 2
    mask = x < 0
    x_padded = x.masked_fill(mask, num_slots)
    bin_count = torch.zeros((x.size(0), num_slots + 1), dtype=x.dtype, device=x.device)
    bin_count.scatter_add_(1, x_padded, torch.ones_like(x_padded))
    bin_count = bin_count[:, :num_slots]
    sorted_bin_count, sorted_bin_idx = torch.sort(bin_count, dim=-1, descending=True)
    sorted_bin_idx.masked_fill_(sorted_bin_count == 0, -1)
    sorted_bin_idx = torch.sort(sorted_bin_idx, descending=True, dim=-1).values
    x[:, :].fill_(-1)
    valid_len = min(num_slots, x.size(1))
    x[:, :valid_len] = sorted_bin_idx[:, :valid_len]
 def create_grouped_scores(scores: torch.Tensor, group_idx: torch.Tensor, num_groups: int):
    num_tokens, num_experts = scores.shape
    scores = scores.view(num_tokens, num_groups, -1)
    mask = torch.zeros((num_tokens, num_groups), dtype=torch.bool, device=scores.device)
    mask = mask.scatter_(1, group_idx, True).unsqueeze(-1).expand_as(scores)
    return (scores * mask).view(num_tokens, num_experts)
 def bench(fn, num_warmups: int = 50, num_tests: int = 50, post_fn=None):
    # Flush L2 cache with 256 MB data
    torch.cuda.synchronize()
    cache = torch.empty(int(256e6 // 4), dtype=torch.int, device='cuda')
    # Warmup
    for _ in range(num_warmups):
        fn()
    # Flush L2
    cache.zero_()
    # Testing
    start_events = [torch.cuda.Event(enable_timing=True) for _ in range(num_tests)]
    end_events = [torch.cuda.Event(enable_timing=True) for _ in range(num_tests)]
    for i in range(num_tests):
        # Record
        start_events[i].record()
        fn()
        end_events[i].record()
        if post_fn is not None:
            post_fn()
    torch.cuda.synchronize()
    times = np.array([s.elapsed_time(e) / 1e3 for s, e in zip(start_events, end_events)])[1:]
    return np.average(times), np.min(times), np.max(times)
 class empty_suppress:
    def __enter__(self):
        return self
    def __exit__(self, *_):
        pass
 class suppress_stdout_stderr:
    def __enter__(self):
        self.outnull_file = open(os.devnull, 'w')
        self.errnull_file = open(os.devnull, 'w')
        self.old_stdout_fileno_undup = sys.stdout.fileno()
        self.old_stderr_fileno_undup = sys.stderr.fileno()
        self.old_stdout_fileno = os.dup(sys.stdout.fileno())
        self.old_stderr_fileno = os.dup(sys.stderr.fileno())
        self.old_stdout = sys.stdout
        self.old_stderr = sys.stderr
        os.dup2(self.outnull_file.fileno(), self.old_stdout_fileno_undup)
        os.dup2(self.errnull_file.fileno(), self.old_stderr_fileno_undup)
        sys.stdout = self.outnull_file
        sys.stderr = self.errnull_file
        return self
    def __exit__(self, *_):
        sys.stdout = self.old_stdout
        sys.stderr = self.old_stderr
        os.dup2(self.old_stdout_fileno, self.old_stdout_fileno_undup)
        os.dup2(self.old_stderr_fileno, self.old_stderr_fileno_undup)
        os.close(self.old_stdout_fileno)
        os.close(self.old_stderr_fileno)
        self.outnull_file.close()
        self.errnull_file.close()
 def bench_kineto(fn, kernel_names, num_tests: int = 30, suppress_kineto_output: bool = False,
                 trace_path: Optional[str] = None, barrier_comm_profiling: bool = False):
    # Profile
    suppress = suppress_stdout_stderr if suppress_kineto_output else empty_suppress
    with suppress():
        schedule = torch.profiler.schedule(wait=0, warmup=1, active=1, repeat=1)
        with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CUDA], schedule=schedule) as prof:
            for i in range(2):
                # NOTES: use a large kernel and a barrier to eliminate the unbalanced CPU launch overhead
                if barrier_comm_profiling:
                    lhs = torch.randn((8192, 8192), dtype=torch.float, device='cuda')
                    rhs = torch.randn((8192, 8192), dtype=torch.float, device='cuda')
                    lhs @ rhs
                    dist.all_reduce(torch.ones(1, dtype=torch.float, device='cuda'))
                for _ in range(num_tests):
                    fn()
                prof.step()
    # Parse the profiling table
    assert isinstance(kernel_names, str) or isinstance(kernel_names, tuple)
    is_tupled = isinstance(kernel_names, tuple)
    prof_lines = prof.key_averages().table(sort_by='cuda_time_total', max_name_column_width=100).split('\n')
    kernel_names = (kernel_names, ) if isinstance(kernel_names, str) else kernel_names
    assert all([isinstance(name, str) for name in kernel_names])
    for name in kernel_names:
        assert sum([name in line for line in prof_lines]) == 1, f'Errors of the kernel {name} in the profiling table'
    # Save chrome traces
    if trace_path is not None:
        prof.export_chrome_trace(trace_path)
    # Return average kernel times
    units = {'ms': 1e3, 'us': 1e6}
    kernel_times = []
    for name in kernel_names:
        for line in prof_lines:
            if name in line:
                time_str = line.split()[-2]
                for unit, scale in units.items():
                    if unit in time_str:
                        kernel_times.append(float(time_str.replace(unit, '')) / scale)
                        break
                break
    return tuple(kernel_times) if is_tupled else kernel_times[0]
 def hash_tensor(t: torch.Tensor):
    return t.view(torch.int64).sum().item()
--- a/DeepEP/third-party/README.md
+++ b/DeepEP/third-party/README.md
@ -0,0 +1,89 @@
 # Install NVSHMEM
 ## Important notices
 **This project is neither sponsored nor supported by NVIDIA.**
 **Use of NVIDIA NVSHMEM is governed by the terms at [NVSHMEM Software License Agreement](https://docs.nvidia.com/nvshmem/api/sla.html).**
 ## Prerequisites
 Hardware requirements:
   - GPUs inside one node needs to be connected by NVLink
   - GPUs across different nodes needs to be connected by RDMA devices, see [GPUDirect RDMA Documentation](https://docs.nvidia.com/cuda/gpudirect-rdma/)
   - InfiniBand GPUDirect Async (IBGDA) support, see [IBGDA Overview](https://developer.nvidia.com/blog/improving-network-performance-of-hpc-systems-using-nvidia-magnum-io-nvshmem-and-gpudirect-async/)
   - For more detailed requirements, see [NVSHMEM Hardware Specifications](https://docs.nvidia.com/nvshmem/release-notes-install-guide/install-guide/abstract.html#hardware-requirements)
 ## Installation procedure
 ### 1. Acquiring NVSHMEM source code
 Download NVSHMEM v3.2.5 from the [NVIDIA NVSHMEM OPEN SOURCE PACKAGES](https://developer.nvidia.com/downloads/assets/secure/nvshmem/nvshmem_src_3.2.5-1.txz).
 ### 2. Apply our custom patch
 Navigate to your NVSHMEM source directory and apply our provided patch:
 ```bash
 git apply /path/to/deep_ep/dir/third-party/nvshmem.patch
 ```
 ### 3. Configure NVIDIA driver (required by inter-node communication)
 Enable IBGDA by modifying `/etc/modprobe.d/nvidia.conf`:
 ```bash
 options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1;"
 ```
 Update kernel configuration:
 ```bash
 sudo update-initramfs -u
 sudo reboot
 ```
 For more detailed configurations, please refer to the [NVSHMEM Installation Guide](https://docs.nvidia.com/nvshmem/release-notes-install-guide/install-guide/abstract.html).
 ### 4. Build and installation
 DeepEP uses NVLink for intra-node communication and IBGDA for inter-node communication. All the other features are disabled to reduce the dependencies.
 ```bash
 export CUDA_HOME=/path/to/cuda
 # disable all features except IBGDA
 export NVSHMEM_IBGDA_SUPPORT=1
 export NVSHMEM_SHMEM_SUPPORT=0
 export NVSHMEM_UCX_SUPPORT=0
 export NVSHMEM_USE_NCCL=0
 export NVSHMEM_PMIX_SUPPORT=0
 export NVSHMEM_TIMEOUT_DEVICE_POLLING=0
 export NVSHMEM_USE_GDRCOPY=0
 export NVSHMEM_IBRC_SUPPORT=0
 export NVSHMEM_BUILD_TESTS=0
 export NVSHMEM_BUILD_EXAMPLES=0
 export NVSHMEM_MPI_SUPPORT=0
 export NVSHMEM_BUILD_HYDRA_LAUNCHER=0
 export NVSHMEM_BUILD_TXZ_PACKAGE=0
 export NVSHMEM_TIMEOUT_DEVICE_POLLING=0
 cmake -G Ninja -S . -B build -DCMAKE_INSTALL_PREFIX=/path/to/your/dir/to/install
 cmake --build build/ --target install
 ```
 ## Post-installation configuration
 Set environment variables in your shell configuration:
 ```bash
 export NVSHMEM_DIR=/path/to/your/dir/to/install  # Use for DeepEP installation
 export LD_LIBRARY_PATH="${NVSHMEM_DIR}/lib:$LD_LIBRARY_PATH"
 export PATH="${NVSHMEM_DIR}/bin:$PATH"
 ```
 ## Verification
 ```bash
 nvshmem-info -a # Should display details of nvshmem
 ```
--- a/DeepEP/third-party/nvshmem.patch
+++ b/DeepEP/third-party/nvshmem.patch
@ -0,0 +1,474 @@
 From 9e6cc27cceb3130784e4ea7b61ea3171156365fd Mon Sep 17 00:00:00 2001
 From: Shangyan Zhou <sy.zhou@deepseek.com>
 Date: Fri, 20 Dec 2024 10:57:12 +0800
 Subject: [PATCH 1/4] Change QP creating order.
 ---
 src/modules/transport/ibgda/ibgda.cpp | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)
 diff --git a/src/modules/transport/ibgda/ibgda.cpp b/src/modules/transport/ibgda/ibgda.cpp
 index ef325cd..286132e 100644
 --- a/src/modules/transport/ibgda/ibgda.cpp
 +++ b/src/modules/transport/ibgda/ibgda.cpp
@@ -2936,17 +2936,20 @@ int nvshmemt_ibgda_connect_endpoints(nvshmem_transport_t t, int *selected_dev_id
         INFO(ibgda_state->log_level, "Creating %d RC QPs", device->rc.num_eps_per_pe);
         for (int i = 0; i < num_rc_eps; ++i) {
             // Do not create loopback to self
 -            if (i / device->rc.num_eps_per_pe == mype) {
 +            int dst_pe = (i + 1 + mype) % n_pes;
 +            int offset = i / n_pes;
 +            int mapped_i = dst_pe * device->rc.num_eps_per_pe + offset;
 +            if (dst_pe == mype) {
                 continue;
             }
 -            status = ibgda_create_qp(&device->rc.eps[i], device, portid, i,
 +            status = ibgda_create_qp(&device->rc.eps[mapped_i], device, portid, mapped_i,
                                      NVSHMEMI_IBGDA_DEVICE_QP_TYPE_RC);
             NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, out,
 -                                  "ibgda_create_dci failed on RC #%d.", i);
 +                                  "ibgda_create_dci failed on RC #%d.", mapped_i);
 -            status = ibgda_get_rc_handle(&local_rc_handles[i], device->rc.eps[i], device);
 +            status = ibgda_get_rc_handle(&local_rc_handles[mapped_i], device->rc.eps[mapped_i], device);
             NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, out,
 -                                  "ibgda_get_rc_handle failed on RC #%d.", i);
 +                                  "ibgda_get_rc_handle failed on RC #%d.", mapped_i);
         }
         if (num_rc_eps) {
 --
 2.25.1
 From b11d41e4f3727f2f6ccc00a8c852e59e2ee33c8a Mon Sep 17 00:00:00 2001
 From: Shangyan Zhou <sy.zhou@deepseek.com>
 Date: Fri, 10 Jan 2025 11:53:38 +0800
 Subject: [PATCH 2/4] Add recv queue and recv cq for rc qps.
 Let the ibgda rc qps use regular recv queue.
 Add recv queue to ibgda dev qp.
 IBGDA create recv cq
 Setup recv cq.
 fix recv queue.
 Remove some useless idx.
 Longer recv queue.
 ---
 .../nvshmem_common_ibgda.h                    | 19 +++++-
 src/modules/transport/ibgda/ibgda.cpp         | 65 ++++++++++++++++---
 2 files changed, 71 insertions(+), 13 deletions(-)
 diff --git a/src/include/device_host_transport/nvshmem_common_ibgda.h b/src/include/device_host_transport/nvshmem_common_ibgda.h
 index 8b8a263..1be3dec 100644
 --- a/src/include/device_host_transport/nvshmem_common_ibgda.h
 +++ b/src/include/device_host_transport/nvshmem_common_ibgda.h
@@ -168,14 +168,17 @@ typedef struct {
         uint64_t get_head;    // last wqe idx + 1 with a "fetch" operation (g, get, amo_fetch)
         uint64_t get_tail;    // last wqe idx + 1 polled with cst; get_tail > get_head is possible
     } tx_wq;
 +    struct {
 +        uint64_t resv_head;   // last reserved wqe idx + 1
 +    } rx_wq;
     struct {
         uint64_t head;
         uint64_t tail;
     } ibuf;
     char padding[NVSHMEMI_IBGDA_QP_MANAGEMENT_PADDING];
 } __attribute__((__aligned__(8))) nvshmemi_ibgda_device_qp_management_v1;
 -static_assert(sizeof(nvshmemi_ibgda_device_qp_management_v1) == 96,
 -              "ibgda_device_qp_management_v1 must be 96 bytes.");
 +static_assert(sizeof(nvshmemi_ibgda_device_qp_management_v1) == 104,
 +              "ibgda_device_qp_management_v1 must be 104 bytes.");
 typedef nvshmemi_ibgda_device_qp_management_v1 nvshmemi_ibgda_device_qp_management_t;
@@ -199,9 +202,19 @@ typedef struct nvshmemi_ibgda_device_qp {
         // May point to mvars.prod_idx or internal prod_idx
         uint64_t *prod_idx;
     } tx_wq;
 +    struct {
 +        uint16_t nwqes;
 +        uint64_t tail;
 +        void *wqe;
 +        __be32 *dbrec;
 +        void *bf;
 +        nvshmemi_ibgda_device_cq_t *cq;
 +        // May point to mvars.prod_idx or internal prod_idx
 +        uint64_t *prod_idx;
 +    } rx_wq;
     nvshmemi_ibgda_device_qp_management_v1 mvars;  // management variables
 } nvshmemi_ibgda_device_qp_v1;
 -static_assert(sizeof(nvshmemi_ibgda_device_qp_v1) == 184, "ibgda_device_qp_v1 must be 184 bytes.");
 +static_assert(sizeof(nvshmemi_ibgda_device_qp_v1) == 248, "ibgda_device_qp_v1 must be 248 bytes.");
 typedef nvshmemi_ibgda_device_qp_v1 nvshmemi_ibgda_device_qp_t;
 diff --git a/src/modules/transport/ibgda/ibgda.cpp b/src/modules/transport/ibgda/ibgda.cpp
 index 286132e..e0b2d5c 100644
 --- a/src/modules/transport/ibgda/ibgda.cpp
 +++ b/src/modules/transport/ibgda/ibgda.cpp
@@ -198,6 +198,7 @@ struct ibgda_ep {
     off_t dbr_offset;
     struct ibgda_cq *send_cq;
 +    struct ibgda_cq *recv_cq;
     struct ibv_ah *ah;
     uint32_t user_index;
@@ -1538,7 +1539,8 @@ static int ibgda_create_cq_shared_objects(nvshmemt_ibgda_state_t *ibgda_state,
     struct ibv_context *context = device->context;
 -    unsigned int num_cqs = device->dci.num_eps + device->rc.num_eps_per_pe * n_pes;
 +    // Each RC qp has one send CQ and one recv CQ.
 +    unsigned int num_cqs = device->dci.num_eps + device->rc.num_eps_per_pe * n_pes * 2;
     assert(ibgda_qp_depth > 0);
     size_t num_cqe = IBGDA_ROUND_UP_POW2_OR_0(ibgda_qp_depth);
@@ -1701,7 +1703,8 @@ static int ibgda_create_qp_shared_objects(nvshmemt_ibgda_state_t *ibgda_state,
     }
     // Allocate and map WQ buffer for all QPs.
 -    wq_buf_size_per_qp = num_wqebb * MLX5_SEND_WQE_BB;  // num_wqebb is always a power of 2
 +    // Todo: reduce the size of wq buffer.
 +    wq_buf_size_per_qp = num_wqebb * MLX5_SEND_WQE_BB * 2;  // num_wqebb is always a power of 2
     wq_buf_size = wq_buf_size_per_qp * num_eps;
     status = ibgda_nic_control_alloc(&wq_mobject, wq_buf_size, IBGDA_GPAGE_SIZE);
     NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, out, "cannot allocate wq buf.\n");
@@ -1882,8 +1885,11 @@ static int ibgda_create_qp(struct ibgda_ep **ep_ptr, struct ibgda_device *device
     int cqe_version = 0;
     struct ibgda_cq *send_cq = NULL;
 +    struct ibgda_cq *recv_cq = NULL;
     size_t num_wqebb = IBGDA_ROUND_UP_POW2_OR_0(ibgda_qp_depth);
 +    size_t num_recv_wqe = ibgda_qp_depth;
 +    size_t recv_wqe_size = 16;
     int status = 0;
@@ -1911,6 +1917,11 @@ static int ibgda_create_qp(struct ibgda_ep **ep_ptr, struct ibgda_device *device
     status = ibgda_create_cq(&send_cq, device);
     NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, out, "ibgda_create_cq failed.\n");
 +    if (qp_type == NVSHMEMI_IBGDA_DEVICE_QP_TYPE_RC) {
 +        status = ibgda_create_cq(&recv_cq, device);
 +        NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, out, "ibgda_create_cq failed.\n");
 +    }
 +
     ep = (struct ibgda_ep *)calloc(1, sizeof(struct ibgda_ep));
     NVSHMEMI_NULL_ERROR_JMP(ep, status, NVSHMEMX_ERROR_OUT_OF_MEMORY, out,
                             "Unable to allocate mem for ep.\n");
@@ -1939,12 +1950,9 @@ static int ibgda_create_qp(struct ibgda_ep **ep_ptr, struct ibgda_device *device
     DEVX_SET(qpc, qp_context, pm_state, MLX5_QPC_PM_STATE_MIGRATED);
     DEVX_SET(qpc, qp_context, pd, device->qp_shared_object.pdn);
     DEVX_SET(qpc, qp_context, uar_page, uar_mobject->uar->page_id);  // BF register
 -    DEVX_SET(qpc, qp_context, rq_type, IBGDA_SRQ_TYPE_VALUE);        // Shared Receive Queue
 -    DEVX_SET(qpc, qp_context, srqn_rmpn_xrqn, device->qp_shared_object.srqn);
     DEVX_SET(qpc, qp_context, cqn_snd, send_cq->cqn);
 -    DEVX_SET(qpc, qp_context, cqn_rcv, device->qp_shared_object.rcqn);
 +    DEVX_SET(qpc, qp_context, cqn_rcv, qp_type == NVSHMEMI_IBGDA_DEVICE_QP_TYPE_RC ? recv_cq->cqn : device->qp_shared_object.rcqn);
     DEVX_SET(qpc, qp_context, log_sq_size, IBGDA_ILOG2_OR0(num_wqebb));
 -    DEVX_SET(qpc, qp_context, log_rq_size, 0);
     DEVX_SET(qpc, qp_context, cs_req, 0);                                     // Disable CS Request
     DEVX_SET(qpc, qp_context, cs_res, 0);                                     // Disable CS Response
     DEVX_SET(qpc, qp_context, dbr_umem_valid, IBGDA_MLX5_UMEM_VALID_ENABLE);  // Enable dbr_umem_id
@@ -1953,6 +1961,15 @@ static int ibgda_create_qp(struct ibgda_ep **ep_ptr, struct ibgda_device *device
     DEVX_SET(qpc, qp_context, dbr_umem_id, dbr_umem->umem_id);  // DBR buffer
     DEVX_SET(qpc, qp_context, user_index, qp_idx);
     DEVX_SET(qpc, qp_context, page_offset, 0);
 +    if (qp_type == NVSHMEMI_IBGDA_DEVICE_QP_TYPE_RC){
 +        DEVX_SET(qpc, qp_context, rq_type, 0);        // Regular recv queue
 +        DEVX_SET(qpc, qp_context, log_rq_size, IBGDA_ILOG2(num_recv_wqe)); // 4 wqe
 +        DEVX_SET(qpc, qp_context, log_rq_stride, IBGDA_ILOG2(recv_wqe_size) - 4); // max recv wqe size = 16B
 +    } else {
 +        DEVX_SET(qpc, qp_context, rq_type, IBGDA_SRQ_TYPE_VALUE);        // Shared Receive Queue, DC must use this.
 +        DEVX_SET(qpc, qp_context, srqn_rmpn_xrqn, device->qp_shared_object.srqn);
 +        DEVX_SET(qpc, qp_context, log_rq_size, 0);
 +    }
     ep->devx_qp = mlx5dv_devx_obj_create(context, cmd_in, sizeof(cmd_in), cmd_out, sizeof(cmd_out));
     NVSHMEMI_NULL_ERROR_JMP(ep->devx_qp, status, NVSHMEMX_ERROR_INTERNAL, out,
@@ -1962,9 +1979,9 @@ static int ibgda_create_qp(struct ibgda_ep **ep_ptr, struct ibgda_device *device
     ep->portid = portid;
     ep->sq_cnt = num_wqebb;
 -    ep->sq_buf_offset = 0;
 +    ep->sq_buf_offset = num_recv_wqe * recv_wqe_size;
 -    ep->rq_cnt = 0;
 +    ep->rq_cnt = num_recv_wqe;
     ep->rq_buf_offset = 0;
     ep->wq_mobject = device->qp_shared_object.wq_mobject;
@@ -1978,6 +1995,7 @@ static int ibgda_create_qp(struct ibgda_ep **ep_ptr, struct ibgda_device *device
     ep->uar_mobject = uar_mobject;
     ep->send_cq = send_cq;
 +    ep->recv_cq = recv_cq;
     ep->qp_type = qp_type;
@@ -1989,6 +2007,7 @@ out:
     if (status) {
         if (uar_mobject) ibgda_unmap_and_free_qp_uar(uar_mobject);
         if (send_cq) ibgda_destroy_cq(send_cq);
 +        if (recv_cq) ibgda_destroy_cq(recv_cq);
         if (ep) free(ep);
     }
@@ -2287,6 +2306,10 @@ static int ibgda_destroy_ep(struct ibgda_ep *ep) {
         ibgda_destroy_cq(ep->send_cq);
     }
 +    if (ep->recv_cq) {
 +        ibgda_destroy_cq(ep->recv_cq);
 +    }
 +
     if (ep->ah) {
         ftable.destroy_ah(ep->ah);
     }
@@ -2318,7 +2341,7 @@ static void ibgda_get_device_qp(nvshmemi_ibgda_device_qp_t *dev_qp, struct ibgda
     dev_qp->qpn = ep->qpn;
     assert(ep->wq_mobject->has_gpu_mapping);
 -    dev_qp->tx_wq.wqe = (void *)((uintptr_t)ep->wq_mobject->aligned.gpu_ptr + ep->wq_offset);
 +    dev_qp->tx_wq.wqe = (void *)((uintptr_t)ep->wq_mobject->aligned.gpu_ptr + ep->wq_offset + ep->sq_buf_offset);
     if (ibgda_nic_handler == IBGDA_NIC_HANDLER_GPU) {
         assert(ep->dbr_mobject->has_gpu_mapping);
@@ -2330,6 +2353,12 @@ static void ibgda_get_device_qp(nvshmemi_ibgda_device_qp_t *dev_qp, struct ibgda
     }
     dev_qp->tx_wq.nwqes = ep->sq_cnt;
 +    if (ep->qp_type == NVSHMEMI_IBGDA_DEVICE_QP_TYPE_RC) {
 +        dev_qp->rx_wq.nwqes = ep->rq_cnt;
 +        dev_qp->rx_wq.wqe = (void *)((uintptr_t)ep->wq_mobject->aligned.gpu_ptr + ep->wq_offset + ep->rq_buf_offset);
 +        dev_qp->rx_wq.dbrec = (__be32 *)((uintptr_t)ep->dbr_mobject->aligned.gpu_ptr + ep->dbr_offset);
 +        dev_qp->rx_wq.bf = (void *)ep->uar_mobject->aligned.gpu_ptr;
 +    }
     ibuf_dci_start = (uintptr_t)device->qp_shared_object.internal_buf.mem_object->aligned.gpu_ptr;
     ibuf_rc_start = ibuf_dci_start + (size_per_dci * device->dci.num_eps);
@@ -2379,6 +2408,9 @@ static int ibgda_setup_gpu_state(nvshmem_transport_t t) {
     nvshmemi_ibgda_device_cq_t *cq_d = NULL;
     nvshmemi_ibgda_device_cq_t *cq_h = NULL;
 +    nvshmemi_ibgda_device_cq_t *recv_cq_d = NULL;
 +    nvshmemi_ibgda_device_cq_t *recv_cq_h = NULL;
 +
     uint8_t *qp_group_switches_d = NULL;
     const size_t mvars_offset = offsetof(nvshmemi_ibgda_device_qp_t, mvars);
@@ -2386,6 +2418,7 @@ static int ibgda_setup_gpu_state(nvshmem_transport_t t) {
     const size_t cons_t_offset = offsetof(nvshmemi_ibgda_device_qp_management_t, tx_wq.cons_idx);
     const size_t wqe_h_offset = offsetof(nvshmemi_ibgda_device_qp_management_t, tx_wq.resv_head);
     const size_t wqe_t_offset = offsetof(nvshmemi_ibgda_device_qp_management_t, tx_wq.ready_head);
 +    const size_t rx_resv_head_offset = offsetof(nvshmemi_ibgda_device_qp_management_t, rx_wq.resv_head);
     nvshmemi_ibgda_device_qp_map_type_t rc_map_type = NVSHMEMI_IBGDA_DEVICE_QP_MAP_TYPE_INVALID;
     nvshmemi_ibgda_device_qp_map_type_t dc_map_type = NVSHMEMI_IBGDA_DEVICE_QP_MAP_TYPE_INVALID;
@@ -2421,7 +2454,7 @@ static int ibgda_setup_gpu_state(nvshmem_transport_t t) {
         num_dct_handles += device->dct.num_eps * n_pes;
         num_dci_handles += device->dci.num_eps;
         num_rc_handles += device->rc.num_eps_per_pe * n_pes;
 -        num_cq_handles += device->dci.num_eps + (device->rc.num_eps_per_pe * (n_pes - 1));
 +        num_cq_handles += device->dci.num_eps + (device->rc.num_eps_per_pe * (n_pes - 1) * 2);
         num_shared_dci_handles += device->dci.num_shared_eps;
     }
     assert(num_dci_handles - num_shared_dci_handles >= 0);
@@ -2456,6 +2489,10 @@ static int ibgda_setup_gpu_state(nvshmem_transport_t t) {
     for (int i = 0; i < num_cq_handles; i++) {
         nvshmemi_init_ibgda_device_cq(cq_h[i]);
     }
 +
 +    recv_cq_h = (nvshmemi_ibgda_device_cq_t *)calloc(1, sizeof(*recv_cq_h));
 +    NVSHMEMI_NULL_ERROR_JMP(recv_cq_h, status, NVSHMEMX_ERROR_OUT_OF_MEMORY, out, "recv_cq calloc err.");
 +    nvshmemi_init_ibgda_device_cq(recv_cq_h[0]);
     /* allocate host memory for dct, rc, cq, dci end */
     /* allocate device memory for dct, rc, cq, dci start */
@@ -2559,6 +2596,14 @@ static int ibgda_setup_gpu_state(nvshmem_transport_t t) {
                 }
                 ++cq_idx;
 +
 +                rc_h[arr_idx].rx_wq.cq = &cq_d[cq_idx];
 +
 +                ibgda_get_device_cq(&cq_h[cq_idx], device->rc.eps[i]->recv_cq);
 +                cq_h[cq_idx].resv_head = (uint64_t *)(base_mvars_d_addr + rx_resv_head_offset);
 +                cq_h[cq_idx].qpn = rc_h[arr_idx].qpn;
 +                cq_h[cq_idx].qp_type = rc_h[arr_idx].qp_type;
 +                ++cq_idx;
             }
         }
     }
 --
 2.25.1
 From af479f9f23103d4a1579fae38676d6b3022df887 Mon Sep 17 00:00:00 2001
 From: Shangyan Zhou <sy.zhou@deepseek.com>
 Date: Sat, 8 Feb 2025 18:02:39 +0800
 Subject: [PATCH 3/4] Maintain recv queue's cons_idx.
 ---
 src/include/device_host_transport/nvshmem_common_ibgda.h | 5 +++--
 src/modules/transport/ibgda/ibgda.cpp                    | 6 ++++--
 2 files changed, 7 insertions(+), 4 deletions(-)
 diff --git a/src/include/device_host_transport/nvshmem_common_ibgda.h b/src/include/device_host_transport/nvshmem_common_ibgda.h
 index 1be3dec..ea1e284 100644
 --- a/src/include/device_host_transport/nvshmem_common_ibgda.h
 +++ b/src/include/device_host_transport/nvshmem_common_ibgda.h
@@ -170,6 +170,7 @@ typedef struct {
     } tx_wq;
     struct {
         uint64_t resv_head;   // last reserved wqe idx + 1
 +        uint64_t cons_idx;    // polled wqe idx + 1 (consumer index + 1)
     } rx_wq;
     struct {
         uint64_t head;
@@ -177,7 +178,7 @@ typedef struct {
     } ibuf;
     char padding[NVSHMEMI_IBGDA_QP_MANAGEMENT_PADDING];
 } __attribute__((__aligned__(8))) nvshmemi_ibgda_device_qp_management_v1;
 -static_assert(sizeof(nvshmemi_ibgda_device_qp_management_v1) == 104,
 -              "ibgda_device_qp_management_v1 must be 104 bytes.");
 +static_assert(sizeof(nvshmemi_ibgda_device_qp_management_v1) == 112,
 +              "ibgda_device_qp_management_v1 must be 112 bytes.");
 typedef nvshmemi_ibgda_device_qp_management_v1 nvshmemi_ibgda_device_qp_management_t;
@@ -214,7 +215,7 @@ typedef struct nvshmemi_ibgda_device_qp {
     } rx_wq;
     nvshmemi_ibgda_device_qp_management_v1 mvars;  // management variables
 } nvshmemi_ibgda_device_qp_v1;
 -static_assert(sizeof(nvshmemi_ibgda_device_qp_v1) == 248, "ibgda_device_qp_v1 must be 248 bytes.");
 +static_assert(sizeof(nvshmemi_ibgda_device_qp_v1) == 256, "ibgda_device_qp_v1 must be 256 bytes.");
 typedef nvshmemi_ibgda_device_qp_v1 nvshmemi_ibgda_device_qp_t;
 diff --git a/src/modules/transport/ibgda/ibgda.cpp b/src/modules/transport/ibgda/ibgda.cpp
 index e0b2d5c..bc339c5 100644
 --- a/src/modules/transport/ibgda/ibgda.cpp
 +++ b/src/modules/transport/ibgda/ibgda.cpp
@@ -1067,7 +1067,7 @@ static inline void ibgda_nic_control_free(struct ibgda_mem_object *mobject) {
         ibgda_host_mem_free(mobject);
 }
 -static int ibgda_create_cq(struct ibgda_cq **pgcq, struct ibgda_device *device) {
 +static int ibgda_create_cq(struct ibgda_cq **pgcq, struct ibgda_device *device, int cc = 1) {
     int status = 0;
     struct ibgda_cq *gcq = NULL;
@@ -1118,7 +1118,7 @@ static int ibgda_create_cq(struct ibgda_cq **pgcq, struct ibgda_device *device)
     cq_context = DEVX_ADDR_OF(create_cq_in, cmd_in, cq_context);
     DEVX_SET(cqc, cq_context, dbr_umem_valid, IBGDA_MLX5_UMEM_VALID_ENABLE);
     DEVX_SET(cqc, cq_context, cqe_sz, MLX5_CQE_SIZE_64B);
 -    DEVX_SET(cqc, cq_context, cc, 0x1);  // Use collapsed CQ
 +    DEVX_SET(cqc, cq_context, cc, cc);  // Use collapsed CQ
     DEVX_SET(cqc, cq_context, oi, 0x1);  // Allow overrun
     DEVX_SET(cqc, cq_context, dbr_umem_id, dbr_umem->umem_id);
     DEVX_SET(cqc, cq_context, log_cq_size, IBGDA_ILOG2_OR0(num_cqe));
@@ -2419,6 +2419,7 @@ static int ibgda_setup_gpu_state(nvshmem_transport_t t) {
     const size_t wqe_h_offset = offsetof(nvshmemi_ibgda_device_qp_management_t, tx_wq.resv_head);
     const size_t wqe_t_offset = offsetof(nvshmemi_ibgda_device_qp_management_t, tx_wq.ready_head);
     const size_t rx_resv_head_offset = offsetof(nvshmemi_ibgda_device_qp_management_t, rx_wq.resv_head);
 +    const size_t rx_cons_offset = offsetof(nvshmemi_ibgda_device_qp_management_t, rx_wq.cons_idx);
     nvshmemi_ibgda_device_qp_map_type_t rc_map_type = NVSHMEMI_IBGDA_DEVICE_QP_MAP_TYPE_INVALID;
     nvshmemi_ibgda_device_qp_map_type_t dc_map_type = NVSHMEMI_IBGDA_DEVICE_QP_MAP_TYPE_INVALID;
@@ -2601,6 +2602,7 @@ static int ibgda_setup_gpu_state(nvshmem_transport_t t) {
                 ibgda_get_device_cq(&cq_h[cq_idx], device->rc.eps[i]->recv_cq);
                 cq_h[cq_idx].resv_head = (uint64_t *)(base_mvars_d_addr + rx_resv_head_offset);
 +                cq_h[cq_idx].cons_idx = (uint64_t *)(base_mvars_d_addr + rx_cons_offset);
                 cq_h[cq_idx].qpn = rc_h[arr_idx].qpn;
                 cq_h[cq_idx].qp_type = rc_h[arr_idx].qp_type;
                 ++cq_idx;
 --
 2.25.1
 From e0ba3fa21b4b633b481c6684c3ad04f2670c8df4 Mon Sep 17 00:00:00 2001
 From: Shangyan Zhou <sy.zhou@deepseek.com>
 Date: Tue, 11 Feb 2025 11:00:57 +0800
 Subject: [PATCH 4/4] Init rx_wq counters.
 ---
 src/include/device_host_transport/nvshmem_common_ibgda.h | 2 ++
 1 file changed, 2 insertions(+)
 diff --git a/src/include/device_host_transport/nvshmem_common_ibgda.h b/src/include/device_host_transport/nvshmem_common_ibgda.h
 index ea1e284..e6640d6 100644
 --- a/src/include/device_host_transport/nvshmem_common_ibgda.h
 +++ b/src/include/device_host_transport/nvshmem_common_ibgda.h
@@ -46,6 +46,8 @@
         qp_man.tx_wq.cons_idx = NVSHMEMI_IBGDA_ULSCALAR_INVALID;                    \
         qp_man.tx_wq.get_head = NVSHMEMI_IBGDA_ULSCALAR_INVALID;                    \
         qp_man.tx_wq.get_tail = NVSHMEMI_IBGDA_ULSCALAR_INVALID;                    \
 +        qp_man.rx_wq.resv_head = NVSHMEMI_IBGDA_ULSCALAR_INVALID;                    \
 +        qp_man.rx_wq.cons_idx = NVSHMEMI_IBGDA_ULSCALAR_INVALID;                    \
         qp_man.ibuf.head = NVSHMEMI_IBGDA_ULSCALAR_INVALID;                         \
         qp_man.ibuf.tail = NVSHMEMI_IBGDA_ULSCALAR_INVALID;                         \
     } while (0);
 --
 2.25.1
 diff --git a/src/modules/transport/common/transport_ib_common.cpp b/src/modules/transport/common/transport_ib_common.cpp
 index c89f408..f99018a 100644
 --- a/src/modules/transport/common/transport_ib_common.cpp
 +++ b/src/modules/transport/common/transport_ib_common.cpp
@@ -26,6 +26,9 @@ int nvshmemt_ib_common_nv_peer_mem_available() {
     if (access("/sys/kernel/mm/memory_peers/nvidia-peermem/version", F_OK) == 0) {
         return NVSHMEMX_SUCCESS;
     }
 +    if (access("/sys/module/nvidia_peermem/version", F_OK) == 0) {
 +        return NVSHMEMX_SUCCESS;
 +    }
     return NVSHMEMX_ERROR_INTERNAL;
 }
 From 099f608fcd9a1d34c866ad75d0af5d02d2020374 Mon Sep 17 00:00:00 2001
 From: Kaichao You <youkaichao@gmail.com>
 Date: Tue, 10 Jun 2025 00:35:03 -0700
 Subject: [PATCH] remove gdrcopy dependency
 ---
 src/modules/transport/ibgda/ibgda.cpp | 6 ++++++
 1 file changed, 6 insertions(+)
 diff --git a/src/modules/transport/ibgda/ibgda.cpp b/src/modules/transport/ibgda/ibgda.cpp
 index ef325cd..16ee09c 100644
 --- a/src/modules/transport/ibgda/ibgda.cpp
 +++ b/src/modules/transport/ibgda/ibgda.cpp
@@ -406,6 +406,7 @@ static size_t ibgda_get_host_page_size() {
     return host_page_size;
 }
 +#ifdef NVSHMEM_USE_GDRCOPY
 int nvshmemt_ibgda_progress(nvshmem_transport_t t) {
     nvshmemt_ibgda_state_t *ibgda_state = (nvshmemt_ibgda_state_t *)t->state;
     int n_devs_selected = ibgda_state->n_devs_selected;
@@ -459,6 +460,11 @@ int nvshmemt_ibgda_progress(nvshmem_transport_t t) {
     }
     return 0;
 }
 +#else
 +int nvshmemt_ibgda_progress(nvshmem_transport_t t) {
 +    return NVSHMEMX_ERROR_NOT_SUPPORTED;
 +}
 +#endif
 int nvshmemt_ibgda_show_info(struct nvshmem_transport *transport, int style) {
     NVSHMEMI_ERROR_PRINT("ibgda show info not implemented");
 --
 2.34.1
--- a/184
+++ b/184
@ -0,0 +1,184 @@
 ###############################################################################
 # Stage 0 ─ builder-torch：编译 PyTorch 2.7.1 (+cu126)
 ###############################################################################
 ARG CUDA_VERSION=12.6.1
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
 ENV USE_CUDA=1 \
    USE_DISTRIBUTED=1 \
    USE_MPI=1 \
    USE_GLOO=1 \
    USE_NCCL=1 \
    USE_SYSTEM_NCCL=1 \
    BUILD_TEST=0
 ARG MAX_JOBS=90                       
 ENV DEBIAN_FRONTEND=noninteractive \
    PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
    TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
 RUN apt-get update && apt-get install -y --no-install-recommends \
      python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
      libopenblas-dev libopenmpi-dev \
      libnccl2=2.22.3-1+cuda12.6 \
      libnccl-dev=2.22.3-1+cuda12.6 \
      libjpeg-dev libpng-dev ca-certificates && \
    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
 WORKDIR /opt
 RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
 WORKDIR /opt/pytorch
 ENV MAX_JOBS=${MAX_JOBS}
 RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
    python3 setup.py bdist_wheel
 ###############################################################################
 # Stage 1 ─ builder-extras：用自编 Torch 装 TV / flashinfer / sglang，并收集轮子
 ###############################################################################
 ARG CUDA_VERSION=12.6.1
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
 ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
 RUN apt-get update && apt-get install -y --no-install-recommends \
      python3 python3-pip python3-distutils python3.10-dev git build-essential \
      cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
      libopenmpi-dev libopenblas-dev\
      libnccl2=2.22.3-1+cuda12.6 \
      libnccl-dev=2.22.3-1+cuda12.6 && \
    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
 # ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
 COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
 RUN set -e && \
    echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
    find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
 # ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
 WORKDIR /opt
 RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
 WORKDIR /opt/vision
 RUN python3 setup.py bdist_wheel
 # ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
 WORKDIR /opt
 RUN git clone https://github.com/flashinfer-ai/flashinfer.git
 WORKDIR /opt/flashinfer
 RUN pip install . && \
    python3 -m pip wheel . --no-deps -w dist/
 # ── 下载 vllm 预编译 wheel，避免编译 flash-attn ───────────────────────────────
 WORKDIR /opt
 RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
 # ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
 COPY ./sglang /sgl/sglang
 WORKDIR /sgl/sglang/python
 RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
    python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
 # ── 🔄 下载 sgl-kernel（与 sglang 同步）───────────────────────────────────────
 RUN pip download --only-binary=:all: --no-deps sgl-kernel==0.1.9 -d /tmp/sgl_kernel_wheels
 # ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
 RUN mkdir -p /wheels && \
    cp /tmp/torch_dist/torch*.whl /wheels/ && \
    cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
    cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
    cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
    cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
    cp /tmp/sgl_kernel_wheels/sgl_kernel-*.whl   /wheels/ && \
    pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
 # ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
 RUN pip wheel \
    pydantic orjson psutil pyzmq pynvml \
    transformers==4.52.0 uvicorn fastapi IPython aiohttp \
    setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
    -w /wheels
 # ── ✅ 打包 gradio UI 所需依赖 ────────────────────────────────────────────────
 RUN pip wheel "gradio==5.38.2" requests -w /wheels
 ###############################################################################
 # Stage 2 ─ runtime：极简运行镜像，仅离线安装 wheel
 ###############################################################################
 ARG CUDA_VERSION=12.6.1
 FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
 ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
 RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
        python3 python3-dev python3-pip python3-distutils curl ca-certificates \
        libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
    rm -rf /var/lib/apt/lists/* && \
    python3 -m pip install --no-cache-dir --upgrade pip \
    && python3 -m pip install --no-cache-dir xgrammar
 # 👉 拷贝 cupti 动态库（避免写死版本号）
 COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
 COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
 # 👇建议在后面补上
 RUN ldconfig
 # ---- 拷贝预调优的 MoE Triton kernel config ----------------------------
 COPY moe_kernels /usr/local/lib/python3.10/dist-packages/sglang/srt/layers/moe/fused_moe_triton/configs
 COPY --from=builder-extras /wheels /tmp/wheels
 # ✅ 优先装你自编的 torch，避免被 PyPI 上的覆盖
 RUN ls -lh /tmp/wheels && \
    rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
    rm -f /tmp/wheels/huggingface_hub-0.33.4*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/sgl_kernel-*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/gradio-5.38.2*.whl && \
    python3 -m pip install --no-cache-dir --no-deps $(ls /tmp/wheels | grep -v '^gradio-' | sed 's|^|/tmp/wheels/|') && \
    python3 -m pip install --no-cache-dir --no-deps $(find /tmp/wheels -maxdepth 1 -type f -name '*.whl' ! -name 'gradio-*') && \
    python3 -c "import gradio, sys; print('✅ Gradio version =', gradio.__version__)" && \
    rm -rf /tmp/wheels
 # ✅ 安装 Prometheus client
 RUN python3 -m pip install --no-cache-dir prometheus_client
 # ✅ 设置多进程 metrics 收集目录（用于 MultiProcessCollector）
 ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
 # ✅ 确保目录存在
 RUN mkdir -p /tmp/prometheus
 # ✅ 添加 Tini（推荐）
 ENV TINI_VERSION=v0.19.0
 ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
 RUN chmod +x /tini
 ENTRYPOINT ["/tini", "--"]
 # ---- 拷贝模型（路径可换） ----
 # COPY ./Alibaba/Qwen3-30B-A3B /root/.cradle/Alibaba/Qwen3-30B-A3B
 HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
 # ---- 暴露端口 ----
 EXPOSE 30000 30001
 # 安装 supervisor
 RUN apt-get update && apt-get install -y supervisor && \
    mkdir -p /etc/supervisor/conf.d
 # 拷贝 supervisord 配置文件和 UI 脚本
 COPY ./meta_ui.py /app/meta_ui.py
 COPY ./supervisord.conf /etc/supervisor/supervisord.conf
 # 作为容器主进程运行 supervisor
 CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]
--- a/gdrcopy/.gitignore
+++ b/gdrcopy/.gitignore
@ -0,0 +1,25 @@
 # Compiled Object files
 *.slo
 *.lo
 *.o
 *.obj
 # Compiled Dynamic libraries
 *.so
 *.dylib
 *.dll
 # Compiled Static libraries
 *.lai
 *.la
 *.a
 *.lib
 # Executables
 *.exe
 *.out
 *.app
 # Editor files
 *~
 *.swp
--- a/gdrcopy/CHANGELOG.md
+++ b/gdrcopy/CHANGELOG.md
@ -0,0 +1,108 @@
 # Changelog
 ## [2.4.4] - 2024-12-16
 - Fix the use-after-free bug of mr objects in gdrdv\_vma\_close.
 - Fix the resource leakage bug in gdrdrv\_release.
 ## [2.4.3] - 2024-12-02
 - Fix NVIDIA\_IS\_OPENSOURCE detection when compile with NVIDIA driver version 545 or newer.
 - Fix compile error in gdrdrv when compile on RHEL9.5.
 ## [2.4.2] - 2024-10-31
 - Fix the size alignment bug in gdrdrv.
 - Fix memory leak in gdr\_pin\_buffer.
 - Add support for another flavor of BF3.
 ## [2.4.1] - 2023-12-18
 - Add support for persistent mapping.
 - Fix bug in src/gdrdrv/Makefile.
 - Fix compile-time bug when check.h is not found.
 ## [2.4] - 2023-09-19
 - Various bug fixes in the test and benchmark applications.
 - Prefix all applications with "gdrcopy\_".
 - Introduce more unit tests in gdrcopy\_sanity.
 - Introduce gdrcopy\_pplat benchmark application.
 - Remove dependency on libcheck and libsubunit
 - Introduce gdr\_get\_info\_v2.
 - Introduce new copy algorithm for device mappings.
 - Add support for NVIDIA BLUEFIELD-3.
 - Add support for Linux kernel >= 6.3.
 - Add support for SLES and OpenSUSE.
 - Add support for systemd service on RHEL9.
 - Relicense gdrdrv to Dual MIT/GPL.
 - Fix bugs in gdrdrv when pinning two small buffers back-to-back.
 - Add support for coherent platforms such as Grace-Hopper.
 - Add support for Confidential Computing (CC).
 ## [2.3.1] - 2023-05-12
 - Add a workaround for the GPL-compatibility issue when compile with CONFIG\_ARCH\_HAS\_CC\_PLATFORM on Linux kernel 5.18+.
 - Fix error in init.d/gdrcopy due to missing /etc/rc.d/init.d/functions.
 ## [2.3] - 2021-07-27
 - Remove automatically-generated build id links in rpm packages.
 - Remove gdrcopy-kmod from the Requires field of the gdrcopy rpm package.
 - Remove gdrdrv-dkms dependency enforcement from the gdrcopy deb package.
 - Add libsubunit0 to the dependency list of the gdrcopy deb package.
 - Add apiperf test.
 - Revamp gdrdrv to fix race-condition bugs.
 - Add an option to build kmod package.
 - Split the gdrcopy deb package into meta, libgdrapi, and tests packages.
 - Update the package maintainer.
 - Various updates in README.
 ## [2.2] - 2021-02-01
 - Add support for ARM64.
 - Update various information on README.
 - Improve Makefile.
 - Add multi-arch support.
 - Handle removal of HAVE\_UNLOCKED\_IOCTL in Linux kernel v5.9 and later.
 - Prevent dpkg package creation to unnecessarily compile gdrdrv.
 - Improve gdr\_open error message.
 - Fix bug that prevents sanity from correctly summarizing failure.
 - Add dkms support in kmod package.
 - Handle the removal of kzfree in Linux kernel v5.10 and later.
 - Improve small-size copy-to-mapping.
 ## [2.1] - 2020-08-07
 - fix build problem on RHL8 kernels
 - relax checks in gdrdrv to support multi-threading use cases
 - fix fd leak in gdr\_open()
 - introduce new copylat test
 - remove CUDA RT dependency in tests
 - assorted cleanups
 ## [2.0] - 2019-09-16
 - Harden security in gdrdrv.
 - Enable cached mappings in POWER9.
 - Improve copy performance with unrolling in POWERPC.
 - Creates _sanity_ unit test for testing the functionality and security.
 - Consolidate _basic_ and _validate_ into _sanity_ unit test.
 - Introduce compile time and runtime version checking in _libgdrapi_.
 - Improve rpm packaging.
 - Introduce deb packaging for the userspace library and the applications.
 - Introduce dkms packaging for the _gdrdrv_ driver.
 - Rename gdr\_copy\_from/to\_bar to gdr\_copy\_from/to\_mapping.
 - Update README
 ## [1.3] - 2018-07-26
 - Add _gdrdrv_ driver for converting cudaMalloc'd addresses to the GPU's BAR1
  addresses and exposing them to CPU-accessible virtual addresses.
 - Add _libgdrapi_, a user-space library for communicating with the gdrdrv driver.
 - Add _basic_ application as an minimal example on how to use gdrcopy.
 - Add _copybw_ application as a complete example on how CPU could read/write to
  cudaMalloc'd memory via BAR1 mappings.
 - Add _validate_ unit test to ensure that gdrcopy functions as expected.
 - Add a script for packaging gdrcopy in the rpm format.
 [2.4.3]: https://github.com/NVIDIA/gdrcopy/releases/tag/v2.4.3
 [2.4.2]: https://github.com/NVIDIA/gdrcopy/releases/tag/v2.4.2
 [2.4.1]: https://github.com/NVIDIA/gdrcopy/releases/tag/v2.4.1
 [2.4]: https://github.com/NVIDIA/gdrcopy/releases/tag/v2.4
 [2.3.1]: https://github.com/NVIDIA/gdrcopy/releases/tag/v2.3.1
 [2.3]: https://github.com/NVIDIA/gdrcopy/releases/tag/v2.3
 [2.2]: https://github.com/NVIDIA/gdrcopy/releases/tag/v2.2
 [2.1]: https://github.com/NVIDIA/gdrcopy/releases/tag/v2.1
 [2.0]: https://github.com/NVIDIA/gdrcopy/releases/tag/v2.0
 [1.3]: https://github.com/NVIDIA/gdrcopy/releases/tag/v1.3
--- a/gdrcopy/LICENSE
+++ b/gdrcopy/LICENSE
@ -0,0 +1,19 @@
 Copyright (c) 2014-2021, NVIDIA CORPORATION. All rights reserved.
 Permission is hereby granted, free of charge, to any person obtaining a
 copy of this software and associated documentation files (the "Software"),
 to deal in the Software without restriction, including without limitation
 the rights to use, copy, modify, merge, publish, distribute, sublicense,
 and/or sell copies of the Software, and to permit persons to whom the
 Software is furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in 
 all copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 DEALINGS IN THE SOFTWARE.
--- a/gdrcopy/Makefile
+++ b/gdrcopy/Makefile
@ -0,0 +1,94 @@
 # Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the "Software"),
 # to deal in the Software without restriction, including without limitation
 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
 # and/or sell copies of the Software, and to permit persons to whom the
 # Software is furnished to do so, subject to the following conditions:
 #
 # The above copyright notice and this permission notice shall be included in 
 # all copies or substantial portions of the Software.
 #
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 prefix      ?= /usr/local
 exec_prefix ?= $(prefix)
 libdir      ?= $(exec_prefix)/lib
 bindir      ?= $(exec_prefix)/bin
 includedir  ?= $(prefix)/include
 DESTDIR := $(abspath $(DESTDIR))
 DESTLIB = $(DESTDIR)$(libdir)
 DESTBIN = $(DESTDIR)$(bindir)
 DESTINC = $(DESTDIR)$(includedir)
 CUDA ?= /usr/local/cuda
 LIB_MAJOR_VER ?= $(shell awk '/\#define GDR_API_MAJOR_VERSION/ { print $$3 }' include/gdrapi.h | tr -d '\n')
 LIB_MINOR_VER ?= $(shell awk '/\#define GDR_API_MINOR_VERSION/ { print $$3 }' include/gdrapi.h | tr -d '\n')
 GDRAPI_ARCH := $(shell ./config_arch)
 GDRAPI_INC := ../include
 LIB_VER:=$(LIB_MAJOR_VER).$(LIB_MINOR_VER)
 LIB_BASENAME:=libgdrapi.so
 LIB_DYNAMIC=$(LIB_BASENAME).$(LIB_VER)
 LIB_SONAME=$(LIB_BASENAME).$(LIB_MAJOR_VER)
 all: config driver lib exes
 version:
 	@ echo "$(LIB_VER)"
 config:
 	@ echo "GDRAPI_ARCH=$(GDRAPI_ARCH)"
 driver:
 	cd src/gdrdrv && \
 	$(MAKE) $(MAKE_PARAMS)
 lib:
 	cd src && \
 	$(MAKE) LIB_MAJOR_VER=$(LIB_MAJOR_VER) LIB_MINOR_VER=$(LIB_MINOR_VER)
 exes: lib
 	cd tests && \
 	$(MAKE) CUDA=$(CUDA)
 install: lib_install exes_install
 lib_install: lib
 	@ echo "installing in $(DESTLIB) $(DESTINC)..." && \
 	mkdir -p $(DESTLIB) && \
 	install -D -v -m u=rwx,g=rx,o=rx src/$(LIB_DYNAMIC) -t $(DESTLIB) && \
 	mkdir -p $(DESTINC) && \
 	install -D -v -m u=rw,g=rw,o=r include/* -t $(DESTINC); \
 	cd $(DESTLIB); \
 	ln -sf $(LIB_DYNAMIC) $(LIB_SONAME); \
 	ln -sf $(LIB_SONAME) $(LIB_BASENAME);
 exes_install: exes
 	cd tests && $(MAKE) install DESTBIN=$(DESTBIN)
 drv_install: driver
 	cd src/gdrdrv && \
 	$(MAKE) install
 clean:
 	cd tests && \
 	$(MAKE) clean
 	cd src && \
 	$(MAKE) clean
 	cd src/gdrdrv && \
 	$(MAKE) clean
 .PHONY: driver clean all lib exes lib_install drv_install exes_install install
--- a/gdrcopy/README.md
+++ b/gdrcopy/README.md
@ -0,0 +1,495 @@
 # GDRCopy
 A low-latency GPU memory copy library based on NVIDIA GPUDirect RDMA
 technology.
 ## Introduction
 While GPUDirect RDMA is meant for direct access to GPU memory from
 third-party devices, it is possible to use these same APIs to create
 perfectly valid CPU mappings of the GPU memory.
 The advantage of a CPU driven copy is the very small overhead
 involved. That might be useful when low latencies are required.
 ## What is inside
 GDRCopy offers the infrastructure to create user-space mappings of GPU memory,
 which can then be manipulated as if it was plain host memory (caveats apply
 here).
 A simple by-product of it is a copy library with the following characteristics:
 - very low overhead, as it is driven by the CPU. As a reference, currently a 
  cudaMemcpy can incur in a 6-7us overhead.
 - An initial memory *pinning* phase is required, which is potentially expensive,
  10us-1ms depending on the buffer size.
 - Fast H-D, because of write-combining. H-D bandwidth is 6-8GB/s on Ivy
  Bridge Xeon but it is subject to NUMA effects.
 - Slow D-H, because the GPU BAR, which backs the mappings, can't be
  prefetched and so burst reads transactions are not generated through
  PCIE
 The library comes with a few tests like:
 - gdrcopy_sanity, which contains unit tests for the library and the driver.
 - gdrcopy_copybw, a minimal application which calculates the R/W bandwidth for a specific buffer size.
 - gdrcopy_copylat, a benchmark application which calculates the R/W copy latency for a range of buffer sizes.
 - gdrcopy_apiperf, an application for benchmarking the latency of each GDRCopy API call.
 - gdrcopy_pplat, a benchmark application which calculates the round-trip ping-pong latency between GPU and CPU.
 ## Requirements
 GPUDirect RDMA requires NVIDIA Tesla or Quadro class GPUs based on Kepler,
 Pascal, Volta, or Turing, see [GPUDirect
 RDMA](http://developer.nvidia.com/gpudirect).  For more technical informations,
 please refer to the official GPUDirect RDMA [design
 document](http://docs.nvidia.com/cuda/gpudirect-rdma).
 The device driver requires GPU display driver >= 418.40 on ppc64le and >= 331.14 on other platforms. The library and tests
 require CUDA >= 6.0.
 DKMS is a prerequisite for installing GDRCopy kernel module package. On RHEL
 or SLE,
 however, users have an option to build kmod and install it instead of the DKMS
 package. See [Build and installation](#build-and-installation) section for more details.
 ```shell
 # On RHEL
 # dkms can be installed from epel-release. See https://fedoraproject.org/wiki/EPEL.
 $ sudo yum install dkms
 # On Debian - No additional dependency
 # On SLE / Leap
 # On SLE dkms can be installed from PackageHub.
 $ sudo zypper install dkms rpmbuild
 ```
 CUDA and GPU display driver must be installed before building and/or installing GDRCopy.
 The installation instructions can be found in https://developer.nvidia.com/cuda-downloads.
 GPU display driver header files are also required. They are installed as a part
 of the driver (or CUDA) installation with  *runfile*. If you install the driver
 via package management, we suggest
 - On RHEL, `sudo dnf module install nvidia-driver:latest-dkms`.
 - On Debian, `sudo apt install nvidia-dkms-<your-nvidia-driver-version>`.
 - On SLE, `sudo zypper install nvidia-gfx<your-nvidia-driver-version>-kmp`.
 The supported architectures are Linux x86\_64, ppc64le, and arm64. The supported
 platforms are RHEL8, RHEL9, Ubuntu20\_04, Ubuntu22\_04,
 SLE-15 (any SP) and Leap 15.x.
 Root privileges are necessary to load/install the kernel-mode device
 driver.
 ## Build and installation
 We provide three ways for building and installing GDRCopy.
 ### rpm package
 ```shell
 # For RHEL:
 $ sudo yum groupinstall 'Development Tools'
 $ sudo yum install dkms rpm-build make
 # For SLE:
 $ sudo zypper in dkms rpmbuild
 $ cd packages
 $ CUDA=<cuda-install-top-dir> ./build-rpm-packages.sh
 $ sudo rpm -Uvh gdrcopy-kmod-<version>dkms.noarch.<platform>.rpm
 $ sudo rpm -Uvh gdrcopy-<version>.<arch>.<platform>.rpm
 $ sudo rpm -Uvh gdrcopy-devel-<version>.noarch.<platform>.rpm
 ```
 DKMS package is the default kernel module package that `build-rpm-packages.sh`
 generates. To create kmod package, `-m` option must be passed to the script.
 Unlike the DKMS package, the kmod package contains a prebuilt GDRCopy kernel
 module which is specific to the NVIDIA driver version and the Linux kernel
 version used to build it.
 ### deb package
 ```shell
 $ sudo apt install build-essential devscripts debhelper fakeroot pkg-config dkms
 $ cd packages
 $ CUDA=<cuda-install-top-dir> ./build-deb-packages.sh
 $ sudo dpkg -i gdrdrv-dkms_<version>_<arch>.<platform>.deb
 $ sudo dpkg -i libgdrapi_<version>_<arch>.<platform>.deb
 $ sudo dpkg -i gdrcopy-tests_<version>_<arch>.<platform>.deb
 $ sudo dpkg -i gdrcopy_<version>_<arch>.<platform>.deb
 ```
 ### from source
 ```shell
 $ make prefix=<install-to-this-location> CUDA=<cuda-install-top-dir> all install
 $ sudo ./insmod.sh
 ```
 ### Notes
 Compiling the gdrdrv driver requires the NVIDIA driver source code, which is typically installed at
 `/usr/src/nvidia-<version>`. Our make file automatically detects and picks that source code. In case there are multiple
 versions installed, it is possible to pass the correct path by defining the NVIDIA_SRC_DIR variable, e.g. `export
 NVIDIA_SRC_DIR=/usr/src/nvidia-520.61.05/nvidia` before building the gdrdrv module.
 There are two major flavors of NVIDIA driver: 1) proprietary, and 2)
 [opensource](https://developer.nvidia.com/blog/nvidia-releases-open-source-gpu-kernel-modules/). We detect the flavor
 when compiling gdrdrv based on the source code of the NVIDIA driver. Different flavors come with different features and
 restrictions:
 - gdrdrv compiled with the opensource flavor will provide functionality and high performance on all platforms. However,
  you will not be able to load this gdrdrv driver when the proprietary NVIDIA driver is loaded.
 - gdrdrv compiled with the proprietary flavor can always be loaded regardless of the flavor of NVIDIA driver you have
  loaded. However, it may have suboptimal performance on coherent platforms such as Grace-Hopper. Functionally, it will not
  work correctly on Intel CPUs with Linux kernel built with confidential compute (CC) support, i.e.
  `CONFIG_ARCH_HAS_CC_PLATFORM=y`, *WHEN* CC is enabled at runtime.
 ## Tests
 Execute provided tests:
 ```shell
 $ gdrcopy_sanity 
 Total: 28, Passed: 28, Failed: 0, Waived: 0
 List of passed tests:
    basic_child_thread_pins_buffer_cumemalloc
    basic_child_thread_pins_buffer_vmmalloc
    basic_cumemalloc
    basic_small_buffers_mapping
    basic_unaligned_mapping
    basic_vmmalloc
    basic_with_tokens
    data_validation_cumemalloc
    data_validation_vmmalloc
    invalidation_access_after_free_cumemalloc
    invalidation_access_after_free_vmmalloc
    invalidation_access_after_gdr_close_cumemalloc
    invalidation_access_after_gdr_close_vmmalloc
    invalidation_fork_access_after_free_cumemalloc
    invalidation_fork_access_after_free_vmmalloc
    invalidation_fork_after_gdr_map_cumemalloc
    invalidation_fork_after_gdr_map_vmmalloc
    invalidation_fork_child_gdr_map_parent_cumemalloc
    invalidation_fork_child_gdr_map_parent_vmmalloc
    invalidation_fork_child_gdr_pin_parent_with_tokens
    invalidation_fork_map_and_free_cumemalloc
    invalidation_fork_map_and_free_vmmalloc
    invalidation_two_mappings_cumemalloc
    invalidation_two_mappings_vmmalloc
    invalidation_unix_sock_shared_fd_gdr_map_cumemalloc
    invalidation_unix_sock_shared_fd_gdr_map_vmmalloc
    invalidation_unix_sock_shared_fd_gdr_pin_buffer_cumemalloc
    invalidation_unix_sock_shared_fd_gdr_pin_buffer_vmmalloc
 $ gdrcopy_copybw
 GPU id:0; name: Tesla V100-SXM2-32GB; Bus id: 0000:06:00
 GPU id:1; name: Tesla V100-SXM2-32GB; Bus id: 0000:07:00
 GPU id:2; name: Tesla V100-SXM2-32GB; Bus id: 0000:0a:00
 GPU id:3; name: Tesla V100-SXM2-32GB; Bus id: 0000:0b:00
 GPU id:4; name: Tesla V100-SXM2-32GB; Bus id: 0000:85:00
 GPU id:5; name: Tesla V100-SXM2-32GB; Bus id: 0000:86:00
 GPU id:6; name: Tesla V100-SXM2-32GB; Bus id: 0000:89:00
 GPU id:7; name: Tesla V100-SXM2-32GB; Bus id: 0000:8a:00
 selecting device 0
 testing size: 131072
 rounded size: 131072
 gpu alloc fn: cuMemAlloc
 device ptr: 7f1153a00000
 map_d_ptr: 0x7f1172257000
 info.va: 7f1153a00000
 info.mapped_size: 131072
 info.page_size: 65536
 info.mapped: 1
 info.wc_mapping: 1
 page offset: 0
 user-space pointer:0x7f1172257000
 writing test, size=131072 offset=0 num_iters=10000
 write BW: 9638.54MB/s
 reading test, size=131072 offset=0 num_iters=100
 read BW: 530.135MB/s
 unmapping buffer
 unpinning buffer
 closing gdrdrv
 $ gdrcopy_copylat
 GPU id:0; name: Tesla V100-SXM2-32GB; Bus id: 0000:06:00
 GPU id:1; name: Tesla V100-SXM2-32GB; Bus id: 0000:07:00
 GPU id:2; name: Tesla V100-SXM2-32GB; Bus id: 0000:0a:00
 GPU id:3; name: Tesla V100-SXM2-32GB; Bus id: 0000:0b:00
 GPU id:4; name: Tesla V100-SXM2-32GB; Bus id: 0000:85:00
 GPU id:5; name: Tesla V100-SXM2-32GB; Bus id: 0000:86:00
 GPU id:6; name: Tesla V100-SXM2-32GB; Bus id: 0000:89:00
 GPU id:7; name: Tesla V100-SXM2-32GB; Bus id: 0000:8a:00
 selecting device 0
 device ptr: 0x7fa2c6000000
 allocated size: 16777216
 gpu alloc fn: cuMemAlloc
 map_d_ptr: 0x7fa2f9af9000
 info.va: 7fa2c6000000
 info.mapped_size: 16777216
 info.page_size: 65536
 info.mapped: 1
 info.wc_mapping: 1
 page offset: 0
 user-space pointer: 0x7fa2f9af9000
 gdr_copy_to_mapping num iters for each size: 10000
 WARNING: Measuring the API invocation overhead as observed by the CPU. Data
 might not be ordered all the way to the GPU internal visibility.
 Test             Size(B)     Avg.Time(us)
 gdr_copy_to_mapping             1         0.0889
 gdr_copy_to_mapping             2         0.0884
 gdr_copy_to_mapping             4         0.0884
 gdr_copy_to_mapping             8         0.0884
 gdr_copy_to_mapping            16         0.0905
 gdr_copy_to_mapping            32         0.0902
 gdr_copy_to_mapping            64         0.0902
 gdr_copy_to_mapping           128         0.0952
 gdr_copy_to_mapping           256         0.0983
 gdr_copy_to_mapping           512         0.1176
 gdr_copy_to_mapping          1024         0.1825
 gdr_copy_to_mapping          2048         0.2549
 gdr_copy_to_mapping          4096         0.4366
 gdr_copy_to_mapping          8192         0.8141
 gdr_copy_to_mapping         16384         1.6155
 gdr_copy_to_mapping         32768         3.2284
 gdr_copy_to_mapping         65536         6.4906
 gdr_copy_to_mapping        131072        12.9761
 gdr_copy_to_mapping        262144        25.9459
 gdr_copy_to_mapping        524288        51.9100
 gdr_copy_to_mapping       1048576       103.8028
 gdr_copy_to_mapping       2097152       207.5990
 gdr_copy_to_mapping       4194304       415.2856
 gdr_copy_to_mapping       8388608       830.6355
 gdr_copy_to_mapping      16777216      1661.3285
 gdr_copy_from_mapping num iters for each size: 100
 Test             Size(B)     Avg.Time(us)
 gdr_copy_from_mapping           1         0.9069
 gdr_copy_from_mapping           2         1.7170
 gdr_copy_from_mapping           4         1.7169
 gdr_copy_from_mapping           8         1.7164
 gdr_copy_from_mapping          16         0.8601
 gdr_copy_from_mapping          32         1.7024
 gdr_copy_from_mapping          64         3.1016
 gdr_copy_from_mapping         128         3.4944
 gdr_copy_from_mapping         256         3.6400
 gdr_copy_from_mapping         512         2.4394
 gdr_copy_from_mapping        1024         2.8022
 gdr_copy_from_mapping        2048         4.6615
 gdr_copy_from_mapping        4096         7.9783
 gdr_copy_from_mapping        8192        14.9209
 gdr_copy_from_mapping       16384        28.9571
 gdr_copy_from_mapping       32768        56.9373
 gdr_copy_from_mapping       65536       114.1008
 gdr_copy_from_mapping      131072       234.9382
 gdr_copy_from_mapping      262144       496.4011
 gdr_copy_from_mapping      524288       985.5196
 gdr_copy_from_mapping     1048576      1970.7057
 gdr_copy_from_mapping     2097152      3942.5611
 gdr_copy_from_mapping     4194304      7888.9468
 gdr_copy_from_mapping     8388608     18361.5673
 gdr_copy_from_mapping    16777216     36758.8342
 unmapping buffer
 unpinning buffer
 closing gdrdrv
 $ gdrcopy_apiperf -s 8
 GPU id:0; name: Tesla V100-SXM2-32GB; Bus id: 0000:06:00
 GPU id:1; name: Tesla V100-SXM2-32GB; Bus id: 0000:07:00
 GPU id:2; name: Tesla V100-SXM2-32GB; Bus id: 0000:0a:00
 GPU id:3; name: Tesla V100-SXM2-32GB; Bus id: 0000:0b:00
 GPU id:4; name: Tesla V100-SXM2-32GB; Bus id: 0000:85:00
 GPU id:5; name: Tesla V100-SXM2-32GB; Bus id: 0000:86:00
 GPU id:6; name: Tesla V100-SXM2-32GB; Bus id: 0000:89:00
 GPU id:7; name: Tesla V100-SXM2-32GB; Bus id: 0000:8a:00
 selecting device 0
 device ptr: 0x7f1563a00000
 allocated size: 65536
 Size(B) pin.Time(us)    map.Time(us)    get_info.Time(us)   unmap.Time(us)
 unpin.Time(us)
 65536   1346.034060 3.603800    0.340270    4.700930    676.612800
 Histogram of gdr_pin_buffer latency for 65536 bytes
 [1303.852000    -   2607.704000]    93
 [2607.704000    -   3911.556000]    0
 [3911.556000    -   5215.408000]    0
 [5215.408000    -   6519.260000]    0
 [6519.260000    -   7823.112000]    0
 [7823.112000    -   9126.964000]    0
 [9126.964000    -   10430.816000]   0
 [10430.816000   -   11734.668000]   0
 [11734.668000   -   13038.520000]   0
 [13038.520000   -   14342.372000]   2
 closing gdrdrv
 $ numactl -N 1 -l gdrcopy_pplat
 GPU id:0; name: NVIDIA A40; Bus id: 0000:09:00
 selecting device 0
 device ptr: 0x7f99d2600000
 gpu alloc fn: cuMemAlloc
 map_d_ptr: 0x7f9a054fb000
 info.va: 7f99d2600000
 info.mapped_size: 4
 info.page_size: 65536
 info.mapped: 1
 info.wc_mapping: 1
 page offset: 0
 user-space pointer: 0x7f9a054fb000
 CPU does gdr_copy_to_mapping and GPU writes back via cuMemHostAlloc'd buffer.
 Running 1000 iterations with data size 4 bytes.
 Round-trip latency per iteration is 1.08762 us
 unmapping buffer
 unpinning buffer
 closing gdrdrv
 ```
 ## NUMA effects
 Depending on the platform architecture, like where the GPU are placed in
 the PCIe topology, performance may suffer if the processor which is driving
 the copy is not the one which is hosting the GPU, for example in a
 multi-socket server.
 In the example below, GPU ID 0 is hosted by
 CPU socket 0. By explicitly playing with the OS process and memory
 affinity, it is possible to run the test onto the optimal processor:
 ```shell
 $ numactl -N 0 -l gdrcopy_copybw -d 0 -s $((64 * 1024)) -o $((0 * 1024)) -c $((64 * 1024))
 GPU id:0; name: Tesla V100-SXM2-32GB; Bus id: 0000:06:00
 GPU id:1; name: Tesla V100-SXM2-32GB; Bus id: 0000:07:00
 GPU id:2; name: Tesla V100-SXM2-32GB; Bus id: 0000:0a:00
 GPU id:3; name: Tesla V100-SXM2-32GB; Bus id: 0000:0b:00
 GPU id:4; name: Tesla V100-SXM2-32GB; Bus id: 0000:85:00
 GPU id:5; name: Tesla V100-SXM2-32GB; Bus id: 0000:86:00
 GPU id:6; name: Tesla V100-SXM2-32GB; Bus id: 0000:89:00
 GPU id:7; name: Tesla V100-SXM2-32GB; Bus id: 0000:8a:00
 selecting device 0
 testing size: 65536
 rounded size: 65536
 gpu alloc fn: cuMemAlloc
 device ptr: 7f5817a00000
 map_d_ptr: 0x7f583b186000
 info.va: 7f5817a00000
 info.mapped_size: 65536
 info.page_size: 65536
 info.mapped: 1
 info.wc_mapping: 1
 page offset: 0
 user-space pointer:0x7f583b186000
 writing test, size=65536 offset=0 num_iters=1000
 write BW: 9768.3MB/s
 reading test, size=65536 offset=0 num_iters=1000
 read BW: 548.423MB/s
 unmapping buffer
 unpinning buffer
 closing gdrdrv
 ```
 or on the other socket:
 ```shell
 $ numactl -N 1 -l gdrcopy_copybw -d 0 -s $((64 * 1024)) -o $((0 * 1024)) -c $((64 * 1024))
 GPU id:0; name: Tesla V100-SXM2-32GB; Bus id: 0000:06:00
 GPU id:1; name: Tesla V100-SXM2-32GB; Bus id: 0000:07:00
 GPU id:2; name: Tesla V100-SXM2-32GB; Bus id: 0000:0a:00
 GPU id:3; name: Tesla V100-SXM2-32GB; Bus id: 0000:0b:00
 GPU id:4; name: Tesla V100-SXM2-32GB; Bus id: 0000:85:00
 GPU id:5; name: Tesla V100-SXM2-32GB; Bus id: 0000:86:00
 GPU id:6; name: Tesla V100-SXM2-32GB; Bus id: 0000:89:00
 GPU id:7; name: Tesla V100-SXM2-32GB; Bus id: 0000:8a:00
 selecting device 0
 testing size: 65536
 rounded size: 65536
 gpu alloc fn: cuMemAlloc
 device ptr: 7fbb63a00000
 map_d_ptr: 0x7fbb82ab0000
 info.va: 7fbb63a00000
 info.mapped_size: 65536
 info.page_size: 65536
 info.mapped: 1
 info.wc_mapping: 1
 page offset: 0
 user-space pointer:0x7fbb82ab0000
 writing test, size=65536 offset=0 num_iters=1000
 write BW: 9224.36MB/s
 reading test, size=65536 offset=0 num_iters=1000
 read BW: 521.262MB/s
 unmapping buffer
 unpinning buffer
 closing gdrdrv
 ```
 ## Restrictions and known issues
 GDRCopy works with regular CUDA device memory only, as returned by cudaMalloc.
 In particular, it does not work with CUDA managed memory.
 `gdr_pin_buffer()` accepts any addresses returned by cudaMalloc and its family.
 In contrast, `gdr_map()` requires that the pinned address is aligned to the GPU page.
 Neither CUDA Runtime nor Driver APIs guarantees that GPU memory allocation
 functions return aligned addresses. Users are responsible for proper alignment
 of addresses passed to the library.
 Two cudaMalloc'd memory regions may be contiguous. Users may call
 `gdr_pin_buffer` and `gdr_map` with address and size that extend across these
 two regions. This use case is not well-supported in GDRCopy. On rare occassions,
 users may experience 1.) an error in `gdr_map`, or 2.) low copy performance
 because `gdr_map` cannot provide write-combined mapping.
 In some GPU driver versions, pinning the same GPU address multiple times
 consumes additional BAR1 space. This is because the space is not properly
 reused. If you encounter this issue, we suggest that you try the latest version
 of NVIDIA GPU driver.
 On POWER9 where CPU and GPU are connected via NVLink, CUDA9.2 and GPU Driver
 v396.37 are the minimum requirements in order to achieve the full performance.
 GDRCopy works with ealier CUDA and GPU driver versions but the achievable
 bandwidth is substantially lower.
 If gdrdrv is compiled with the proprietary flavor of NVIDIA driver, GDRCopy does not fully support Linux with the
 confidential computing (CC) configuration with Intel CPU. In particular, it does not functional if
 `CONFIG_ARCH_HAS_CC_PLATFORM=y` and CC is enabled at runtime. However, it works if CC is disabled or
 `CONFIG_ARCH_HAS_CC_PLATFORM=n`. This issue is not applied to AMD CPU. To avoid this issue, please compile and load
 gdrdrv with the opensource flavor of NVIDIA driver.
 To allow the loading of unsupported 3rd party modules in SLE, set `allow_unsupported_modules 1` in
 /etc/modprobe.d/unsupported-modules. After making this change, modules missing the "supported" flag, will be allowed to
 load.
 ## Bug filing
 For reporting issues you may be having using any of NVIDIA software or
 reporting suspected bugs we would recommend you use the bug filing system
 which is available to NVIDIA registered developers on the developer site.
 If you are not a member you can [sign
 up](https://developer.nvidia.com/accelerated-computing-developer).
 Once a member you can submit issues using [this
 form](https://developer.nvidia.com/nvbugs/cuda/add). Be sure to select
 GPUDirect in the "Relevant Area" field.
 You can later track their progress using the __My Bugs__ link on the left of
 this [view](https://developer.nvidia.com/user).
 ## Acknowledgment
 If you find this software useful in your work, please cite:
 R. Shi et al., "Designing efficient small message transfer mechanism for inter-node MPI communication on InfiniBand GPU clusters," 2014 21st International Conference on High Performance Computing (HiPC), Dona Paula, 2014, pp. 1-10, doi: 10.1109/HiPC.2014.7116873.
--- a/gdrcopy/config_arch
+++ b/gdrcopy/config_arch
@ -0,0 +1,46 @@
 #!/bin/bash
 # Copyright (c) 2014-2021, NVIDIA CORPORATION. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the "Software"),
 # to deal in the Software without restriction, including without limitation
 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
 # and/or sell copies of the Software, and to permit persons to whom the
 # Software is furnished to do so, subject to the following conditions:
 #
 # The above copyright notice and this permission notice shall be included in 
 # all copies or substantial portions of the Software.
 #
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 topdir="$(cd "$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")" && pwd)"
 dir=$(mktemp -d)
 src=$dir/arch.c
 exe=$dir/arch
 cat <<EOF >$src
 #include <stdio.h>
 #include "gdrconfig.h"
 int main(int argc, char *argv[])
 {
 #ifdef GDRAPI_X86
  printf("X86\n");
 #elif defined(GDRAPI_POWER)
  printf("POWER\n");
 #elif defined(GDRAPI_ARM64)
  printf("ARM64\n");
 #else
  printf("ERROR\n");
 #endif
  return 0;
 }
 EOF
 gcc -I ${topdir}/include -I ${topdir}/src $src -o $exe
 $exe 
 rm -rf $dir
--- a/gdrcopy/include/gdrapi.h
+++ b/gdrcopy/include/gdrapi.h
@ -0,0 +1,154 @@
 /*
 * Copyright (c) 2014-2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in 
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */
 #ifndef __GDRAPI_H__
 #define __GDRAPI_H__
 #include <stdint.h> // for standard [u]intX_t types
 #include <stddef.h>
 #define MAJOR_VERSION_SHIFT     16
 #define MINOR_VERSION_MASK      (((uint32_t)1 << MAJOR_VERSION_SHIFT) - 1)
 #define GDR_API_MAJOR_VERSION    2
 #define GDR_API_MINOR_VERSION    4
 #define GDR_API_VERSION          ((GDR_API_MAJOR_VERSION << MAJOR_VERSION_SHIFT) | GDR_API_MINOR_VERSION)
 #define MINIMUM_GDRDRV_MAJOR_VERSION    2
 #define MINIMUM_GDRDRV_MINOR_VERSION    0
 #define MINIMUM_GDRDRV_VERSION          ((MINIMUM_GDRDRV_MAJOR_VERSION << MAJOR_VERSION_SHIFT) | MINIMUM_GDRDRV_MINOR_VERSION)
 #define GPU_PAGE_SHIFT   16
 #define GPU_PAGE_SIZE    (1UL << GPU_PAGE_SHIFT)
 #define GPU_PAGE_OFFSET  (GPU_PAGE_SIZE-1)
 #define GPU_PAGE_MASK    (~GPU_PAGE_OFFSET)
 /*
 * GDRCopy, a low-latency GPU memory copy library (and a kernel-mode
 * driver) based on NVIDIA GPUDirect RDMA technology.
 *
 * supported environment variables:
 *
 * - GDRCOPY_ENABLE_LOGGING, if defined logging is enabled, default is
 *   disabled.
 *
 * - GDRCOPY_LOG_LEVEL, overrides log threshold, default is to print errors
 *   only.
 */
 #ifdef __cplusplus
 extern "C" {
 #endif
 struct gdr;
 typedef struct gdr *gdr_t;
 // Initialize the library, e.g. by opening a connection to the kernel-mode
 // driver. Returns an handle to the library state object.
 gdr_t gdr_open(void);
 // Destroy library state object, e.g. it closes the connection to kernel-mode
 // driver.
 int gdr_close(gdr_t g);
 // The handle to a user-space GPU memory mapping
 typedef struct gdr_mh_s {
  unsigned long h;
 } gdr_mh_t;
 // Create a peer-to-peer mapping of the device memory buffer, returning an opaque handle.
 // Note that at this point the mapping is still not accessible to user-space.
 int gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle);
 // Destroys the peer-to-peer mapping and frees the handle.
 //
 // If there exists a corresponding user-space mapping, gdr_unmap should be
 // called before this one.
 int gdr_unpin_buffer(gdr_t g, gdr_mh_t handle);
 // flag is set when the kernel callback (relative to the
 // nvidia_p2p_get_pages) gets invoked, e.g. cuMemFree() before
 // gdr_unpin_buffer.
 int gdr_get_callback_flag(gdr_t g, gdr_mh_t handle, int *flag);
 typedef enum gdr_mapping_type {
    GDR_MAPPING_TYPE_NONE = 0,
    GDR_MAPPING_TYPE_WC = 1,
    GDR_MAPPING_TYPE_CACHING = 2,
    GDR_MAPPING_TYPE_DEVICE = 3
 } gdr_mapping_type_t;
 // After pinning, info struct contains details of the mapped area.  
 //
 // Note that both info->va and info->mapped_size might be different from
 // the original address passed to gdr_pin_buffer due to aligning happening
 // in the kernel-mode driver
 typedef struct gdr_info_v2 {
    uint64_t va;
    uint64_t mapped_size;
    uint32_t page_size;
    // tm_cycles and cycles_per_ms are deprecated and will be removed in future.
    uint64_t tm_cycles;
    uint32_t cycles_per_ms;
    unsigned mapped:1;
    unsigned wc_mapping:1;
    gdr_mapping_type_t mapping_type;
 } gdr_info_v2_t;
 typedef gdr_info_v2_t gdr_info_t;
 int gdr_get_info_v2(gdr_t g, gdr_mh_t handle, gdr_info_v2_t *info);
 #define gdr_get_info gdr_get_info_v2
 // Create a user-space mapping of the memory handle.
 //
 // WARNING: the address could be potentially aligned to the boundary of the page size
 // before being mapped in user-space, so the pointer returned might be
 // affected by an offset. gdr_get_info can be used to calculate that
 // offset.
 int gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size);
 // get rid of a user-space mapping.
 // First invoke gdr_unmap() then gdr_unpin_buffer().
 int gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size);
 // map_d_ptr is the user-space virtual address belonging to a mapping of a device memory buffer,
 // i.e. one returned by gdr_map()
 //
 // WARNING: Both integrity and ordering of data as observed by pre-launched GPU
 // work is not guaranteed by this API. For more information, see
 // https://docs.nvidia.com/cuda/gpudirect-rdma/index.html#sync-behavior
 int gdr_copy_to_mapping(gdr_mh_t handle, void *map_d_ptr, const void *h_ptr, size_t size);
 int gdr_copy_from_mapping(gdr_mh_t handle, void *h_ptr, const void *map_d_ptr, size_t size);
 // Query the version of libgdrapi
 void gdr_runtime_get_version(int *major, int *minor);
 // Query the version of gdrdrv driver
 int gdr_driver_get_version(gdr_t g, int *major, int *minor);
 #ifdef __cplusplus
 }
 #endif
 #endif // __GDRAPI_H__
--- a/gdrcopy/include/gdrconfig.h
+++ b/gdrcopy/include/gdrconfig.h
@ -0,0 +1,15 @@
 #pragma once
 #if defined __GNUC__
 #if defined(__powerpc__)
 #define GDRAPI_POWER
 #elif defined(__aarch64__)
 #define GDRAPI_ARM64
 #elif defined(__i386__) || defined(__x86_64__) || defined(__X86__)
 #define GDRAPI_X86
 #else
 #error "architecture is not supported"
 #endif // arch
 #else
 #error "compiler not supported"
 #endif // __GNUC__
--- a/gdrcopy/insmod.sh
+++ b/gdrcopy/insmod.sh
@ -0,0 +1,41 @@
 #!/bin/bash
 # Copyright (c) 2014-2021, NVIDIA CORPORATION. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the "Software"),
 # to deal in the Software without restriction, including without limitation
 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
 # and/or sell copies of the Software, and to permit persons to whom the
 # Software is furnished to do so, subject to the following conditions:
 #
 # The above copyright notice and this permission notice shall be included in 
 # all copies or substantial portions of the Software.
 #
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 THIS_DIR=$(dirname $0)
 # remove driver
 grep gdrdrv /proc/devices >/dev/null && sudo /sbin/rmmod gdrdrv
 # insert driver
 sudo /sbin/insmod src/gdrdrv/gdrdrv.ko dbg_enabled=0 info_enabled=0 use_persistent_mapping=0
 # create device inodes
 major=`fgrep gdrdrv /proc/devices | cut -b 1-4`
 echo "INFO: driver major is $major"
 # remove old inodes just in case
 if [ -e /dev/gdrdrv ]; then
    sudo rm /dev/gdrdrv
 fi
 echo "INFO: creating /dev/gdrdrv inode"
 sudo mknod /dev/gdrdrv c $major 0
 sudo chmod a+w+r /dev/gdrdrv
--- a/gdrcopy/packages/build-deb-packages.sh
+++ b/gdrcopy/packages/build-deb-packages.sh
@ -0,0 +1,247 @@
 #!/bin/bash
 # Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the "Software"),
 # to deal in the Software without restriction, including without limitation
 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
 # and/or sell copies of the Software, and to permit persons to whom the
 # Software is furnished to do so, subject to the following conditions:
 #
 # The above copyright notice and this permission notice shall be included in 
 # all copies or substantial portions of the Software.
 #
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 # Restart this number at 1 if MAJOR_VERSION or MINOR_VERSION changes
 # See https://www.debian.org/doc/debian-policy/ch-controlfields.html#version
 DEBIAN_VERSION=1
 SCRIPT_DIR_PATH="$(cd "$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")" && pwd)"
 TOP_DIR_PATH="${SCRIPT_DIR_PATH}/.."
 CWD=$(pwd)
 skip_dep_check=0
 build_test_package=1
 build_driver_package=1
 ex()
 {
    local rc
    echo "+ $@"
    $@
    rc=$?
    if [[ $rc -ne 0 ]]; then
        echo "Failed with error $rc to execute: $@" >&2
        exit $rc
    fi
 }
 function show_help
 {
    echo "Usage: [CUDA=<path>] $0 [-d] [-t] [-k] [-h]"
    echo ""
    echo "  CUDA=<path>     Set your installed CUDA path (ex. /usr/local/cuda)."
    echo "  -d              Don't check build dependencies. Use my environment variables such as C_INCLUDE_PATH instead."
    echo "  -t              Skip building gdrcopy-tests package."
    echo "  -k              Skip building gdrdrv-dkms package."
    echo "  -h              Show this help text."
    echo ""
 }
 OPTIND=1	# Reset in case getopts has been used previously in the shell.
 while getopts "hdtk" opt; do
    case "${opt}" in
    h)
        show_help
        exit 0
        ;;
    d)  skip_dep_check=1
        ;;
    t)  build_test_package=0
        ;;
    k)  build_driver_package=0
        ;;
    esac
 done
 shift $((OPTIND-1))
 if [[ ${build_test_package} == 1 ]] && [ "X$CUDA" == "X" ]; then
    echo "CUDA environment variable is not defined"; exit 1
 fi
 NVCC=${CUDA}/bin/nvcc
 CUDA_VERSION=`$NVCC --version | grep release | sed 's/^.*release \([0-9]\+\.[0-9]\+\).*/\1/'`
 CUDA_MAJOR=`echo ${CUDA_VERSION} | cut -d "." -f 1`
 CUDA_MINOR=`echo ${CUDA_VERSION} | cut -d "." -f 2`
 echo "Building debian package for the gdrcopy library ..."
 ex cd ${SCRIPT_DIR_PATH}
 MODULE_SUBDIR=$(awk '/MODULE_SUBDIR \?=/ { print $3 }' ${TOP_DIR_PATH}/src/gdrdrv/Makefile | tr -d '\n')
 MAJOR_VERSION=$(awk '/#define GDR_API_MAJOR_VERSION/ { print $3 }' ${TOP_DIR_PATH}/include/gdrapi.h | tr -d '\n')
 MINOR_VERSION=$(awk '/#define GDR_API_MINOR_VERSION/ { print $3 }' ${TOP_DIR_PATH}/include/gdrapi.h | tr -d '\n')
 VERSION="${MAJOR_VERSION}.${MINOR_VERSION}.4"
 if [ "X$VERSION" == "X" ]; then
    echo "Failed to get version numbers!" >&2
    exit 1
 fi
 #FULL_VERSION="${VERSION}-${DEBIAN_VERSION}"
 FULL_VERSION="${VERSION}"
 tmpdir=`mktemp -d /tmp/gdr.XXXXXX`
 if [ ! -d "${tmpdir}" ]; then
    echo "Failed to create a temp directory!" >&2
    exit 1
 fi
 echo "Building gdrcopy debian packages version ${FULL_VERSION} ..."
 echo "Working in ${tmpdir} ..."
 ex cd ${TOP_DIR_PATH}
 ex mkdir -p ${tmpdir}/gdrcopy
 ex rm -rf ${tmpdir}/gdrcopy/*
 ex cp -r Makefile README.md include src tests LICENSE config_arch ${tmpdir}/gdrcopy/
 ex cp -r packages/debian-lib ${tmpdir}/gdrcopy/
 ex cp -r packages/debian-tests ${tmpdir}/gdrcopy/
 ex cp README.md ${tmpdir}/gdrcopy/debian-lib/README.Debian
 ex cp README.md ${tmpdir}/gdrcopy/debian-lib/README.source
 ex cp README.md ${tmpdir}/gdrcopy/debian-tests/README.Debian
 ex cp README.md ${tmpdir}/gdrcopy/debian-tests/README.source
 ex cd ${tmpdir}/gdrcopy
 ex find . -type f -exec sed -i "s/@FULL_VERSION@/${FULL_VERSION}/g" {} +
 ex find . -type f -exec sed -i "s/@VERSION@/${VERSION}/g" {} +
 ex rm -f ${tmpdir}/libgdrapi_${VERSION}.orig.tar.gz
 ex rm -f ${tmpdir}/gdrcopy-tests_${VERSION}.orig.tar.gz
 ex cd ${tmpdir}
 ex cp -r gdrcopy libgdrapi-${VERSION}
 ex cd ${tmpdir}/libgdrapi-${VERSION}
 ex mv debian-lib debian
 ex rm -rf debian-*
 ex cd ${tmpdir}
 ex cp -r gdrcopy gdrcopy-tests-${VERSION}
 ex cd ${tmpdir}/gdrcopy-tests-${VERSION}
 ex mv debian-tests debian
 ex rm -rf debian-*
 ex cd ${tmpdir}
 ex tar czvf libgdrapi_${VERSION}.orig.tar.gz libgdrapi-${VERSION}
 ex tar czvf gdrcopy-tests_${VERSION}.orig.tar.gz gdrcopy-tests-${VERSION}
 echo "Building libgdrapi package ..."
 ex cd ${tmpdir}/libgdrapi-${VERSION}
 debuild_params="--set-envvar=PKG_CONFIG_PATH=${PKG_CONFIG_PATH}"
 if [ "${skip_dep_check}" -eq 1 ]; then
    debuild_params+=" --preserve-env -d"
    echo "Skip build dependency check. Use the environment variables instead ..."
 fi
 # --set-envvar needs to be placed before -us -uc
 debuild_params+=" -us -uc"
 ex debuild ${debuild_params}
 if [[ ${build_test_package} == 1 ]]; then
    echo
    echo "Building gdrcopy-tests package ..."
    ex cd ${tmpdir}/gdrcopy-tests-${VERSION}
    debuild_params="--set-envvar=CUDA=${CUDA} --set-envvar=PKG_CONFIG_PATH=${PKG_CONFIG_PATH}"
    if [ "${skip_dep_check}" -eq 1 ]; then
        debuild_params+=" --preserve-env -d"
        echo "Skip build dependency check. Use the environment variables instead ..."
    fi
    # --set-envvar needs to be placed before -us -uc
    debuild_params+=" -us -uc"
    ex debuild ${debuild_params}
 fi
 if [[ ${build_driver_package} == 1 ]]; then
    echo
    echo "Building gdrdrv-dkms package ..."
    ex cd ${tmpdir}/gdrcopy/src/gdrdrv
    ex make clean
    dkmsdir="${tmpdir}/gdrdrv-dkms-${VERSION}"
    ex mkdir -p ${dkmsdir}
    ex cp -r ${tmpdir}/gdrcopy/src/gdrdrv ${dkmsdir}/gdrdrv-${VERSION}
    ex rm -rf ${dkmsdir}/gdrdrv-${VERSION}/debian-*
    ex cp ${SCRIPT_DIR_PATH}/dkms.conf ${dkmsdir}/gdrdrv-${VERSION}/
    ex cp -r ${TOP_DIR_PATH}/scripts ${dkmsdir}/gdrdrv-${VERSION}
    ex cd ${dkmsdir}
    ex cp -r ${SCRIPT_DIR_PATH}/dkms/* .
    ex find . -type f -exec sed -i "s/@FULL_VERSION@/${FULL_VERSION}/g" {} +
    ex find . -type f -exec sed -i "s/@VERSION@/${VERSION}/g" {} +
    ex find . -type f -exec sed -i "s/@MODULE_LOCATION@/${MODULE_SUBDIR//\//\\/}/g" {} +
    ex cd ${tmpdir}
    ex tar czvf gdrdrv-dkms_${VERSION}.orig.tar.gz gdrdrv-dkms-${VERSION}
    ex cd ${dkmsdir}
    ex dpkg-buildpackage -rfakeroot -d -F -us -uc
 fi
 echo
 echo "Building gdrcopy package ..."
 metadir=${tmpdir}/gdrcopy-${VERSION}
 ex mkdir -p ${metadir}
 ex cd ${TOP_DIR_PATH}
 ex cp -r packages/debian-meta ${metadir}/debian
 ex cp README.md ${metadir}/debian/README.Debian
 ex cp README.md ${metadir}/debian/README.source
 ex cd ${metadir}
 ex find . -type f -exec sed -i "s/@FULL_VERSION@/${FULL_VERSION}/g" {} +
 ex find . -type f -exec sed -i "s/@VERSION@/${VERSION}/g" {} +
 ex find . -type f -exec sed -i "s/@MODULE_LOCATION@/${MODULE_SUBDIR//\//\\/}/g" {} +
 ex cd ${tmpdir}
 ex tar czvf gdrcopy_${VERSION}.orig.tar.gz gdrcopy-${VERSION}
 cd ${metadir}
 ex debuild -us -uc
 echo
 echo "Copying *.deb and supplementary files to the current working directory ..."
 if $(hash lsb_release 2>/dev/null); then
    release=`lsb_release -rs | sed -e "s/\./_/g"`
    id=`lsb_release -is | sed -e "s/ /_/g"`
    release=".${id}${release}"
 else
    release=""
 fi
 ex cd ${CWD}
 for item in `ls ${tmpdir}/*.deb`; do
    item_name=`basename $item`
    item_name=`echo $item_name | sed -e "s/\.deb//g"`
    if echo "$item_name" | grep -q "tests"; then
        item_name="${item_name}${release}+cuda${CUDA_MAJOR}.${CUDA_MINOR}.deb"
    else
        item_name="${item_name}${release}.deb"
    fi
    ex cp $item ./${item_name}
 done
 ex cp ${tmpdir}/*.tar.* .
 ex cp ${tmpdir}/*.dsc .
 echo
 echo "Cleaning up ..."
 ex rm -rf ${tmpdir}
--- a/gdrcopy/packages/build-rpm-packages.sh
+++ b/gdrcopy/packages/build-rpm-packages.sh
@ -0,0 +1,185 @@
 #!/bin/bash
 # Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the "Software"),
 # to deal in the Software without restriction, including without limitation
 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
 # and/or sell copies of the Software, and to permit persons to whom the
 # Software is furnished to do so, subject to the following conditions:
 #
 # The above copyright notice and this permission notice shall be included in 
 # all copies or substantial portions of the Software.
 #
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 # Restart this number at 1 if MAJOR_VERSION or MINOR_VERSION changes
 # See https://rpm-packaging-guide.github.io/#preamble-items
 RPM_VERSION=1
 SCRIPT_DIR_PATH="$(cd "$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")" && pwd)"
 TOP_DIR_PATH="${SCRIPT_DIR_PATH}/.."
 CWD=$(pwd)
 ex()
 {
    local rc
    echo "+ $@"
    $@
    rc=$?
    if [[ $rc -ne 0 ]]; then
        echo "Failed with error $rc to execute: $@" >&2
        exit $rc
    fi
 }
 function show_help
 {
    echo "This script is for generating GDRCopy RPM packages."
    echo
    echo "Usage: CUDA=<path> $0 [-m]"
    echo
    echo "Optional arguments:"
    echo "  -m              Generate kmod package (default: no)."
    echo
    echo "Environment variables:"
    echo "  CUDA=<path>             [Required] CUDA installation path (usually /usr/local/cuda)."
    echo "  NVIDIA_SRC_DIR=<path>   [Optional] NVIDIA driver source directory (usually /usr/src/nvidia-<version>/nvidia)."
 }
 OPTIND=1	# Reset in case getopts has been used previously in the shell.
 generate_kmod=0
 while getopts "h?m" opt; do
    case "$opt" in
    h|\?)
        show_help
        exit 0
        ;;
    m)  generate_kmod=1
        ;;
    esac
 done
 shift $((OPTIND-1))
 NVCC=${CUDA}/bin/nvcc
 CUDA_VERSION=`$NVCC --version | grep release | sed 's/^.*release \([0-9]\+\.[0-9]\+\).*/\1/'`
 CUDA_MAJOR=`echo ${CUDA_VERSION} | cut -d "." -f 1`
 CUDA_MINOR=`echo ${CUDA_VERSION} | cut -d "." -f 2`
 if [ "X$CUDA" == "X" ]; then
    echo "CUDA environment variable is not defined"
    exit 1
 fi
 echo "Building rpm package ..."
 ex cd ${SCRIPT_DIR_PATH}
 MODULE_SUBDIR=$(awk '/MODULE_SUBDIR \?=/ { print $3 }' ${TOP_DIR_PATH}/src/gdrdrv/Makefile | tr -d '\n')
 MAJOR_VERSION=$(awk '/#define GDR_API_MAJOR_VERSION/ { print $3 }' ${TOP_DIR_PATH}/include/gdrapi.h | tr -d '\n')
 MINOR_VERSION=$(awk '/#define GDR_API_MINOR_VERSION/ { print $3 }' ${TOP_DIR_PATH}/include/gdrapi.h | tr -d '\n')
 VERSION="${MAJOR_VERSION}.${MINOR_VERSION}.4"
 if [ "X$VERSION" == "X" ]; then
    echo "Failed to get version numbers!" >&2
    exit 1
 fi
 FULL_VERSION="${VERSION}"
 if [[ ${generate_kmod} == 1 ]]; then
    if [ -z "${NVIDIA_SRC_DIR}" ]; then
 	NVIDIA_SRC_DIR=$(find /usr/src/kernel-modules/nvidia-* /usr/src/nvidia-* -name "nv-p2p.c" -print -quit 2>/dev/null)
        if [ ${#NVIDIA_SRC_DIR} -gt 0 ]; then
            NVIDIA_SRC_DIR=$(dirname ${NVIDIA_SRC_DIR})
        fi
    fi
    if [ -d ${NVIDIA_SRC_DIR} ]; then
        NVIDIA_DRIVER_VERSION=$(basename $(dirname ${NVIDIA_SRC_DIR}))
    else
        echo "NVIDIA_SRC_DIR=${NVIDIA_SRC_DIR}" >&2
        echo "Failed to find NVIDIA driver!" >&2
        exit 1
    fi
 fi
 tmpdir=`mktemp -d /tmp/gdr.XXXXXX`
 if [ ! -d "$tmpdir" ]; then
    echo "Failed to create a temp directory!" >&2
    exit 1
 fi
 echo "Building gdrcopy rpm packages version ${VERSION} ..."
 echo "Working in $tmpdir ..."
 ex cd ${TOP_DIR_PATH}
 ex mkdir -p $tmpdir/gdrcopy
 ex rm -rf $tmpdir/gdrcopy/*
 ex cp -r packages/dkms.conf packages/rhel/init.d packages/rhel/gdrcopy.service scripts/ insmod.sh Makefile README.md include src tests config_arch LICENSE packages/gdrcopy.spec $tmpdir/gdrcopy/
 ex rm -f $tmpdir/gdrcopy-$VERSION.tar.gz
 ex cd $tmpdir/gdrcopy
 ex find . -type f -exec sed -i "s/@FULL_VERSION@/${FULL_VERSION}/g" {} +
 ex find . -type f -exec sed -i "s/@VERSION@/${VERSION}/g" {} +
 ex find . -type f -exec sed -i "s/@MODULE_LOCATION@/${MODULE_SUBDIR//\//\\/}/g" {} +
 ex cd $tmpdir
 ex mv gdrcopy gdrcopy-$VERSION
 ex tar czvf gdrcopy-$VERSION.tar.gz gdrcopy-$VERSION
 ex mkdir -p $tmpdir/topdir/{SRPMS,RPMS,SPECS,BUILD,SOURCES}
 ex cp gdrcopy-$VERSION/gdrcopy.spec $tmpdir/topdir/SPECS/
 ex cp gdrcopy-$VERSION.tar.gz $tmpdir/topdir/SOURCES/
 rpmbuild_params="-ba --nodeps --define '_build_id_links none' --define \"_topdir $tmpdir/topdir\" --define \"_release ${RPM_VERSION}\" --define 'dist %{nil}' --define \"CUDA $CUDA\" --define \"GDR_VERSION ${VERSION}\" --define \"KVERSION $(uname -r)\" --define \"MODULE_LOCATION ${MODULE_SUBDIR}\""
 if [[ ${generate_kmod} == 1 ]]; then
    rpmbuild_params="${rpmbuild_params} --define \"NVIDIA_DRIVER_VERSION ${NVIDIA_DRIVER_VERSION}\" --define \"NVIDIA_SRC_DIR ${NVIDIA_SRC_DIR}\" --define \"BUILD_KMOD 1\""
 fi
 rpmbuild_params="${rpmbuild_params} $tmpdir/topdir/SPECS/gdrcopy.spec"
 eval "rpmbuild ${rpmbuild_params}"
 rpms=`ls -1 $tmpdir/topdir/RPMS/*/*.rpm`
 srpm=`ls -1 $tmpdir/topdir/SRPMS/`
 if [ -f "/etc/redhat-release" ]; then
    release_version=".el$(cat /etc/redhat-release | grep -o -E '[0-9]+' | head -1)"
 elif [ -f "/etc/centos-release" ]; then
    release_version=".el$(cat /etc/centos-release | grep -o -E '[0-9]+' | head -1)"
 elif [ -f "/etc/os-release" ]; then
    release_version=$(source /etc/os-release && echo ".$ID-$VERSION_ID")
 else
    release_version="unknown_distro"
 fi
 echo $srpm $rpms
 ex cd ${CWD}
 for item in `ls $tmpdir/topdir/SRPMS/*.rpm $tmpdir/topdir/RPMS/*/*.rpm`; do
    item_name=`basename $item .rpm`
    arch=$(sed -ne 's/.*\(\.[^\.]\+\)$/\1/p' <<< $item_name)
    item_name=`basename $item_name $arch`
    if [ "$item_name" == "gdrcopy-${FULL_VERSION}-${RPM_VERSION}.`uname -m`" ]; then
        item_name="${item_name}${release_version}+cuda${CUDA_MAJOR}.${CUDA_MINOR}.${arch}.rpm"
    else
        item_name="${item_name}${release_version}${arch}.rpm"
    fi
    ex cp $item ./${item_name}
 done
 echo
 echo "Cleaning up ..."
 ex rm -rf ${tmpdir}
--- a/gdrcopy/packages/debian-lib/changelog
+++ b/gdrcopy/packages/debian-lib/changelog
@ -0,0 +1,44 @@
 libgdrapi (2.4.4) stable; urgency=low
  * No change.
 -- Pak Markthub <pmarkthub@nvidia.com>  Mon, 16 Dec 2024 11:59:59 -0700
 libgdrapi (2.4.3) stable; urgency=low
  * No change.
 -- Pak Markthub <pmarkthub@nvidia.com>  Mon, 02 Dec 2024 11:59:59 -0700
 libgdrapi (2.4.2) stable; urgency=low
  * Fix memory leak in gdr_pin_buffer.
 -- Pak Markthub <pmarkthub@nvidia.com>  Thu, 31 Oct 2024 11:59:59 -0700
 libgdrapi (2.4.1) stable; urgency=low
  * No change
 -- Pak Markthub <pmarkthub@nvidia.com>  Mon, 18 Dec 2023 11:59:59 -0700
 libgdrapi (2.4) stable; urgency=low
  * Introduce gdr_get_info_v2.
  * Introduce new copy algorithm for device mappings.
  * Add support for NVIDIA BLUEFIELD-3.
 -- Pak Markthub <pmarkthub@nvidia.com>  Fri, 01 Sep 2023 11:59:59 -0700
 libgdrapi (2.3.1) stable; urgency=low
  * No change
 -- Pak Markthub <pmarkthub@nvidia.com>  Mon, 12 May 2023 11:59:59 -0700
 libgdrapi (2.3) stable; urgency=low
  * Initial version of libgdrapi package -- was a part of gdrcopy package.
 -- Pak Markthub <pmarkthub@nvidia.com>  Mon, 23 Jul 2021 11:59:59 -0700
--- a/gdrcopy/packages/debian-lib/compat
+++ b/gdrcopy/packages/debian-lib/compat
@ -0,0 +1 @@
 9
--- a/gdrcopy/packages/debian-lib/control
+++ b/gdrcopy/packages/debian-lib/control
@ -0,0 +1,19 @@
 Source: libgdrapi
 Priority: optional
 Maintainer: GPUDirect Team <gpudirect@nvidia.com>
 Uploaders: Davide Rossetti <drossetti@nvidia.com>, Pak Markthub <pmarkthub@nvidia.com>
 Build-Depends: debhelper (>= 9)
 Standards-Version: @FULL_VERSION@
 Section: libs
 Homepage: https://github.com/NVIDIA/gdrcopy
 #Vcs-Git: https://anonscm.debian.org/git/collab-maint/gdrcopy.git
 #Vcs-Browser: https://anonscm.debian.org/cgit/collab-maint/gdrcopy.git
 Package: libgdrapi
 Architecture: any
 Multi-Arch: same
 Depends: ${shlibs:Depends}, ${misc:Depends}
 Replaces: gdrcopy (<= 2.2-1)
 Conflicts: gdrcopy (<= 2.2-1)
 Description: A low-latency GPU memory copy library 
 A low-latency GPU memory copy library based on NVIDIA GPUDirect RDMA technology.
--- a/gdrcopy/packages/debian-lib/copyright
+++ b/gdrcopy/packages/debian-lib/copyright
@ -0,0 +1,25 @@
 Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
 Upstream-Name: gdrcopy
 Source: https://github.com/NVIDIA/gdrcopy
 Files: *
 Copyright: 2019-2021 NVIDIA CORPORATION. All rights reserved.
 License: MIT
 Permission is hereby granted, free of charge, to any person obtaining a
 copy of this software and associated documentation files (the "Software"),
 to deal in the Software without restriction, including without limitation
 the rights to use, copy, modify, merge, publish, distribute, sublicense,
 and/or sell copies of the Software, and to permit persons to whom the
 Software is furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in 
 all copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 DEALINGS IN THE SOFTWARE.
--- a/gdrcopy/packages/debian-lib/libgdrapi-docs.docs
+++ b/gdrcopy/packages/debian-lib/libgdrapi-docs.docs
@ -0,0 +1,2 @@
 README.Debian
 README.source
--- a/gdrcopy/packages/debian-lib/libgdrapi.install
+++ b/gdrcopy/packages/debian-lib/libgdrapi.install
@ -0,0 +1 @@
--- a/gdrcopy/packages/debian-lib/rules
+++ b/gdrcopy/packages/debian-lib/rules
@ -0,0 +1,27 @@
 #!/usr/bin/make -f
 # See debhelper(7) (uncomment to enable)
 # output every command that modifies files on the build system.
 #export DH_VERBOSE = 1
 # see FEATURE AREAS in dpkg-buildflags(1)
 #export DEB_BUILD_MAINT_OPTIONS = hardening=+all
 # see ENVIRONMENT in dpkg-buildflags(1)
 # package maintainers to append CFLAGS
 #export DEB_CFLAGS_MAINT_APPEND  = -Wall -pedantic
 # package maintainers to append LDFLAGS
 #export DEB_LDFLAGS_MAINT_APPEND = -Wl,--as-needed
 %:
 	dh $@
 # dh_make generated override targets
 # This is example for Cmake (See https://bugs.debian.org/641051 )
 override_dh_auto_build:
 	dh_auto_build -- lib
 override_dh_auto_install:
 	$(MAKE) DESTDIR=$(CURDIR)/debian/libgdrapi prefix=/usr libdir=/usr/lib/$(DEB_HOST_MULTIARCH) lib_install
--- a/gdrcopy/packages/debian-lib/source/format
+++ b/gdrcopy/packages/debian-lib/source/format
@ -0,0 +1 @@
 3.0 (quilt)
--- a/gdrcopy/packages/debian-meta/changelog
+++ b/gdrcopy/packages/debian-meta/changelog
@ -0,0 +1,105 @@
 gdrcopy (2.4.4) stable; urgency=low
  * Fix the use-after-free bug of mr objects in gdrdv_vma_close.
  * Fix the resource leakage bug in gdrdrv_release.
 -- Pak Markthub <pmarkthub@nvidia.com>  Mon, 16 Dec 2024 11:59:59 -0700
 gdrcopy (2.4.3) stable; urgency=low
  * Fix NVIDIA_IS_OPENSOURCE detection when compile with NVIDIA driver version 545 or newer.
  * Fix compile error in gdrdrv when compile on RHEL9.5.
 -- Pak Markthub <pmarkthub@nvidia.com>  Mon, 02 Dec 2024 11:59:59 -0700
 gdrcopy (2.4.2) stable; urgency=low
  * Fix the size alignment bug in gdrdrv.
  * Fix memory leak in gdr_pin_buffer.
  * Add support for another flavor of BF3.
 -- Pak Markthub <pmarkthub@nvidia.com>  Thu, 31 Oct 2024 11:59:59 -0700
 gdrcopy (2.4.1) stable; urgency=low
  * Add support for persistent mapping.
  * Fix bug in src/gdrdrv/Makefile.
  * Fix compile-time bug when check.h is not found.
 -- Pak Markthub <pmarkthub@nvidia.com>  Mon, 18 Dec 2023 11:59:59 -0700
 gdrcopy (2.4) stable; urgency=low
  * Various bug fixes in the test and benchmark applications.
  * Prefix all applications with "gdrcopy_".
  * Introduce more unit tests in gdrcopy_sanity.
  * Introduce gdrcopy_pplat benchmark application.
  * Remove dependency on libcheck and libsubunit
  * Introduce gdr_get_info_v2.
  * Introduce new copy algorithm for device mappings.
  * Add support for NVIDIA BLUEFIELD-3.
  * Add support for Linux kernel >= 6.3.
  * Relicense gdrdrv to Dual MIT/GPL.
  * Fix bugs in gdrdrv when pinning two small buffers back-to-back.
  * Add support for coherent platforms such as Grace-Hopper.
  * Add support for Confidential Computing (CC).
 -- Pak Markthub <pmarkthub@nvidia.com>  Fri, 01 Sep 2023 11:59:59 -0700
 gdrcopy (2.3.1) stable; urgency=low
  * Add a workaround for the GPL-compatibility issue when compile with CONFIG_ARCH_HAS_CC_PLATFORM on Linux kernel 5.18+.
 -- Pak Markthub <pmarkthub@nvidia.com>  Mon, 12 May 2023 11:59:59 -0700
 gdrcopy (2.3) stable; urgency=low
  * Convert to meta package.
  * Declare dependency with gdrdrv-dkms, libgdrapi, and gdrcopy-tests.
  * Update the package maintainer.
 -- Pak Markthub <pmarkthub@nvidia.com>  Mon, 23 Jul 2021 11:59:59 -0700
 gdrcopy (2.2) stable; urgency=low
  * Add support for ARM64.
  * Update various information on README.
  * Improve Makefile.
  * Add multi-arch support.
  * Handle removal of HAVE_UNLOCKED_IOCTL in Linux kernel v5.9 and later.
  * Prevent dpkg package creation to unnecessarily compile gdrdrv.
  * Improve gdr_open error message.
  * Fix bug that prevents sanity from correctly summarizing failure.
  * Add dkms support in kmod package.
  * Handle the removal of kzfree in Linux kernel v5.10 and later.
  * Improve small-size copy-to-mapping.
 -- Pak Markthub <pmarkthub@nvidia.com>  Mon, 01 Feb 2021 11:59:59 -0700
 gdrcopy (2.1) stable; urgency=low
  * fix build problem on RHL8 kernels
  * relax checks in gdrdrv to support multi-threading use cases
  * fix fd leak in gdr_open()
  * Introduce copylat test application.
  * Introduce basic_with_tokens and invalidation_fork_child_gdr_pin_parent_with_tokens sub-tests in sanity.
  * Remove the dependency with libcudart.so.
  * Clean up the code in the tests folder.
  * Change the package maintainer to Davide Rossetti.
 -- Davide Rossetti <drossetti@nvidia.com>  Mon, 02 Mar 2020 11:59:59 -0700
 gdrcopy (2.0) stable; urgency=low
  * Improve copy performance with unrolling in POWERPC.
  * Create sanity unit test for testing the functionality and security.
  * Consolidate basic and validate into sanity unit test.
  * Introduce compile time and runtime version checking in libgdrapi.
  * Improve rpm packaging.
  * Introduce deb packaging for the userspace library and the applications.
  * Introduce dkms packaging for the gdrdrv driver.
  * Rename gdr_copy_from/to_bar to gdr_copy_from/to_mapping.
  * Update README
 -- Pak Markthub <pmarkthub@nvidia.com>  Mon, 16 Sep 2019 11:59:59 -0700
--- a/gdrcopy/packages/debian-meta/compat
+++ b/gdrcopy/packages/debian-meta/compat
@ -0,0 +1 @@
 9
--- a/gdrcopy/packages/debian-meta/control
+++ b/gdrcopy/packages/debian-meta/control
@ -0,0 +1,17 @@
 Source: gdrcopy
 Priority: optional
 Maintainer: GPUDirect Team <gpudirect@nvidia.com>
 Uploaders: Davide Rossetti <drossetti@nvidia.com>, Pak Markthub <pmarkthub@nvidia.com>
 Build-Depends: debhelper (>= 9)
 Standards-Version: @FULL_VERSION@
 Section: misc
 Homepage: https://github.com/NVIDIA/gdrcopy
 Package: gdrcopy
 Architecture: any
 Multi-Arch: same
 Depends: gdrdrv-dkms (= @FULL_VERSION@), libgdrapi (= @FULL_VERSION@), gdrcopy-tests (= @FULL_VERSION@)
 Maintainer: GPUDirect Team <gpudirect@nvidia.com>
 Uploaders: Davide Rossetti <drossetti@nvidia.com>, Pak Markthub <pmarkthub@nvidia.com>
 Description: GDRCopy meta-package
 Meta-package for GDRCopy, a low-latency GPU memory copy library based on NVIDIA GPUDirect RDMA technology.
--- a/gdrcopy/packages/debian-meta/copyright
+++ b/gdrcopy/packages/debian-meta/copyright
@ -0,0 +1,25 @@
 Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
 Upstream-Name: gdrcopy
 Source: https://github.com/NVIDIA/gdrcopy
 Files: *
 Copyright: 2019-2021 NVIDIA CORPORATION. All rights reserved.
 License: MIT
 Permission is hereby granted, free of charge, to any person obtaining a
 copy of this software and associated documentation files (the "Software"),
 to deal in the Software without restriction, including without limitation
 the rights to use, copy, modify, merge, publish, distribute, sublicense,
 and/or sell copies of the Software, and to permit persons to whom the
 Software is furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in 
 all copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 DEALINGS IN THE SOFTWARE.
--- a/gdrcopy/packages/debian-meta/rules
+++ b/gdrcopy/packages/debian-meta/rules
@ -0,0 +1,44 @@
 #!/usr/bin/make -f
 # See debhelper(7) (uncomment to enable)
 # output every command that modifies files on the build system.
 #export DH_VERBOSE = 1
 # see FEATURE AREAS in dpkg-buildflags(1)
 #export DEB_BUILD_MAINT_OPTIONS = hardening=+all
 # see ENVIRONMENT in dpkg-buildflags(1)
 # package maintainers to append CFLAGS
 #export DEB_CFLAGS_MAINT_APPEND  = -Wall -pedantic
 # package maintainers to append LDFLAGS
 #export DEB_LDFLAGS_MAINT_APPEND = -Wl,--as-needed
 build build-arch build-indep:
 clean:
 	dh_testdir
 	dh_clean
 install: build
 	dh_testdir
 	dh_testroot
 	dh_prep
 binary-arch: install
 binary-indep: install
 	dh_testdir
 	dh_testroot
 	dh_install
 	dh_installdocs
 	dh_installchangelogs
 	dh_compress
 	dh_fixperms
 	dh_installdeb
 	dh_gencontrol
 	dh_md5sums
 	dh_builddeb
 binary: binary-indep binary-arch
 .PHONY: build clean binary-indep binary-arch binary install
--- a/gdrcopy/packages/debian-meta/source/format
+++ b/gdrcopy/packages/debian-meta/source/format
@ -0,0 +1 @@
 3.0 (quilt)
--- a/gdrcopy/packages/debian-tests/changelog
+++ b/gdrcopy/packages/debian-tests/changelog
@ -0,0 +1,47 @@
 gdrcopy-tests (2.4.4) stable; urgency=low
  * No change.
 -- Pak Markthub <pmarkthub@nvidia.com>  Mon, 16 Dec 2024 11:59:59 -0700
 gdrcopy-tests (2.4.3) stable; urgency=low
  * No change.
 -- Pak Markthub <pmarkthub@nvidia.com>  Mon, 02 Dec 2024 11:59:59 -0700
 gdrcopy-tests (2.4.2) stable; urgency=low
  * No change.
 -- Pak Markthub <pmarkthub@nvidia.com>  Thu, 31 Oct 2024 11:59:59 -0700
 gdrcopy-tests (2.4.1) stable; urgency=low
  * Fix compile-time bug when check.h is not found.
 -- Pak Markthub <pmarkthub@nvidia.com>  Mon, 18 Dec 2023 11:59:59 -0700
 gdrcopy-tests (2.4) stable; urgency=low
  * Various bug fixes in the test and benchmark applications.
  * Prefix all applications with "gdrcopy_".
  * Introduce more unit tests in gdrcopy_sanity.
  * Introduce gdrcopy_pplat benchmark application.
  * Remove dependency on libcheck and libsubunit
 -- Pak Markthub <pmarkthub@nvidia.com>  Fri, 01 Sep 2023 11:59:59 -0700
 gdrcopy-tests (2.3.1) stable; urgency=low
  * No change
 -- Pak Markthub <pmarkthub@nvidia.com>  Mon, 12 May 2023 11:59:59 -0700
 gdrcopy-tests (2.3) stable; urgency=low
  * Initial version of gdrcopy-tests package -- was a part of gdrcopy package.
  * Add apiperf test.
 -- Pak Markthub <pmarkthub@nvidia.com>  Mon, 23 Jul 2021 11:59:59 -0700
--- a/gdrcopy/packages/debian-tests/compat
+++ b/gdrcopy/packages/debian-tests/compat
@ -0,0 +1 @@
 9
--- a/gdrcopy/packages/debian-tests/control
+++ b/gdrcopy/packages/debian-tests/control
@ -0,0 +1,18 @@
 Source: gdrcopy-tests
 Priority: optional
 Maintainer: GPUDirect Team <gpudirect@nvidia.com>
 Uploaders: Davide Rossetti <drossetti@nvidia.com>, Pak Markthub <pmarkthub@nvidia.com>
 Build-Depends: debhelper (>= 9)
 Standards-Version: @FULL_VERSION@
 Section: utils
 Homepage: https://github.com/NVIDIA/gdrcopy
 #Vcs-Git: https://anonscm.debian.org/git/collab-maint/gdrcopy.git
 #Vcs-Browser: https://anonscm.debian.org/cgit/collab-maint/gdrcopy.git
 Package: gdrcopy-tests
 Architecture: any
 Multi-Arch: same
 Depends: libgdrapi (>= @FULL_VERSION@), ${shlibs:Depends}, ${misc:Depends}
 Replaces: gdrcopy (<= 2.2-1)
 Conflicts: gdrcopy (<= 2.2-1)
 Description: Test utilities for GDRCopy 
--- a/gdrcopy/packages/debian-tests/copyright
+++ b/gdrcopy/packages/debian-tests/copyright
@ -0,0 +1,25 @@
 Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
 Upstream-Name: gdrcopy
 Source: https://github.com/NVIDIA/gdrcopy
 Files: *
 Copyright: 2019-2021 NVIDIA CORPORATION. All rights reserved.
 License: MIT
 Permission is hereby granted, free of charge, to any person obtaining a
 copy of this software and associated documentation files (the "Software"),
 to deal in the Software without restriction, including without limitation
 the rights to use, copy, modify, merge, publish, distribute, sublicense,
 and/or sell copies of the Software, and to permit persons to whom the
 Software is furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in 
 all copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 DEALINGS IN THE SOFTWARE.
--- a/gdrcopy/packages/debian-tests/gdrcopy-tests-docs.docs
+++ b/gdrcopy/packages/debian-tests/gdrcopy-tests-docs.docs
@ -0,0 +1,2 @@
 README.Debian
 README.source
--- a/gdrcopy/packages/debian-tests/gdrcopy-tests.install
+++ b/gdrcopy/packages/debian-tests/gdrcopy-tests.install
@ -0,0 +1 @@
--- a/gdrcopy/packages/debian-tests/rules
+++ b/gdrcopy/packages/debian-tests/rules
@ -0,0 +1,30 @@
 #!/usr/bin/make -f
 # See debhelper(7) (uncomment to enable)
 # output every command that modifies files on the build system.
 #export DH_VERBOSE = 1
 # see FEATURE AREAS in dpkg-buildflags(1)
 #export DEB_BUILD_MAINT_OPTIONS = hardening=+all
 # see ENVIRONMENT in dpkg-buildflags(1)
 # package maintainers to append CFLAGS
 #export DEB_CFLAGS_MAINT_APPEND  = -Wall -pedantic
 # package maintainers to append LDFLAGS
 #export DEB_LDFLAGS_MAINT_APPEND = -Wl,--as-needed
 %:
 	dh $@
 # dh_make generated override targets
 # This is example for Cmake (See https://bugs.debian.org/641051 )
 override_dh_auto_build:
 	dh_auto_build -- CUDA=$(CUDA) lib exes
 override_dh_shlibdeps:
 	dh_shlibdeps -Xgdrcopy_apiperf -Xgdrcopy_copybw -Xgdrcopy_copylat -Xgdrcopy_sanity -Xgdrcopy_pplat
 override_dh_auto_install:
 	$(MAKE) DESTDIR=$(CURDIR)/debian/gdrcopy-tests prefix=/usr exes_install
--- a/gdrcopy/packages/debian-tests/source/format
+++ b/gdrcopy/packages/debian-tests/source/format
@ -0,0 +1 @@
 3.0 (quilt)
--- a/gdrcopy/packages/dkms.conf
+++ b/gdrcopy/packages/dkms.conf
@ -0,0 +1,6 @@
 PACKAGE_NAME="gdrdrv"
 PACKAGE_VERSION="@FULL_VERSION@"
 BUILT_MODULE_NAME[0]="gdrdrv"
 DEST_MODULE_LOCATION[0]="@MODULE_LOCATION@"
 AUTOINSTALL="yes"
 MAKE[0]="cd $dkms_tree/gdrdrv/@FULL_VERSION@/build && make CONF_SCRIPT_DIR=scripts KVER=$kernelver"
--- a/gdrcopy/packages/dkms/Makefile
+++ b/gdrcopy/packages/dkms/Makefile
@ -0,0 +1,33 @@
 #/usr/bin/make
 SRC = $(DESTDIR)/usr/src
 SHARE = $(DESTDIR)/usr/share/$(NAME)-dkms
 all:
 clean:
 install:
 #source tree
 ifeq ("$(wildcard $(NAME)-$(VERSION))", "$(NAME)-$(VERSION)")
 	install -d "$(SRC)"
 	cp -a $(NAME)-$(VERSION) $(SRC)
 	# sets 0755 for dirs, 0644 for files
 	chmod a-wx+rX,u+w -R "$(SRC)/$(NAME)-$(VERSION)"
 	# set u+x for all files under the scripts folder
 	chmod u+x -R "$(SRC)/$(NAME)-$(VERSION)/scripts"
 endif
 #tarball, possibly with binaries
 ifeq ("$(wildcard $(NAME)-$(VERSION).dkms.tar.gz)", "$(NAME)-$(VERSION).dkms.tar.gz")
 	install -d "$(SHARE)"
 	install -m 644 $(NAME)-$(VERSION).dkms.tar.gz "$(SHARE)"
 endif
 #postinst, only if we are supporting legacy mode
 ifeq ("$(wildcard common.postinst)", "common.postinst")
 	install -d "$(SHARE)"
 	install -m 755 $(PREFIX)/usr/lib/dkms/common.postinst $(SHARE)/postinst
 endif
--- a/gdrcopy/packages/dkms/common.postinst
+++ b/gdrcopy/packages/dkms/common.postinst
@ -0,0 +1,293 @@
 #!/bin/sh
 # Copyright (C) 2002-2005 Flavio Stanchina
 # Copyright (C) 2005-2006 Aric Cyr
 # Copyright (C) 2007 Mario Limonciello
 # Copyright (C) 2009 Alberto Milone
 set -e
 . /usr/share/debconf/confmodule
 uname_s=$(uname -s)
 _get_kernel_dir() {
    KVER=$1
    case ${uname_s} in
 	Linux)		DIR="/lib/modules/$KVER/build" ;;
 	GNU/kFreeBSD)	DIR="/usr/src/kfreebsd-headers-$KVER/sys" ;;
    esac
    echo $DIR
 }
 _check_kernel_dir() {
    DIR=$(_get_kernel_dir $1)
    case ${uname_s} in
 	Linux)		test -e $DIR/include ;;
 	GNU/kFreeBSD)	test -e $DIR/kern && test -e $DIR/conf/kmod.mk ;;
 	*)		return 1 ;;
    esac
    return $?
 }
 # Check the existence of a kernel named as $1
 _is_kernel_name_correct() {
    CORRECT="no"
    KERNEL_NAME=$1
    for kernel in /boot/config-*; do
        KERNEL=${kernel#*-}
        if [ "${KERNEL}" = "${KERNEL_NAME}" ]; then
            CORRECT="yes"
            break
        fi
    done
    echo $CORRECT
 }
 # Get the most recent kernel on Debian based systems. This keeps
 # into account both the version and the ABI. If the current kernel
 # is the most recent kernel then the function will print a null string.
 _get_newest_kernel_debian() {
    NEWEST_KERNEL=
    NEWEST_VERSION=
    NEWEST_ABI=
    for kernel in /boot/config-*; do
        [ -f "$kernel" ] || continue
        KERNEL=${kernel#*-}
        KERNEL_VERSION=${KERNEL%%-*}
        ABI=${KERNEL#*-}
        ABI=${ABI%%-*}
        if [ -z "$NEWEST_KERNEL" ]; then
            # The 1st time get a version which is bigger than $1
            COMPARE_TO=$1
        else
            # Get the biggest version
            COMPARE_TO="$NEWEST_VERSION-$NEWEST_ABI"
        fi
        # if $kernel is greater than $COMPARE_TO
        if [ `dpkg --compare-versions "$KERNEL_VERSION-$ABI" gt "$COMPARE_TO" && echo "yes" || \
              echo "no"` = "yes" ]; then
            NEWEST_KERNEL=$KERNEL
            NEWEST_VERSION=$KERNEL_VERSION
            NEWEST_ABI=$ABI
        fi
    done
    echo "$NEWEST_KERNEL"
 }
 # Get the most recent kernel in Rhel based systems. If the current kernel
 # is the most recent kernel then the function will print a null string.
 _get_newest_kernel_rhel() {
    NEWEST_KERNEL=
    LAST_INSTALLED_KERNEL=$(rpm -q --whatprovides kernel  --last | grep kernel -m1 | cut -f1 -d' ')
    LIK_FORMATTED_NAME=$(rpm -q $LAST_INSTALLED_KERNEL --queryformat="%{VERSION}-%{RELEASE}.%{ARCH}\n")
    if [ `echo $LIK_FORMATTED_NAME | grep 2.6 >/dev/null` ]; then
        # Fedora and Suse
        NEWEST_KERNEL=$LIK_FORMATTED_NAME
    else
        # Hack for Mandriva where $LIK_FORMATTED_NAME is broken
        LIK_NAME=$(rpm -q $LAST_INSTALLED_KERNEL --queryformat="%{NAME}\n")
        LIK_TYPE=${LIK_NAME#kernel-}
        LIK_TYPE=${LIK_TYPE%%-*}
        LIK_STRIPPED=${LIK_NAME#kernel-}
        LIK_STRIPPED=${LIK_STRIPPED#$LIK_TYPE-}
        LIK_STRIPPED_BASE=${LIK_STRIPPED%%-*}
        LIK_STRIPPED_END=${LIK_STRIPPED#$LIK_STRIPPED_BASE-}
        LIK_FINAL=$LIK_STRIPPED_BASE-$LIK_TYPE-$LIK_STRIPPED_END
        NEWEST_KERNEL=$LIK_FINAL
    fi
    echo $NEWEST_KERNEL
 }
 # Get the newest kernel on Debian and Rhel based systems.
 get_newest_kernel() {
    NEWEST_KERNEL=
    # Try Debian first as rpm can be installed in Debian based distros
    if [ -e /usr/bin/dpkg ]; then
        # If DEB based
        CURRENT_VERSION=${CURRENT_KERNEL%%-*}
        CURRENT_ABI=${CURRENT_KERNEL#*-}
        CURRENT_FLAVOUR=${CURRENT_ABI#*-}
        CURRENT_ABI=${CURRENT_ABI%%-*}
        NEWEST_KERNEL=$(_get_newest_kernel_debian "$CURRENT_VERSION-$CURRENT_ABI")
    elif [ `which rpm >/dev/null` ]; then
        # If RPM based
        NEWEST_KERNEL=$(_get_newest_kernel_rhel)
    fi
    # Make sure that kernel name that we extracted corresponds to an installed
    # kernel
    if [ -n "$NEWEST_KERNEL" ] && [ `_is_kernel_name_correct $NEWEST_KERNEL` = "no" ]; then
        NEWEST_KERNEL=
    fi
    echo $NEWEST_KERNEL
 }
 NAME=$1
 VERSION=$2
 TARBALL_ROOT=$3
 ARCH=$4
 UPGRADE=$5
 if [ -z "$NAME" ] || [ -z "$VERSION" ]; then
    echo "Need NAME, and VERSION defined"
    echo "ARCH is optional"
    exit 1
 fi
 # read framework configuration options
 if [ -r /etc/dkms/framework.conf ]; then
    . /etc/dkms/framework.conf
 fi
 KERNELS=$(ls /lib/modules/ 2>/dev/null || true)
 CURRENT_KERNEL=$(uname -r)
 #We never want to keep an older version side by side to prevent conflicts
 if [ -e "/var/lib/dkms/$NAME/$VERSION" ]; then
    echo "Removing old $NAME-$VERSION DKMS files..."
    dkms remove -m $NAME -v $VERSION --all
 fi
 #Load new files, by source package and by tarball
 if [ -f "$TARBALL_ROOT/$NAME-$VERSION.dkms.tar.gz" ]; then
    if ! dkms ldtarball --archive "$TARBALL_ROOT/$NAME-$VERSION.dkms.tar.gz"; then
        echo ""
        echo ""
        echo "Unable to load DKMS tarball $TARBALL_ROOT/$NAME-$VERSION.dkms.tar.gz."
        echo "Common causes include: "
        echo " - You must be using DKMS 2.1.0.0 or later to support binaries only"
        echo "   distribution specific archives."
        echo " - Corrupt distribution specific archive"
        echo ""
        echo ""
        exit 2
    fi
 elif [ -d "/usr/src/$NAME-$VERSION" ]; then
    echo "Loading new $NAME-$VERSION DKMS files..."
    dkms add -m $NAME -v $VERSION > /dev/null
 fi
 # On 1st installation, let us look for a directory
 # in /lib/modules which matches `uname -r`. If none
 # is found it is possible that buildd is being used
 # and that uname -r is giving us the name of the
 # kernel used by the buildd machine.
 #
 # If this is the case we try to build the kernel
 # module for each kernel which has a directory in
 # /lib/modules. Furthermore we will have to tell
 # DKMS which architecture it should build the module
 # for (e.g. if the buildd machine is using a
 # 2.6.24-23-xen 64bit kernel).
 #
 # NOTE: if the headers are not installed then the
 #       module won't be built, as usual
 # Here we look for the most recent kernel so that we can
 # build the module for it (in addition to doing it for the
 # current kernel.
 NEWEST_KERNEL=$(get_newest_kernel)
 if [ -z "$autoinstall_all_kernels" ]; then
    # If the current kernel is installed on the system or chroot
    if [ `_is_kernel_name_correct $CURRENT_KERNEL` = "yes" ]; then
        if [ -n "$NEWEST_KERNEL" ] && [ ${CURRENT_KERNEL} != ${NEWEST_KERNEL} ]; then
            KERNELS="$CURRENT_KERNEL $NEWEST_KERNEL"
        else
            KERNELS=$CURRENT_KERNEL
        fi
    # The current kernel is not useful as it's not installed
    else
        echo "It is likely that $CURRENT_KERNEL belongs to a chroot's host"
        # Let's use only the newest kernel if this is not a first installation
        # otherwise build for all kernels
        if [ -n "$NEWEST_KERNEL" -a -n "$UPGRADE" ]; then
            KERNELS="$NEWEST_KERNEL"
        fi
    fi
 fi
 # Take care of displaying newline separated list
 echo "Building for $KERNELS" | tr '\n' ',' \
    | sed -e 's/,/, /g; s/, $/\n/; s/, \([^,]\+\)$/ and \1/'
 if [ -n "$ARCH" ]; then
    if which lsb_release >/dev/null && [ $(lsb_release -s -i) = "Ubuntu" ]; then
        case $ARCH in
            amd64)
                ARCH="x86_64"
                ;;
            lpia|i?86)
                ARCH="i686"
                ;;
        esac
    fi
    echo "Building for architecture $ARCH"
    ARCH="-a $ARCH"
 fi
 for KERNEL in $KERNELS; do
    dkms_status=`dkms status -m $NAME -v $VERSION -k $KERNEL $ARCH`
    if [ `echo $KERNEL | grep -c "BOOT"` -gt 0 ]; then
        echo ""
        echo "Module build and install for $KERNEL was skipped as "
        echo "it is a BOOT variant"
        continue
    fi
    #if the module isn't yet built, try to build it
    if [ `echo $dkms_status | grep -c ": built"` -eq 0 ]; then
        if [ ! -L /var/lib/dkms/$NAME/$VERSION/source ]; then
            echo "This package appears to be a binaries-only package"
            echo " you will not be able to build against kernel $KERNEL"
            echo " since the package source was not provided"
            continue
        fi
        if _check_kernel_dir $KERNEL; then
            echo "Building initial module for $KERNEL"
            set +e
            dkms build -m $NAME -v $VERSION -k $KERNEL $ARCH > /dev/null
            case $? in
            9)
                set -e
                echo "Skipped."
                continue
                ;;
            0)
                set -e
                echo "Done."
                ;;
            *)
                exit $?
                ;;
            esac
            dkms_status=`dkms status -m $NAME -v $VERSION -k $KERNEL $ARCH`
        else
            echo "Module build for kernel $KERNEL was skipped since the"
            echo "kernel headers for this kernel does not seem to be installed."
        fi
    fi
    #if the module is built (either pre-built or just now), install it
    if [ `echo $dkms_status | grep -c ": built"` -eq 1 ] && 
       [ `echo $dkms_status | grep -c ": installed"` -eq 0 ]; then
        dkms install -m $NAME -v $VERSION -k $KERNEL $ARCH
    fi
 done
--- a/gdrcopy/packages/dkms/debian/README.Debian
+++ b/gdrcopy/packages/dkms/debian/README.Debian
@ -0,0 +1,5 @@
 gdrdrv DKMS module for Debian
 This package was automatically generated by the DKMS system,
 for distribution on Debian based operating systems.
--- a/gdrcopy/packages/dkms/debian/changelog
+++ b/gdrcopy/packages/dkms/debian/changelog
@ -0,0 +1,72 @@
 gdrdrv-dkms (2.4.4) stable; urgency=low
  * Fix the use-after-free bug of mr objects in gdrdv_vma_close.
  * Fix the resource leakage bug in gdrdrv_release.
 -- Pak Markthub <pmarkthub@nvidia.com>  Mon, 16 Dec 2024 11:59:59 -0700
 gdrdrv-dkms (2.4.3) stable; urgency=low
  * Fix NVIDIA_IS_OPENSOURCE detection when compile with NVIDIA driver version 545 or newer.
  * Fix compile error in gdrdrv when compile on RHEL9.5.
 -- Pak Markthub <pmarkthub@nvidia.com>  Mon, 02 Dec 2024 11:59:59 -0700
 gdrdrv-dkms (2.4.2) stable; urgency=low
  * Fix the size alignment bug in gdrdrv.
  * Add support for another flavor of BF3.
 -- Pak Markthub <pmarkthub@nvidia.com>  Thu, 31 Oct 2024 11:59:59 -0700
 gdrdrv-dkms (2.4.1) stable; urgency=low
  * Add support for persistent mapping.
  * Fix bug in src/gdrdrv/Makefile.
 -- Pak Markthub <pmarkthub@nvidia.com>  Mon, 18 Dec 2023 11:59:59 -0700
 gdrdrv-dkms (2.4) stable; urgency=low
  * Add support for NVIDIA BLUEFIELD-3.
  * Add support for Linux kernel >= 6.3.
  * Relicense gdrdrv to Dual MIT/GPL.
  * Fix bugs in gdrdrv when pinning two small buffers back-to-back.
  * Add support for coherent platforms such as Grace-Hopper.
  * Add support for Confidential Computing (CC).
 -- Pak Markthub <pmarkthub@nvidia.com>  Fri, 01 Sep 2023 11:59:59 -0700
 gdrdrv-dkms (2.3.1) stable; urgency=low
  * Add a workaround for the GPL-compatibility issue when compile with CONFIG_ARCH_HAS_CC_PLATFORM on Linux kernel 5.18+.
 -- Pak Markthub <pmarkthub@nvidia.com>  Mon, 12 May 2023 11:59:59 -0700
 gdrdrv-dkms (2.3) stable; urgency=low
  * Change the package maintainer to GPUDirect Team.
  * Add Davide Rossetti and Pak Makthub as Uploaders.
  * Revamp gdrdrv to fix race-condition bugs.
 -- Pak Markthub <pmarkthub@nvidia.com>  Mon, 23 Jul 2021 11:59:59 -0700
 gdrdrv-dkms (2.2) stable; urgency=low
  * No change.
 -- Pak Markthub <pmarkthub@nvidia.com>  Mon, 01 Feb 2021 11:59:59 -0700
 gdrdrv-dkms (2.1) stable; urgency=low
  * Change the package maintainer to Davide Rossetti.
 -- Davide Rossetti <drossetti@nvidia.com>  Mon, 02 Mar 2020 11:59:59 -0700
 gdrdrv-dkms (2.0) stable; urgency=low
  * Harden security in gdrdrv.
  * Enable cached mappings in POWER9.
 -- Pak Markthub <pmarkthub@nvidia.com>  Mon, 16 Sep 2019 11:59:59 -0700
--- a/gdrcopy/packages/dkms/debian/compat
+++ b/gdrcopy/packages/dkms/debian/compat
@ -0,0 +1 @@
 9
--- a/gdrcopy/packages/dkms/debian/control
+++ b/gdrcopy/packages/dkms/debian/control
@ -0,0 +1,13 @@
 Source: gdrdrv-dkms
 Section: misc
 Priority: optional
 Maintainer: GPUDirect Team <gpudirect@nvidia.com>
 Uploaders: Davide Rossetti <drossetti@nvidia.com>, Pak Markthub <pmarkthub@nvidia.com>
 Build-Depends: debhelper (>= 9), dkms
 Standards-Version: @FULL_VERSION@
 Package: gdrdrv-dkms
 Architecture: any
 Multi-Arch: same
 Depends: dkms (>= 1.95), ${misc:Depends}
 Description: gdrdrv driver in DKMS format.
--- a/gdrcopy/packages/dkms/debian/copyright
+++ b/gdrcopy/packages/dkms/debian/copyright
@ -0,0 +1,2 @@
 This copyright has not been completed by the author of this package.
--- a/gdrcopy/packages/dkms/debian/dirs
+++ b/gdrcopy/packages/dkms/debian/dirs
@ -0,0 +1 @@
 usr/src
--- a/gdrcopy/packages/dkms/debian/gdrdrv.init
+++ b/gdrcopy/packages/dkms/debian/gdrdrv.init
@ -0,0 +1,147 @@
 #!/bin/bash
 #
 # Startup/shutdown script for GDRcopy driver
 # chkconfig: 2345 20 80
 # description: Startup/shutdown script for GDRcopy kernel-mode driver
 ### BEGIN INIT INFO
 # Provides:       gdrcopy
 # Required-Start:
 # Required-Stop:
 # Default-Start: 2 3 4 5
 # Default-Stop: 0 1 6
 # Description: GDRcopy kernel-mode driver init script
 ### END INIT INFO
 # Source function library.
 . /lib/lsb/init-functions
 DRIVER=gdrdrv
 RETVAL=0
 is_module()
 {
    local RC
    /sbin/lsmod | grep -w "$1" > /dev/null 2>&1
    RC=$?
    return $RC
 }
 log_msg()
 {
    logger -i "$modname: $@"
 }
 function req_modules_loaded() {
    local RC
    local reqmods="nvidia"
    for mod in $reqmods; do
        if ! is_module $mod; then
            echo "module $mod is not loaded"
            RC=1
            break
        fi
    done
    return $RC
 }
 # Create /dev nodes for device
 function createnodes() {
    local module=$1
    local RC
    local inode=/dev/$module
    major=`fgrep $module /proc/devices | cut -b 1-4`
    log_msg "$module: driver major is $major"
    [ -e $inode ] && rm -f $inode
    mknod -m 666 $inode c $major 0
    RC=$?
    return $RC
 }
 # Remove /dev nodes for device
 function removenodes() {
    rm -f /dev/gdrdrv*
 }
 load_module()
 {
    local RC
    local module=$1
    filename=`modinfo $module | grep filename | awk '{print $NF}'`
    if [ ! -n "$filename" ]; then
        echo "Module $module does not exist"
        log_msg "Error: Module $module does not exist"
        return 1
    fi
    echo -n $"Loading $DRIVER kernel module: "
    /sbin/modprobe $module && log_success_msg || log_failure_msg
    RC=$?
    return $RC
 }
 # Start daemon
 function start() {
    echo -n $"Checking required modules: "
    req_modules_loaded && log_success_msg || log_failure_msg
    RETVAL=$?
    echo
    [ "$RETVAL" = 0 ] || exit $RETVAL
    if is_module $DRIVER ; then
        echo "module already loaded"
    else
        load_module $DRIVER
        RETVAL=$?
        echo
        [ "$RETVAL" = 0 ] || exit $RETVAL
    fi
    echo -n $"Initializing GDRcopy /dev entries: "
    createnodes $DRIVER && log_success_msg || log_failure_msg
    RETVAL=$?
    echo
    [ "$RETVAL" = 0 ] || exit $RETVAL
 }
 # Stop daemon
 function stop() {
    echo -n $"Unloading $DRIVER kernel module: "
    /sbin/rmmod $DRIVER && log_success_msg || log_failure_msg
    RETVAL=$?
    echo
    [ "$RETVAL" = 0 ] || exit $RETVAL
    echo -n $"Removing GDRcopy /dev entries: "
    removenodes $DRIVER && log_success_msg || log_failure_msg
    RETVAL=$?
    echo
    [ "$RETVAL" = 0 ] || exit $RETVAL
 }
 # See how we were called
 case "$1" in
   start)
       start
      ;;
   stop)
       stop
      ;;
   restart)
       stop
       start
      ;;
   *)
       echo $"Usage: $0 {start|stop|restart}"
       RETVAL=1
 esac
 exit $RETVAL
--- a/gdrcopy/packages/dkms/debian/postinst
+++ b/gdrcopy/packages/dkms/debian/postinst
@ -0,0 +1,49 @@
 #!/bin/sh
 # Copyright (C) 2002-2005 Flavio Stanchina
 # Copyright (C) 2005-2006 Aric Cyr
 # Copyright (C) 2007 Mario Limonciello
 # Copyright (C) 2009 Alberto Milone
 set -e
 NAME=gdrdrv
 PACKAGE_NAME=$NAME-dkms
 DEB_NAME=$(echo $PACKAGE_NAME | sed 's,_,-,')
 CVERSION=`dpkg-query -W -f='${Version}' $DEB_NAME | awk -F "-" '{print $1}' | cut -d\: -f2`
 ARCH=`dpkg --print-architecture`
 dkms_configure () {
 	for POSTINST in /usr/lib/dkms/common.postinst "/usr/share/$PACKAGE_NAME/postinst"; do
 		if [ -f "$POSTINST" ]; then
 			"$POSTINST" "$NAME" "$CVERSION" "/usr/share/$PACKAGE_NAME" "$ARCH" "$2"
 			return $?
 		fi
 		echo "WARNING: $POSTINST does not exist." >&2
 	done
 	echo "ERROR: DKMS version is too old and $PACKAGE_NAME was not" >&2
 	echo "built with legacy DKMS support." >&2
 	echo "You must either rebuild $PACKAGE_NAME with legacy postinst" >&2
 	echo "support or upgrade DKMS to a more current version." >&2
 	return 1
 }
 case "$1" in
 	configure)
 		dkms_configure
 	;;
 	abort-upgrade|abort-remove|abort-deconfigure)
 	;;
 	*)
 		echo "postinst called with unknown argument \`$1'" >&2
 		exit 1
 	;;
 esac
 # dh_installdeb will replace this with shell code automatically
 # generated by other debhelper scripts.
 #DEBHELPER#
 exit 0
--- a/gdrcopy/packages/dkms/debian/prerm
+++ b/gdrcopy/packages/dkms/debian/prerm
@ -0,0 +1,28 @@
 #!/bin/sh
 NAME=gdrdrv
 VERSION=@VERSION@
 set -e
 case "$1" in
    remove|upgrade|deconfigure)
      if [  "`dkms status -m $NAME`" ]; then
         dkms remove -m $NAME -v $VERSION --all
      fi
    ;;
    failed-upgrade)
    ;;
    *)
        echo "prerm called with unknown argument \`$1'" >&2
        exit 1
    ;;
 esac
 #DEBHELPER#
 exit 0
--- a/gdrcopy/packages/dkms/debian/rules
+++ b/gdrcopy/packages/dkms/debian/rules
@ -0,0 +1,55 @@
 #!/usr/bin/make -f
 # -*- makefile -*-
 # Uncomment this to turn on verbose mode.
 #export DH_VERBOSE=1
 DEB_NAME=gdrdrv
 NAME=gdrdrv
 VERSION=@VERSION@
 configure: configure-stamp
 configure-stamp:
 	dh_testdir
 	touch configure-stamp
 build: build-stamp
 build-stamp: configure-stamp 
 	dh_testdir
 	$(MAKE)
 	touch $@
 clean:
 	dh_testdir
 	dh_testroot
 	rm -f build-stamp configure-stamp
 	-$(MAKE) clean
 	dh_clean
 install: build
 	dh_testdir
 	dh_testroot
 	dh_prep
 	dh_installdirs
 	$(MAKE) DESTDIR=$(CURDIR)/debian/$(DEB_NAME)-dkms NAME=$(NAME) VERSION=$(VERSION) install
 	dh_installinit --name $(DEB_NAME)
 binary-arch: build install
 binary-indep: build install
 	dh_testdir
 	dh_testroot
 	dh_link
 	dh_strip
 	dh_compress
 	dh_fixperms
 	dh_installdeb
 	dh_shlibdeps
 	dh_gencontrol
 	dh_md5sums
 	dh_builddeb
 binary: binary-indep binary-arch
 .PHONY: build clean binary-indep binary-arch binary install configure
--- a/gdrcopy/packages/dkms/debian/source/format
+++ b/gdrcopy/packages/dkms/debian/source/format
@ -0,0 +1 @@
 3.0 (quilt)
--- a/gdrcopy/packages/gdrcopy.cfg
+++ b/gdrcopy/packages/gdrcopy.cfg
@ -0,0 +1,20 @@
 ### Commented entries have reasonable defaults.
 ### Uncomment to edit them.
 # Source: <source package name; defaults to package name>
 Section: misc
 Priority: optional
 Homepage: https://github.com/NVIDIA/gdrcopy
 Standards-Version: @FULL_VERSION@
 Package: gdrcopy
 Version: @FULL_VERSION@
 Maintainer: GPUDirect Team <gpudirect@nvidia.com>
 Uploaders: Davide Rossetti <drossetti@nvidia.com>, Pak Markthub <pmarkthub@nvidia.com>
 Depends: gdrdrv-dkms (= @FULL_VERSION@), libgdrapi (= @FULL_VERSION@), gdrcopy-tests (= @FULL_VERSION@)
 Architecture: any
 Multi-Arch: same
 Copyright: MIT
 Changelog: changelog
 Readme: README.md
 Description: GDRCopy meta-package
 Meta-package for GDRCopy, a low-latency GPU memory copy library based on NVIDIA GPUDirect RDMA technology.
--- a/gdrcopy/packages/rhel/gdrcopy.service
+++ b/gdrcopy/packages/rhel/gdrcopy.service
@ -0,0 +1,14 @@
 [Unit]
 Description=GDRCopy service
 After=multi-user.target
 [Service]
 Type=oneshot
 RemainAfterExit=yes
 ExecStart=/bin/bash /usr/libexec/gdrcopy/gdrcopy start
 ExecReload=/bin/bash /usr/libexec/gdrcopy/gdrcopy restart
 ExecStop=/bin/bash /usr/libexec/gdrcopy/gdrcopy stop
 [Install]
 WantedBy=multi-user.target
--- a/gdrcopy/scripts/test_gdrdrv_HAVE_VM_FLAGS_SET.sh
+++ b/gdrcopy/scripts/test_gdrdrv_HAVE_VM_FLAGS_SET.sh
@ -0,0 +1,81 @@
 #!/bin/sh
 show_help()
 {
    echo "Usage: ${0} [-hk]"
    echo
    echo "  -h          Show this help text."
    echo "  -k <kver>   Specify the kernel version."
    echo
 }
 set_kver=0
 kver=""
 OPTIND=1    # Reset in case getopts has been used previously in the shell.
 while getopts "hk:" opt ; do
    case "${opt}" in
        h)
            show_help
            exit 0
            ;;
        k)
            set_kver=1
            kver="${OPTARG}"
            ;;
        ?)
            show_help
            exit 0
            ;;
    esac
 done
 if [ ${set_kver} -eq 0 ]; then
    kver="$(uname -r)"
 fi
 kdir="/lib/modules/${kver}/build"
 tmpfolder=$(mktemp --tmpdir -d gdrcopy.XXXXXXXXX)
 testfile="${tmpfolder}/test-dummy.c"
 makefile="${tmpfolder}/Makefile"
 cat >${testfile} <<EOF
 #include <linux/module.h>
 #include <linux/mm.h>
 static int __init test_dummy_init(void)
 {
    struct vm_area_struct vma;
    vm_flags_set(&vma, 0);
    return 0;
 }
 static void __exit test_dummy_fini(void)
 {
 }
 MODULE_AUTHOR("gpudirect@nvidia.com");
 MODULE_LICENSE("MIT");
 MODULE_VERSION("1.0");
 module_init(test_dummy_init);
 module_exit(test_dummy_fini);
 EOF
 cat >${makefile} <<EOF
 obj-m := test-dummy.o
 EOF
 cd ${tmpfolder}
 make -C ${kdir} M=${tmpfolder} modules > /dev/null 2>&1
 ret=$?
 rm -rf ${tmpfolder}
 if [ "${ret}" -eq 0 ]; then
    echo "y"
 else
    echo "n"
 fi
--- a/gdrcopy/src/Makefile
+++ b/gdrcopy/src/Makefile
@ -0,0 +1,79 @@
 # Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the "Software"),
 # to deal in the Software without restriction, including without limitation
 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
 # and/or sell copies of the Software, and to permit persons to whom the
 # Software is furnished to do so, subject to the following conditions:
 #
 # The above copyright notice and this permission notice shall be included in 
 # all copies or substantial portions of the Software.
 #
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 LIB_MAJOR_VER ?= $(shell awk '/\#define GDR_API_MAJOR_VERSION/ { print $$3 }' ../include/gdrapi.h | tr -d '\n')
 LIB_MINOR_VER ?= $(shell awk '/\#define GDR_API_MINOR_VERSION/ { print $$3 }' ../include/gdrapi.h | tr -d '\n')
 GDRAPI_ARCH ?= $(shell ../config_arch)
 GDRAPI_INC := ../include
 CPPFLAGS := -I $(GDRAPI_INC) -I gdrdrv/ -D GDRAPI_ARCH=$(GDRAPI_ARCH)
 LDFLAGS  := 
 COMMONCFLAGS := -O2
 CFLAGS   += $(COMMONCFLAGS)
 CXXFLAGS += $(COMMONCFLAGS)
 LIBS     := -lpthread -ldl
 LIB_VER:=$(LIB_MAJOR_VER).$(LIB_MINOR_VER)
 LIB_BASENAME:=libgdrapi.so
 LIB_DYNAMIC=$(LIB_BASENAME).$(LIB_VER)
 LIB_SONAME=$(LIB_BASENAME).$(LIB_MAJOR_VER)
 LIB:=$(LIB_DYNAMIC)
 LIBSRCS := gdrapi.c
 ifeq ($(GDRAPI_ARCH),X86)
 LIBSRCS += memcpy_avx.c memcpy_sse.c memcpy_sse41.c
 endif
 LIBOBJS := $(LIBSRCS:.c=.o)
 all: config lib
 config:
 	@ echo "GDRAPI_ARCH=$(GDRAPI_ARCH)"
 lib: $(LIB)
 #static
 #$(LIB): $(LIB)($(LIBOBJS))
 #dynamic
 $(LIBOBJS): CFLAGS+=-fPIC
 $(LIB): $(LIBOBJS)
 	$(CC) -shared -Wl,-soname,$(LIB_SONAME) -o $@ $^
 	PATH=/sbin:/usr/sbin:$$PATH; ldconfig -n $(PWD)
 	ln -sf $(LIB_DYNAMIC) $(LIB_SONAME)
 	ln -sf $(LIB_SONAME) $(LIB_BASENAME)
 # special-cased to finely tune the arch option
 memcpy_avx.o: memcpy_avx.c
 	$(COMPILE.c) -mavx -o $@ $^
 memcpy_sse.o: memcpy_sse.c
 	$(COMPILE.c) -msse -o $@ $^
 memcpy_sse41.o: memcpy_sse41.c
 	$(COMPILE.c) -msse4.1 -o $@ $^
 gdrapi.o: gdrapi.c $(GDRAPI_INC)/gdrapi.h gdrapi_internal.h gdrdrv/gdrdrv.h
 clean:
 	rm -f *.o $(EXES) lib*.so* *~ core.*
 .PHONY: clean all lib 
--- a/gdrcopy/src/gdrapi.c
+++ b/gdrcopy/src/gdrapi.c
@ -0,0 +1,877 @@
 /*
 * Copyright (c) 2014-2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in 
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
 #include <stdarg.h>
 #include <string.h>
 #include <sys/types.h>
 #include <sys/socket.h>
 #include <sys/time.h>
 #include <sys/stat.h>
 #include <sys/mman.h>
 #include <fcntl.h>
 #include <errno.h>
 #include <netdb.h>
 #include <malloc.h>
 #include <getopt.h>
 #include <arpa/inet.h>
 #include <sys/ioctl.h>
 #include <time.h>
 #include <asm/types.h>
 #include <assert.h>
 #include <sys/queue.h>
 #include "gdrconfig.h"
 #include "gdrapi.h"
 #include "gdrdrv.h"
 #include "gdrapi_internal.h"
 // logging/tracing
 enum gdrcopy_msg_level {
    GDRCOPY_MSG_DEBUG = 1,
    GDRCOPY_MSG_INFO,
    GDRCOPY_MSG_WARN,
    GDRCOPY_MSG_ERROR
 };
 static int gdr_msg_level = GDRCOPY_MSG_ERROR;
 static int gdr_enable_logging = -1;
 static void gdr_msg(enum gdrcopy_msg_level lvl, const char* fmt, ...)
 {
    if (-1 == gdr_enable_logging) {
        const char *env = getenv("GDRCOPY_ENABLE_LOGGING");
        if (env)
            gdr_enable_logging = 1;
        else
            gdr_enable_logging = 0;
        env = getenv("GDRCOPY_LOG_LEVEL");
        if (env)
            gdr_msg_level = atoi(env);
    }
    if (gdr_enable_logging) {
        if (lvl >= gdr_msg_level) {
            va_list ap;
            va_start(ap, fmt);
            vfprintf(stderr, fmt, ap);
            va_end(ap);
        }
    }
 }
 #define gdr_dbg(FMT, ARGS...)  gdr_msg(GDRCOPY_MSG_DEBUG, "DBG:  " FMT, ## ARGS)
 #define gdr_dbgc(C, FMT, ARGS...)  do { static int gdr_dbg_cnt=(C); if (gdr_dbg_cnt) { gdr_dbg(FMT, ## ARGS); --gdr_dbg_cnt; }} while (0)
 #define gdr_info(FMT, ARGS...) gdr_msg(GDRCOPY_MSG_INFO,  "INFO: " FMT, ## ARGS)
 #define gdr_warn(FMT, ARGS...) gdr_msg(GDRCOPY_MSG_WARN,  "WARN: " FMT, ## ARGS)
 #define gdr_err(FMT, ARGS...)  gdr_msg(GDRCOPY_MSG_ERROR, "ERR:  " FMT, ## ARGS)
 static gdr_memh_t *to_memh(gdr_mh_t mh) {
    return (gdr_memh_t *)mh.h;
 }
 static gdr_mh_t from_memh(gdr_memh_t *memh) {
    gdr_mh_t mh;
    mh.h = (unsigned long)memh;
    return mh;
 }
 static void gdr_init_cpu_flags(void);
 static inline int gdr_is_mapped(const gdr_mapping_type_t mapping_type)
 {
    return mapping_type != GDR_MAPPING_TYPE_NONE;
 }
 gdr_t gdr_open(void)
 {
    gdr_t g = NULL;
    const char *gdrinode = "/dev/gdrdrv";
    int ret;
    g = calloc(1, sizeof(*g));
    if (!g) {
        gdr_err("error while allocating memory\n");
        return NULL;
    }
    int fd = open(gdrinode, O_RDWR | O_CLOEXEC);
    if (-1 == fd ) {
        ret = errno;
        gdr_err("error opening driver (errno=%d/%s)\n", ret, strerror(ret));
        goto err_mem;
    }
    struct GDRDRV_IOC_GET_VERSION_PARAMS params;
    int retcode = ioctl(fd, GDRDRV_IOC_GET_VERSION, &params);
    if (0 != retcode) {
        ret = errno;
        gdr_err("Error getting the gdrdrv driver version with ioctl error (errno=%d). gdrdrv might be too old.\n", ret);
        goto err_fd;
    }
    if (params.gdrdrv_version < MINIMUM_GDRDRV_VERSION) {
        gdr_err(
            "The minimum required gdrdrv driver version is %d.%d but the current gdrdrv version is %d.%d\n", 
            MINIMUM_GDRDRV_MAJOR_VERSION, 
            MINIMUM_GDRDRV_MINOR_VERSION, 
            params.gdrdrv_version >> MAJOR_VERSION_SHIFT, 
            params.gdrdrv_version & MINOR_VERSION_MASK
        );
        goto err_fd;
    }
    if (params.minimum_gdr_api_version > GDR_API_VERSION) {
        gdr_err(
            "gdrdrv driver requires libgdrapi version %d.%d or above but the current libgdrapi version is %d.%d\n", 
            params.minimum_gdr_api_version >> MAJOR_VERSION_SHIFT, 
            params.minimum_gdr_api_version & MINOR_VERSION_MASK, 
            GDR_API_MAJOR_VERSION, 
            GDR_API_MINOR_VERSION
        );
        goto err_fd;
    }
    g->fd = fd;
    LIST_INIT(&g->memhs);
    gdr_init_cpu_flags();
    // Initialize page_shift, page_size, and page_mask.
    g->page_size = sysconf(_SC_PAGESIZE);
    g->page_mask = ~(g->page_size - 1);
    size_t ps_tmp = g->page_size;
    g->page_shift = -1;
    while (ps_tmp > 0) {
        ++g->page_shift;
        if ((ps_tmp & 0x1) == 1)
            break;
        ps_tmp >>= 1;
    }
    g->gdrdrv_version = params.gdrdrv_version;
    return g;
 err_fd:
    close(fd);
 err_mem:
    free(g);
    return NULL;
 }
 int gdr_close(gdr_t g)
 {
    int ret = 0;
    int retcode;
    gdr_memh_t *mh, *next_mh;
    mh = g->memhs.lh_first;
    while (mh != NULL) {
        // gdr_unpin_buffer frees mh, so we need to get the next one
        // beforehand.
        next_mh = mh->entries.le_next;
        ret = gdr_unpin_buffer(g, from_memh(mh));
        if (ret) {
            gdr_err("error unpinning buffer inside gdr_close (errno=%d/%s)\n", ret, strerror(ret));
            return ret;
        }
        mh = next_mh;
    }
    retcode = close(g->fd);
    if (-1 == retcode) {
        ret = errno;
        gdr_err("error closing driver (errno=%d/%s)\n", ret, strerror(ret));
    }
    g->fd = 0;
    free(g);
    return ret;
 }
 int gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle)
 {
    int ret = 0;
    int retcode;
    if (!handle) {
        return EINVAL;
    }
    gdr_memh_t *mh = calloc(1, sizeof(gdr_memh_t));
    if (!mh) {
        return ENOMEM;
    }
    struct GDRDRV_IOC_PIN_BUFFER_PARAMS params;
    params.addr = addr;
    params.size = size;
    params.p2p_token = p2p_token;
    params.va_space = va_space;
    params.handle = 0;
    retcode = ioctl(g->fd, GDRDRV_IOC_PIN_BUFFER, &params);
    if (0 != retcode) {
        ret = errno;
        gdr_err("ioctl error (errno=%d)\n", ret);
        free(mh);
        goto err;
    }
    mh->handle = params.handle;
    LIST_INSERT_HEAD(&g->memhs, mh, entries);
    *handle = from_memh(mh);
 err:
    return ret;
 }
 int gdr_unpin_buffer(gdr_t g, gdr_mh_t handle)
 {
    int ret = 0;
    int retcode;
    gdr_memh_t *mh = to_memh(handle);
    struct GDRDRV_IOC_UNPIN_BUFFER_PARAMS params;
    params.handle = mh->handle;
    retcode = ioctl(g->fd, GDRDRV_IOC_UNPIN_BUFFER, &params);
    if (0 != retcode) {
        ret = errno;
        gdr_err("ioctl error (errno=%d)\n", ret);
    }
    LIST_REMOVE(mh, entries);
    free(mh);
    return ret;
 }
 int gdr_get_callback_flag(gdr_t g, gdr_mh_t handle, int *flag)
 {
    int ret = 0;
    int retcode;
    gdr_memh_t *mh = to_memh(handle);
    struct GDRDRV_IOC_GET_CB_FLAG_PARAMS params;
    params.handle = mh->handle;
    retcode = ioctl(g->fd, GDRDRV_IOC_GET_CB_FLAG, &params);
    if (0 != retcode) {
        ret = errno;
        gdr_err("ioctl error (errno=%d)\n", ret);
    } else {
        *flag = params.flag;
    }
    return ret;
 }
 int gdr_get_info_v2(gdr_t g, gdr_mh_t handle, gdr_info_v2_t *info)
 {
    int ret = 0;
    int retcode;
    gdr_memh_t *mh = to_memh(handle);
    if (g->gdrdrv_version >= GDRDRV_MINIMUM_VERSION_WITH_GET_INFO_V2) {
        struct GDRDRV_IOC_GET_INFO_V2_PARAMS params;
        params.handle = mh->handle;
        retcode = ioctl(g->fd, GDRDRV_IOC_GET_INFO_V2, &params);
        if (0 != retcode) {
            ret = errno;
            gdr_err("ioctl error (errno=%d)\n", ret);
            goto out;
        } else {
            info->va            = params.va;
            info->mapped_size   = params.mapped_size;
            info->page_size     = params.page_size;
            info->tm_cycles     = params.tm_cycles;
            info->cycles_per_ms = params.tsc_khz;
            info->mapped        = gdr_is_mapped(params.mapping_type);
            info->wc_mapping    = (params.mapping_type == GDR_MAPPING_TYPE_WC);
            info->mapping_type  = params.mapping_type;
        }
    }
    else
    {
        struct GDRDRV_IOC_GET_INFO_PARAMS params;
        params.handle = mh->handle;
        retcode = ioctl(g->fd, GDRDRV_IOC_GET_INFO, &params);
        if (0 != retcode) {
            ret = errno;
            gdr_err("ioctl error (errno=%d)\n", ret);
            goto out;
        } else {
            info->va            = params.va;
            info->mapped_size   = params.mapped_size;
            info->page_size     = params.page_size;
            info->tm_cycles     = params.tm_cycles;
            info->cycles_per_ms = params.tsc_khz;
            info->mapped        = params.mapped;
            info->wc_mapping    = params.wc_mapping;
            info->mapping_type  = params.mapped ? (params.wc_mapping ? GDR_MAPPING_TYPE_WC : GDR_MAPPING_TYPE_CACHING) : GDR_MAPPING_TYPE_NONE;
        }
    }
 out:
    return ret;
 }
 int gdr_map(gdr_t g, gdr_mh_t handle, void **ptr_va, size_t size)
 {
    int ret = 0;
    gdr_info_v2_t info = {0,};
    gdr_memh_t *mh = to_memh(handle);
    if (gdr_is_mapped(mh->mapping_type)) {
        gdr_err("mh is mapped already\n");
        return EAGAIN;
    }
    size_t rounded_size = (size + g->page_size - 1) & g->page_mask;
    off_t magic_off = (off_t)mh->handle << g->page_shift;
    void *mmio = mmap(NULL, rounded_size, PROT_READ|PROT_WRITE, MAP_SHARED, g->fd, magic_off);
    if (mmio == MAP_FAILED) {
        int __errno = errno;
        mmio = NULL;
        gdr_err("error %s(%d) while mapping handle %x, rounded_size=%zu offset=%llx\n",
                strerror(__errno), __errno, handle, rounded_size, (long long unsigned)magic_off);
        ret = __errno;
        goto err;
    }
    *ptr_va = mmio;
    ret = gdr_get_info_v2(g, handle, &info);
    if (ret) {
        gdr_err("error %d from get_info, munmapping before exiting\n", ret);
        munmap(mmio, rounded_size);
        goto err;
    }
    if (!gdr_is_mapped(info.mapping_type)) {
        // Race could cause this issue.
        // E.g., gdr_map and cuMemFree are triggered concurrently.
        // The above mmap is successful but cuMemFree causes unmapping immediately.
        gdr_err("mh is not mapped\n");
        ret = EAGAIN;
    }
    mh->mapping_type = info.mapping_type;
    gdr_dbg("mapping_type=%d\n", mh->mapping_type);
 err:
    return ret;
 }
 int gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size)
 {
    int ret = 0;
    int retcode = 0;
    size_t rounded_size;
    gdr_memh_t *mh = to_memh(handle);
    rounded_size = (size + g->page_size - 1) & g->page_mask;
    if (!gdr_is_mapped(mh->mapping_type)) {
        gdr_err("mh is not mapped yet\n");
        return EINVAL;
    }
    retcode = munmap(va, rounded_size);
    if (-1 == retcode) {
        int __errno = errno;
        gdr_err("error %s(%d) while unmapping handle %x, rounded_size=%zu\n",
                strerror(__errno), __errno, handle, rounded_size);
        ret = __errno;
        goto err;
    }
    mh->mapping_type = GDR_MAPPING_TYPE_NONE;
 err:
    return ret;
 }
 #ifdef GDRAPI_X86
 #include <cpuid.h>
 // prepare for AVX2 implementation
 #ifndef bit_AVX2
 /* Extended Features (%eax == 7) */
 /* %ebx */
 #define bit_AVX2 (1 << 5)
 #endif
 #include <immintrin.h>
 extern int memcpy_uncached_store_avx(void *dest, const void *src, size_t n_bytes);
 extern int memcpy_cached_store_avx(void *dest, const void *src, size_t n_bytes);
 extern int memcpy_uncached_store_sse(void *dest, const void *src, size_t n_bytes);
 extern int memcpy_cached_store_sse(void *dest, const void *src, size_t n_bytes);
 extern int memcpy_uncached_load_sse41(void *dest, const void *src, size_t n_bytes);
 static inline void wc_store_fence(void) { _mm_sfence(); }
 #define PREFERS_STORE_UNROLL4 0
 #define PREFERS_STORE_UNROLL8 0
 #define PREFERS_LOAD_UNROLL4  0
 #define PREFERS_LOAD_UNROLL8  0
 // GDRAPI_X86
 #elif defined(GDRAPI_POWER)
 static int memcpy_uncached_store_avx(void *dest, const void *src, size_t n_bytes)  { return 1; }
 static int memcpy_cached_store_avx(void *dest, const void *src, size_t n_bytes)  { return 1; }
 static int memcpy_uncached_store_sse(void *dest, const void *src, size_t n_bytes)    { return 1; }
 static int memcpy_cached_store_sse(void *dest, const void *src, size_t n_bytes)    { return 1; }
 static int memcpy_uncached_load_sse41(void *dest, const void *src, size_t n_bytes) { return 1; }
 static inline void wc_store_fence(void) { asm volatile("sync") ; }
 #define PREFERS_STORE_UNROLL4 1
 #define PREFERS_STORE_UNROLL8 0
 #define PREFERS_LOAD_UNROLL4  0
 #define PREFERS_LOAD_UNROLL8  1
 // GDRAPI_POWER
 #elif defined(GDRAPI_ARM64)
 static int memcpy_uncached_store_avx(void *dest, const void *src, size_t n_bytes)  { return 1; }
 static int memcpy_cached_store_avx(void *dest, const void *src, size_t n_bytes)  { return 1; }
 static int memcpy_uncached_store_sse(void *dest, const void *src, size_t n_bytes)    { return 1; }
 static int memcpy_cached_store_sse(void *dest, const void *src, size_t n_bytes)    { return 1; }
 static int memcpy_uncached_load_sse41(void *dest, const void *src, size_t n_bytes) { return 1; }
 static inline void wc_store_fence(void) { asm volatile("DMB ishld") ; }
 #define PREFERS_STORE_UNROLL4 0
 #define PREFERS_STORE_UNROLL8 0
 #define PREFERS_LOAD_UNROLL4  0
 #define PREFERS_LOAD_UNROLL8  0
 // GDRAPI_ARM64
 #endif
 static int has_sse = 0;
 static int has_sse2 = 0;
 static int has_sse4_1 = 0;
 static int has_avx = 0;
 static int has_avx2 = 0;
 static void gdr_init_cpu_flags(void)
 {
 #ifdef GDRAPI_X86
    unsigned int info_type = 0x00000001;
    unsigned int ax, bx, cx, dx;
    if (__get_cpuid(info_type, &ax, &bx, &cx, &dx) == 1) {
       has_sse4_1 = ((cx & bit_SSE4_1) != 0);
       has_avx    = ((cx & bit_AVX)    != 0);
       has_sse    = ((dx & bit_SSE)    != 0);
       has_sse2   = ((dx & bit_SSE2)   != 0);
       gdr_dbg("sse4_1=%d avx=%d sse=%d sse2=%d\n", has_sse4_1, has_avx, has_sse, has_sse2);
    }
 #ifdef bit_AVX2
    info_type = 0x7;
    if (__get_cpuid(info_type, &ax, &bx, &cx, &dx) == 1) {
        has_avx2 = bx & bit_AVX2;
    }
 #endif // bit_AVX2
 #endif // GDRAPI_X86
 #ifdef GDRAPI_POWER
    // detect and enable Altivec/SMX support
 #endif
 }
 // note: more than one implementation may be compiled in
 static void unroll8_memcpy(void *dst, const void *src, size_t size)
 {
    const uint64_t *r = (const uint64_t *)src;
    uint64_t *w = (uint64_t *)dst;
    size_t nw = size / sizeof(*r);
    assert(size % sizeof(*r) == 0);
    while (nw) {
        if (0 == (nw & 3)) {
            uint64_t r0 = r[0];
            uint64_t r1 = r[1];
            uint64_t r2 = r[2];
            uint64_t r3 = r[3];
            w[0] = r0;
            w[1] = r1;
            w[2] = r2;
            w[3] = r3;
            r += 4;
            w += 4;
            nw -= 4;
        } else if (0 == (nw & 1)) {
            uint64_t r0 = r[0];
            uint64_t r1 = r[1];
            w[0] = r0;
            w[1] = r1;
            r += 2;
            w += 2;
            nw -= 2;
        } else {
            w[0] = r[0];
            ++w;
            ++r;
            --nw;
        }
    }
 }
 static void unroll4_memcpy(void *dst, const void *src, size_t size)
 {
    const uint32_t *r = (const uint32_t *)src;
    uint32_t *w = (uint32_t *)dst;
    size_t nw = size / sizeof(*r);
    assert(size % sizeof(*r) == 0);
    while (nw) {
        if (0 == (nw & 3)) {
            uint32_t r0 = r[0];
            uint32_t r1 = r[1];
            uint32_t r2 = r[2];
            uint32_t r3 = r[3];
            w[0] = r0;
            w[1] = r1;
            w[2] = r2;
            w[3] = r3;
            r += 4;
            w += 4;
            nw -= 4;
        } else if (0 == (nw & 1)) {
            uint32_t r0 = r[0];
            uint32_t r1 = r[1];
            w[0] = r0;
            w[1] = r1;
            r += 2;
            w += 2;
            nw -= 2;
        } else {
            w[0] = r[0];
            ++w;
            ++r;
            --nw;
        }
    }
 }
 static inline int is_aligned(unsigned long value, unsigned powof2)
 {
    return ((value & (powof2-1)) == 0);
 }
 static inline int ptr_is_aligned(const void *ptr, unsigned powof2)
 {
    unsigned long addr = (unsigned long)ptr;
    return is_aligned(addr, powof2);
 }
 static inline void memcpy_to_device_mapping(void *dst, const void *src, size_t size)
 {
    size_t remaining_size = size;
    void *curr_map_d_ptr = dst;
    const void *curr_h_ptr = src;
    size_t copy_size = 0;
    while (remaining_size > 0) {
        if (is_aligned(remaining_size, sizeof(uint64_t)) && ptr_is_aligned(curr_map_d_ptr, sizeof(uint64_t)) && ptr_is_aligned(curr_h_ptr, sizeof(uint64_t))) {
            // We have proper alignment. memcpy can be used here. Although
            // unlikely, this might break in the future if the implementation
            // of memcpy changes to generate unaligned access. Still, we choose
            // memcpy because it provides better performance than our simple
            // aligned-access workaround.
            memcpy(curr_map_d_ptr, curr_h_ptr, remaining_size);
            copy_size = remaining_size;
        }
        else if (remaining_size >= sizeof(uint64_t) && ptr_is_aligned(curr_map_d_ptr, sizeof(uint64_t))) {
            // memcpy cannot be used here because its internal
            // implementation may end up in an unaligned access.
            WRITE_ONCE(*(uint64_t *)curr_map_d_ptr, *(uint64_t *)curr_h_ptr);
            copy_size = sizeof(uint64_t);
        }
        else if (remaining_size >= sizeof(uint32_t) && ptr_is_aligned(curr_map_d_ptr, sizeof(uint32_t))) {
            WRITE_ONCE(*(uint32_t *)curr_map_d_ptr, *(uint32_t *)curr_h_ptr);
            copy_size = sizeof(uint32_t);
        }
        else if (remaining_size >= sizeof(uint16_t) && ptr_is_aligned(curr_map_d_ptr, sizeof(uint16_t))) {
            WRITE_ONCE(*(uint16_t *)curr_map_d_ptr, *(uint16_t *)curr_h_ptr);
            copy_size = sizeof(uint16_t);
        }
        else {
            WRITE_ONCE(*(uint8_t *)curr_map_d_ptr, *(uint8_t *)curr_h_ptr);
            copy_size = sizeof(uint8_t);
        }
        remaining_size -= copy_size;
        curr_map_d_ptr = (void *)((uintptr_t)curr_map_d_ptr + copy_size);
        curr_h_ptr = (const void *)((uintptr_t)curr_h_ptr + copy_size);
    }
 }
 static inline void memcpy_from_device_mapping(void *dst, const void *src, size_t size)
 {
    size_t remaining_size = size;
    const void *curr_map_d_ptr = src;
    void *curr_h_ptr = dst;
    size_t copy_size = 0;
    while (remaining_size > 0) {
        if (is_aligned(remaining_size, sizeof(uint64_t)) && ptr_is_aligned(curr_map_d_ptr, sizeof(uint64_t)) && ptr_is_aligned(curr_h_ptr, sizeof(uint64_t))) {
            // We have proper alignment. memcpy can be used here. Although
            // unlikely, this might break in the future if the implementation
            // of memcpy changes to generate unaligned access. Still, we choose
            // memcpy because it provides better performance than our simple
            // aligned-access workaround.
            memcpy(curr_h_ptr, curr_map_d_ptr, remaining_size);
            copy_size = remaining_size;
        }
        else if (remaining_size >= sizeof(uint64_t) && ptr_is_aligned(curr_map_d_ptr, sizeof(uint64_t))) {
            // memcpy cannot be used here because its internal
            // implementation may end up in an unaligned access.
            *(uint64_t *)curr_h_ptr = READ_ONCE(*(uint64_t *)curr_map_d_ptr);
            copy_size = sizeof(uint64_t);
        }
        else if (remaining_size >= sizeof(uint32_t) && ptr_is_aligned(curr_map_d_ptr, sizeof(uint32_t))) {
            *(uint32_t *)curr_h_ptr = READ_ONCE(*(uint32_t *)curr_map_d_ptr);
            copy_size = sizeof(uint32_t);
        }
        else if (remaining_size >= sizeof(uint16_t) && ptr_is_aligned(curr_map_d_ptr, sizeof(uint16_t))) {
            *(uint16_t *)curr_h_ptr = READ_ONCE(*(uint16_t *)curr_map_d_ptr);
            copy_size = sizeof(uint16_t);
        }
        else {
            *(uint8_t *)curr_h_ptr = READ_ONCE(*(uint8_t *)curr_map_d_ptr);
            copy_size = sizeof(uint8_t);
        }
        remaining_size -= copy_size;
        curr_map_d_ptr = (const void *)((uintptr_t)curr_map_d_ptr + copy_size);
        curr_h_ptr = (void *)((uintptr_t)curr_h_ptr + copy_size);
    }
 }
 static int gdr_copy_to_mapping_internal(void *map_d_ptr, const void *h_ptr, size_t size, gdr_mapping_type_t mapping_type)
 {
    const int wc_mapping = (mapping_type == GDR_MAPPING_TYPE_WC);
    const int device_mapping = (mapping_type == GDR_MAPPING_TYPE_DEVICE);
    do {
        // For very small sizes and aligned pointers, we use simple store.
        if (size == sizeof(uint8_t)) {
            WRITE_ONCE(*(uint8_t *)map_d_ptr, *(uint8_t *)h_ptr);
            goto do_fence;
        } else if (size == sizeof(uint16_t) && ptr_is_aligned(map_d_ptr, sizeof(uint16_t))) {
            WRITE_ONCE(*(uint16_t *)map_d_ptr, *(uint16_t *)h_ptr);
            goto do_fence;
        } else if (size == sizeof(uint32_t) && ptr_is_aligned(map_d_ptr, sizeof(uint32_t))) {
            WRITE_ONCE(*(uint32_t *)map_d_ptr, *(uint32_t *)h_ptr);
            goto do_fence;
        } else if (size == sizeof(uint64_t) && ptr_is_aligned(map_d_ptr, sizeof(uint64_t))) {
            WRITE_ONCE(*(uint64_t *)map_d_ptr, *(uint64_t *)h_ptr);
            goto do_fence;
        }
        // pick the most performing implementation compatible with the platform we are running on
        // NOTE: write fences are included in functions below
        if (has_avx) {
            assert(wc_mapping);
            gdr_dbgc(1, "using AVX implementation of gdr_copy_to_mapping\n");
            memcpy_uncached_store_avx(map_d_ptr, h_ptr, size);
            goto out;
        }
        if (has_sse) {
            assert(wc_mapping);
            gdr_dbgc(1, "using SSE implementation of gdr_copy_to_mapping\n");
            memcpy_uncached_store_sse(map_d_ptr, h_ptr, size);
            goto out;
        }
        // on POWER, compiler/libc memcpy is not optimal for MMIO
        // 64bit stores are not better than 32bit ones, so we prefer the latter.
        // NOTE: if preferred but not aligned, a better implementation would still try to
        // use byte sized stores to align map_d_ptr and h_ptr to next word.
        // NOTE2: unroll*_memcpy and memcpy do not include fencing.
        if (wc_mapping && PREFERS_STORE_UNROLL8 && is_aligned(size, 8) && ptr_is_aligned(map_d_ptr, 8) && ptr_is_aligned(h_ptr, 8)) {
            gdr_dbgc(1, "using unroll8_memcpy for gdr_copy_to_mapping\n");
            unroll8_memcpy(map_d_ptr, h_ptr, size);
        } else if (wc_mapping && PREFERS_STORE_UNROLL4 && is_aligned(size, 4) && ptr_is_aligned(map_d_ptr, 4) && ptr_is_aligned(h_ptr, 4)) {
            gdr_dbgc(1, "using unroll4_memcpy for gdr_copy_to_mapping\n");
            unroll4_memcpy(map_d_ptr, h_ptr, size);
        } else if (device_mapping) {
            gdr_dbgc(1, "using device-mapping copy for gdr_copy_to_mapping with device mapping\n");
            memcpy_to_device_mapping(map_d_ptr, h_ptr, size);
        } else {
            gdr_dbgc(1, "fallback to compiler/libc memcpy implementation of gdr_copy_to_mapping\n");
            memcpy(map_d_ptr, h_ptr, size);
        }
    } while (0);
 do_fence:
    if (wc_mapping) {
        // fencing is needed even for plain memcpy(), due to performance
        // being hit by delayed flushing of WC buffers
        wc_store_fence();
    }
 out:
    return 0;
 }
 static int gdr_copy_from_mapping_internal(void *h_ptr, const void *map_d_ptr, size_t size, gdr_mapping_type_t mapping_type)
 {
    const int wc_mapping = (mapping_type == GDR_MAPPING_TYPE_WC);
    const int device_mapping = (mapping_type == GDR_MAPPING_TYPE_DEVICE);
    do {
        // pick the most performing implementation compatible with the platform we are running on
        if (has_sse4_1) {
            assert(wc_mapping);
            gdr_dbgc(1, "using SSE4_1 implementation of gdr_copy_from_mapping\n");
            memcpy_uncached_load_sse41(h_ptr, map_d_ptr, size);
            break;
        }
        if (has_avx) {
            assert(wc_mapping);
            gdr_dbgc(1, "using AVX implementation of gdr_copy_from_mapping\n");
            memcpy_cached_store_avx(h_ptr, map_d_ptr, size);
            break;
        }
        if (has_sse) {
            assert(wc_mapping);
            gdr_dbgc(1, "using SSE implementation of gdr_copy_from_mapping\n");
            memcpy_cached_store_sse(h_ptr, map_d_ptr, size);
            break;
        }
        // on POWER, compiler memcpy is not optimal for MMIO
        // 64bit loads have 2x the BW of 32bit ones
        if (wc_mapping && PREFERS_LOAD_UNROLL8 && is_aligned(size, 8) && ptr_is_aligned(map_d_ptr, 8) && ptr_is_aligned(h_ptr, 8)) {
            gdr_dbgc(1, "using unroll8_memcpy for gdr_copy_from_mapping\n");
            unroll8_memcpy(h_ptr, map_d_ptr, size);
        } else if (wc_mapping && PREFERS_LOAD_UNROLL4 && is_aligned(size, 4) && ptr_is_aligned(map_d_ptr, 4) && ptr_is_aligned(h_ptr, 4)) {
            gdr_dbgc(1, "using unroll4_memcpy for gdr_copy_from_mapping\n");
            unroll4_memcpy(h_ptr, map_d_ptr, size);
        } else if (device_mapping) {
            gdr_dbgc(1, "using device-mapping copy for gdr_copy_from_mapping\n");
            memcpy_from_device_mapping(h_ptr, map_d_ptr, size);
        } else {
            gdr_dbgc(1, "fallback to compiler/libc memcpy implementation of gdr_copy_from_mapping\n");
            memcpy(h_ptr, map_d_ptr, size);
        }
        // note: fencing is not needed because plain stores are used
        // if non-temporal/uncached stores were used on x86, a proper fence would be needed instead
        // if (wc_mapping)
        //    wc_store_fence();
    } while (0);
    return 0;
 }
 int gdr_copy_to_mapping(gdr_mh_t handle, void *map_d_ptr, const void *h_ptr, size_t size)
 {
    gdr_memh_t *mh = to_memh(handle);
    if (unlikely(!gdr_is_mapped(mh->mapping_type))) {
        gdr_err("mh is not mapped yet\n");
        return EINVAL;
    }
    if (unlikely(size == 0))
        return 0;
    return gdr_copy_to_mapping_internal(map_d_ptr, h_ptr, size, mh->mapping_type);
 }
 int gdr_copy_from_mapping(gdr_mh_t handle, void *h_ptr, const void *map_d_ptr, size_t size)
 {
    gdr_memh_t *mh = to_memh(handle);
    if (unlikely(!gdr_is_mapped(mh->mapping_type))) {
        gdr_err("mh is not mapped yet\n");
        return EINVAL;
    }
    if (unlikely(size == 0))
        return 0;
    return gdr_copy_from_mapping_internal(h_ptr, map_d_ptr, size, mh->mapping_type);
 }
 void gdr_runtime_get_version(int *major, int *minor)
 {
    *major = GDR_API_MAJOR_VERSION;
    *minor = GDR_API_MINOR_VERSION;
 }
 int gdr_driver_get_version(gdr_t g, int *major, int *minor)
 {
    assert(g != NULL);
    assert(g->fd > 0);
    struct GDRDRV_IOC_GET_VERSION_PARAMS params;
    int retcode = ioctl(g->fd, GDRDRV_IOC_GET_VERSION, &params);
    if (0 != retcode) {
        int ret = errno;
        gdr_err("Error getting the gdrdrv driver version with ioctl error (errno=%d). gdrdrv might be too old.\n", ret);
        return ret;
    }
    *major = params.gdrdrv_version >> MAJOR_VERSION_SHIFT;
    *minor = params.gdrdrv_version & MINOR_VERSION_MASK;
    return 0;
 }
 // ==============================================================================
 // Obsoleted API. Provided for compatibility only.
 // ==============================================================================
 #ifdef gdr_get_info
 #undef gdr_get_info
 #endif
 typedef struct gdr_info_v1 {
    uint64_t va;
    uint64_t mapped_size;
    uint32_t page_size;
    // tm_cycles and cycles_per_ms are deprecated and will be removed in future.
    uint64_t tm_cycles;
    uint32_t cycles_per_ms;
    unsigned mapped:1;
    unsigned wc_mapping:1;
 } gdr_info_v1_t;
 int gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_v1_t *info)
 {
    int ret = 0;
    int retcode;
    gdr_memh_t *mh = to_memh(handle);
    struct GDRDRV_IOC_GET_INFO_PARAMS params;
    params.handle = mh->handle;
    retcode = ioctl(g->fd, GDRDRV_IOC_GET_INFO, &params);
    if (0 != retcode) {
        ret = errno;
        gdr_err("ioctl error (errno=%d)\n", ret);
        goto out;
    } else {
        info->va            = params.va;
        info->mapped_size   = params.mapped_size;
        info->page_size     = params.page_size;
        info->tm_cycles     = params.tm_cycles;
        info->cycles_per_ms = params.tsc_khz;
        info->mapped        = params.mapped;
        info->wc_mapping    = params.wc_mapping;
    }
 out:
    return ret;
 }
 /*
 * Local variables:
 *  c-indent-level: 4
 *  c-basic-offset: 4
 *  tab-width: 4
 *  indent-tabs-mode: nil
 * End:
 */
--- a/gdrcopy/src/gdrapi_internal.h
+++ b/gdrcopy/src/gdrapi_internal.h
@ -0,0 +1,74 @@
 /*
 * Copyright (c) 2014-2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in 
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */
 #ifndef __GDRAPI_INTERNAL_H__
 #define __GDRAPI_INTERNAL_H__
 #include <stdint.h> // for standard [u]intX_t types
 #include <stddef.h>
 #include <sys/queue.h>
 #include "gdrapi.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 #ifndef unlikely
 #ifdef __GNUC__
 #define unlikely(x)         __builtin_expect(!!(x), 0)
 #else
 #define unlikely(x)         (x)
 #endif
 #endif
 #ifndef ACCESS_ONCE
 #define ACCESS_ONCE(x)      (*(volatile typeof((x)) *)&(x))
 #endif
 #ifndef READ_ONCE
 #define READ_ONCE(x)        ACCESS_ONCE(x)
 #endif
 #ifndef WRITE_ONCE
 #define WRITE_ONCE(x, v)    (ACCESS_ONCE(x) = (v))
 #endif
 typedef struct gdr_memh_t { 
    uint32_t handle;
    LIST_ENTRY(gdr_memh_t) entries;
    gdr_mapping_type_t mapping_type;
 } gdr_memh_t;
 struct gdr {
    int fd;
    LIST_HEAD(memh_list, gdr_memh_t) memhs;
    size_t page_size;
    size_t page_mask;
    uint8_t page_shift;
    uint32_t gdrdrv_version;
 };
 #ifdef __cplusplus
 }
 #endif
 #endif // __GDRAPI_INTERNAL_H__
--- a/gdrcopy/src/gdrdrv/Makefile
+++ b/gdrcopy/src/gdrdrv/Makefile
@ -0,0 +1,77 @@
 # Copyright (c) 2014-2021, NVIDIA CORPORATION. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the "Software"),
 # to deal in the Software without restriction, including without limitation
 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
 # and/or sell copies of the Software, and to permit persons to whom the
 # Software is furnished to do so, subject to the following conditions:
 #
 # The above copyright notice and this permission notice shall be included in 
 # all copies or substantial portions of the Software.
 #
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 ifneq ($(KERNELRELEASE),)
 kver_major:=$(shell echo $(KERNELRELEASE) | awk -F '.' '// { print $$2;}' )
 obj-m := nv-p2p-dummy.o
 obj-m += gdrdrv.o
 ccflags-y += -I$(NVIDIA_SRC_DIR)
 ifeq ($(NVIDIA_IS_OPENSOURCE),y)
 ccflags-y += -DGDRDRV_OPENSOURCE_NVIDIA
 endif
 ifeq ($(HAVE_VM_FLAGS_SET),y)
 ccflags-y += -DGDRDRV_HAVE_VM_FLAGS_SET
 endif
 else
 KVER ?= $(shell uname -r)
 MODULES_DIR := /lib/modules/$(KVER)
 KDIR := $(MODULES_DIR)/build
 MODULE_SUBDIR ?= /kernel/drivers/misc/
 MODULE_DESTDIR := $(MODULES_DIR)/$(MODULE_SUBDIR)
 DEPMOD := /sbin/depmod
 export NVIDIA_SRC_DIR ?= $(shell { find /usr/src/kernel-modules/nvidia-* /usr/src/nvidia-* -name "nv-p2p.c" -print -quit | xargs dirname || echo "NVIDIA_DRIVER_MISSING"; } 2>/dev/null)
 export NVIDIA_IS_OPENSOURCE ?= $(shell grep -r "MODULE_LICENSE" $(NVIDIA_SRC_DIR)/ | grep -s -q "GPL" && echo "y")
 CONF_SCRIPT_DIR ?= $(PWD)/../../scripts
 export HAVE_VM_FLAGS_SET ?= $(shell $(CONF_SCRIPT_DIR)/test_gdrdrv_HAVE_VM_FLAGS_SET.sh -k $(KVER))
 all: build
 build:
 	@ echo "Picking NVIDIA driver sources from NVIDIA_SRC_DIR=$(NVIDIA_SRC_DIR). If that does not meet your expectation, you might have a stale driver still around and that might cause problems."
 	@ echo "Setting NVIDIA_IS_OPENSOURCE=$(NVIDIA_IS_OPENSOURCE)"
 	@ echo "Setting HAVE_VM_FLAGS_SET=$(HAVE_VM_FLAGS_SET)"
 	@ $(MAKE) -C $(KDIR) $(MAKE_PARAMS) M=$(PWD) modules
 install: build
 	[ -d $(DESTDIR)/$(MODULE_DESTDIR) ] || mkdir -p $(DESTDIR)/$(MODULE_DESTDIR)
 	cp gdrdrv.ko $(DESTDIR)/$(MODULE_DESTDIR)
 	if [ ! -n "$(DESTDIR)" ]; then $(DEPMOD) -r -ae $(KVER); fi
 help:
 	$(MAKE) -C $(KDIR) M=$$PWD help
 clean:
 	rm -rf *.o .*.o.d *.ko* *.mod.* .*.cmd Module.symvers modules.order .tmp_versions/ *~ core .depend TAGS .cache.mk  *.mod
 TAGS:
 	find $(KERNELDIR) -follow -name \*.h -o -name \*.c  |xargs etags
 .PHONY: clean all help install default linksyms nvidia_src_dir build
 endif
--- a/gdrcopy/src/gdrdrv/gdrdrv.c
+++ b/gdrcopy/src/gdrdrv/gdrdrv.c
--- a/gdrcopy/src/gdrdrv/gdrdrv.h
+++ b/gdrcopy/src/gdrdrv/gdrdrv.h
@ -0,0 +1,138 @@
 /*
 * Copyright (c) 2014-2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in 
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */
 #ifndef __GDR_DRV_H__
 #define __GDR_DRV_H__
 #define GDRDRV_STRINGIFY(s)           #s
 #define GDRDRV_TOSTRING(s)            GDRDRV_STRINGIFY(s)
 #define GDRDRV_MAJOR_VERSION_SHIFT    16
 #define GDRDRV_MAJOR_VERSION    2
 #define GDRDRV_MINOR_VERSION    4
 #define GDRDRV_VERSION          ((GDRDRV_MAJOR_VERSION << GDRDRV_MAJOR_VERSION_SHIFT) | GDRDRV_MINOR_VERSION)
 #define GDRDRV_VERSION_STRING   GDRDRV_TOSTRING(GDRDRV_MAJOR_VERSION) "." GDRDRV_TOSTRING(GDRDRV_MINOR_VERSION)
 #define MINIMUM_GDR_API_MAJOR_VERSION   2
 #define MINIMUM_GDR_API_MINOR_VERSION   0
 #define MINIMUM_GDR_API_VERSION         ((MINIMUM_GDR_API_MAJOR_VERSION << 16) | MINIMUM_GDR_API_MINOR_VERSION)
 #define GDRDRV_MINIMUM_VERSION_WITH_GET_INFO_V2 ((2 << GDRDRV_MAJOR_VERSION_SHIFT) | 4)
 #define GDRDRV_IOCTL                 0xDA
 typedef enum {
    GDR_MR_NONE = 0,
    GDR_MR_WC = 1,
    GDR_MR_CACHING = 2,
    GDR_MR_DEVICE = 3
 } gdr_mr_type_t;
 typedef __u64 gdr_hnd_t;
 //-----------
 struct GDRDRV_IOC_PIN_BUFFER_PARAMS
 {
    // in
    __u64 addr;
    __u64 size;
    __u64 p2p_token;
    __u32 va_space;
    // out
    gdr_hnd_t handle;
 };
 #define GDRDRV_IOC_PIN_BUFFER _IOWR(GDRDRV_IOCTL, 1, struct GDRDRV_IOC_PIN_BUFFER_PARAMS)
 //-----------
 struct GDRDRV_IOC_UNPIN_BUFFER_PARAMS
 {
    // in
    gdr_hnd_t handle;
 };
 #define GDRDRV_IOC_UNPIN_BUFFER _IOWR(GDRDRV_IOCTL, 2, struct GDRDRV_IOC_UNPIN_BUFFER_PARAMS *)
 //-----------
 struct GDRDRV_IOC_GET_CB_FLAG_PARAMS
 {
    // in
    gdr_hnd_t handle;
    // out
    __u32 flag;
 };
 #define GDRDRV_IOC_GET_CB_FLAG _IOWR(GDRDRV_IOCTL, 3, struct GDRDRV_IOC_GET_CB_FLAG_PARAMS *)
 //-----------
 struct GDRDRV_IOC_GET_INFO_PARAMS
 {
    // in
    gdr_hnd_t handle;
    // out
    __u64 va;
    __u64 mapped_size;
    __u32 page_size;
    __u32 tsc_khz;
    __u64 tm_cycles;
    __u32 mapped;
    __u32 wc_mapping;
 };
 #define GDRDRV_IOC_GET_INFO _IOWR(GDRDRV_IOCTL, 4, struct GDRDRV_IOC_GET_INFO_PARAMS *)
 //-----------
 struct GDRDRV_IOC_GET_INFO_V2_PARAMS
 {
    // in
    gdr_hnd_t handle;
    // out
    __u64 va;
    __u64 mapped_size;
    __u32 page_size;
    __u32 tsc_khz;
    __u64 tm_cycles;
    __u32 mapping_type;
 };
 #define GDRDRV_IOC_GET_INFO_V2 _IOWR(GDRDRV_IOCTL, 5, struct GDRDRV_IOC_GET_INFO_V2_PARAMS *)
 //-----------
 struct GDRDRV_IOC_GET_VERSION_PARAMS
 {
    // out
    __u32 gdrdrv_version;
    __u32 minimum_gdr_api_version;
 };
 #define GDRDRV_IOC_GET_VERSION _IOWR(GDRDRV_IOCTL, 255, struct GDRDRV_IOC_GET_VERSION_PARAMS *)
 //-----------
 #endif // __GDR_DRV_H__
--- a/gdrcopy/src/gdrdrv/nv-p2p-dummy.c
+++ b/gdrcopy/src/gdrdrv/nv-p2p-dummy.c
@ -0,0 +1,138 @@
 /*
 * Copyright (c) 2015-2023, NVIDIA CORPORATION. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */
 /*
 * Warning: this kernel module is only needed at compile time.
 *
 * Long story is that this module is here only to produce the correct
 * module versions related to the very kernel where the other module (the
 * interesting one) is going to be compiled.  In other words, this module
 * produce the same symbol versions as the real NVIDIA kernel-mode driver.
 *
 * Downside: the function signatures must be kept up-to-date.
 */
 #include <linux/version.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/delay.h>
 #include <linux/compiler.h>
 #include <linux/string.h>
 #include <linux/uaccess.h>
 #include <linux/fs.h>
 #include <linux/list.h>
 #include <linux/mm.h>
 #include <linux/io.h>
 #include "nv-p2p.h"
 MODULE_AUTHOR("drossetti@nvidia.com");
 MODULE_LICENSE("MIT");
 MODULE_DESCRIPTION("P2P dummy kernel-mode driver");
 MODULE_VERSION("1.0");
 int nvidia_p2p_init_mapping(uint64_t p2p_token,
                            struct nvidia_p2p_params *params,
                            void (*destroy_callback)(void *data),
                            void *data)
 {
    return -EINVAL;
 }
 EXPORT_SYMBOL(nvidia_p2p_init_mapping);
 int nvidia_p2p_destroy_mapping(uint64_t p2p_token)
 {
    return -EINVAL;
 }
 EXPORT_SYMBOL(nvidia_p2p_destroy_mapping);
 int nvidia_p2p_get_pages(uint64_t p2p_token, uint32_t va_space,
                         uint64_t virtual_address,
                         uint64_t length,
                         struct nvidia_p2p_page_table **page_table,
                         void (*free_callback)(void *data),
                         void *data)
 {
    return -EINVAL;
 }
 EXPORT_SYMBOL(nvidia_p2p_get_pages);
 int nvidia_p2p_put_pages(uint64_t p2p_token, uint32_t va_space,
                         uint64_t virtual_address,
                         struct nvidia_p2p_page_table *page_table)
 {
    return -EINVAL;
 }
 EXPORT_SYMBOL(nvidia_p2p_put_pages);
 int nvidia_p2p_free_page_table(struct nvidia_p2p_page_table *page_table)
 {
    return -EINVAL;
 }
 EXPORT_SYMBOL(nvidia_p2p_free_page_table);
 #ifdef NVIDIA_P2P_CAP_PERSISTENT_PAGES
 int nvidia_p2p_cap_persistent_pages;
 EXPORT_SYMBOL(nvidia_p2p_cap_persistent_pages);
 #endif
 #ifdef NVIDIA_P2P_CAP_GET_PAGES_PERSISTENT_API
 int nvidia_p2p_get_pages_persistent(uint64_t virtual_address,
        uint64_t length,
        struct nvidia_p2p_page_table **page_table,
        uint32_t flags)
 {
    return -EINVAL;
 }
 EXPORT_SYMBOL(nvidia_p2p_get_pages_persistent);
 int nvidia_p2p_put_pages_persistent(uint64_t virtual_address,
        struct nvidia_p2p_page_table *page_table,
        uint32_t flags)
 {
    return -EINVAL;
 }
 EXPORT_SYMBOL(nvidia_p2p_put_pages_persistent);
 #endif
 static int __init nv_p2p_dummy_init(void)
 {
    return 0;
 }
 static void __exit nv_p2p_dummy_cleanup(void)
 {
 }
 module_init(nv_p2p_dummy_init);
 module_exit(nv_p2p_dummy_cleanup);
 /*
 * Local variables:
 *  c-indent-level: 4
 *  c-basic-offset: 4
 *  tab-width: 4
 *  indent-tabs-mode: nil
 * End:
 */
--- a/gdrcopy/src/memcpy_avx.c
+++ b/gdrcopy/src/memcpy_avx.c
@ -0,0 +1,207 @@
 /*
 * Copyright (c) 2014-2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in 
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 #include <immintrin.h>
 #ifndef min
 #define min(A,B) ((A)<(B)?(A):(B))
 #endif
 int memcpy_uncached_store_avx(void *dest, const void *src, size_t n_bytes)
 {
    int ret = 0;
 #ifdef __AVX__
    char *d = (char*)dest;
    uintptr_t d_int = (uintptr_t)d;
    const char *s = (const char *)src;
    uintptr_t s_int = (uintptr_t)s;
    size_t n = n_bytes;
    // align dest to 256-bits
    if (d_int & 0x1f) {
        size_t nh = min(0x20 - (d_int & 0x1f), n);
        memcpy(d, s, nh);
        d += nh; d_int += nh;
        s += nh; s_int += nh;
        n -= nh;
    }
    if (s_int & 0x1f) { // src is not aligned to 256-bits
        __m256d r0,r1,r2,r3;
        // unroll 4
        while (n >= 4*sizeof(__m256d)) {
            r0 = _mm256_loadu_pd((double *)(s+0*sizeof(__m256d)));
            r1 = _mm256_loadu_pd((double *)(s+1*sizeof(__m256d)));
            r2 = _mm256_loadu_pd((double *)(s+2*sizeof(__m256d)));
            r3 = _mm256_loadu_pd((double *)(s+3*sizeof(__m256d)));
            _mm256_stream_pd((double *)(d+0*sizeof(__m256d)), r0);
            _mm256_stream_pd((double *)(d+1*sizeof(__m256d)), r1);
            _mm256_stream_pd((double *)(d+2*sizeof(__m256d)), r2);
            _mm256_stream_pd((double *)(d+3*sizeof(__m256d)), r3);
            s += 4*sizeof(__m256d);
            d += 4*sizeof(__m256d);
            n -= 4*sizeof(__m256d);
        }
        while (n >= sizeof(__m256d)) {
            r0 = _mm256_loadu_pd((double *)(s));
            _mm256_stream_pd((double *)(d), r0);
            s += sizeof(__m256d);
            d += sizeof(__m256d);
            n -= sizeof(__m256d);
        }
    } else { // or it IS aligned
        __m256d r0,r1,r2,r3,r4,r5,r6,r7;
        // unroll 8
        while (n >= 8*sizeof(__m256d)) {
            r0 = _mm256_load_pd((double *)(s+0*sizeof(__m256d)));
            r1 = _mm256_load_pd((double *)(s+1*sizeof(__m256d)));
            r2 = _mm256_load_pd((double *)(s+2*sizeof(__m256d)));
            r3 = _mm256_load_pd((double *)(s+3*sizeof(__m256d)));
            r4 = _mm256_load_pd((double *)(s+4*sizeof(__m256d)));
            r5 = _mm256_load_pd((double *)(s+5*sizeof(__m256d)));
            r6 = _mm256_load_pd((double *)(s+6*sizeof(__m256d)));
            r7 = _mm256_load_pd((double *)(s+7*sizeof(__m256d)));
            _mm256_stream_pd((double *)(d+0*sizeof(__m256d)), r0);
            _mm256_stream_pd((double *)(d+1*sizeof(__m256d)), r1);
            _mm256_stream_pd((double *)(d+2*sizeof(__m256d)), r2);
            _mm256_stream_pd((double *)(d+3*sizeof(__m256d)), r3);
            _mm256_stream_pd((double *)(d+4*sizeof(__m256d)), r4);
            _mm256_stream_pd((double *)(d+5*sizeof(__m256d)), r5);
            _mm256_stream_pd((double *)(d+6*sizeof(__m256d)), r6);
            _mm256_stream_pd((double *)(d+7*sizeof(__m256d)), r7);
            s += 8*sizeof(__m256d);
            d += 8*sizeof(__m256d);
            n -= 8*sizeof(__m256d);
        }
        while (n >= sizeof(__m256d)) {
            r0 = _mm256_load_pd((double *)(s));
            _mm256_stream_pd((double *)(d), r0);
            s += sizeof(__m256d);
            d += sizeof(__m256d);
            n -= sizeof(__m256d);
        }            
    }
    if (n)
        memcpy(d, s, n);
    // fencing is needed even for plain memcpy(), due to performance
    // being hit by delayed flushing of WC buffers
    _mm_sfence();
 #else
 #error "this file should be compiled with -mavx"
 #endif
    return ret;
 }
 int memcpy_cached_store_avx(void *dest, const void *src, size_t n_bytes)
 {
    int ret = 0;
 #ifdef __AVX__
    char *d = (char*)dest;
    uintptr_t d_int = (uintptr_t)d;
    const char *s = (const char *)src;
    uintptr_t s_int = (uintptr_t)s;
    size_t n = n_bytes;
    // align dest to 256-bits
    if (d_int & 0x1f) {
        size_t nh = min(0x20 - (d_int & 0x1f), n);
        memcpy(d, s, nh);
        d += nh; d_int += nh;
        s += nh; s_int += nh;
        n -= nh;
    }
    if (s_int & 0x1f) { // src is not aligned to 256-bits
        __m256d r0,r1,r2,r3;
        // unroll 4
        while (n >= 4*sizeof(__m256d)) {
            r0 = _mm256_loadu_pd((double *)(s+0*sizeof(__m256d)));
            r1 = _mm256_loadu_pd((double *)(s+1*sizeof(__m256d)));
            r2 = _mm256_loadu_pd((double *)(s+2*sizeof(__m256d)));
            r3 = _mm256_loadu_pd((double *)(s+3*sizeof(__m256d)));
            _mm256_store_pd((double *)(d+0*sizeof(__m256d)), r0);
            _mm256_store_pd((double *)(d+1*sizeof(__m256d)), r1);
            _mm256_store_pd((double *)(d+2*sizeof(__m256d)), r2);
            _mm256_store_pd((double *)(d+3*sizeof(__m256d)), r3);
            s += 4*sizeof(__m256d);
            d += 4*sizeof(__m256d);
            n -= 4*sizeof(__m256d);
        }
        while (n >= sizeof(__m256d)) {
            r0 = _mm256_loadu_pd((double *)(s));
            _mm256_store_pd((double *)(d), r0);
            s += sizeof(__m256d);
            d += sizeof(__m256d);
            n -= sizeof(__m256d);
        }
    } else { // or it IS aligned
        __m256d r0,r1,r2,r3;
        // unroll 4
        while (n >= 4*sizeof(__m256d)) {
            r0 = _mm256_load_pd((double *)(s+0*sizeof(__m256d)));
            r1 = _mm256_load_pd((double *)(s+1*sizeof(__m256d)));
            r2 = _mm256_load_pd((double *)(s+2*sizeof(__m256d)));
            r3 = _mm256_load_pd((double *)(s+3*sizeof(__m256d)));
            _mm256_store_pd((double *)(d+0*sizeof(__m256d)), r0);
            _mm256_store_pd((double *)(d+1*sizeof(__m256d)), r1);
            _mm256_store_pd((double *)(d+2*sizeof(__m256d)), r2);
            _mm256_store_pd((double *)(d+3*sizeof(__m256d)), r3);
            s += 4*sizeof(__m256d);
            d += 4*sizeof(__m256d);
            n -= 4*sizeof(__m256d);
        }
        while (n >= sizeof(__m256d)) {
            r0 = _mm256_load_pd((double *)(s));
            _mm256_store_pd((double *)(d), r0);
            s += sizeof(__m256d);
            d += sizeof(__m256d);
            n -= sizeof(__m256d);
        }            
    }
    if (n)
        memcpy(d, s, n);
    // fencing is needed because of the use of non-temporal stores
    _mm_sfence();
 #else
 #error "this file should be compiled with -mavx"
 #endif
    return ret;
 }
 // add variant for _mm_stream_load_si256() / VMOVNTDQA
 /*
 * Local variables:
 *  c-indent-level: 4
 *  c-basic-offset: 4
 *  tab-width: 4
 *  indent-tabs-mode: nil
 * End:
 */
--- a/gdrcopy/src/memcpy_sse.c
+++ b/gdrcopy/src/memcpy_sse.c
@ -0,0 +1,198 @@
 /*
 * Copyright (c) 2014-2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in 
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 #include <immintrin.h>
 #ifndef min
 #define min(A,B) ((A)<(B)?(A):(B))
 #endif
 int memcpy_uncached_store_sse(void *dest, const void *src, size_t n_bytes)
 {
    int ret = 0;
 #ifdef __SSE__
    char *d = (char*)dest;
    uintptr_t d_int = (uintptr_t)d;
    const char *s = (const char *)src;
    uintptr_t s_int = (uintptr_t)s;
    size_t n = n_bytes;
    // align dest to 128-bits
    if (d_int & 0xf) {
        size_t nh = min(0x10 - (d_int & 0x0f), n);
        memcpy(d, s, nh);
        d += nh; d_int += nh;
        s += nh; s_int += nh;
        n -= nh;
    }
    if (s_int & 0xf) { // src is not aligned to 128-bits
        __m128 r0,r1,r2,r3;
        // unroll 4
        while (n >= 4*4*sizeof(float)) {
            r0 = _mm_loadu_ps((float *)(s+0*4*sizeof(float)));
            r1 = _mm_loadu_ps((float *)(s+1*4*sizeof(float)));
            r2 = _mm_loadu_ps((float *)(s+2*4*sizeof(float)));
            r3 = _mm_loadu_ps((float *)(s+3*4*sizeof(float)));
            _mm_stream_ps((float *)(d+0*4*sizeof(float)), r0);
            _mm_stream_ps((float *)(d+1*4*sizeof(float)), r1);
            _mm_stream_ps((float *)(d+2*4*sizeof(float)), r2);
            _mm_stream_ps((float *)(d+3*4*sizeof(float)), r3);
            s += 4*4*sizeof(float);
            d += 4*4*sizeof(float);
            n -= 4*4*sizeof(float);
        }
        while (n >= 4*sizeof(float)) {
            r0 = _mm_loadu_ps((float *)(s));
            _mm_stream_ps((float *)(d), r0);
            s += 4*sizeof(float);
            d += 4*sizeof(float);
            n -= 4*sizeof(float);
        }
    } else { // or it IS aligned
        __m128 r0,r1,r2,r3;
        // unroll 4
        while (n >= 4*4*sizeof(float)) {
            r0 = _mm_load_ps((float *)(s+0*4*sizeof(float)));
            r1 = _mm_load_ps((float *)(s+1*4*sizeof(float)));
            r2 = _mm_load_ps((float *)(s+2*4*sizeof(float)));
            r3 = _mm_load_ps((float *)(s+3*4*sizeof(float)));
            _mm_stream_ps((float *)(d+0*4*sizeof(float)), r0);
            _mm_stream_ps((float *)(d+1*4*sizeof(float)), r1);
            _mm_stream_ps((float *)(d+2*4*sizeof(float)), r2);
            _mm_stream_ps((float *)(d+3*4*sizeof(float)), r3);
            s += 4*4*sizeof(float);
            d += 4*4*sizeof(float);
            n -= 4*4*sizeof(float);
        }
        while (n >= 4*sizeof(float)) {
            r0 = _mm_load_ps((float *)(s));
            _mm_stream_ps((float *)(d), r0);
            s += 4*sizeof(float);
            d += 4*sizeof(float);
            n -= 4*sizeof(float);
        }            
    }
    if (n)
        memcpy(d, s, n);
    // fencing is needed even for plain memcpy(), due to performance
    // being hit by delayed flushing of WC buffers
    _mm_sfence();
 #else
 #error "this file should be compiled with -msse"
 #endif
    return ret;
 }
 int memcpy_cached_store_sse(void *dest, const void *src, size_t n_bytes)
 {
    int ret = 0;
 #ifdef __SSE__
    char *d = (char*)dest;
    uintptr_t d_int = (uintptr_t)d;
    const char *s = (const char *)src;
    uintptr_t s_int = (uintptr_t)s;
    size_t n = n_bytes;
    // align dest to 128-bits
    if (d_int & 0xf) {
        size_t nh = min(0x10 - (d_int & 0x0f), n);
        memcpy(d, s, nh);
        d += nh; d_int += nh;
        s += nh; s_int += nh;
        n -= nh;
    }
    if (s_int & 0xf) { // src is not aligned to 128-bits
        __m128 r0,r1,r2,r3;
        // unroll 4
        while (n >= 4*4*sizeof(float)) {
            r0 = _mm_loadu_ps((float *)(s+0*4*sizeof(float)));
            r1 = _mm_loadu_ps((float *)(s+1*4*sizeof(float)));
            r2 = _mm_loadu_ps((float *)(s+2*4*sizeof(float)));
            r3 = _mm_loadu_ps((float *)(s+3*4*sizeof(float)));
            _mm_store_ps((float *)(d+0*4*sizeof(float)), r0);
            _mm_store_ps((float *)(d+1*4*sizeof(float)), r1);
            _mm_store_ps((float *)(d+2*4*sizeof(float)), r2);
            _mm_store_ps((float *)(d+3*4*sizeof(float)), r3);
            s += 4*4*sizeof(float);
            d += 4*4*sizeof(float);
            n -= 4*4*sizeof(float);
        }
        while (n >= 4*sizeof(float)) {
            r0 = _mm_loadu_ps((float *)(s));
            _mm_store_ps((float *)(d), r0);
            s += 4*sizeof(float);
            d += 4*sizeof(float);
            n -= 4*sizeof(float);
        }
    } else { // or it IS aligned
        __m128 r0,r1,r2,r3;
        // unroll 4
        while (n >= 4*4*sizeof(float)) {
            r0 = _mm_load_ps((float *)(s+0*4*sizeof(float)));
            r1 = _mm_load_ps((float *)(s+1*4*sizeof(float)));
            r2 = _mm_load_ps((float *)(s+2*4*sizeof(float)));
            r3 = _mm_load_ps((float *)(s+3*4*sizeof(float)));
            _mm_store_ps((float *)(d+0*4*sizeof(float)), r0);
            _mm_store_ps((float *)(d+1*4*sizeof(float)), r1);
            _mm_store_ps((float *)(d+2*4*sizeof(float)), r2);
            _mm_store_ps((float *)(d+3*4*sizeof(float)), r3);
            s += 4*4*sizeof(float);
            d += 4*4*sizeof(float);
            n -= 4*4*sizeof(float);
        }
        while (n >= 4*sizeof(float)) {
            r0 = _mm_load_ps((float *)(s));
            _mm_store_ps((float *)(d), r0);
            s += 4*sizeof(float);
            d += 4*sizeof(float);
            n -= 4*sizeof(float);
        }            
    }
    if (n)
        memcpy(d, s, n);
    // fencing because of NT stores
    // potential optimization: issue only when NT stores are actually emitted
    _mm_sfence();
 #else
 #error "this file should be compiled with -msse"
 #endif
    return ret;
 }
 /*
 * Local variables:
 *  c-indent-level: 4
 *  c-basic-offset: 4
 *  tab-width: 4
 *  indent-tabs-mode: nil
 * End:
 */
--- a/gdrcopy/src/memcpy_sse41.c
+++ b/gdrcopy/src/memcpy_sse41.c
@ -0,0 +1,141 @@
 /*
 * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in 
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 #include <immintrin.h>
 #ifndef min
 #define min(A,B) ((A)<(B)?(A):(B))
 #endif
 // implementation of copy from BAR using MOVNTDQA 
 // suggested by Nicholas Wilt <nwilt@amazon.com>
 // src is WC MMIO of GPU BAR
 // dest is host memory
 int memcpy_uncached_load_sse41(void *dest, const void *src, size_t n_bytes)
 {
    int ret = 0;
 #ifdef __SSE4_1__
    char *d = (char*)dest;
    uintptr_t d_int = (uintptr_t)d;
    const char *s = (const char *)src;
    uintptr_t s_int = (uintptr_t)s;
    size_t n = n_bytes;
    // align src to 128-bits
    if (s_int & 0xf) {
        size_t nh = min(0x10 - (s_int & 0x0f), n);
        memcpy(d, s, nh);
        d += nh; d_int += nh;
        s += nh; s_int += nh;
        n -= nh;
    }
    if (d_int & 0xf) { // dest is not aligned to 128-bits
        __m128i r0,r1,r2,r3,r4,r5,r6,r7;
        // unroll 8
        while (n >= 8*sizeof(__m128i)) {
            r0 = _mm_stream_load_si128 ((__m128i *)(s+0*sizeof(__m128i)));
            r1 = _mm_stream_load_si128 ((__m128i *)(s+1*sizeof(__m128i)));
            r2 = _mm_stream_load_si128 ((__m128i *)(s+2*sizeof(__m128i)));
            r3 = _mm_stream_load_si128 ((__m128i *)(s+3*sizeof(__m128i)));
            r4 = _mm_stream_load_si128 ((__m128i *)(s+4*sizeof(__m128i)));
            r5 = _mm_stream_load_si128 ((__m128i *)(s+5*sizeof(__m128i)));
            r6 = _mm_stream_load_si128 ((__m128i *)(s+6*sizeof(__m128i)));
            r7 = _mm_stream_load_si128 ((__m128i *)(s+7*sizeof(__m128i)));
            _mm_storeu_si128((__m128i *)(d+0*sizeof(__m128i)), r0);
            _mm_storeu_si128((__m128i *)(d+1*sizeof(__m128i)), r1);
            _mm_storeu_si128((__m128i *)(d+2*sizeof(__m128i)), r2);
            _mm_storeu_si128((__m128i *)(d+3*sizeof(__m128i)), r3);
            _mm_storeu_si128((__m128i *)(d+4*sizeof(__m128i)), r4);
            _mm_storeu_si128((__m128i *)(d+5*sizeof(__m128i)), r5);
            _mm_storeu_si128((__m128i *)(d+6*sizeof(__m128i)), r6);
            _mm_storeu_si128((__m128i *)(d+7*sizeof(__m128i)), r7);
            s += 8*sizeof(__m128i);
            d += 8*sizeof(__m128i);
            n -= 8*sizeof(__m128i);
        }
        while (n >= sizeof(__m128i)) {
            r0 = _mm_stream_load_si128 ((__m128i *)(s+0*sizeof(__m128i)));
            _mm_storeu_si128((__m128i *)(d+0*sizeof(__m128i)), r0);
            s += sizeof(__m128i);
            d += sizeof(__m128i);
            n -= sizeof(__m128i);
        }
    } else { // or it IS aligned
        __m128i r0,r1,r2,r3,r4,r5,r6,r7;
        // unroll 8
        while (n >= 8*sizeof(__m128i)) {
            r0 = _mm_stream_load_si128 ((__m128i *)(s+0*sizeof(__m128i)));
            r1 = _mm_stream_load_si128 ((__m128i *)(s+1*sizeof(__m128i)));
            r2 = _mm_stream_load_si128 ((__m128i *)(s+2*sizeof(__m128i)));
            r3 = _mm_stream_load_si128 ((__m128i *)(s+3*sizeof(__m128i)));
            r4 = _mm_stream_load_si128 ((__m128i *)(s+4*sizeof(__m128i)));
            r5 = _mm_stream_load_si128 ((__m128i *)(s+5*sizeof(__m128i)));
            r6 = _mm_stream_load_si128 ((__m128i *)(s+6*sizeof(__m128i)));
            r7 = _mm_stream_load_si128 ((__m128i *)(s+7*sizeof(__m128i)));
            _mm_stream_si128((__m128i *)(d+0*sizeof(__m128i)), r0);
            _mm_stream_si128((__m128i *)(d+1*sizeof(__m128i)), r1);
            _mm_stream_si128((__m128i *)(d+2*sizeof(__m128i)), r2);
            _mm_stream_si128((__m128i *)(d+3*sizeof(__m128i)), r3);
            _mm_stream_si128((__m128i *)(d+4*sizeof(__m128i)), r4);
            _mm_stream_si128((__m128i *)(d+5*sizeof(__m128i)), r5);
            _mm_stream_si128((__m128i *)(d+6*sizeof(__m128i)), r6);
            _mm_stream_si128((__m128i *)(d+7*sizeof(__m128i)), r7);
            s += 8*sizeof(__m128i);
            d += 8*sizeof(__m128i);
            n -= 8*sizeof(__m128i);
        }
        while (n >= sizeof(__m128i)) {
            r0 = _mm_stream_load_si128 ((__m128i *)(s+0*sizeof(__m128i)));
            _mm_stream_si128((__m128i *)(d+0*sizeof(__m128i)), r0);
            s += sizeof(__m128i);
            d += sizeof(__m128i);
            n -= sizeof(__m128i);
        }
    }
    if (n)
        memcpy(d, s, n);
    // fencing because of NT stores
    // potential optimization: issue only when NT stores are actually emitted
    _mm_sfence();
 #else
 #error "this file should be compiled with -msse4.1"
 #endif
    return ret;
 }
 /*
 * Local variables:
 *  c-indent-level: 4
 *  c-basic-offset: 4
 *  tab-width: 4
 *  indent-tabs-mode: nil
 * End:
 */
--- a/gdrcopy/tests/Makefile
+++ b/gdrcopy/tests/Makefile
@ -0,0 +1,69 @@
 DESTBIN ?= 
 CUDA ?= /usr/local/cuda
 NVCC ?= $(CUDA)/bin/nvcc
 GDRAPI_INC := ../include
 GDRAPI_SRC := ../src
 CUDA_LIB := -L $(CUDA)/lib64 -L $(CUDA)/lib -L /usr/lib64/nvidia -L /usr/lib/nvidia -L $(CUDA)/lib64/stubs
 CUDA_INC += -I $(CUDA)/include
 CPPFLAGS := $(CUDA_INC) -I $(GDRAPI_INC) -I $(GDRAPI_SRC) -I $(CUDA)/include
 LDFLAGS  := $(CUDA_LIB) -L $(CUDA)/lib64 -L $(GDRAPI_SRC)
 COMMONCFLAGS := -O2
 CFLAGS   += $(COMMONCFLAGS)
 CXXFLAGS += $(COMMONCFLAGS)
 NVCCFLAGS ?=
 LIBS     := -lcuda -lpthread -ldl -lgdrapi
 CPP_SRCS := copybw.cpp sanity.cpp copylat.cpp apiperf.cpp
 CU_SRCS  := pplat.cu
 EXES := $(patsubst %.cpp,gdrcopy_%,$(CPP_SRCS)) $(patsubst %.cu,gdrcopy_%,$(CU_SRCS))
 all: exes
 exes: $(EXES)
 testsuites/testsuite.o: testsuites/testsuite.cpp testsuites/testsuite.hpp common.hpp
 common.o: common.cpp $(GDRAPI_INC)/gdrapi.h common.hpp
 copybw.o: copybw.cpp $(GDRAPI_INC)/gdrapi.h common.hpp
 sanity.o: sanity.cpp $(GDRAPI_INC)/gdrapi.h $(GDRAPI_SRC)/gdrapi_internal.h common.hpp testsuites/testsuite.hpp
 copylat.o: copylat.cpp $(GDRAPI_INC)/gdrapi.h common.hpp
 apiperf.o: apiperf.cpp $(GDRAPI_INC)/gdrapi.h common.hpp
 gdrcopy_copybw: copybw.o common.o
 	$(LINK.cc)  -o $@ $^ $(LIBS) -lrt
 gdrcopy_sanity: sanity.o common.o testsuites/testsuite.o
 	$(LINK.cc)  -o $@ $^ $(LIBS)
 gdrcopy_copylat: copylat.o common.o
 	$(LINK.cc)  -o $@ $^ $(LIBS) -lrt
 gdrcopy_apiperf: apiperf.o common.o
 	$(LINK.cc)  -o $@ $^ $(LIBS) -lrt
 gdrcopy_pplat: pplat.o common.o
 	$(NVCC)  -o $@ $^ $(LDFLAGS) -lgdrapi -lcuda
 %.o: %.cu
 	$(NVCC) -o $@ -c $^ $(LIBS) $(CPPFLAGS) $(NVCCFLAGS)
 clean:
 	rm -f *.o $(EXES) *~ core.* testsuites/*.o
 install: exes
 	@ echo "installing exes in $(DESTBIN)..." && \
 	mkdir -p $(DESTBIN) && \
 	install -D -v -m u=rwx,g=rx,o=rx gdrcopy_copybw -t $(DESTBIN) && \
 	install -D -v -m u=rwx,g=rx,o=rx gdrcopy_copylat -t $(DESTBIN) && \
 	install -D -v -m u=rwx,g=rx,o=rx gdrcopy_apiperf -t $(DESTBIN) && \
 	install -D -v -m u=rwx,g=rx,o=rx gdrcopy_sanity -t $(DESTBIN) && \
 	install -D -v -m u=rwx,g=rx,o=rx gdrcopy_pplat -t $(DESTBIN)
 	cd $(DESTBIN) && \
 	ln -sf gdrcopy_copybw copybw && \
 	ln -sf gdrcopy_copylat copylat && \
 	ln -sf gdrcopy_apiperf apiperf && \
 	ln -sf gdrcopy_sanity sanity
 .PHONY: clean all exes install
--- a/gdrcopy/tests/apiperf.cpp
+++ b/gdrcopy/tests/apiperf.cpp
@ -0,0 +1,287 @@
 /*
 * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */
 #include <stdlib.h>
 #include <getopt.h>
 #include <memory.h>
 #include <stdio.h>
 #include <math.h>
 #include <iostream>
 #include <iomanip>
 #include <cuda.h>
 using namespace std;
 #include "gdrapi.h"
 #include "common.hpp"
 using namespace gdrcopy::test;
 // manually tuned...
 int num_iters        = 100;
 int num_bins         = 10;
 int num_warmup_iters = 10;
 size_t _size = (size_t)1 << 24;
 int dev_id = 0;
 void print_usage(const char *path)
 {
    cout << "Usage: " << path << " [-h][-s <max-size>][-d <gpu>][-n <iters>][-w <iters>][-a <fn>]" << endl;
    cout << endl;
    cout << "Options:" << endl;
    cout << "   -h              Print this help text" << endl;
    cout << "   -s <max-size>   Max buffer size to benchmark (default: " << _size << ")" << endl;
    cout << "   -d <gpu>        GPU ID (default: " << dev_id << ")" << endl;
    cout << "   -n <iters>      Number of benchmark iterations (default: " << num_iters << ")" << endl;
    cout << "   -w <iters>      Number of warm-up iterations (default: " << num_warmup_iters << ")" << endl;
    cout << "   -a <fn>         GPU buffer allocation function (default: cuMemAlloc)" << endl;
    cout << "                       Choices: cuMemAlloc, cuMemCreate" << endl;
 }
 void run_test(CUdeviceptr d_A, size_t size)
 {
    // minimum pinning size is a GPU page size
    size_t pin_request_size = GPU_PAGE_SIZE;
    struct timespec beg, end;
    double pin_lat_us;
    double map_lat_us;
    double unpin_lat_us;
    double unmap_lat_us;
    double inf_lat_us;
    double delta_lat_us;
    double *lat_arr;
    int *bin_arr;
    gdr_t g = gdr_open();
    ASSERT_NEQ(g, (void*)0);
    gdr_mh_t mh;
    BEGIN_CHECK {
        // tokens are optional in CUDA 6.0
        // wave out the test if GPUDirectRDMA is not enabled
        lat_arr = (double *)malloc(sizeof(double) * num_iters);
        bin_arr = (int *)malloc(sizeof(double) * num_bins);
        while (pin_request_size <= size) {
            int iter = 0;
            size_t actual_pin_size;
            double min_lat, max_lat;
            min_lat = -1;
            max_lat = -1;
            pin_lat_us = 0;
            map_lat_us = 0;
            unpin_lat_us = 0;
            unmap_lat_us = 0;
            inf_lat_us = 0;
            actual_pin_size = PAGE_ROUND_UP(pin_request_size, GPU_PAGE_SIZE);
            for (iter = 0; iter < num_warmup_iters; ++iter) {
                BREAK_IF_NEQ(gdr_pin_buffer(g, d_A, actual_pin_size, 0, 0, &mh), 0);
                ASSERT_NEQ(mh, null_mh);
                void *map_d_ptr  = NULL;
                ASSERT_EQ(gdr_map(g, mh, &map_d_ptr, actual_pin_size), 0);
                gdr_info_t info;
                ASSERT_EQ(gdr_get_info(g, mh, &info), 0);
                ASSERT_EQ(gdr_unmap(g, mh, map_d_ptr, actual_pin_size), 0);
                ASSERT_EQ(gdr_unpin_buffer(g, mh), 0);
            }
            for (iter = 0; iter < num_iters; ++iter) {
                clock_gettime(MYCLOCK, &beg);
                BREAK_IF_NEQ(gdr_pin_buffer(g, d_A, actual_pin_size, 0, 0, &mh), 0);
                clock_gettime(MYCLOCK, &end);
                delta_lat_us = ((end.tv_nsec-beg.tv_nsec)/1000.0 + (end.tv_sec-beg.tv_sec)*1000000.0);
                pin_lat_us += delta_lat_us;
                ASSERT_NEQ(mh, null_mh);
                lat_arr[iter] = delta_lat_us;
                min_lat = (min_lat == -1) ? delta_lat_us : ((delta_lat_us < min_lat) ? delta_lat_us : min_lat);
                max_lat = delta_lat_us > max_lat ? delta_lat_us : max_lat;
                void *map_d_ptr  = NULL;
                clock_gettime(MYCLOCK, &beg);
                ASSERT_EQ(gdr_map(g, mh, &map_d_ptr, actual_pin_size), 0);
                clock_gettime(MYCLOCK, &end);
                delta_lat_us = ((end.tv_nsec-beg.tv_nsec)/1000.0 + (end.tv_sec-beg.tv_sec)*1000000.0);
                map_lat_us += delta_lat_us;
                gdr_info_t info;
                clock_gettime(MYCLOCK, &beg);
                ASSERT_EQ(gdr_get_info(g, mh, &info), 0);
                clock_gettime(MYCLOCK, &end);
                delta_lat_us = ((end.tv_nsec-beg.tv_nsec)/1000.0 + (end.tv_sec-beg.tv_sec)*1000000.0);
                inf_lat_us += delta_lat_us;
                clock_gettime(MYCLOCK, &beg);
                ASSERT_EQ(gdr_unmap(g, mh, map_d_ptr, actual_pin_size), 0);
                clock_gettime(MYCLOCK, &end);
                delta_lat_us = ((end.tv_nsec-beg.tv_nsec)/1000.0 + (end.tv_sec-beg.tv_sec)*1000000.0);
                unmap_lat_us += delta_lat_us;
                clock_gettime(MYCLOCK, &beg);
                ASSERT_EQ(gdr_unpin_buffer(g, mh), 0);
                clock_gettime(MYCLOCK, &end);
                delta_lat_us = ((end.tv_nsec-beg.tv_nsec)/1000.0 + (end.tv_sec-beg.tv_sec)*1000000.0);
                unpin_lat_us += delta_lat_us;
            }
            pin_lat_us /= iter;
            map_lat_us /= iter;
            inf_lat_us /= iter;
            unpin_lat_us /= iter;
            unmap_lat_us /= iter;
            printf("Size(B)\tpin.Time(us)\tmap.Time(us)\tget_info.Time(us)\tunmap.Time(us)\tunpin.Time(us)\n");
            printf("%zu\t%f\t%f\t%f\t%f\t%f\n",
                    actual_pin_size, pin_lat_us, map_lat_us, inf_lat_us, unmap_lat_us, unpin_lat_us);
            pin_request_size <<= 1;
            printf("Histogram of gdr_pin_buffer latency for %ld bytes\n", actual_pin_size);
            print_histogram(lat_arr, num_iters, bin_arr, num_bins, min_lat, max_lat);
            printf("\n");
        }
        free(lat_arr);
        free(bin_arr);
    } END_CHECK;
    cout << "closing gdrdrv" << endl;
    ASSERT_EQ(gdr_close(g), 0);
 }
 int main(int argc, char *argv[])
 {
    gpu_memalloc_fn_t galloc_fn = gpu_mem_alloc;
    gpu_memfree_fn_t gfree_fn = gpu_mem_free;
    while(1) {
        int c;
        c = getopt(argc, argv, "s:d:n:w:a:h");
        if (c == -1)
            break;
        switch (c) {
            case 's':
                _size = strtol(optarg, NULL, 0);
                break;
            case 'd':
                dev_id = strtol(optarg, NULL, 0);
                break;
            case 'n':
                num_iters = strtol(optarg, NULL, 0);
                break;
            case 'w':
                num_warmup_iters = strtol(optarg, NULL, 0);
                break;
            case 'a':
                if (strcmp(optarg, "cuMemAlloc") == 0) {
                    galloc_fn = gpu_mem_alloc;
                    gfree_fn = gpu_mem_free;
                }
                else if (strcmp(optarg, "cuMemCreate") == 0) {
                    galloc_fn = gpu_vmm_alloc;
                    gfree_fn = gpu_vmm_free;
                }
                else {
                    cerr << "Unrecognized fn argument" << endl;
                    exit(EXIT_FAILURE);
                }
                break;
            case 'h':
                print_usage(argv[0]);
                exit(EXIT_SUCCESS);
                break;
            default:
                printf("ERROR: invalid option\n");
                exit(EXIT_FAILURE);
        }
    }
    size_t size = PAGE_ROUND_UP(_size, GPU_PAGE_SIZE);
    ASSERTDRV(cuInit(0));
    int n_devices = 0;
    ASSERTDRV(cuDeviceGetCount(&n_devices));
    CUdevice dev;
    for (int n=0; n<n_devices; ++n) {
        char dev_name[256];
        int dev_pci_domain_id;
        int dev_pci_bus_id;
        int dev_pci_device_id;
        ASSERTDRV(cuDeviceGet(&dev, n));
        ASSERTDRV(cuDeviceGetName(dev_name, sizeof(dev_name) / sizeof(dev_name[0]), dev));
        ASSERTDRV(cuDeviceGetAttribute(&dev_pci_domain_id, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, dev));
        ASSERTDRV(cuDeviceGetAttribute(&dev_pci_bus_id, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev));
        ASSERTDRV(cuDeviceGetAttribute(&dev_pci_device_id, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, dev));
        cout  << "GPU id:" << n << "; name: " << dev_name
              << "; Bus id: "
              << std::hex
              << std::setfill('0') << std::setw(4) << dev_pci_domain_id
              << ":" << std::setfill('0') << std::setw(2) << dev_pci_bus_id
              << ":" << std::setfill('0') << std::setw(2) << dev_pci_device_id
              << std::dec
              << endl;
    }
    cout << "selecting device " << dev_id << endl;
    ASSERTDRV(cuDeviceGet(&dev, dev_id));
    CUcontext dev_ctx;
    ASSERTDRV(cuDevicePrimaryCtxRetain(&dev_ctx, dev));
    ASSERTDRV(cuCtxSetCurrent(dev_ctx));
    ASSERT_EQ(check_gdr_support(dev), true);
    CUdeviceptr d_A;
    gpu_mem_handle_t mhandle;
    ASSERTDRV(galloc_fn(&mhandle, size, true, true));
    d_A = mhandle.ptr;
    cout << "device ptr: 0x" << hex << d_A << dec << endl;
    cout << "allocated size: " << size << endl;
    run_test(d_A, size);
    ASSERTDRV(gfree_fn(&mhandle));
    ASSERTDRV(cuCtxSetCurrent(NULL));
    ASSERTDRV(cuDevicePrimaryCtxRelease(dev));
    return 0;
 }
 /*
 * Local variables:
 *  c-indent-level: 4
 *  c-basic-offset: 4
 *  tab-width: 4
 *  indent-tabs-mode: nil
 * End:
 */
--- a/gdrcopy/tests/common.cpp
+++ b/gdrcopy/tests/common.cpp
@ -0,0 +1,358 @@
 /*
 * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in 
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */
 #include <stdio.h>
 #include <stdarg.h>
 #include <sys/types.h>
 #include <unistd.h>
 #include <map>
 #include <cuda.h>
 #include "common.hpp"
 namespace gdrcopy {
    namespace test {
        bool print_dbg_msg = false;
        void print_dbg(const char* fmt, ...)
        {
            if (print_dbg_msg) {
                va_list ap;
                va_start(ap, fmt);
                vfprintf(stderr, fmt, ap);
                va_end(ap);
            }
        }
        CUresult gpu_mem_alloc(gpu_mem_handle_t *handle, const size_t size, bool aligned_mapping, bool set_sync_memops)
        {
            CUresult ret = CUDA_SUCCESS;
            CUdeviceptr ptr, out_ptr;
            size_t allocated_size;
            if (aligned_mapping)
                allocated_size = size + GPU_PAGE_SIZE - 1;
            else
                allocated_size = size;
            ret = cuMemAlloc(&ptr, allocated_size);
            if (ret != CUDA_SUCCESS)
                return ret;
            if (set_sync_memops) {
                unsigned int flag = 1;
                ret = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, ptr);
                if (ret != CUDA_SUCCESS) {
                    cuMemFree(ptr);
                    return ret;
                }
            }
            if (aligned_mapping)
                out_ptr = PAGE_ROUND_UP(ptr, GPU_PAGE_SIZE);
            else
                out_ptr = ptr;
            handle->ptr = out_ptr;
            handle->unaligned_ptr = ptr;
            handle->size = size;
            handle->allocated_size = allocated_size;
            return CUDA_SUCCESS;
        }
        CUresult gpu_mem_free(gpu_mem_handle_t *handle)
        {
            CUresult ret = CUDA_SUCCESS;
            CUdeviceptr ptr;
            ret = cuMemFree(handle->unaligned_ptr);
            if (ret == CUDA_SUCCESS)
                memset(handle, 0, sizeof(gpu_mem_handle_t));
            return ret;
        }
 #if CUDA_VERSION >= 11000
        /**
         * Allocating GPU memory using VMM API.
         * VMM API is available since CUDA 10.2. However, the RDMA support is added in CUDA 11.0.
         * Our tests are not useful without RDMA support. So, we enable this VMM allocation from CUDA 11.0.
         */
        CUresult gpu_vmm_alloc(gpu_mem_handle_t *handle, const size_t size, bool aligned_mapping, bool set_sync_memops)
        {
            CUresult ret = CUDA_SUCCESS;
            size_t granularity, gran;
            CUmemAllocationProp mprop;
            CUdevice gpu_dev;
            size_t rounded_size;
            CUdeviceptr ptr = 0;
            CUmemGenericAllocationHandle mem_handle = 0;
            bool is_mapped = false;
            int RDMASupported = 0;
            int version;
            ret = cuDriverGetVersion(&version);
            if (ret != CUDA_SUCCESS) {
                print_dbg("error in cuDriverGetVersion\n");
                goto out;
            }
            if (version < 11000) {
                print_dbg("VMM with RDMA is not supported in this CUDA version.\n");
                ret = CUDA_ERROR_NOT_SUPPORTED;
                goto out;
            }
            ret = cuCtxGetDevice(&gpu_dev);
            if (ret != CUDA_SUCCESS) {
                print_dbg("error in cuCtxGetDevice\n");
                goto out;
            }
            ret = cuDeviceGetAttribute(&RDMASupported, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, gpu_dev);
            if (ret != CUDA_SUCCESS) {
                print_dbg("error in cuDeviceGetAttribute\n");
                goto out;
            }
            if (!RDMASupported) {
                print_dbg("GPUDirect RDMA is not supported on this GPU.\n");
                ret = CUDA_ERROR_NOT_SUPPORTED;
                goto out;
            }
            memset(&mprop, 0, sizeof(CUmemAllocationProp));
            mprop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
            mprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
            mprop.location.id = gpu_dev;
            mprop.allocFlags.gpuDirectRDMACapable = 1;
            ret = cuMemGetAllocationGranularity(&gran, &mprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED);
            if (ret != CUDA_SUCCESS) {
                print_dbg("error in cuMemGetAllocationGranularity\n");
                goto out;
            }
            // In case gran is smaller than GPU_PAGE_SIZE
            granularity = PAGE_ROUND_UP(gran, GPU_PAGE_SIZE);
            rounded_size = PAGE_ROUND_UP(size, granularity);
            ret = cuMemAddressReserve(&ptr, rounded_size, granularity, 0, 0);
            if (ret != CUDA_SUCCESS) {
                print_dbg("error in cuMemAddressReserve\n");
                goto out;
            }
            ret = cuMemCreate(&mem_handle, rounded_size, &mprop, 0);
            if (ret != CUDA_SUCCESS) {
                print_dbg("error in cuMemCreate\n");
                goto out;
            }
            ret = cuMemMap(ptr, rounded_size, 0, mem_handle, 0);
            if (ret != CUDA_SUCCESS) {
                print_dbg("error in cuMemMap\n");
                goto out;
            }
            is_mapped = true;
            CUmemAccessDesc access;
            access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
            access.location.id = gpu_dev;
            access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
            ret = cuMemSetAccess(ptr, rounded_size, &access, 1);
            if (ret != CUDA_SUCCESS) {
                print_dbg("error in cuMemSetAccess\n");
                goto out;
            }
            // cuMemAddressReserve always returns aligned ptr
            handle->ptr = ptr;
            handle->handle = mem_handle;
            handle->size = size;
            handle->allocated_size = rounded_size;
 out:
            if (ret != CUDA_SUCCESS) {
                if (is_mapped)
                    cuMemUnmap(ptr, rounded_size);
                if (mem_handle)
                    cuMemRelease(mem_handle);
                if (ptr)
                    cuMemAddressFree(ptr, rounded_size);
            }
            return ret;
        }
        CUresult gpu_vmm_free(gpu_mem_handle_t *handle)
        {
            CUresult ret;
            if (!handle || !handle->ptr)
                return CUDA_ERROR_INVALID_VALUE;
            ret = cuMemUnmap(handle->ptr, handle->allocated_size);
            if (ret != CUDA_SUCCESS) {
                print_dbg("error in cuMemUnmap\n");
                return ret;
            }
            ret = cuMemRelease(handle->handle);
            if (ret != CUDA_SUCCESS) {
                print_dbg("error in cuMemRelease\n");
                return ret;
            }
            ret = cuMemAddressFree(handle->ptr, handle->allocated_size);
            if (ret != CUDA_SUCCESS) {
                print_dbg("error in cuMemAddressFree\n");
                return ret;
            }
            memset(handle, 0, sizeof(gpu_mem_handle_t));
            return CUDA_SUCCESS;
        }
 #else
        /* VMM with RDMA is not available before CUDA 11.0 */
        CUresult gpu_vmm_alloc(gpu_mem_handle_t *handle, const size_t size, bool aligned_mapping, bool set_sync_memops)
        {
            return CUDA_ERROR_NOT_SUPPORTED;
        }
        CUresult gpu_vmm_free(gpu_mem_handle_t *handle)
        {
            return CUDA_ERROR_NOT_SUPPORTED;
        }
 #endif
        int compare_buf(uint32_t *ref_buf, uint32_t *buf, size_t size)
        {
            int diff = 0;
            if (size % 4 != 0U) {
                print_dbg("warning: buffer size %zu is not dword aligned, ignoring trailing bytes\n", size);
                size -= (size % 4);
            }
            unsigned ndwords = size/sizeof(uint32_t);
            for(unsigned  w = 0; w < ndwords; ++w) {
                if (ref_buf[w] != buf[w]) {
                    if (!diff) {
                        printf("%10.10s %8.8s %8.8s\n", "word", "content", "expected");
                    }
                    if (diff < 10) {
                        printf("%10d %08x %08x\n", w, buf[w], ref_buf[w]);
                    }
                    ++diff;
                }
            }
            if (diff) {
                print_dbg("check error: %d different dwords out of %d\n", diff, ndwords);
            }
            return diff;
        }
        void init_hbuf_walking_bit(uint32_t *h_buf, size_t size)
        {
            uint32_t base_value = 0x3F4C5E6A; // 0xa55ad33d;
            unsigned w;
            ASSERT_NEQ(h_buf, (void*)0);
            ASSERT_EQ(size % 4, 0U);
            //OUT << "filling mem with walking bit " << endl;
            for(w = 0; w<size/sizeof(uint32_t); ++w)
                h_buf[w] = base_value ^ (1<< (w%32));
        }
        void init_hbuf_linear_ramp(uint32_t *h_buf, size_t size)
        {
            uint32_t base_value = 0x3F4C5E6A; // 0xa55ad33d;
            unsigned w;
            ASSERT_NEQ(h_buf, (void*)0);
            ASSERT_EQ(size % 4, 0U);
            //OUT << "filling mem with walking bit " << endl;
            for(w = 0; w<size/sizeof(uint32_t); ++w)
                h_buf[w] = w;
        }
        bool check_gdr_support(CUdevice dev)
        {
            #if CUDA_VERSION >= 11030
            int drv_version;
            ASSERTDRV(cuDriverGetVersion(&drv_version));
            // Starting from CUDA 11.3, CUDA provides an ability to check GPUDirect RDMA support.
            if (drv_version >= 11030) {
                int gdr_support = 0;
                ASSERTDRV(cuDeviceGetAttribute(&gdr_support, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, dev));
                if (!gdr_support)
                    print_dbg("This GPU does not support GPUDirect RDMA.\n");
                return !!gdr_support;
            }
            #endif
            // For older versions, we fall back to detect this support with gdr_pin_buffer.
            const size_t size = GPU_PAGE_SIZE;
            CUdeviceptr d_A;
            gpu_mem_handle_t mhandle;
            ASSERTDRV(gpu_mem_alloc(&mhandle, size, true, true));
            d_A = mhandle.ptr;
            gdr_t g = gdr_open_safe();
            gdr_mh_t mh;
            int status = gdr_pin_buffer(g, d_A, size, 0, 0, &mh);
            if (status != 0) {
                print_dbg("error in gdr_pin_buffer with code=%d\n", status);
                print_dbg("Your GPU might not support GPUDirect RDMA\n");
            }
            else
                ASSERT_EQ(gdr_unpin_buffer(g, mh), 0);
            ASSERT_EQ(gdr_close(g), 0);
            ASSERTDRV(gpu_mem_free(&mhandle));
            return status == 0;
        }
        void print_histogram(double *lat_arr, int count, int *bin_arr, int num_bins, double min, double max)
        {
            int den = (max - min) / num_bins;
            den = den > 0 ? den : 1;
            for (int j = 0; j < num_bins; j++) 
                bin_arr[j] = 0;
            for (int i = 0; i < count; i++) {
                bin_arr[(int) ((lat_arr[i] - min) / den)]++;
            }
            for (int j = 0; j < num_bins; j++) {
                printf("[%lf\t-\t%lf]\t%d\n", (min * (j + 1)), (min * (j + 2)), bin_arr[j]);
            }
        }
    }
 }
--- a/gdrcopy/tests/common.hpp
+++ b/gdrcopy/tests/common.hpp
@ -0,0 +1,162 @@
 /*
 * Copyright (c) 2014-2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in 
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */
 #pragma once
 #include <stdarg.h>
 #include <sys/types.h>
 #include <unistd.h>
 #include <stdio.h>
 #include <cuda.h>
 #include <cstring>
 #include <map>
 #include <gdrapi.h>
 #include <gdrconfig.h>
 #ifndef ACCESS_ONCE
 #define ACCESS_ONCE(x)      (*(volatile typeof((x)) *)&(x))
 #endif
 #ifndef READ_ONCE
 #define READ_ONCE(x)        ACCESS_ONCE(x)
 #endif
 #ifndef WRITE_ONCE
 #define WRITE_ONCE(x, v)    (ACCESS_ONCE(x) = (v))
 #endif
 /**
 * Memory barrier
 */
 #if defined(GDRAPI_X86)
 #define MB() asm volatile("mfence":::"memory")
 #define SB() asm volatile("sfence":::"memory")
 #define LB() asm volatile("lfence":::"memory")
 #elif defined(GDRAPI_POWER)
 #define MB() asm volatile("sync":::"memory")
 #define SB() MB()
 #define LB() MB()
 #elif defined(GDRAPI_ARM64)
 #define MB() asm volatile("dmb sy":::"memory")
 #define SB() asm volatile("dmb st":::"memory")
 #define LB() MB()
 #else
 #error "Compiling on an unsupported architecture."
 #endif
 /**
 * Clock used for timing
 */
 //#define MYCLOCK CLOCK_REALTIME
 //#define MYCLOCK CLOCK_RAW_MONOTONIC
 #define MYCLOCK CLOCK_MONOTONIC
 #define EXIT_WAIVED 2
 #define ASSERT(x)                                                       \
    do                                                                  \
        {                                                               \
            if (!(x))                                                   \
                {                                                       \
                    fprintf(stderr, "Assertion \"%s\" failed at %s:%d\n", #x, __FILE__, __LINE__); \
                    exit(EXIT_FAILURE);                                 \
                }                                                       \
        } while (0)
 #define ASSERTDRV(stmt)				\
    do                                          \
        {                                       \
            CUresult result = (stmt);           \
            if (result != CUDA_SUCCESS) {       \
                const char *_err_name;          \
                cuGetErrorName(result, &_err_name); \
                fprintf(stderr, "CUDA error: %s\n", _err_name);   \
            }                                   \
            ASSERT(CUDA_SUCCESS == result);     \
        } while (0)
 #define ASSERT_EQ(P, V) ASSERT((P) == (V))
 #define CHECK_EQ(P, V) ASSERT((P) == (V))
 #define ASSERT_NEQ(P, V) ASSERT(!((P) == (V)))
 #define BREAK_IF_NEQ(P, V) if((P) != (V)) break
 #define BEGIN_CHECK do
 #define END_CHECK while(0)
 #define PAGE_ROUND_UP(x, n)     (((x) + ((n) - 1)) & ~((n) - 1))
 namespace gdrcopy {
    namespace test {
        typedef struct gpuMemHandle 
        {
            CUdeviceptr ptr; // aligned ptr if requested; otherwise, the same as unaligned_ptr.
            union {
                CUdeviceptr unaligned_ptr; // for tracking original ptr; may be unaligned.
                #if CUDA_VERSION >= 11000
                // VMM with GDR support is available from CUDA 11.0
                CUmemGenericAllocationHandle handle;
                #endif
            };
            size_t size;
            size_t allocated_size;
        } gpu_mem_handle_t;
        typedef CUresult (*gpu_memalloc_fn_t)(gpu_mem_handle_t *handle, const size_t size, bool aligned_mapping, bool set_sync_memops);
        typedef CUresult (*gpu_memfree_fn_t)(gpu_mem_handle_t *handle);
        static inline gdr_t gdr_open_safe()
        {
            gdr_t g = gdr_open();
            if (!g) {
                fprintf(stderr, "gdr_open error: Is gdrdrv driver installed and loaded?\n");
                exit(EXIT_FAILURE);
            }
            return g;
        }
        extern bool print_dbg_msg;
        extern const char *testname;
        void print_dbg(const char* fmt, ...);
        CUresult gpu_mem_alloc(gpu_mem_handle_t *handle, const size_t size, bool aligned_mapping, bool set_sync_memops);
        CUresult gpu_mem_free(gpu_mem_handle_t *handle);
        CUresult gpu_vmm_alloc(gpu_mem_handle_t *handle, const size_t size, bool aligned_mapping, bool set_sync_memops);
        CUresult gpu_vmm_free(gpu_mem_handle_t *handle);
        static inline bool operator==(const gdr_mh_t &a, const gdr_mh_t &b) {
            return a.h == b.h;
        }
        static const gdr_mh_t null_mh = {0};
        int compare_buf(uint32_t *ref_buf, uint32_t *buf, size_t size);
        void init_hbuf_walking_bit(uint32_t *h_buf, size_t size);
        void init_hbuf_linear_ramp(uint32_t *h_buf, size_t size);
        bool check_gdr_support(CUdevice dev);
        void print_histogram(double *lat_arr, int count, int *bin_arr, int num_bins, double min, double max);
    }
 }
--- a/gdrcopy/tests/copybw.cpp
+++ b/gdrcopy/tests/copybw.cpp
@ -0,0 +1,282 @@
 /*
 * Copyright (c) 2014-2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in 
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */
 #include <stdlib.h>
 #include <getopt.h>
 #include <memory.h>
 #include <stdio.h>
 #include <math.h>
 #include <iostream>
 #include <iomanip>
 #include <cuda.h>
 using namespace std;
 #include "gdrapi.h"
 #include "common.hpp"
 using namespace gdrcopy::test;
 // manually tuned...
 int num_write_iters = 10000;
 int num_read_iters  = 100;
 size_t _size = 128*1024;
 size_t copy_size = 0;
 size_t copy_offset = 0;
 int dev_id = 0;
 void print_usage(const char *path)
 {
    cout << "Usage: " << path << " [-h][-s <size>][-c <size>][-o <offset>][-d <gpu>][-w <iters>][-r <iters>][-a <fn>]" << endl;
    cout << endl;
    cout << "Options:" << endl;
    cout << "   -h              Print this help text" << endl;
    cout << "   -s <size>       Buffer allocation size (default: " << _size << ")" << endl;
    cout << "   -c <size>       Copy size (default: " << copy_size << ")" << endl;
    cout << "   -o <offset>     Copy offset (default: " << copy_offset << ")" << endl;
    cout << "   -d <gpu>        GPU ID (default: " << dev_id << ")" << endl;
    cout << "   -w <iters>      Number of write iterations (default: " << num_write_iters << ")" << endl;
    cout << "   -r <iters>      Number of read iterations (default: " << num_read_iters << ")" << endl;
    cout << "   -a <fn>         GPU buffer allocation function (default: cuMemAlloc)" << endl;
    cout << "                       Choices: cuMemAlloc, cuMemCreate" << endl;
 }
 void run_test(CUdeviceptr d_A, size_t size)
 {
    uint32_t *init_buf = NULL;
    ASSERTDRV(cuMemAllocHost((void **)&init_buf, size));
    ASSERT_NEQ(init_buf, (void*)0);
    init_hbuf_walking_bit(init_buf, size);
    gdr_t g = gdr_open_safe();
    gdr_mh_t mh;
    BEGIN_CHECK {
        // tokens are optional in CUDA 6.0
        // wave out the test if GPUDirectRDMA is not enabled
        BREAK_IF_NEQ(gdr_pin_buffer(g, d_A, size, 0, 0, &mh), 0);
        ASSERT_NEQ(mh, null_mh);
        void *map_d_ptr  = NULL;
        ASSERT_EQ(gdr_map(g, mh, &map_d_ptr, size), 0);
        cout << "map_d_ptr: " << map_d_ptr << endl;
        gdr_info_t info;
        ASSERT_EQ(gdr_get_info(g, mh, &info), 0);
        cout << "info.va: " << hex << info.va << dec << endl;
        cout << "info.mapped_size: " << info.mapped_size << endl;
        cout << "info.page_size: " << info.page_size << endl;
        cout << "info.mapped: " << info.mapped << endl;
        cout << "info.wc_mapping: " << info.wc_mapping << endl;
        // remember that mappings start on a 64KB boundary, so let's
        // calculate the offset from the head of the mapping to the
        // beginning of the buffer
        int off = info.va - d_A;
        cout << "page offset: " << off << endl;
        uint32_t *buf_ptr = (uint32_t *)((char *)map_d_ptr + off);
        cout << "user-space pointer:" << buf_ptr << endl;
        // copy to GPU benchmark
        cout << "writing test, size=" << copy_size << " offset=" << copy_offset << " num_iters=" << num_write_iters << endl;
        struct timespec beg, end;
        clock_gettime(MYCLOCK, &beg);
        for (int iter=0; iter<num_write_iters; ++iter)
            gdr_copy_to_mapping(mh, buf_ptr + copy_offset/4, init_buf, copy_size);
        clock_gettime(MYCLOCK, &end);
        double woMBps;
        {
            double byte_count = (double) copy_size * num_write_iters;
            double dt_ms = (end.tv_nsec-beg.tv_nsec)/1000000.0 + (end.tv_sec-beg.tv_sec)*1000.0;
            double Bps = byte_count / dt_ms * 1e3;
            woMBps = Bps / 1024.0 / 1024.0;
            cout << "write BW: " << woMBps << "MB/s" << endl;
        }
        compare_buf(init_buf, buf_ptr + copy_offset/4, copy_size);
        // copy from GPU benchmark
        cout << "reading test, size=" << copy_size << " offset=" << copy_offset << " num_iters=" << num_read_iters << endl;
        clock_gettime(MYCLOCK, &beg);
        for (int iter=0; iter<num_read_iters; ++iter)
            gdr_copy_from_mapping(mh, init_buf, buf_ptr + copy_offset/4, copy_size);
        clock_gettime(MYCLOCK, &end);
        double roMBps;
        {
            double byte_count = (double) copy_size * num_read_iters;
            double dt_ms = (end.tv_nsec-beg.tv_nsec)/1000000.0 + (end.tv_sec-beg.tv_sec)*1000.0;
            double Bps = byte_count / dt_ms * 1e3;
            roMBps = Bps / 1024.0 / 1024.0;
            cout << "read BW: " << roMBps << "MB/s" << endl;
        }
        cout << "unmapping buffer" << endl;
        ASSERT_EQ(gdr_unmap(g, mh, map_d_ptr, size), 0);
        cout << "unpinning buffer" << endl;
        ASSERT_EQ(gdr_unpin_buffer(g, mh), 0);
    } END_CHECK;
    cout << "closing gdrdrv" << endl;
    ASSERT_EQ(gdr_close(g), 0);
 }
 int main(int argc, char *argv[])
 {
    gpu_memalloc_fn_t galloc_fn = gpu_mem_alloc;
    gpu_memfree_fn_t gfree_fn = gpu_mem_free;
    while(1) {        
        int c;
        c = getopt(argc, argv, "s:d:o:c:w:r:a:h");
        if (c == -1)
            break;
        switch (c) {
        case 's':
            _size = strtol(optarg, NULL, 0);
            break;
        case 'c':
            copy_size = strtol(optarg, NULL, 0);
            break;
        case 'o':
            copy_offset = strtol(optarg, NULL, 0);
            break;
        case 'd':
            dev_id = strtol(optarg, NULL, 0);
            break;
        case 'w':
            num_write_iters = strtol(optarg, NULL, 0);
            break;
        case 'r':
            num_read_iters = strtol(optarg, NULL, 0);
            break;
        case 'a':
            if (strcmp(optarg, "cuMemAlloc") == 0) {
                galloc_fn = gpu_mem_alloc;
                gfree_fn = gpu_mem_free;
            }
            else if (strcmp(optarg, "cuMemCreate") == 0) {
                galloc_fn = gpu_vmm_alloc;
                gfree_fn = gpu_vmm_free;
            }
            else {
                cerr << "Unrecognized fn argument" << endl;
                exit(EXIT_FAILURE);
            }
            break;
        case 'h':
            print_usage(argv[0]);
            exit(EXIT_SUCCESS);
        default:
            fprintf(stderr, "ERROR: invalid option\n");
            exit(EXIT_FAILURE);
        }
    }
    if (!copy_size)
        copy_size = _size;
    if (copy_offset % sizeof(uint32_t) != 0) {
        fprintf(stderr, "ERROR: offset must be multiple of 4 bytes\n");
        exit(EXIT_FAILURE);
    }
    if (copy_offset + copy_size > _size) {
        fprintf(stderr, "ERROR: offset + copy size run past the end of the buffer\n");
        exit(EXIT_FAILURE);
    }
    size_t size = PAGE_ROUND_UP(_size, GPU_PAGE_SIZE);
    ASSERTDRV(cuInit(0));
    int n_devices = 0;
    ASSERTDRV(cuDeviceGetCount(&n_devices));
    CUdevice dev;
    for (int n=0; n<n_devices; ++n) {
        char dev_name[256];
        int dev_pci_domain_id;
        int dev_pci_bus_id;
        int dev_pci_device_id;
        ASSERTDRV(cuDeviceGet(&dev, n));
        ASSERTDRV(cuDeviceGetName(dev_name, sizeof(dev_name) / sizeof(dev_name[0]), dev));
        ASSERTDRV(cuDeviceGetAttribute(&dev_pci_domain_id, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, dev));
        ASSERTDRV(cuDeviceGetAttribute(&dev_pci_bus_id, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev));
        ASSERTDRV(cuDeviceGetAttribute(&dev_pci_device_id, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, dev));
        cout << "GPU id:" << n << "; name: " << dev_name 
            << "; Bus id: "
            << std::hex 
            << std::setfill('0') << std::setw(4) << dev_pci_domain_id
            << ":" << std::setfill('0') << std::setw(2) << dev_pci_bus_id
            << ":" << std::setfill('0') << std::setw(2) << dev_pci_device_id
            << std::dec
            << endl;
    }
    cout << "selecting device " << dev_id << endl;
    ASSERTDRV(cuDeviceGet(&dev, dev_id));
    CUcontext dev_ctx;
    ASSERTDRV(cuDevicePrimaryCtxRetain(&dev_ctx, dev));
    ASSERTDRV(cuCtxSetCurrent(dev_ctx));
    cout << "testing size: " << _size << endl;
    cout << "rounded size: " << size << endl;
    ASSERT_EQ(check_gdr_support(dev), true);
    if (galloc_fn == gpu_mem_alloc)
        cout << "gpu alloc fn: cuMemAlloc" << endl;
    else
        cout << "gpu alloc fn: cuMemCreate" << endl;
    CUdeviceptr d_A;
    gpu_mem_handle_t mhandle;
    ASSERTDRV(galloc_fn(&mhandle, size, true, true));
    d_A = mhandle.ptr;
    cout << "device ptr: " << hex << d_A << dec << endl;
    run_test(d_A, size);
    ASSERTDRV(gfree_fn(&mhandle));
    ASSERTDRV(cuDevicePrimaryCtxRelease(dev));
    return 0;
 }
 /*
 * Local variables:
 *  c-indent-level: 4
 *  c-basic-offset: 4
 *  tab-width: 4
 *  indent-tabs-mode: nil
 * End:
 */
--- a/gdrcopy/tests/copylat.cpp
+++ b/gdrcopy/tests/copylat.cpp
@ -0,0 +1,307 @@
 /*
 * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in 
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */
 #include <stdlib.h>
 #include <getopt.h>
 #include <memory.h>
 #include <stdio.h>
 #include <math.h>
 #include <iostream>
 #include <iomanip>
 #include <cuda.h>
 using namespace std;
 #include "gdrapi.h"
 #include "common.hpp"
 using namespace gdrcopy::test;
 // manually tuned...
 int num_write_iters = 10000;
 int num_read_iters = 100;
 int dev_id = 0;
 bool do_cumemcpy = false;
 size_t _size = (size_t)1 << 24;
 void print_usage(const char *path)
 {
    cout << "Usage: " << path << " [-h][-c][-s <size>][-d <gpu>][-w <iters>][-r <iters>][-a <fn>]" << endl;
    cout << endl;
    cout << "Options:" << endl;
    cout << "   -h              Print this help text" << endl;
    cout << "   -c              Also run cuMemcpy (default: no)" << endl;
    cout << "   -s <size>       Buffer allocation size (default: " << _size << ")" << endl;
    cout << "   -d <gpu>        GPU ID (default: " << dev_id << ")" << endl;
    cout << "   -w <iters>      Number of write iterations (default: " << num_write_iters << ")" << endl;
    cout << "   -r <iters>      Number of read iterations (default: " << num_read_iters << ")" << endl;
    cout << "   -a <fn>         GPU buffer allocation function (default: cuMemAlloc)" << endl;
    cout << "                       Choices: cuMemAlloc, cuMemCreate" << endl;
 }
 int main(int argc, char *argv[])
 {
    size_t copy_size = 1;
    struct timespec beg, end;
    double lat_us;
    gpu_memalloc_fn_t galloc_fn = gpu_mem_alloc;
    gpu_memfree_fn_t gfree_fn = gpu_mem_free;
    while(1) {        
        int c;
        c = getopt(argc, argv, "s:d:w:r:a:hc");
        if (c == -1)
            break;
        switch (c) {
            case 's':
                _size = strtol(optarg, NULL, 0);
                break;
            case 'd':
                dev_id = strtol(optarg, NULL, 0);
                break;
            case 'w':
                num_write_iters = strtol(optarg, NULL, 0);
                break;
            case 'r':
                num_read_iters = strtol(optarg, NULL, 0);
                break;
            case 'a':
                if (strcmp(optarg, "cuMemAlloc") == 0) {
                    galloc_fn = gpu_mem_alloc;
                    gfree_fn = gpu_mem_free;
                }
                else if (strcmp(optarg, "cuMemCreate") == 0) {
                    galloc_fn = gpu_vmm_alloc;
                    gfree_fn = gpu_vmm_free;
                }
                else {
                    cerr << "Unrecognized fn argument" << endl;
                    exit(EXIT_FAILURE);
                }
                break;
            case 'c':
                do_cumemcpy = true;
                break;
            case 'h':
                print_usage(argv[0]);
                exit(EXIT_SUCCESS);
            default:
                printf("ERROR: invalid option\n");
                exit(EXIT_FAILURE);
        }
    }
    size_t size = PAGE_ROUND_UP(_size, GPU_PAGE_SIZE);
    ASSERTDRV(cuInit(0));
    int n_devices = 0;
    ASSERTDRV(cuDeviceGetCount(&n_devices));
    CUdevice dev;
    for (int n=0; n<n_devices; ++n) {
        char dev_name[256];
        int dev_pci_domain_id;
        int dev_pci_bus_id;
        int dev_pci_device_id;
        ASSERTDRV(cuDeviceGet(&dev, n));
        ASSERTDRV(cuDeviceGetName(dev_name, sizeof(dev_name) / sizeof(dev_name[0]), dev));
        ASSERTDRV(cuDeviceGetAttribute(&dev_pci_domain_id, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, dev));
        ASSERTDRV(cuDeviceGetAttribute(&dev_pci_bus_id, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev));
        ASSERTDRV(cuDeviceGetAttribute(&dev_pci_device_id, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, dev));
        cout  << "GPU id:" << n << "; name: " << dev_name 
              << "; Bus id: "
              << std::hex 
              << std::setfill('0') << std::setw(4) << dev_pci_domain_id
              << ":" << std::setfill('0') << std::setw(2) << dev_pci_bus_id
              << ":" << std::setfill('0') << std::setw(2) << dev_pci_device_id
              << std::dec
              << endl;
    }
    cout << "selecting device " << dev_id << endl;
    ASSERTDRV(cuDeviceGet(&dev, dev_id));
    CUcontext dev_ctx;
    ASSERTDRV(cuDevicePrimaryCtxRetain(&dev_ctx, dev));
    ASSERTDRV(cuCtxSetCurrent(dev_ctx));
    ASSERT_EQ(check_gdr_support(dev), true);
    CUdeviceptr d_A;
    gpu_mem_handle_t mhandle;
    ASSERTDRV(galloc_fn(&mhandle, size, true, true));
    d_A = mhandle.ptr;
    cout << "device ptr: 0x" << hex << d_A << dec << endl;
    cout << "allocated size: " << size << endl;
    if (galloc_fn == gpu_mem_alloc)
        cout << "gpu alloc fn: cuMemAlloc" << endl;
    else
        cout << "gpu alloc fn: cuMemCreate" << endl;
    uint32_t *init_buf = NULL;
    uint32_t *h_buf = NULL;
    ASSERTDRV(cuMemAllocHost((void **)&init_buf, size));
    ASSERT_NEQ(init_buf, (void*)0);
    ASSERTDRV(cuMemAllocHost((void **)&h_buf, size));
    ASSERT_NEQ(h_buf, (void*)0);
    init_hbuf_walking_bit(init_buf, size);
    if (do_cumemcpy) {
        cout << endl;
        cout << "cuMemcpy_H2D num iters for each size: " << num_write_iters << endl;
        printf("Test \t\t Size(B) \t Avg.Time(us)\n");
        BEGIN_CHECK {
            // cuMemcpy H2D benchmark
            copy_size = 1;
            while (copy_size <= size) {
                int iter = 0;
                clock_gettime(MYCLOCK, &beg);
                for (iter = 0; iter < num_write_iters; ++iter) {
                    ASSERTDRV(cuMemcpy(d_A, (CUdeviceptr)init_buf, copy_size));
                }
                clock_gettime(MYCLOCK, &end);
                lat_us = ((end.tv_nsec-beg.tv_nsec)/1000.0 + (end.tv_sec-beg.tv_sec)*1000000.0) / (double)iter;
                printf("cuMemcpy_H2D \t %8zu \t %11.4f\n", copy_size, lat_us);
                copy_size <<= 1;
            }
        } END_CHECK;
        cout << endl;
        cout << "cuMemcpy_D2H num iters for each size: " << num_read_iters << endl;
        printf("Test \t\t Size(B) \t Avg.Time(us)\n");
        BEGIN_CHECK {
            // cuMemcpy D2H benchmark
            copy_size = 1;
            while (copy_size <= size) {
                int iter = 0;
                clock_gettime(MYCLOCK, &beg);
                for (iter = 0; iter < num_read_iters; ++iter) {
                    ASSERTDRV(cuMemcpy((CUdeviceptr)h_buf, d_A, copy_size));
                }
                clock_gettime(MYCLOCK, &end);
                lat_us = ((end.tv_nsec-beg.tv_nsec)/1000.0 + (end.tv_sec-beg.tv_sec)*1000000.0) / (double)iter;
                printf("cuMemcpy_D2H \t %8zu \t %11.4f\n", copy_size, lat_us);
                copy_size <<= 1;
            }
        } END_CHECK;
        cout << endl;
    }
    cout << endl;
    gdr_t g = gdr_open_safe();
    gdr_mh_t mh;
    BEGIN_CHECK {
        // tokens are optional in CUDA 6.0
        ASSERT_EQ(gdr_pin_buffer(g, d_A, size, 0, 0, &mh), 0);
        ASSERT_NEQ(mh, null_mh);
        void *map_d_ptr  = NULL;
        ASSERT_EQ(gdr_map(g, mh, &map_d_ptr, size), 0);
        cout << "map_d_ptr: " << map_d_ptr << endl;
        gdr_info_t info;
        ASSERT_EQ(gdr_get_info(g, mh, &info), 0);
        cout << "info.va: " << hex << info.va << dec << endl;
        cout << "info.mapped_size: " << info.mapped_size << endl;
        cout << "info.page_size: " << info.page_size << endl;
        cout << "info.mapped: " << info.mapped << endl;
        cout << "info.wc_mapping: " << info.wc_mapping << endl;
        // remember that mappings start on a 64KB boundary, so let's
        // calculate the offset from the head of the mapping to the
        // beginning of the buffer
        int off = info.va - d_A;
        cout << "page offset: " << off << endl;
        uint32_t *buf_ptr = (uint32_t *)((char *)map_d_ptr + off);
        cout << "user-space pointer: " << buf_ptr << endl;
        // gdr_copy_to_mapping benchmark
        cout << endl;
        cout << "gdr_copy_to_mapping num iters for each size: " << num_write_iters << endl;
        cout << "WARNING: Measuring the API invocation overhead as observed by the CPU. Data might not be ordered all the way to the GPU internal visibility." << endl;
        // For more information, see
        // https://docs.nvidia.com/cuda/gpudirect-rdma/index.html#sync-behavior
        printf("Test \t\t\t Size(B) \t Avg.Time(us)\n");
        copy_size = 1;
        while (copy_size <= size) {
            int iter = 0;
            clock_gettime(MYCLOCK, &beg);
            for (iter = 0; iter < num_write_iters; ++iter) {
                gdr_copy_to_mapping(mh, buf_ptr, init_buf, copy_size);
            }
            clock_gettime(MYCLOCK, &end);
            lat_us = ((end.tv_nsec-beg.tv_nsec)/1000.0 + (end.tv_sec-beg.tv_sec)*1000000.0) / (double)iter;
            printf("gdr_copy_to_mapping \t %8zu \t %11.4f\n", copy_size, lat_us);
            copy_size <<= 1;
        }
        MB();
        // gdr_copy_from_mapping benchmark
        cout << endl;
        cout << "gdr_copy_from_mapping num iters for each size: " << num_read_iters << endl;
        printf("Test \t\t\t Size(B) \t Avg.Time(us)\n");
        copy_size = 1;
        while (copy_size <= size) {
            int iter = 0;
            clock_gettime(MYCLOCK, &beg);
            for (iter = 0; iter < num_read_iters; ++iter)
                gdr_copy_from_mapping(mh, h_buf, buf_ptr, copy_size);
            clock_gettime(MYCLOCK, &end);
            lat_us = ((end.tv_nsec-beg.tv_nsec)/1000.0 + (end.tv_sec-beg.tv_sec)*1000000.0) / (double)iter;
            printf("gdr_copy_from_mapping \t %8zu \t %11.4f\n", copy_size, lat_us);
            copy_size <<= 1;
        }
        cout << "unmapping buffer" << endl;
        ASSERT_EQ(gdr_unmap(g, mh, map_d_ptr, size), 0);
        cout << "unpinning buffer" << endl;
        ASSERT_EQ(gdr_unpin_buffer(g, mh), 0);
    } END_CHECK;
    cout << "closing gdrdrv" << endl;
    ASSERT_EQ(gdr_close(g), 0);
    ASSERTDRV(gfree_fn(&mhandle));
    return 0;
 }
 /*
 * Local variables:
 *  c-indent-level: 4
 *  c-basic-offset: 4
 *  tab-width: 4
 *  indent-tabs-mode: nil
 * End:
 */
--- a/Show More
+++ b/Show More
		`@ -0,0 +1,2 @@`

							`This copyright has not been completed by the author of this package.`