first commit

2025-09-15 10:32:17 +08:00 · 2025-09-15 10:32:17 +08:00 · cc76bab27e
commit cc76bab27e
3854 changed files with 740345 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,240 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+
+# Tokenizer cache for tests
+.tokenizer_cache/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+
+# MacOS
+.DS_Store
+
+# Vim
+*.swp
+
+# Documentation
+docs/_build
+
+# SGL
+benchmark/mmlu/data
+benchmark/mmlu/data.tar
+benchmark/llava_bench/images
+benchmark/llava_bench/mme_pack
+*.jsonl
+tmp*.txt
+
+# Plots
+*.png
+*.pdf
+
+# personnal
+work_dirs/
+*.csv
+
+!logo.png
+
+# Prerequisites
+*.d
+
+# Compiled Object files
+*.slo
+*.lo
+*.o
+*.obj
+
+# Precompiled Headers
+*.gch
+*.pch
+
+# Compiled Dynamic libraries
+*.so
+*.dylib
+*.dll
+
+# Fortran module files
+*.mod
+*.smod
+
+# Compiled Static libraries
+*.lai
+*.la
+*.a
+*.lib
+
+# Executables
+*.exe
+*.out
+*.app
+
+compile_commands.json
+
+*.iml
+
+# VSCode
+.vscode
+
+1
+
+# Autoenv
+.env.leave
+
+# Rust lib
+Cargo.lock
+
+lmms-eval
--- a/DeepEP/.gitignore
+++ b/DeepEP/.gitignore
@ -0,0 +1,8 @@
+compile_commands.json
+.idea
+.DS_Store
+*.pyc
+build/
+.cache/
+.vscode/
+*/cmake-build-*/
--- a/DeepEP/LICENSE
+++ b/DeepEP/LICENSE
@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2025 DeepSeek
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/DeepEP/README.md
+++ b/DeepEP/README.md
@ -0,0 +1,344 @@
+# DeepEP
+
+DeepEP is a communication library tailored for Mixture-of-Experts (MoE) and expert parallelism (EP). It provides high-throughput and low-latency all-to-all GPU kernels, which are also known as MoE dispatch and combine. The library also supports low-precision operations, including FP8.
+
+To align with the group-limited gating algorithm proposed in the [DeepSeek-V3](https://github.com/deepseek-ai/DeepSeek-V3) paper, DeepEP offers a set of kernels optimized for asymmetric-domain bandwidth forwarding, such as forwarding data from NVLink domain to RDMA domain. These kernels deliver high throughput, making them suitable for both training and inference prefilling tasks. Additionally, they support SM (Streaming Multiprocessors) number control.
+
+For latency-sensitive inference decoding, DeepEP includes a set of low-latency kernels with pure RDMA to minimize delays. The library also introduces a hook-based communication-computation overlapping method that does not occupy any SM resource.
+
+Notice: the implementation in this library may have some slight differences from the [DeepSeek-V3](https://github.com/deepseek-ai/DeepSeek-V3) paper.
+
+## Performance
+
+### Normal kernels with NVLink and RDMA forwarding
+
+We test normal kernels on H800 (~160 GB/s NVLink maximum bandwidth), with each connected to a CX7 InfiniBand 400 Gb/s RDMA network card (~50 GB/s maximum bandwidth). And we follow the DeepSeek-V3/R1 pretraining setting (4096 tokens per batch, 7168 hidden, top-4 groups, top-8 experts, FP8 dispatching and BF16 combining).
+
+|   Type    | Dispatch #EP | Bottleneck bandwidth | Combine #EP | Bottleneck bandwidth |
+|:---------:|:------------:|:--------------------:|:-----------:|:--------------------:|
+| Intranode |      8       |  153 GB/s (NVLink)   |      8      |  158 GB/s (NVLink)   |
+| Internode |      16      |    43 GB/s (RDMA)    |     16      |    43 GB/s (RDMA)    |
+| Internode |      32      |    58 GB/s (RDMA)    |     32      |    57 GB/s (RDMA)    |
+| Internode |      64      |    51 GB/s (RDMA)    |     64      |    50 GB/s (RDMA)    |
+
+**News (2025.04.22)**: with optimizations from Tencent Network Platform Department, performance was enhanced by up to 30%, see [#130](https://github.com/deepseek-ai/DeepEP/pull/130) for more details. Thanks for the contribution!
+
+### Low-latency kernels with pure RDMA
+
+We test low-latency kernels on H800 with each connected to a CX7 InfiniBand 400 Gb/s RDMA network card (~50 GB/s maximum bandwidth). And we follow a typical DeepSeek-V3/R1 production setting (128 tokens per batch, 7168 hidden, top-8 experts, FP8 dispatching and BF16 combining).
+
+| Dispatch #EP | Latency | RDMA bandwidth | Combine #EP | Latency | RDMA bandwidth |
+|:------------:|:-------:|:--------------:|:-----------:|:-------:|:--------------:|
+|      8       |  77 us  |    98 GB/s     |      8      | 114 us  |    127 GB/s    |
+|      16      | 118 us  |    63 GB/s     |     16      | 195 us  |    74 GB/s     |
+|      32      | 155 us  |    48 GB/s     |     32      | 273 us  |    53 GB/s     |
+|      64      | 173 us  |    43 GB/s     |     64      | 314 us  |    46 GB/s     |
+|     128      | 192 us  |    39 GB/s     |     128     | 369 us  |    39 GB/s     |
+|     256      | 194 us  |    39 GB/s     |     256     | 360 us  |    40 GB/s     |
+
+**News (2025.06.05)**: low-latency kernels now leverage NVLink as much as possible, see [#173](https://github.com/deepseek-ai/DeepEP/pull/173) for more details. Thanks for the contribution!
+
+## Quick start
+
+### Requirements
+
+- Ampere (SM80), Hopper (SM90) GPUs, or other architectures with SM90 PTX ISA support
+- Python 3.8 and above
+- CUDA version
+  - CUDA 11.0 and above for SM80 GPUs
+  - CUDA 12.3 and above for SM90 GPUs
+- PyTorch 2.1 and above
+- NVLink for intranode communication
+- RDMA network for internode communication
+
+### Download and install NVSHMEM dependency
+
+DeepEP also depends on our modified NVSHMEM. Please refer to our [NVSHMEM Installation Guide](third-party/README.md) for instructions.
+
+### Development
+
+```bash
+# Build and make symbolic links for SO files
+NVSHMEM_DIR=/path/to/installed/nvshmem python setup.py build
+# You may modify the specific SO names according to your own platform
+ln -s build/lib.linux-x86_64-cpython-38/deep_ep_cpp.cpython-38-x86_64-linux-gnu.so
+
+# Run test cases
+# NOTES: you may modify the `init_dist` function in `tests/utils.py`
+# according to your own cluster settings, and launch into multiple nodes 
+python tests/test_intranode.py
+python tests/test_internode.py
+python tests/test_low_latency.py
+```
+
+### Installation
+
+```bash
+NVSHMEM_DIR=/path/to/installed/nvshmem python setup.py install
+```
+
+#### Installation environment variables
+
+- `NVSHMEM_DIR`: the path to the NVSHMEM directory, disable all internode and low-latency features if not specified 
+- `DISABLE_SM90_FEATURES`: 0 or 1, whether to disable SM90 features, it is required for SM90 devices or CUDA 11
+- `TORCH_CUDA_ARCH_LIST`: the list of target architectures, e.g. `TORCH_CUDA_ARCH_LIST="9.0"`
+- `DISABLE_AGGRESSIVE_PTX_INSTRS`: 0 or 1, whether to disable aggressive load/store instructions, see [Undefined-behavior PTX usage](#undefined-behavior-ptx-usage) for more details
+
+Then, import `deep_ep` in your Python project, and enjoy!
+
+## Network configurations
+
+DeepEP is fully tested with InfiniBand networks. However, it is theoretically compatible with RDMA over Converged Ethernet (RoCE) as well.
+
+### Traffic isolation
+
+Traffic isolation is supported by InfiniBand through Virtual Lanes (VL).
+
+To prevent interference between different types of traffic, we recommend segregating workloads across different virtual lanes as follows:
+
+- workloads using normal kernels
+- workloads using low-latency kernels
+- other workloads
+
+For DeepEP, you can control the virtual lane assignment by setting the `NVSHMEM_IB_SL` environment variable.
+
+### Adaptive routing
+
+Adaptive routing is an advanced routing feature provided by InfiniBand switches that can evenly distribute traffic across multiple paths. Enabling adaptive routing can completely eliminate network congestion caused by routing conflicts, but it also introduces additional latency. We recommend the following configuration for optimal performance:
+
+- enable adaptive routing in environments with heavy network loads
+- use static routing in environments with light network loads
+
+### Congestion control
+
+Congestion control is disabled as we have not observed significant congestion in our production environment.
+
+## Interfaces and examples
+
+### Example use in model training or inference prefilling
+
+The normal kernels can be used in model training or the inference prefilling phase (without the backward part) as the below example code shows.
+
+```python
+import torch
+import torch.distributed as dist
+from typing import List, Tuple, Optional, Union
+
+from deep_ep import Buffer, EventOverlap
+
+# Communication buffer (will allocate at runtime)
+_buffer: Optional[Buffer] = None
+
+# Set the number of SMs to use
+# NOTES: this is a static variable
+Buffer.set_num_sms(24)
+
+
+# You may call this function at the framework initialization
+def get_buffer(group: dist.ProcessGroup, hidden_bytes: int) -> Buffer:
+    global _buffer
+    
+    # NOTES: you may also replace `get_*_config` with your auto-tuned results via all the tests
+    num_nvl_bytes, num_rdma_bytes = 0, 0
+    for config in (Buffer.get_dispatch_config(group.size()), Buffer.get_combine_config(group.size())):
+        num_nvl_bytes = max(config.get_nvl_buffer_size_hint(hidden_bytes, group.size()), num_nvl_bytes)
+        num_rdma_bytes = max(config.get_rdma_buffer_size_hint(hidden_bytes, group.size()), num_rdma_bytes)
+
+    # Allocate a buffer if not existed or not enough buffer size
+    if _buffer is None or _buffer.group != group or _buffer.num_nvl_bytes < num_nvl_bytes or _buffer.num_rdma_bytes < num_rdma_bytes:
+        _buffer = Buffer(group, num_nvl_bytes, num_rdma_bytes)
+    return _buffer
+
+
+def get_hidden_bytes(x: torch.Tensor) -> int:
+    t = x[0] if isinstance(x, tuple) else x
+    return t.size(1) * max(t.element_size(), 2)
+
+
+def dispatch_forward(x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+                     topk_idx: torch.Tensor, topk_weights: torch.Tensor,
+                     num_experts: int, previous_event: Optional[EventOverlap] = None) -> \
+        Tuple[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]], torch.Tensor, torch.Tensor, List, Tuple, EventOverlap]:
+    # NOTES: an optional `previous_event` means a CUDA event captured that you want to make it as a dependency 
+    # of the dispatch kernel, it may be useful with communication-computation overlap. For more information, please
+    # refer to the docs of `Buffer.dispatch`
+    global _buffer
+
+    # Calculate layout before actual dispatch
+    num_tokens_per_rank, num_tokens_per_rdma_rank, num_tokens_per_expert, is_token_in_rank, previous_event = \
+        _buffer.get_dispatch_layout(topk_idx, num_experts,
+                                    previous_event=previous_event, async_finish=True,
+                                    allocate_on_comm_stream=previous_event is not None)
+    # Do MoE dispatch
+    # NOTES: the CPU will wait for GPU's signal to arrive, so this is not compatible with CUDA graph
+    # Unless you specify `num_worst_tokens`, but this flag is for intranode only
+    # For more advanced usages, please refer to the docs of the `dispatch` function
+    recv_x, recv_topk_idx, recv_topk_weights, num_recv_tokens_per_expert_list, handle, event = \
+        _buffer.dispatch(x, topk_idx=topk_idx, topk_weights=topk_weights,
+                         num_tokens_per_rank=num_tokens_per_rank, num_tokens_per_rdma_rank=num_tokens_per_rdma_rank,
+                         is_token_in_rank=is_token_in_rank, num_tokens_per_expert=num_tokens_per_expert,
+                         previous_event=previous_event, async_finish=True,
+                         allocate_on_comm_stream=True)
+    # For event management, please refer to the docs of the `EventOverlap` class
+    return recv_x, recv_topk_idx, recv_topk_weights, num_recv_tokens_per_expert_list, handle, event
+
+
+def dispatch_backward(grad_recv_x: torch.Tensor, grad_recv_topk_weights: torch.Tensor, handle: Tuple) -> \
+        Tuple[torch.Tensor, torch.Tensor, EventOverlap]:
+    global _buffer
+
+    # The backward process of MoE dispatch is actually a combine
+    # For more advanced usages, please refer to the docs of the `combine` function
+    combined_grad_x, combined_grad_recv_topk_weights, event = \
+        _buffer.combine(grad_recv_x, handle, topk_weights=grad_recv_topk_weights, async_finish=True)
+
+    # For event management, please refer to the docs of the `EventOverlap` class
+    return combined_grad_x, combined_grad_recv_topk_weights, event
+
+
+def combine_forward(x: torch.Tensor, handle: Tuple, previous_event: Optional[EventOverlap] = None) -> \
+        Tuple[torch.Tensor, EventOverlap]:
+    global _buffer
+
+    # Do MoE combine
+    # For more advanced usages, please refer to the docs of the `combine` function
+    combined_x, _, event = _buffer.combine(x, handle, async_finish=True, previous_event=previous_event,
+                                           allocate_on_comm_stream=previous_event is not None)
+
+    # For event management, please refer to the docs of the `EventOverlap` class
+    return combined_x, event
+
+
+def combine_backward(grad_combined_x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+                     handle: Tuple, previous_event: Optional[EventOverlap] = None) -> \
+        Tuple[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]], EventOverlap]:
+    global _buffer
+
+    # The backward process of MoE combine is actually a dispatch
+    # For more advanced usages, please refer to the docs of the `dispatch` function
+    grad_x, _, _, _, _, event = _buffer.dispatch(grad_combined_x, handle=handle, async_finish=True,
+                                                 previous_event=previous_event,
+                                                 allocate_on_comm_stream=previous_event is not None)
+
+    # For event management, please refer to the docs of the `EventOverlap` class
+    return grad_x, event
+```
+
+Moreover, inside the dispatch function, we may not know how many tokens to receive for the current rank. So an implicit CPU wait for GPU received count signal will be involved, as the following figure shows.
+
+![normal](figures/normal.png)
+
+### Example use in inference decoding
+
+The low latency kernels can be used in the inference decoding phase as the below example code shows.
+
+```python
+import torch
+import torch.distributed as dist
+from typing import Tuple, Optional
+
+from deep_ep import Buffer
+
+# Communication buffer (will allocate at runtime)
+# NOTES: there is no SM control API for the low-latency kernels
+_buffer: Optional[Buffer] = None
+
+
+# You may call this function at the framework initialization
+def get_buffer(group: dist.ProcessGroup, num_max_dispatch_tokens_per_rank: int, hidden: int, num_experts: int) -> Buffer:
+    # NOTES: the low-latency mode will consume much more space than the normal mode
+    # So we recommend that `num_max_dispatch_tokens_per_rank` (the actual batch size in the decoding engine) should be less than 256
+    global _buffer
+    num_rdma_bytes = Buffer.get_low_latency_rdma_size_hint(num_max_dispatch_tokens_per_rank, hidden, group.size(), num_experts)
+
+    # Allocate a buffer if not existed or not enough buffer size
+    if _buffer is None or _buffer.group != group or not _buffer.low_latency_mode or _buffer.num_rdma_bytes < num_rdma_bytes:
+        # NOTES: for the best performance, the QP number **must** be equal to the number of the local experts
+        assert num_experts % group.size() == 0
+        _buffer = Buffer(group, 0, num_rdma_bytes, low_latency_mode=True, num_qps_per_rank=num_experts // group.size())
+    return _buffer
+
+
+def low_latency_dispatch(hidden_states: torch.Tensor, topk_idx: torch.Tensor, num_max_dispatch_tokens_per_rank: int, num_experts: int):
+    global _buffer
+
+    # Do MoE dispatch, compatible with CUDA graph (but you may restore some buffer status once you replay)
+    recv_hidden_states, recv_expert_count, handle, event, hook = \
+        _buffer.low_latency_dispatch(hidden_states, topk_idx, num_max_dispatch_tokens_per_rank, num_experts,
+                                     async_finish=False, return_recv_hook=True)
+
+    # NOTES: the actual tensor will not be received only if you call `hook()`,
+    # it is useful for double-batch overlapping, but **without any SM occupation**
+    # If you don't want to overlap, please set `return_recv_hook=False`
+    # Later, you can use our GEMM library to do the computation with this specific format
+    return recv_hidden_states, recv_expert_count, handle, event, hook
+
+
+def low_latency_combine(hidden_states: torch.Tensor,
+                        topk_idx: torch.Tensor, topk_weights: torch.Tensor, handle: Tuple):
+    global _buffer
+
+    # Do MoE combine, compatible with CUDA graph (but you may restore some buffer status once you replay)
+    combined_hidden_states, event_overlap, hook = \
+        _buffer.low_latency_combine(hidden_states, topk_idx, topk_weights, handle,
+                                    async_finish=False, return_recv_hook=True)
+
+    # NOTES: the same behavior as described in the dispatch kernel
+    return combined_hidden_states, event_overlap, hook
+```
+
+For two-micro-batch overlapping, you can refer to the following figure. With our receiving hook interface, the RDMA network traffic is happening in the background, without costing any GPU SMs from the computation part. But notice, the overlapped parts can be adjusted, i.e., the 4 parts of attention/dispatch/MoE/combine may not have the exact same execution time. You may adjust the stage settings according to your workload.
+
+![low-latency](figures/low-latency.png)
+
+## Roadmap
+
+- [x] AR support
+- [x] Refactor low-latency mode AR code
+- [x] A100 support (intranode only)
+- [x] Support BF16 for the low-latency dispatch kernel
+- [x] Support NVLink protocol for intranode low-latency kernels
+- [ ] TMA copy instead of LD/ST
+  - [x] Intranode kernels
+  - [ ] Internode kernels
+  - [ ] Low-latency kernels
+- [ ] SM-free kernels and refactors
+- [ ] Fully remove undefined-behavior PTX instructions
+
+## Notices
+
+#### Easier potential overall design
+
+The current DeepEP implementation uses queues for communication buffers which save memory but introduce complexity and potential deadlocks. If you're implementing your own version based on DeepEP, consider using fixed-size buffers allocated to maximum capacity for simplicity and better performance. For a detailed discussion of this alternative approach, see https://github.com/deepseek-ai/DeepEP/issues/39.
+
+#### Undefined-behavior PTX usage
+
+- For extreme performance, we discover and use an undefined-behavior PTX usage: using read-only PTX `ld.global.nc.L1::no_allocate.L2::256B` to **read volatile data**. The PTX modifier `.nc` indicates that a non-coherent cache is used. But the correctness is tested to be guaranteed with `.L1::no_allocate` on Hopper architectures, and performance will be much better. The reason we guess may be: the non-coherent cache is unified with L1, and the L1 modifier is not just a hint but a strong option, so that the correctness can be guaranteed by no dirty data in L1.
+- Initially, because NVCC could not automatically unroll volatile read PTX, we tried using `__ldg` (i.e., `ld.nc`). Even compared to manually unrolled volatile reads, it was significantly faster (likely due to additional compiler optimizations). However, the results could be incorrect or dirty. After consulting the PTX documentation, we discovered that L1 and non-coherent cache are unified on Hopper architectures. We speculated that `.L1::no_allocate` might resolve the issue, leading to this discovery.
+- If you find kernels not working on some other platforms, you may add `DISABLE_AGGRESSIVE_PTX_INSTRS=1` to `setup.py` and disable this, or file an issue.
+
+#### Auto-tuning on your cluster
+
+For better performance on your cluster, we recommend to run all the tests and use the best auto-tuned configuration. The default configurations are optimized on the DeepSeek's internal cluster.
+
+## License
+
+This code repository is released under [the MIT License](LICENSE), except for codes that reference NVSHMEM (including `csrc/kernels/ibgda_device.cuh` and `third-party/nvshmem.patch`), which are subject to [NVSHMEM SLA](https://docs.nvidia.com/nvshmem/api/sla.html).
+
+## Community Forks
+
+- [Infrawaves/DeepEP_ibrc_dual-ports_multiQP](https://github.com/Infrawaves/DeepEP_ibrc_dual-ports_multiQP) - Adds multi-QP solution and dual-port NIC support in IBRC transport
+
+## Citation
+
+If you use this codebase or otherwise find our work valuable, please cite:
+
+```bibtex
+@misc{deepep2025,
+      title={DeepEP: an efficient expert-parallel communication library},
+      author={Chenggang Zhao and Shangyan Zhou and Liyue Zhang and Chengqi Deng and Zhean Xu and Yuxuan Liu and Kuai Yu and Jiashi Li and Liang Zhao},
+      year={2025},
+      publisher = {GitHub},
+      howpublished = {\url{https://github.com/deepseek-ai/DeepEP}},
+}
+```
--- a/DeepEP/csrc/CMakeLists.txt
+++ b/DeepEP/csrc/CMakeLists.txt
@ -0,0 +1,36 @@
+# NOTES: this CMake is only for debugging; for setup, please use Torch extension
+cmake_minimum_required(VERSION 3.10)
+project(deep_ep LANGUAGES CUDA CXX)
+set(CMAKE_VERBOSE_MAKEFILE ON)
+
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 -fPIC")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fPIC")
+set(CUDA_SEPARABLE_COMPILATION ON)
+list(APPEND CUDA_NVCC_FLAGS "-DENABLE_FAST_DEBUG")
+list(APPEND CUDA_NVCC_FLAGS "-O3")
+list(APPEND CUDA_NVCC_FLAGS "--ptxas-options=--verbose,--register-usage-level=10,--warn-on-local-memory-usage")
+
+set(USE_SYSTEM_NVTX on)
+set(CUDA_ARCH_LIST "9.0" CACHE STRING "List of CUDA architectures to compile")
+set(TORCH_CUDA_ARCH_LIST "${CUDA_ARCH_LIST}")
+
+find_package(CUDAToolkit REQUIRED)
+find_package(pybind11 REQUIRED)
+find_package(Torch REQUIRED)
+find_package(NVSHMEM REQUIRED HINTS ${NVSHMEM_ROOT_DIR}/lib/cmake/nvshmem)
+
+add_library(nvshmem ALIAS nvshmem::nvshmem)
+add_library(nvshmem_host ALIAS nvshmem::nvshmem_host)
+add_library(nvshmem_device ALIAS nvshmem::nvshmem_device)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CUDA_STANDARD 17)
+
+include_directories(${CUDA_TOOLKIT_ROOT_DIR}/include ${TORCH_INCLUDE_DIRS} ${PYTHON_INCLUDE_DIRS} ${NVSHMEM_INCLUDE_DIR})
+link_directories(${TORCH_INSTALL_PREFIX}/lib ${CUDA_TOOLKIT_ROOT_DIR}/lib ${NVSHMEM_LIB_DIR})
+
+add_subdirectory(kernels)
+
+# Link CPP and CUDA together
+pybind11_add_module(deep_ep_cpp deep_ep.cpp)
+target_link_libraries(deep_ep_cpp PRIVATE ${EP_CUDA_LIBRARIES} ${TORCH_LIBRARIES} torch_python)
--- a/DeepEP/csrc/config.hpp
+++ b/DeepEP/csrc/config.hpp
@ -0,0 +1,188 @@
+#pragma once
+
+#include "kernels/api.cuh"
+#include "kernels/exception.cuh"
+
+namespace deep_ep {
+
+template <typename dtype_t>
+dtype_t ceil_div(dtype_t a, dtype_t b) {
+    return (a + b - 1) / b;
+}
+
+template <typename dtype_t>
+dtype_t align(dtype_t a, dtype_t b) {
+    return ceil_div<dtype_t>(a, b) * b;
+}
+
+struct Config {
+    int num_sms;
+    int num_max_nvl_chunked_send_tokens;
+    int num_max_nvl_chunked_recv_tokens;
+    int num_max_rdma_chunked_send_tokens;
+    int num_max_rdma_chunked_recv_tokens;
+
+    Config(int num_sms,
+           int num_max_nvl_chunked_send_tokens, int num_max_nvl_chunked_recv_tokens,
+           int num_max_rdma_chunked_send_tokens, int num_max_rdma_chunked_recv_tokens) :
+            num_sms(num_sms),
+            num_max_nvl_chunked_send_tokens(num_max_nvl_chunked_send_tokens),
+            num_max_nvl_chunked_recv_tokens(num_max_nvl_chunked_recv_tokens),
+            num_max_rdma_chunked_send_tokens(num_max_rdma_chunked_send_tokens),
+            num_max_rdma_chunked_recv_tokens(num_max_rdma_chunked_recv_tokens) {
+        EP_HOST_ASSERT(num_sms >= 0);
+        EP_HOST_ASSERT(num_max_nvl_chunked_send_tokens > 0 and num_max_nvl_chunked_recv_tokens > 0);
+        EP_HOST_ASSERT(num_max_nvl_chunked_send_tokens < num_max_nvl_chunked_recv_tokens);
+        EP_HOST_ASSERT(num_max_rdma_chunked_send_tokens > 0 and num_max_rdma_chunked_recv_tokens > 0);
+
+        // Ceil up RDMA buffer size
+        this->num_max_rdma_chunked_recv_tokens = align<int>(num_max_rdma_chunked_recv_tokens, num_max_rdma_chunked_send_tokens);
+        EP_HOST_ASSERT(num_max_rdma_chunked_send_tokens < num_max_rdma_chunked_recv_tokens);
+        // NOTES: this assertion is related to RDMA lazy head update, we must ensure senders always have space to push
+        EP_HOST_ASSERT(num_max_rdma_chunked_send_tokens <= num_max_rdma_chunked_recv_tokens / 2);
+    }
+
+    size_t get_nvl_buffer_size_hint(size_t hidden_bytes, int num_ranks) const {
+        // Below are some assumptions
+        // TODO: add assertions
+        constexpr int kNumMaxTopK = 128;
+        constexpr int kNumMaxScales = 128;
+        EP_HOST_ASSERT(num_ranks < NUM_MAX_NVL_PEERS or num_ranks % NUM_MAX_NVL_PEERS == 0);
+        EP_HOST_ASSERT(num_ranks <= NUM_MAX_NVL_PEERS or num_sms % 2 == 0);
+        const auto num_rdma_ranks = std::max(num_ranks / NUM_MAX_NVL_PEERS, 1);
+        const auto num_nvl_ranks = std::min(num_ranks, NUM_MAX_NVL_PEERS);
+        const int num_channels = num_sms / 2;
+
+        size_t num_bytes = 0;
+        num_bytes += num_channels * num_nvl_ranks * (2 * num_rdma_ranks + 3) * sizeof(int);
+        num_bytes += num_channels * num_nvl_ranks * num_max_nvl_chunked_recv_tokens * hidden_bytes;
+#ifndef DISABLE_NVSHMEM
+        num_bytes += num_channels * num_nvl_ranks * num_max_nvl_chunked_recv_tokens * internode::get_source_meta_bytes();
+#endif
+        num_bytes += num_channels * num_nvl_ranks * num_max_nvl_chunked_recv_tokens * kNumMaxTopK * sizeof(int64_t);
+        num_bytes += num_channels * num_nvl_ranks * num_max_nvl_chunked_recv_tokens * kNumMaxTopK * sizeof(float);
+        num_bytes += num_channels * num_nvl_ranks * num_max_nvl_chunked_recv_tokens * kNumMaxScales * sizeof(float);
+        num_bytes = ((num_bytes + 127) / 128) * 128;
+        return num_bytes;
+    }
+
+    size_t get_rdma_buffer_size_hint(int64_t hidden_bytes, int num_ranks) const {
+#ifndef DISABLE_NVSHMEM
+        // Legacy mode
+        if (num_ranks <= NUM_MAX_NVL_PEERS)
+            return 0;
+
+        // Below are some assumptions
+        // TODO: add assertions
+        constexpr int kNumMaxTopK = 128;
+        constexpr int kNumMaxScales = 128;
+        EP_HOST_ASSERT(num_ranks % NUM_MAX_NVL_PEERS == 0);
+        EP_HOST_ASSERT(num_sms % 2 == 0);
+        const int num_rdma_ranks = num_ranks / NUM_MAX_NVL_PEERS;
+        const int num_channels = num_sms / 2;
+
+        size_t num_bytes = 0;
+        num_bytes += num_channels * num_rdma_ranks * (NUM_MAX_NVL_PEERS * 2 + 2) * 2 * sizeof(int);
+        num_bytes += num_channels * num_rdma_ranks * num_max_rdma_chunked_recv_tokens * hidden_bytes * 2;
+        num_bytes += num_channels * num_rdma_ranks * num_max_rdma_chunked_recv_tokens * internode::get_source_meta_bytes() * 2;
+        num_bytes += num_channels * num_rdma_ranks * num_max_rdma_chunked_recv_tokens * kNumMaxTopK * sizeof(int64_t) * 2;
+        num_bytes += num_channels * num_rdma_ranks * num_max_rdma_chunked_recv_tokens * kNumMaxTopK * sizeof(float) * 2;
+        num_bytes += num_channels * num_rdma_ranks * num_max_rdma_chunked_recv_tokens * kNumMaxScales * sizeof(float) * 2;
+        num_bytes += num_channels * num_rdma_ranks * num_max_rdma_chunked_recv_tokens * sizeof(int4) * 2;
+        num_bytes = ((num_bytes + 127) / 128) * 128;
+        return num_bytes;
+#else
+        EP_HOST_ASSERT(false and "NVSHMEM is disable during compilation");
+#endif
+    }
+};
+
+struct LowLatencyBuffer {
+    int num_clean_int = 0;
+
+    void* dispatch_rdma_send_buffer = nullptr;
+    void* dispatch_rdma_recv_data_buffer = nullptr;
+    int* dispatch_rdma_recv_count_buffer = nullptr;
+
+    void* combine_rdma_send_buffer = nullptr;
+    void* combine_rdma_recv_data_buffer = nullptr;
+    int* combine_rdma_recv_flag_buffer = nullptr;
+
+    void* combine_rdma_send_buffer_data_start = nullptr;
+    size_t num_bytes_per_combine_msg = 0;
+
+    std::pair<int*, int> clean_meta() {
+        EP_HOST_ASSERT(dispatch_rdma_recv_count_buffer == combine_rdma_recv_flag_buffer);
+        return {dispatch_rdma_recv_count_buffer, num_clean_int};
+    }
+};
+
+struct LowLatencyLayout {
+    size_t total_bytes = 0;
+    LowLatencyBuffer buffers[2];
+
+    template <typename out_ptr_t = void*, typename count_ptr_t = uint8_t*, typename in_ptr_t = void*>
+    out_ptr_t advance(const in_ptr_t& ptr, size_t count) {
+        return reinterpret_cast<out_ptr_t>(reinterpret_cast<count_ptr_t>(ptr) + count);
+    }
+
+    LowLatencyLayout(void* rdma_buffer, int num_max_dispatch_tokens_per_rank, int hidden, int num_ranks, int num_experts) {
+        const int num_scales = hidden / 128;
+
+        // Dispatch and combine layout:
+        //  - 2 symmetric odd/even send buffer
+        //  - 2 symmetric odd/even receive buffers
+        //  - 2 symmetric odd/even signaling buffers
+
+        // Message sizes
+        // NOTES: you should add a control `int4` for combine messages if you want to do data transformation
+        EP_HOST_ASSERT(num_scales * sizeof(float) <= hidden);
+        size_t num_bytes_per_dispatch_msg = sizeof(int4) + std::max(hidden * sizeof(nv_bfloat16), hidden + num_scales * sizeof(float));
+        size_t num_bytes_per_combine_msg = hidden * sizeof(nv_bfloat16);
+
+        // Send buffer
+        size_t dispatch_send_buffer_bytes = num_max_dispatch_tokens_per_rank * num_bytes_per_dispatch_msg;
+        size_t combine_send_buffer_bytes = num_experts * num_max_dispatch_tokens_per_rank * num_bytes_per_combine_msg;
+        size_t send_buffer_bytes = std::max(dispatch_send_buffer_bytes, combine_send_buffer_bytes);
+        EP_HOST_ASSERT(send_buffer_bytes % sizeof(int4) == 0);
+        total_bytes += send_buffer_bytes * 2;
+
+        // Symmetric receive buffers
+        // TODO: optimize memory usages
+        size_t dispatch_recv_data_buffer_bytes = num_experts * num_max_dispatch_tokens_per_rank * num_bytes_per_dispatch_msg;
+        size_t combine_recv_buffer_bytes = num_experts * num_max_dispatch_tokens_per_rank * num_bytes_per_combine_msg;
+        size_t recv_buffer_bytes = std::max(dispatch_recv_data_buffer_bytes, combine_recv_buffer_bytes);
+        EP_HOST_ASSERT(recv_buffer_bytes % sizeof(int4) == 0);
+        total_bytes += recv_buffer_bytes * 2;
+
+        // Symmetric signaling buffers
+        size_t dispatch_recv_count_buffer_bytes = num_experts * sizeof(int);
+        size_t combine_recv_flag_buffer_bytes = dispatch_recv_count_buffer_bytes;
+        size_t signaling_buffer_bytes = std::max(dispatch_recv_count_buffer_bytes, combine_recv_flag_buffer_bytes);
+        total_bytes += signaling_buffer_bytes * 2;
+
+        // Assign pointers
+        // NOTES: we still leave some space for distinguishing dispatch/combine buffer,
+        // so you may see some parameters are duplicated
+        for (int i = 0; i < 2; ++ i) {
+            buffers[i] = {
+                static_cast<int>(signaling_buffer_bytes / sizeof(int)),
+                advance(rdma_buffer, send_buffer_bytes * i),
+                advance(rdma_buffer, send_buffer_bytes * 2 + recv_buffer_bytes * i),
+                advance<int*>(rdma_buffer, send_buffer_bytes * 2 + recv_buffer_bytes * 2 + signaling_buffer_bytes * i),
+                advance(rdma_buffer, send_buffer_bytes * i),
+                advance(rdma_buffer, send_buffer_bytes * 2 + recv_buffer_bytes * i),
+                advance<int*>(rdma_buffer, send_buffer_bytes * 2 + recv_buffer_bytes * 2 + signaling_buffer_bytes * i),
+                advance(rdma_buffer, send_buffer_bytes * i),
+                num_bytes_per_combine_msg
+            };
+        }
+    }
+};
+
+size_t get_low_latency_rdma_size_hint(int num_max_dispatch_tokens_per_rank, int hidden, int num_ranks, int num_experts) {
+    auto num_bytes = LowLatencyLayout(nullptr, num_max_dispatch_tokens_per_rank, hidden, num_ranks, num_experts).total_bytes;
+    return ((num_bytes + NUM_BUFFER_ALIGNMENT_BYTES) / NUM_BUFFER_ALIGNMENT_BYTES) * NUM_BUFFER_ALIGNMENT_BYTES;
+}
+
+} // namespace deep_ep
--- a/DeepEP/csrc/deep_ep.cpp
+++ b/DeepEP/csrc/deep_ep.cpp
--- a/DeepEP/csrc/deep_ep.hpp
+++ b/DeepEP/csrc/deep_ep.hpp
@ -0,0 +1,157 @@
+#pragma once
+
+// Forcibly disable NDEBUG
+#ifdef NDEBUG
+#undef NDEBUG
+#endif
+
+#include <pybind11/pybind11.h>
+#include <pybind11/pytypes.h>
+#include <torch/types.h>
+#include <tuple>
+#include <vector>
+
+#include "config.hpp"
+#include "event.hpp"
+#include "kernels/configs.cuh"
+#include "kernels/exception.cuh"
+
+#ifndef TORCH_EXTENSION_NAME
+#define TORCH_EXTENSION_NAME deep_ep_cpp
+#endif
+
+namespace deep_ep {
+
+struct Buffer {
+    EP_STATIC_ASSERT(NUM_MAX_NVL_PEERS == 8, "The number of maximum NVLink peers must be 8");
+
+private:
+    // Low-latency mode buffer
+    int low_latency_buffer_idx = 0;
+    bool low_latency_mode = false;
+
+    // NVLink Buffer
+    int64_t num_nvl_bytes;
+    void* buffer_ptrs[NUM_MAX_NVL_PEERS] = {nullptr};
+    void** buffer_ptrs_gpu = nullptr;
+
+    // NVSHMEM Buffer
+    int64_t num_rdma_bytes;
+    void* rdma_buffer_ptr = nullptr;
+
+    // Device info and communication
+    int device_id;
+    int num_device_sms;
+    int rank, rdma_rank, nvl_rank;
+    int num_ranks, num_rdma_ranks, num_nvl_ranks;
+    cudaIpcMemHandle_t ipc_handles[NUM_MAX_NVL_PEERS];
+
+    // Stream for communication
+    at::cuda::CUDAStream comm_stream;
+
+    // After IPC/NVSHMEM synchronization, this flag will be true
+    bool available = false;
+
+    // Barrier signals
+    int* barrier_signal_ptrs[NUM_MAX_NVL_PEERS] = {nullptr};
+    int** barrier_signal_ptrs_gpu = nullptr;
+
+    // Workspace
+    void* workspace = nullptr;
+
+    // Host-side MoE info
+    volatile int* moe_recv_counter = nullptr;
+    int* moe_recv_counter_mapped = nullptr;
+
+    // Host-side expert-level MoE info
+    volatile int* moe_recv_expert_counter = nullptr;
+    int* moe_recv_expert_counter_mapped = nullptr;
+
+    // Host-side RDMA-level MoE info
+    volatile int* moe_recv_rdma_counter = nullptr;
+    int* moe_recv_rdma_counter_mapped = nullptr;
+
+public:
+    Buffer(int rank, int num_ranks, int64_t num_nvl_bytes, int64_t num_rdma_bytes, bool low_latency_mode);
+
+    ~Buffer() noexcept(false);
+
+    bool is_available() const;
+
+    bool is_internode_available() const;
+
+    int get_num_rdma_ranks() const;
+
+    int get_rdma_rank() const;
+
+    int get_root_rdma_rank(bool global) const;
+
+    int get_local_device_id() const;
+
+    pybind11::bytearray get_local_ipc_handle() const;
+
+    pybind11::bytearray get_local_nvshmem_unique_id() const;
+
+    torch::Tensor get_local_buffer_tensor(const pybind11::object& dtype, int64_t offset, bool use_rdma_buffer) const;
+
+    torch::Stream get_comm_stream() const;
+
+    void sync(const std::vector<int>& device_ids, const std::vector<std::optional<pybind11::bytearray>>& all_gathered_handles, const std::optional<pybind11::bytearray>& root_unique_id_opt);
+
+    std::tuple<torch::Tensor, std::optional<torch::Tensor>, torch::Tensor, torch::Tensor, std::optional<EventHandle>>
+    get_dispatch_layout(const torch::Tensor& topk_idx, int num_experts, std::optional<EventHandle>& previous_event,
+                        bool async, bool allocate_on_comm_stream);
+
+    std::tuple<torch::Tensor, std::optional<torch::Tensor>, std::optional<torch::Tensor>, std::optional<torch::Tensor>, std::vector<int>, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, std::optional<EventHandle>>
+    intranode_dispatch(const torch::Tensor& x, const std::optional<torch::Tensor>& x_scales,
+                       const std::optional<torch::Tensor>& topk_idx, const std::optional<torch::Tensor>& topk_weights,
+                       const std::optional<torch::Tensor>& num_tokens_per_rank, const torch::Tensor& is_token_in_rank, const std::optional<torch::Tensor>& num_tokens_per_expert,
+                       int cached_num_recv_tokens, const std::optional<torch::Tensor>& cached_rank_prefix_matrix, const std::optional<torch::Tensor>& cached_channel_prefix_matrix,
+                       int expert_alignment, int num_worst_tokens, const Config& config,
+                       std::optional<EventHandle>& previous_event, bool async, bool allocate_on_comm_stream);
+
+    std::tuple<torch::Tensor, std::optional<torch::Tensor>, std::optional<EventHandle>>
+    intranode_combine(const torch::Tensor& x, const std::optional<torch::Tensor>& topk_weights,
+                      const std::optional<torch::Tensor>& bias_0, const std::optional<torch::Tensor>& bias_1,
+                      const torch::Tensor& src_idx, const torch::Tensor& rank_prefix_matrix, const torch::Tensor& channel_prefix_matrix,
+                      const torch::Tensor& send_head, const Config& config, std::optional<EventHandle>& previous_event, bool async, bool allocate_on_comm_stream);
+
+    std::tuple<torch::Tensor, std::optional<torch::Tensor>, std::optional<torch::Tensor>, std::optional<torch::Tensor>, std::vector<int>, torch::Tensor, torch::Tensor, std::optional<torch::Tensor>, torch::Tensor, std::optional<torch::Tensor>, torch::Tensor, std::optional<torch::Tensor>, std::optional<torch::Tensor>, std::optional<torch::Tensor>, std::optional<EventHandle>>
+    internode_dispatch(const torch::Tensor& x, const std::optional<torch::Tensor>& x_scales,
+                       const std::optional<torch::Tensor>& topk_idx, const std::optional<torch::Tensor>& topk_weights,
+                       const std::optional<torch::Tensor>& num_tokens_per_rank, const std::optional<torch::Tensor>& num_tokens_per_rdma_rank,
+                       const torch::Tensor& is_token_in_rank, const std::optional<torch::Tensor>& num_tokens_per_expert,
+                       int cached_num_recv_tokens, int cached_num_rdma_recv_tokens,
+                       const std::optional<torch::Tensor>& cached_rdma_channel_prefix_matrix, const std::optional<torch::Tensor>& cached_recv_rdma_rank_prefix_sum,
+                       const std::optional<torch::Tensor>& cached_gbl_channel_prefix_matrix, const std::optional<torch::Tensor>& cached_recv_gbl_rank_prefix_sum,
+                       int expert_alignment, const Config& config, std::optional<EventHandle>& previous_event, bool async, bool allocate_on_comm_stream);
+
+    std::tuple<torch::Tensor, std::optional<torch::Tensor>, std::optional<EventHandle>>
+    internode_combine(const torch::Tensor& x, const std::optional<torch::Tensor>& topk_weights,
+                      const std::optional<torch::Tensor>& bias_0, const std::optional<torch::Tensor>& bias_1,
+                      const torch::Tensor& src_meta, const torch::Tensor& is_combined_token_in_rank,
+                      const torch::Tensor& rdma_channel_prefix_matrix, const torch::Tensor& rdma_rank_prefix_sum, const torch::Tensor& gbl_channel_prefix_matrix,
+                      const torch::Tensor& combined_rdma_head, const torch::Tensor& combined_nvl_head,
+                      const Config& config, std::optional<EventHandle>& previous_event, bool async, bool allocate_on_comm_stream);
+
+    void clean_low_latency_buffer(int num_max_dispatch_tokens_per_rank, int hidden, int num_experts);
+
+    std::tuple<torch::Tensor, std::optional<torch::Tensor>, torch::Tensor, torch::Tensor, torch::Tensor, std::optional<EventHandle>, std::optional<std::function<void()>>>
+    low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_idx,
+                         const std::optional<torch::Tensor>& cumulative_local_expert_recv_stats,
+                         int num_max_dispatch_tokens_per_rank, int num_experts,
+                         bool use_fp8, bool round_scale, bool use_ue8m0,
+                         bool async, bool return_recv_hook);
+
+    std::tuple<torch::Tensor, std::optional<EventHandle>, std::optional<std::function<void()>>>
+    low_latency_combine(const torch::Tensor& x, const torch::Tensor& topk_idx, const torch::Tensor& topk_weights,
+                        const torch::Tensor& src_info, const torch::Tensor& layout_range,
+                        int num_max_dispatch_tokens_per_rank, int num_experts,
+                        bool zero_copy, bool async, bool return_recv_hook,
+                        const std::optional<torch::Tensor>& out = std::nullopt);
+
+    torch::Tensor
+    get_next_low_latency_combine_buffer(int num_max_dispatch_tokens_per_rank, int hidden, int num_experts) const;
+};
+
+} // namespace deep_ep
--- a/DeepEP/csrc/event.hpp
+++ b/DeepEP/csrc/event.hpp
@ -0,0 +1,43 @@
+#include <ATen/cuda/CUDAContext.h>
+#include <memory>
+
+#include "kernels/exception.cuh"
+
+namespace deep_ep {
+
+struct EventHandle {
+    std::shared_ptr<torch::Event> event;
+
+    EventHandle() {
+        event = std::make_shared<torch::Event>(torch::kCUDA);
+        event->record(at::cuda::getCurrentCUDAStream());
+    }
+
+    explicit EventHandle(const at::cuda::CUDAStream& stream) {
+        event = std::make_shared<torch::Event>(torch::kCUDA);
+        event->record(stream);
+    }
+
+    EventHandle(const EventHandle& other) = default;
+
+    void current_stream_wait() const {
+        at::cuda::getCurrentCUDAStream().unwrap().wait(*event);
+    }
+};
+
+torch::Event create_event(const at::cuda::CUDAStream &s) {
+    auto event = torch::Event(torch::kCUDA);
+    event.record(s);
+    return event;
+}
+
+void stream_wait(const at::cuda::CUDAStream& s_0, const at::cuda::CUDAStream& s_1) {
+    EP_HOST_ASSERT(s_0.id() != s_1.id());
+    s_0.unwrap().wait(create_event(s_1));
+}
+
+void stream_wait(const at::cuda::CUDAStream& s, const EventHandle& event) {
+    s.unwrap().wait(*event.event);
+}
+
+} // namespace deep_ep
--- a/DeepEP/csrc/kernels/CMakeLists.txt
+++ b/DeepEP/csrc/kernels/CMakeLists.txt
@ -0,0 +1,21 @@
+function(add_deep_ep_library target_name source_file)
+    add_library(${target_name} STATIC ${source_file})
+    set_target_properties(${target_name} PROPERTIES
+            POSITION_INDEPENDENT_CODE ON
+            CXX_STANDARD_REQUIRED ON
+            CUDA_STANDARD_REQUIRED ON
+            CXX_STANDARD 17
+            CUDA_STANDARD 17
+            CUDA_SEPARABLE_COMPILATION ON
+    )
+    target_link_libraries(${target_name} PUBLIC nvshmem cudart cudadevrt mlx5)
+endfunction()
+
+add_deep_ep_library(runtime_cuda runtime.cu)
+add_deep_ep_library(layout_cuda layout.cu)
+add_deep_ep_library(intranode_cuda intranode.cu)
+add_deep_ep_library(internode_cuda internode.cu)
+add_deep_ep_library(internode_ll_cuda internode_ll.cu)
+
+# Later, we should link all libraries in `EP_CUDA_LIBRARIES`
+set(EP_CUDA_LIBRARIES runtime_cuda layout_cuda intranode_cuda internode_cuda internode_ll_cuda PARENT_SCOPE)
--- a/DeepEP/csrc/kernels/api.cuh
+++ b/DeepEP/csrc/kernels/api.cuh
@ -0,0 +1,167 @@
+#pragma once
+
+#include <vector>
+
+namespace deep_ep {
+
+// Intranode runtime
+namespace intranode {
+
+void barrier(int** barrier_signal_ptrs, int rank, int num_ranks, cudaStream_t stream);
+
+} // namespace intranode
+
+// Internode runtime
+namespace internode {
+
+std::vector<uint8_t> get_unique_id();
+
+int init(const std::vector<uint8_t> &root_unique_id_val, int rank, int num_ranks, bool low_latency_mode);
+
+void *alloc(size_t size, size_t alignment);
+
+void free(void *ptr);
+
+void barrier();
+
+void finalize();
+
+} // namespace internode
+
+// Layout kernels
+namespace layout {
+
+void get_dispatch_layout(const int64_t* topk_idx,
+                         int* num_tokens_per_rank, int* num_tokens_per_rdma_rank,
+                         int* num_tokens_per_expert, bool* is_token_in_rank,
+                         int num_tokens, int num_topk, int num_ranks, int num_experts,
+                         cudaStream_t stream);
+
+} // namespace layout
+
+// Intranode kernels
+namespace intranode {
+
+void notify_dispatch(const int* num_tokens_per_rank, int* moe_recv_counter_mapped, int num_ranks,
+                     const int* num_tokens_per_expert, int* moe_recv_expert_counter_mapped, int num_experts,
+                     int num_tokens, const bool* is_token_in_rank, int* channel_prefix_matrix,
+                     int* rank_prefix_matrix_copy, int num_memset_int, int expert_alignment,
+                     void** buffer_ptrs, int** barrier_signal_ptrs, int rank,
+                     cudaStream_t stream, int num_sms);
+
+void cached_notify_dispatch(const int* rank_prefix_matrix, int num_memset_int,
+                            void** buffer_ptrs, int** barrier_signal_ptrs, int rank, int num_ranks,
+                            cudaStream_t stream);
+
+void dispatch(void* recv_x, float* recv_x_scales, int* recv_src_idx, int64_t* recv_topk_idx, float* recv_topk_weights, int* recv_channel_offset,
+              int* send_head, const void* x, const float* x_scales, const int64_t* topk_idx, const float* topk_weights,
+              const bool* is_token_in_rank, const int* channel_prefix_matrix,
+              int num_tokens, int num_worst_tokens, int hidden_int4, int num_topk, int num_experts, int num_scales,
+              int scale_token_stride, int scale_hidden_stride,
+              void** buffer_ptrs, int rank, int num_ranks,
+              cudaStream_t stream, int num_sms,
+              int num_max_send_tokens, int num_recv_buffer_tokens);
+
+void cached_notify_combine(void** buffer_ptrs, int* send_head, int num_channels, int num_recv_tokens, int num_memset_int,
+                           int** barrier_signal_ptrs, int rank, int num_ranks, cudaStream_t stream);
+
+void combine(cudaDataType_t type,
+             void* recv_x, float* recv_topk_weights,
+             const void* x, const float* topk_weights,
+             const void* bias_0, const void* bias_1,
+             const int* src_idx, const int* rank_prefix_matrix, const int* channel_prefix_matrix,
+             int* send_head, int num_tokens, int num_recv_tokens, int hidden, int num_topk,
+             void** buffer_ptrs, int rank, int num_ranks,
+             cudaStream_t stream, int num_sms,
+             int num_max_send_tokens, int num_recv_buffer_tokens);
+
+} // namespace intranode
+
+// Internode kernels
+namespace internode {
+
+int get_source_meta_bytes();
+
+void notify_dispatch(const int* num_tokens_per_rank, int* moe_recv_counter_mapped, int num_ranks,
+                     const int* num_tokens_per_rdma_rank, int* moe_recv_rdma_counter_mapped,
+                     const int* num_tokens_per_expert, int* moe_recv_expert_counter_mapped, int num_experts,
+                     const bool* is_token_in_rank, int num_tokens, int num_channels,
+                     int hidden_int4, int num_scales, int num_topk, int expert_alignment,
+                     int* rdma_channel_prefix_matrix, int* recv_rdma_rank_prefix_sum,
+                     int* gbl_channel_prefix_matrix, int* recv_gbl_rank_prefix_sum,
+                     void* rdma_buffer_ptr, int num_max_rdma_chunked_recv_tokens,
+                     void** buffer_ptrs, int num_max_nvl_chunked_recv_tokens,
+                     int** barrier_signal_ptrs, int rank,
+                     cudaStream_t stream, int64_t num_rdma_bytes, int64_t num_nvl_bytes,
+                     bool low_latency_mode);
+
+void dispatch(void* recv_x, float* recv_x_scales, int64_t* recv_topk_idx, float* recv_topk_weights, void* recv_src_meta,
+              const void* x, const float* x_scales, const int64_t* topk_idx, const float* topk_weights,
+              int* send_rdma_head, int* send_nvl_head,
+              int* recv_rdma_channel_prefix_matrix, int* recv_gbl_channel_prefix_matrix,
+              const int* rdma_channel_prefix_matrix, const int* recv_rdma_rank_prefix_sum,
+              const int* gbl_channel_prefix_matrix, const int* recv_gbl_rank_prefix_sum,
+              const bool* is_token_in_rank,
+              int num_tokens, int hidden_int4, int num_scales, int num_topk, int num_experts,
+              int scale_token_stride, int scale_hidden_stride,
+              void* rdma_buffer_ptr, int num_max_rdma_chunked_send_tokens, int num_max_rdma_chunked_recv_tokens,
+              void** buffer_ptrs, int num_max_nvl_chunked_send_tokens, int num_max_nvl_chunked_recv_tokens,
+              int rank, int num_ranks, bool is_cached_dispatch,
+              cudaStream_t stream, int num_channels, bool low_latency_mode);
+
+void cached_notify(int hidden_int4, int num_scales, int num_topk_idx, int num_topk_weights,
+                   int num_ranks, int num_channels, int num_combined_tokens, int* combined_rdma_head,
+                   const int* rdma_channel_prefix_matrix, const int* rdma_rank_prefix_sum, int* combined_nvl_head,
+                   void* rdma_buffer_ptr, int num_max_rdma_chunked_recv_tokens,
+                   void** buffer_ptrs, int num_max_nvl_chunked_recv_tokens,
+                   int** barrier_signal_ptrs, int rank, cudaStream_t stream,
+                   int64_t num_rdma_bytes, int64_t num_nvl_bytes,
+                   bool is_cached_dispatch, bool low_latency_mode);
+
+void combine(cudaDataType_t type,
+             void* combined_x, float* combined_topk_weights,
+             const bool* is_combined_token_in_rank,
+             const void* x, const float* topk_weights,
+             const void* bias_0, const void* bias_1,
+             const int* combined_rdma_head, const int* combined_nvl_head,
+             const void* src_meta, const int* rdma_channel_prefix_matrix, const int* rdma_rank_prefix_sum, const int* gbl_channel_prefix_matrix,
+             int num_tokens, int num_combined_tokens, int hidden, int num_topk,
+             void* rdma_buffer_ptr, int num_max_rdma_chunked_send_tokens, int num_max_rdma_chunked_recv_tokens,
+             void** buffer_ptrs, int num_max_nvl_chunked_send_tokens, int num_max_nvl_chunked_recv_tokens,
+             int rank, int num_ranks, cudaStream_t stream, int num_channels, bool low_latency_mode);
+
+} // namespace internode
+
+// Internode low-latency kernels
+namespace internode_ll {
+
+void clean_low_latency_buffer(int* clean_0, int num_clean_int_0,
+                              int* clean_1, int num_clean_int_1,
+                              cudaStream_t stream);
+
+void dispatch(void* packed_recv_x, void* packed_recv_x_scales,
+              int* packed_recv_src_info, int64_t* packed_recv_layout_range,
+              int* packed_recv_count,
+              int* cumulative_local_expert_recv_stats,
+              void* rdma_recv_x, int* rdma_recv_count, void* rdma_x,
+              const void* x, const int64_t* topk_idx,
+              int* next_clean, int num_next_clean_int,
+              int num_tokens, int hidden, int num_max_dispatch_tokens_per_rank,
+              int num_topk, int num_experts, int rank, int num_ranks,
+              bool use_fp8, bool round_scale, bool use_ue8m0,
+              void* workspace, int num_device_sms,
+              cudaStream_t stream, int phases);
+
+void combine(void* combined_x,
+             void* rdma_recv_x, int* rdma_recv_flag, void* rdma_send_x,
+             const void* x, const int64_t* topk_idx, const float* topk_weights,
+             const int* src_info, const int64_t* layout_range,
+             int* next_clean, int num_next_clean_int,
+             int num_combined_tokens, int hidden, int num_max_dispatch_tokens_per_rank,
+             int num_topk, int num_experts, int rank, int num_ranks,
+             void* workspace, int num_device_sms,
+             cudaStream_t stream, int phases, bool zero_copy);
+
+} // namespace internode_ll
+
+} // namespace deep_ep
--- a/DeepEP/csrc/kernels/buffer.cuh
+++ b/DeepEP/csrc/kernels/buffer.cuh
@ -0,0 +1,138 @@
+#pragma once
+
+#include "configs.cuh"
+#include "exception.cuh"
+
+namespace deep_ep {
+
+template <typename dtype_t>
+struct Buffer {
+private:
+    uint8_t* ptr;
+
+public:
+    int total_bytes;
+
+    __device__ __forceinline__ Buffer() : ptr(nullptr), total_bytes(0) {}
+
+    __device__ __forceinline__ Buffer(void* &gbl_ptr, int num_elems, int offset = 0) {
+        total_bytes = num_elems * sizeof(dtype_t);
+        ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + offset * sizeof(dtype_t);
+        gbl_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + total_bytes;
+    }
+
+    __device__ __forceinline__ Buffer advance_also(void* &gbl_ptr) {
+        gbl_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + total_bytes;
+        return *this;
+    }
+
+    __device__ __forceinline__ dtype_t* buffer() {
+        return reinterpret_cast<dtype_t*>(ptr);
+    }
+
+    __device__ __forceinline__ dtype_t& operator[](int idx) {
+        return buffer()[idx];
+    }
+};
+
+template <typename dtype_t, int kNumRanks = 1>
+struct AsymBuffer {
+private:
+    uint8_t* ptrs[kNumRanks];
+    int num_bytes;
+
+public:
+    int total_bytes;
+
+    __device__ __forceinline__ AsymBuffer(void* &gbl_ptr, int num_elems, int num_ranks,
+                                          int sm_id = 0, int num_sms = 1, int offset = 0) {
+        EP_STATIC_ASSERT(kNumRanks == 1, "");
+        num_bytes = num_elems * sizeof(dtype_t);
+
+        int per_channel_bytes = num_bytes * num_ranks;
+        total_bytes = per_channel_bytes * num_sms;
+        ptrs[0] = reinterpret_cast<uint8_t*>(gbl_ptr) + per_channel_bytes * sm_id + num_bytes * offset;
+        gbl_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + total_bytes;
+    }
+
+    __device__ __forceinline__ AsymBuffer(void** gbl_ptrs, int num_elems, int num_ranks,
+                                          int sm_id = 0, int num_sms = 1, int offset = 0) {
+        EP_STATIC_ASSERT(kNumRanks > 1, "");
+        num_bytes = num_elems * sizeof(dtype_t);
+
+        int per_channel_bytes = num_bytes * num_ranks;
+        total_bytes = per_channel_bytes * num_sms;
+        for (int i = 0; i < kNumRanks; ++ i) {
+            ptrs[i] = reinterpret_cast<uint8_t*>(gbl_ptrs[i]) + per_channel_bytes * sm_id + num_bytes * offset;
+            gbl_ptrs[i] = reinterpret_cast<uint8_t*>(gbl_ptrs[i]) + total_bytes;
+        }
+    }
+
+    __device__ __forceinline__ void advance(int shift) {
+        #pragma unroll
+        for (int i = 0; i < kNumRanks; ++ i)
+            ptrs[i] = ptrs[i] + shift * sizeof(dtype_t);
+    }
+
+    __device__ __forceinline__ AsymBuffer advance_also(void* &gbl_ptr) {
+        gbl_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + total_bytes;
+        return *this;
+    }
+
+    template<int kNumAlsoRanks>
+    __device__ __forceinline__ AsymBuffer advance_also(void** gbl_ptrs) {
+        for (int i = 0; i < kNumAlsoRanks; ++ i)
+            gbl_ptrs[i] = reinterpret_cast<uint8_t*>(gbl_ptrs[i]) + total_bytes;
+        return *this;
+    }
+
+    __device__ __forceinline__ dtype_t* buffer(int idx = 0) {
+        EP_STATIC_ASSERT(kNumRanks == 1, "`buffer` is only available for single rank case");
+        return reinterpret_cast<dtype_t*>(ptrs[0] + num_bytes * idx);
+    }
+
+    __device__ __forceinline__ dtype_t* buffer_by(int rank_idx, int idx = 0) {
+        EP_STATIC_ASSERT(kNumRanks > 1, "`buffer` is only available for single rank case");
+        return reinterpret_cast<dtype_t*>(ptrs[rank_idx] + num_bytes * idx);
+    }
+};
+
+template <typename dtype_t, bool kDecoupled = true>
+struct SymBuffer {
+private:
+    // NOTES: for non-decoupled case, `recv_ptr` is not used
+    uint8_t* send_ptr;
+    uint8_t* recv_ptr;
+    int num_bytes;
+
+public:
+    int total_bytes;
+
+    __device__ __forceinline__ SymBuffer(void* &gbl_ptr, int num_elems, int num_ranks,
+                                         int sm_id = 0, int num_sms = 1) {
+        num_bytes = num_elems * sizeof(dtype_t);
+
+        int per_channel_bytes = num_bytes * num_ranks;
+        total_bytes = per_channel_bytes * num_sms * (static_cast<int>(kDecoupled) + 1);
+        send_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + per_channel_bytes * sm_id;
+        recv_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + per_channel_bytes * (sm_id + num_sms);
+        gbl_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + total_bytes;
+    }
+
+    __device__ __forceinline__ dtype_t* send_buffer(int idx = 0) {
+        EP_STATIC_ASSERT(kDecoupled, "`send_buffer` is only available for non-decoupled case");
+        return reinterpret_cast<dtype_t*>(send_ptr + num_bytes * idx);
+    }
+
+    __device__ __forceinline__ dtype_t* recv_buffer(int idx = 0) {
+        EP_STATIC_ASSERT(kDecoupled, "`recv_buffer` is only available for non-decoupled case");
+        return reinterpret_cast<dtype_t*>(recv_ptr + num_bytes * idx);
+    }
+
+    __device__ __forceinline__ dtype_t* buffer(int idx = 0) {
+        EP_STATIC_ASSERT(not kDecoupled, "`buffer` is only available for decoupled case");
+        return reinterpret_cast<dtype_t*>(send_ptr + num_bytes * idx);
+    }
+};
+
+} // namespace deep_ep
--- a/DeepEP/csrc/kernels/configs.cuh
+++ b/DeepEP/csrc/kernels/configs.cuh
@ -0,0 +1,67 @@
+#pragma once
+
+#define NUM_MAX_NVL_PEERS 8
+#define NUM_MAX_RDMA_PEERS 20
+#define NUM_WORKSPACE_BYTES (32 * 1024 * 1024)
+#define NUM_MAX_LOCAL_EXPERTS 1024
+#define NUM_BUFFER_ALIGNMENT_BYTES 128
+
+#define FINISHED_SUM_TAG 1024
+#define NUM_WAIT_NANOSECONDS 500
+
+#ifndef ENABLE_FAST_DEBUG
+#define NUM_CPU_TIMEOUT_SECS 100
+#define NUM_TIMEOUT_CYCLES 200000000000ull // 200G cycles ~= 100s
+#else
+#define NUM_CPU_TIMEOUT_SECS 10
+#define NUM_TIMEOUT_CYCLES 20000000000ull // 20G cycles ~= 10s
+#endif
+
+#define LOW_LATENCY_SEND_PHASE 1
+#define LOW_LATENCY_RECV_PHASE 2
+
+// Make CLion CUDA indexing work
+#ifdef __CLION_IDE__
+#define __CUDA_ARCH__ 900 // NOLINT(*-reserved-identifier)
+#define __CUDACC_RDC__ // NOLINT(*-reserved-identifier)
+#endif
+
+// Remove Torch restrictions
+#ifdef __CUDA_NO_HALF_CONVERSIONS__
+#undef __CUDA_NO_HALF_CONVERSIONS__
+#endif
+#ifdef __CUDA_NO_HALF_OPERATORS__
+#undef __CUDA_NO_HALF_OPERATORS__
+#endif
+#ifdef __CUDA_NO_HALF2_OPERATORS__
+#undef __CUDA_NO_HALF2_OPERATORS__
+#endif
+#ifdef __CUDA_NO_BFLOAT16_CONVERSIONS__
+#undef __CUDA_NO_BFLOAT16_CONVERSIONS__
+#endif
+#ifdef __CUDA_NO_BFLOAT162_OPERATORS__
+#undef __CUDA_NO_BFLOAT162_OPERATORS__
+#endif
+
+#include <cstdint>
+#include <cuda_bf16.h>
+#include <cuda_runtime.h>
+
+#ifndef DISABLE_SM90_FEATURES
+#include <cuda_fp8.h>
+#else
+// Ampere does not support FP8 features
+#define __NV_E4M3 0
+#define __NV_E5M2 1
+typedef int __nv_fp8_interpretation_t;
+typedef int __nv_fp8x4_e4m3;
+typedef uint8_t __nv_fp8_storage_t;
+#endif
+
+#ifndef DISABLE_NVSHMEM
+#include <nvshmem.h>
+#include <nvshmemx.h>
+#include <infiniband/mlx5dv.h>
+#include <non_abi/device/threadgroup/nvshmemi_common_device_defines.cuh>
+#include <device_host_transport/nvshmem_common_ibgda.h>
+#endif
--- a/DeepEP/csrc/kernels/exception.cuh
+++ b/DeepEP/csrc/kernels/exception.cuh
@ -0,0 +1,51 @@
+#pragma once
+
+#include <string>
+#include <exception>
+
+#include "configs.cuh"
+
+#ifndef EP_STATIC_ASSERT
+#define EP_STATIC_ASSERT(cond, reason) static_assert(cond, reason)
+#endif
+
+class EPException: public std::exception {
+private:
+    std::string message = {};
+
+public:
+    explicit EPException(const char *name, const char* file, const int line, const std::string& error) {
+        message = std::string("Failed: ") + name + " error " + file + ":" + std::to_string(line) + " '" + error + "'";
+    }
+
+    const char *what() const noexcept override { return message.c_str(); }
+};
+
+#ifndef CUDA_CHECK
+#define CUDA_CHECK(cmd) \
+do { \
+    cudaError_t e = (cmd); \
+    if (e != cudaSuccess) { \
+        throw EPException("CUDA", __FILE__, __LINE__, cudaGetErrorString(e)); \
+    } \
+} while (0)
+#endif
+
+#ifndef EP_HOST_ASSERT
+#define EP_HOST_ASSERT(cond) \
+do { \
+    if (not (cond)) { \
+        throw EPException("Assertion", __FILE__, __LINE__, #cond); \
+    } \
+} while (0)
+#endif
+
+#ifndef EP_DEVICE_ASSERT
+#define EP_DEVICE_ASSERT(cond) \
+do { \
+    if (not (cond)) { \
+        printf("Assertion failed: %s:%d, condition: %s\n", __FILE__, __LINE__, #cond); \
+        asm("trap;"); \
+    } \
+} while (0)
+#endif
--- a/DeepEP/csrc/kernels/ibgda_device.cuh
+++ b/DeepEP/csrc/kernels/ibgda_device.cuh
@ -0,0 +1,482 @@
+// Portions derived from NVSHMEM (https://developer.nvidia.com/nvshmem)
+// Copyright (c) NVIDIA Corporation.
+// Licensed under the NVSHMEM Software License Agreement (version: September 3, 2019).
+// See full license at: https://docs.nvidia.com/nvshmem/api/sla.html
+//
+// Modified from original source:
+//  - nvshmem/src/include/non_abi/device/pt-to-pt/ibgda_device.cuh
+#pragma once
+
+#include "configs.cuh"
+#include "exception.cuh"
+#include "utils.cuh"
+
+namespace deep_ep {
+
+EP_STATIC_ASSERT(NVSHMEMI_IBGDA_MIN_QP_DEPTH >= 64, "Invalid QP minimum depth");
+
+__device__ static __forceinline__
+uint64_t HtoBE64(uint64_t x) {
+    uint64_t ret;
+    asm("{\n\t"
+        ".reg .b32 ign;\n\t"
+        ".reg .b32 lo;\n\t"
+        ".reg .b32 hi;\n\t"
+        ".reg .b32 new_lo;\n\t"
+        ".reg .b32 new_hi;\n\t"
+        "mov.b64 {lo,hi}, %1;\n\t"
+        "prmt.b32 new_hi, lo, ign, 0x0123;\n\t"
+        "prmt.b32 new_lo, hi, ign, 0x0123;\n\t"
+        "mov.b64 %0, {new_lo,new_hi};\n\t"
+        "}" : "=l"(ret) : "l"(x));
+    return ret;
+}
+
+__device__ static __forceinline__
+uint32_t HtoBE32(uint32_t x) {
+    uint32_t ret;
+    asm("{\n\t"
+        ".reg .b32 ign;\n\t"
+        "prmt.b32 %0, %1, ign, 0x0123;\n\t"
+        "}" : "=r"(ret) : "r"(x));
+    return ret;
+}
+
+__device__ static __forceinline__
+uint16_t HtoBE16(uint16_t x) {
+    // TODO: simplify PTX using 16-bit instructions
+    auto a = static_cast<uint32_t>(x);
+    uint32_t d;
+    asm volatile(
+        "{\n\t"
+        ".reg .b32 mask;\n\t"
+        ".reg .b32 ign;\n\t"
+        "mov.b32 mask, 0x4401;\n\t"
+        "mov.b32 ign, 0x0;\n\t"
+        "prmt.b32 %0, %1, ign, mask;\n\t"
+        "}"
+        : "=r"(d)
+        : "r"(a));
+    return static_cast<uint16_t>(d);
+}
+
+typedef struct mlx5_wqe_ctrl_seg __attribute__((__aligned__(8))) ibgda_ctrl_seg_t;
+
+typedef struct {
+    uint32_t add_data;
+    uint32_t field_boundary;
+    uint64_t reserved;
+} __attribute__((__packed__)) ibgda_atomic_32_masked_fa_seg_t;
+
+__device__ static __forceinline__
+nvshmemi_ibgda_device_state_t* ibgda_get_state() {
+    return &nvshmemi_ibgda_device_state_d;
+}
+
+__device__ static __forceinline__
+nvshmemi_ibgda_device_qp_t* ibgda_get_rc(int pe, int id) {
+    auto state = ibgda_get_state();
+    const auto num_rc_per_pe = ibgda_get_state()->num_rc_per_pe;
+    return &state->globalmem.rcs[pe * num_rc_per_pe + id % num_rc_per_pe];
+}
+
+__device__ static __forceinline__
+void ibgda_lock_acquire(int *lock) {
+    while (atomicCAS(lock, 0, 1) == 1);
+
+    // Prevent reordering before the lock is acquired
+    memory_fence_cta();
+}
+
+__device__ static __forceinline__
+void ibgda_lock_release(int *lock) {
+    memory_fence_cta();
+
+    // Prevent reordering before lock is released
+    st_na_relaxed(lock, 0);
+}
+
+__device__ static __forceinline__
+void ibgda_update_dbr(nvshmemi_ibgda_device_qp_t *qp, uint32_t dbrec_head) {
+    // `DBREC` contains the index of the next empty `WQEBB`
+    __be32 dbrec_val;
+    __be32 *dbrec_ptr = qp->tx_wq.dbrec;
+
+    // This is equivalent to `WRITE_ONCE(dbrec_ptr, HtoBE32(dbrec_head & 0xffff))`
+    asm("{\n\t"
+        ".reg .b32 dbrec_head_16b;\n\t"
+        ".reg .b32 ign;\n\t"
+        "and.b32 dbrec_head_16b, %1, 0xffff;\n\t"
+        "prmt.b32 %0, dbrec_head_16b, ign, 0x123;\n\t"
+        "}"
+        : "=r"(dbrec_val)
+        : "r"(dbrec_head));
+    st_na_release(dbrec_ptr, dbrec_val);
+}
+
+__device__ static __forceinline__
+void ibgda_ring_db(nvshmemi_ibgda_device_qp_t *qp, uint16_t prod_idx) {
+    auto bf_ptr = reinterpret_cast<uint64_t*>(qp->tx_wq.bf);
+    ibgda_ctrl_seg_t ctrl_seg = {
+        .opmod_idx_opcode = HtoBE32(prod_idx << 8),
+        .qpn_ds = HtoBE32(qp->qpn << 8)
+    };
+
+    EP_STATIC_ASSERT(sizeof(decltype(&ctrl_seg)) == sizeof(uint64_t), "");
+    st_na_release(bf_ptr, *(reinterpret_cast<uint64_t*>(&ctrl_seg)));
+}
+
+__device__ static __forceinline__
+void ibgda_post_send(nvshmemi_ibgda_device_qp_t *qp, uint64_t new_prod_idx) {
+    nvshmemi_ibgda_device_qp_management_t *mvars = &qp->mvars;
+    uint64_t old_prod_idx;
+
+    // Update `prod_idx` before ringing the doorbell, so that we know which index is needed in quiet/fence
+    ibgda_lock_acquire(&mvars->post_send_lock);
+
+    old_prod_idx = atomicMax(reinterpret_cast<unsigned long long int*>(&mvars->tx_wq.prod_idx), new_prod_idx);
+    if (new_prod_idx > old_prod_idx) {
+        ibgda_update_dbr(qp, new_prod_idx);
+        ibgda_ring_db(qp, new_prod_idx);
+    }
+    ibgda_lock_release(&mvars->post_send_lock);
+}
+
+template <bool kAlwaysDoPostSend>
+__device__ static __forceinline__
+void ibgda_submit_requests(nvshmemi_ibgda_device_qp_t *qp, uint64_t base_wqe_idx,
+                           uint32_t num_wqes, int message_idx = 0) {
+    nvshmemi_ibgda_device_qp_management_t *mvars = &qp->mvars;
+    uint64_t new_wqe_idx = base_wqe_idx + num_wqes;
+
+    // WQE writes must be finished first
+    __threadfence();
+
+    // Wait for prior WQE slots to be filled first
+    auto *ready_idx = reinterpret_cast<unsigned long long int*>(&mvars->tx_wq.ready_head);
+    while (atomicCAS(ready_idx, base_wqe_idx, new_wqe_idx) != base_wqe_idx);
+
+    // Always post, not in batch
+    constexpr int kNumRequestInBatch = 4;
+    if (kAlwaysDoPostSend or (message_idx + 1) % kNumRequestInBatch == 0)
+        ibgda_post_send(qp, new_wqe_idx);
+}
+
+__device__ static __forceinline__ void
+ibgda_write_rdma_write_inl_wqe(nvshmemi_ibgda_device_qp_t *qp, const uint32_t *val, uint64_t raddr,
+                               __be32 rkey, uint16_t wqe_idx, void** out_wqes, uint32_t imm) {
+    ibgda_ctrl_seg_t ctrl_seg;
+    struct mlx5_wqe_raddr_seg raddr_seg;
+    struct mlx5_wqe_inl_data_seg inl_seg;
+
+    auto *ctrl_seg_ptr = reinterpret_cast<ibgda_ctrl_seg_t*>(out_wqes[0]);
+    auto *raddr_seg_ptr = reinterpret_cast<mlx5_wqe_raddr_seg*>(reinterpret_cast<uintptr_t>(ctrl_seg_ptr) + sizeof(*ctrl_seg_ptr));
+    auto *inl_seg_ptr = reinterpret_cast<mlx5_wqe_inl_data_seg*>(reinterpret_cast<uintptr_t>(raddr_seg_ptr) + sizeof(*raddr_seg_ptr));
+    auto *wqe_data_ptr = reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(inl_seg_ptr) + sizeof(*inl_seg_ptr));
+
+    raddr_seg.raddr = HtoBE64(raddr);
+    raddr_seg.rkey = rkey;
+    raddr_seg.reserved = 0;
+
+    inl_seg.byte_count = HtoBE32(4 | MLX5_INLINE_SEG);
+
+    // `imm == std::numeric_limits<uint32_t>::max()` means no imm writes
+    ctrl_seg = {0};
+    ctrl_seg.qpn_ds = HtoBE32((qp->qpn << 8) | 3);
+    ctrl_seg.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
+    ctrl_seg.opmod_idx_opcode = HtoBE32((wqe_idx << 8) | (imm != std::numeric_limits<uint32_t>::max() ? MLX5_OPCODE_RDMA_WRITE_IMM : MLX5_OPCODE_RDMA_WRITE));
+    if (imm != std::numeric_limits<uint32_t>::max())
+        ctrl_seg.imm = HtoBE32(imm);
+
+    EP_STATIC_ASSERT(sizeof(*ctrl_seg_ptr) == 16, "sizeof(*ctrl_seg_ptr) == 16");
+    EP_STATIC_ASSERT(sizeof(*raddr_seg_ptr) == 16, "sizeof(*raddr_seg_ptr) == 16");
+    EP_STATIC_ASSERT(sizeof(*inl_seg_ptr) == 4, "sizeof(*inl_seg_ptr) == 4");
+    st_na_relaxed(reinterpret_cast<int4*>(ctrl_seg_ptr), *reinterpret_cast<const int4*>(&ctrl_seg));
+    st_na_relaxed(reinterpret_cast<int4*>(raddr_seg_ptr), *reinterpret_cast<const int4*>(&raddr_seg));
+    st_na_relaxed(reinterpret_cast<uint32_t*>(inl_seg_ptr), *reinterpret_cast<const uint32_t*>(&inl_seg));
+    st_na_relaxed(reinterpret_cast<uint32_t*>(wqe_data_ptr), *reinterpret_cast<const uint32_t*>(val));
+}
+
+__device__ static __forceinline__
+uint64_t ibgda_get_lkey_and_rkey(uint64_t laddr, __be32 *lkey,
+                                 uint64_t raddr, int dst_pe, uint64_t *out_raddr, __be32 *out_rkey) {
+    auto state = ibgda_get_state();
+    auto heap_start = reinterpret_cast<uint64_t>(nvshmemi_device_state_d.heap_base);
+    auto log2_cumem_granularity = state->log2_cumem_granularity;
+
+    // Local key
+    uint64_t idx = (laddr - heap_start) >> log2_cumem_granularity;
+    auto device_key = state->constmem.lkeys[idx];
+    auto lchunk_size = device_key.next_addr - laddr;
+    *lkey = device_key.key;
+
+    // Remote key
+    uint64_t roffset = raddr - heap_start;
+    idx = ((roffset >> log2_cumem_granularity) * nvshmemi_device_state_d.npes) + dst_pe;
+    if (idx < NVSHMEMI_IBGDA_MAX_CONST_RKEYS) {
+        device_key = state->constmem.rkeys[idx];
+    } else {
+        device_key = state->globalmem.rkeys[idx - NVSHMEMI_IBGDA_MAX_CONST_RKEYS];
+    }
+    *out_raddr = reinterpret_cast<uint64_t>(nvshmemi_device_state_d.peer_heap_base_remote[dst_pe]) + roffset;
+    *out_rkey = device_key.key;
+
+    // Return the minimum of local and remote chunk sizes
+    auto rchunk_size = device_key.next_addr - roffset;
+    return min(lchunk_size, rchunk_size);
+}
+
+__device__ static __forceinline__ void
+ibgda_get_rkey(uint64_t addr, int dst_pe, uint64_t *out_raddr, __be32 *out_rkey) {
+    auto state = ibgda_get_state();
+    auto heap_start = reinterpret_cast<uint64_t>(nvshmemi_device_state_d.heap_base);
+
+    uint64_t roffset = addr - heap_start;
+    uint64_t idx = ((roffset >> state->log2_cumem_granularity) * nvshmemi_device_state_d.npes) + dst_pe;
+    nvshmemi_ibgda_device_key_t device_key;
+    if (idx < NVSHMEMI_IBGDA_MAX_CONST_RKEYS)
+        device_key = state->constmem.rkeys[idx];
+    else
+        device_key = state->globalmem.rkeys[idx - NVSHMEMI_IBGDA_MAX_CONST_RKEYS];
+    *out_raddr = reinterpret_cast<uint64_t>(nvshmemi_device_state_d.peer_heap_base_remote[dst_pe]) + roffset;
+    *out_rkey = device_key.key;
+}
+
+__device__ static __forceinline__ uint64_t
+ibgda_reserve_wqe_slots(nvshmemi_ibgda_device_qp_t *qp, uint32_t num_wqes) {
+    auto mvars = &qp->mvars;
+    return atomicAdd(reinterpret_cast<unsigned long long*>(&mvars->tx_wq.resv_head), static_cast<unsigned long long>(num_wqes));
+}
+
+__device__ static __forceinline__ void*
+ibgda_get_wqe_ptr(nvshmemi_ibgda_device_qp_t* qp, uint16_t wqe_idx) {
+    uint16_t cnt = qp->tx_wq.nwqes;
+    uint16_t idx = wqe_idx & (cnt - 1);
+    return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(qp->tx_wq.wqe) + (idx << MLX5_SEND_WQE_SHIFT));
+}
+
+__device__ static __forceinline__ void
+nvshmemi_ibgda_rma_p(int *rptr, const int value, int dst_pe, int qp_id, uint32_t imm = std::numeric_limits<uint32_t>::max()) {
+    // Get rkey
+    // NOTES: the `p` operation will not cross multiple remote chunks
+    __be32 rkey;
+    uint64_t raddr;
+    ibgda_get_rkey(reinterpret_cast<uint64_t>(rptr), dst_pe, &raddr, &rkey);
+
+    // Write WQEs
+    auto qp = ibgda_get_rc(dst_pe, qp_id);
+    uint64_t base_wqe_idx = ibgda_reserve_wqe_slots(qp, 1);
+    void *wqe_ptrs;
+    wqe_ptrs = ibgda_get_wqe_ptr(qp, base_wqe_idx);
+    ibgda_write_rdma_write_inl_wqe(qp, reinterpret_cast<const uint32_t*>(&value), raddr, rkey, base_wqe_idx, &wqe_ptrs, imm);
+
+    // Submit requests
+    ibgda_submit_requests<true>(qp, base_wqe_idx, 1);
+}
+
+__device__ static __forceinline__ void
+ibgda_write_rdma_write_wqe(nvshmemi_ibgda_device_qp_t *qp, uint64_t laddr, __be32 lkey,
+                           uint64_t raddr, __be32 rkey, uint32_t bytes, uint16_t wqe_idx,
+                           void** out_wqes) {
+    ibgda_ctrl_seg_t ctrl_seg;
+    struct mlx5_wqe_raddr_seg raddr_seg;
+    struct mlx5_wqe_data_seg data_seg;
+
+    auto *ctrl_seg_ptr = reinterpret_cast<ibgda_ctrl_seg_t*>(out_wqes[0]);
+    void *av_seg_ptr = reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(ctrl_seg_ptr) + sizeof(*ctrl_seg_ptr));
+    struct mlx5_wqe_raddr_seg *raddr_seg_ptr;
+    struct mlx5_wqe_data_seg *data_seg_ptr;
+
+    raddr_seg_ptr = reinterpret_cast<mlx5_wqe_raddr_seg*>(reinterpret_cast<uintptr_t>(av_seg_ptr));
+    data_seg_ptr = reinterpret_cast<mlx5_wqe_data_seg*>(reinterpret_cast<uintptr_t>(raddr_seg_ptr) + sizeof(*raddr_seg_ptr));
+
+    raddr_seg.raddr = HtoBE64(raddr);
+    raddr_seg.rkey = rkey;
+    raddr_seg.reserved = 0;
+
+    data_seg.byte_count = HtoBE32(bytes);
+    data_seg.lkey = lkey;
+    data_seg.addr = HtoBE64(laddr);
+
+    ctrl_seg = {0};
+    ctrl_seg.qpn_ds = HtoBE32((qp->qpn << 8) | 3);
+    ctrl_seg.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
+    ctrl_seg.opmod_idx_opcode = HtoBE32((wqe_idx << 8) | MLX5_OPCODE_RDMA_WRITE);
+
+    EP_STATIC_ASSERT(sizeof(*ctrl_seg_ptr) == 16, "sizeof(*ctrl_seg_ptr) == 16");
+    EP_STATIC_ASSERT(sizeof(*raddr_seg_ptr) == 16, "sizeof(*raddr_seg_ptr) == 16");
+    EP_STATIC_ASSERT(sizeof(*data_seg_ptr) == 16, "sizeof(*data_seg_ptr) == 16");
+    st_na_relaxed(reinterpret_cast<int4*>(ctrl_seg_ptr), *reinterpret_cast<const int4*>(&ctrl_seg));
+    st_na_relaxed(reinterpret_cast<int4*>(raddr_seg_ptr), *reinterpret_cast<const int4*>(&raddr_seg));
+    st_na_relaxed(reinterpret_cast<int4*>(data_seg_ptr), *reinterpret_cast<const int4*>(&data_seg));
+}
+
+__device__ static __forceinline__ void
+ibgda_write_empty_recv_wqe(void *out_wqe) {
+    auto *data_seg_ptr = reinterpret_cast<struct mlx5_wqe_data_seg*>(out_wqe);
+    struct mlx5_wqe_data_seg data_seg;
+
+    // Make the first segment in the WQE invalid, then the entire list will be invalid
+    data_seg.byte_count = 0;
+    data_seg.lkey = HtoBE64(MLX5_INVALID_LKEY);
+    data_seg.addr = 0;
+
+    EP_STATIC_ASSERT(sizeof(mlx5_wqe_data_seg) == sizeof(int4), "Invalid data type length");
+    st_na_relaxed(reinterpret_cast<int4*>(data_seg_ptr), *reinterpret_cast<const int4*>(&data_seg));
+}
+
+template <bool kAlwaysDoPostSend = false>
+__device__ static __forceinline__ void
+nvshmemi_ibgda_put_nbi_warp(uint64_t req_rptr, uint64_t req_lptr, size_t bytes, int dst_pe, int qp_id, int lane_id, int message_idx) {
+    // Get lkey and rkey, store them into lanes
+    uint32_t num_wqes = 0;
+    __be32 my_lkey = 0;
+    uint64_t my_laddr = 0;
+    __be32 my_rkey = 0;
+    uint64_t my_raddr = 0;
+    uint64_t my_chunk_size = 0;
+
+    // Decide how many messages (theoretically 3 for maximum)
+    auto remaining_bytes = bytes;
+    while (remaining_bytes > 0) {
+        if (lane_id == num_wqes)
+            my_chunk_size = min(remaining_bytes, ibgda_get_lkey_and_rkey(my_laddr = req_lptr, &my_lkey, req_rptr, dst_pe, &my_raddr, &my_rkey));
+
+        // Move one more message
+        auto chunk_size = __shfl_sync(0xffffffff, my_chunk_size, static_cast<int>(num_wqes));
+        remaining_bytes -= chunk_size;
+        req_lptr += chunk_size;
+        req_rptr += chunk_size;
+        ++ num_wqes;
+    }
+    EP_DEVICE_ASSERT(num_wqes <= 32);
+
+    // Process WQE
+    auto qp = ibgda_get_rc(dst_pe, qp_id);
+    uint64_t base_wqe_idx = 0;
+    if (lane_id == 0)
+        base_wqe_idx = ibgda_reserve_wqe_slots(qp, num_wqes);
+    base_wqe_idx = __shfl_sync(0xffffffff, base_wqe_idx, 0);
+    if (lane_id < num_wqes) {
+        auto wqe_ptr = ibgda_get_wqe_ptr(qp, base_wqe_idx + lane_id);
+        ibgda_write_rdma_write_wqe(qp, my_laddr, my_lkey, my_raddr, my_rkey, my_chunk_size,
+                                   base_wqe_idx, &wqe_ptr);
+    }
+    __syncwarp();
+
+    // Submit
+    if (lane_id == 0)
+        ibgda_submit_requests<kAlwaysDoPostSend>(qp, base_wqe_idx, num_wqes, message_idx);
+    __syncwarp();
+}
+
+__device__ static __forceinline__ void ibgda_write_amo_add_wqe(
+        nvshmemi_ibgda_device_qp_t *qp, const int &value,
+        uint64_t laddr, __be32 lkey, uint64_t raddr, __be32 rkey,
+        uint16_t wqe_idx, void** out_wqes) {
+    ibgda_ctrl_seg_t ctrl_seg = {0};
+    struct mlx5_wqe_raddr_seg raddr_seg;
+    struct mlx5_wqe_atomic_seg atomic_seg_1;
+    struct mlx5_wqe_data_seg data_seg;
+
+    auto ctrl_seg_ptr = reinterpret_cast<ibgda_ctrl_seg_t*>(out_wqes[0]);
+    auto raddr_seg_ptr = reinterpret_cast<mlx5_wqe_raddr_seg*>(reinterpret_cast<uintptr_t>(ctrl_seg_ptr) + sizeof(*ctrl_seg_ptr));
+    auto atomic_seg_ptr = reinterpret_cast<mlx5_wqe_atomic_seg*>(reinterpret_cast<uintptr_t>(raddr_seg_ptr) + sizeof(*raddr_seg_ptr));
+    auto data_seg_ptr = reinterpret_cast<mlx5_wqe_data_seg*>(reinterpret_cast<uintptr_t>(atomic_seg_ptr) + sizeof(*atomic_seg_ptr));
+
+    raddr_seg.raddr = HtoBE64(raddr);
+    raddr_seg.rkey = rkey;
+    raddr_seg.reserved = 0;
+
+    // NOTES: `0x08000000` means `IBGDA_4_BYTE_EXT_AMO_OPMOD`
+    ctrl_seg.opmod_idx_opcode = HtoBE32(MLX5_OPCODE_ATOMIC_MASKED_FA | (wqe_idx << 8) | 0x08000000);
+    auto atomic_32_masked_fa_seg = reinterpret_cast<ibgda_atomic_32_masked_fa_seg_t*>(&atomic_seg_1);
+    atomic_32_masked_fa_seg->add_data = HtoBE32(value);
+    atomic_32_masked_fa_seg->field_boundary = 0;
+
+    ctrl_seg.qpn_ds = HtoBE32((qp->qpn << 8) | 4);
+    ctrl_seg.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
+
+    data_seg.byte_count = HtoBE32(sizeof(int));
+    data_seg.lkey = lkey;
+    data_seg.addr = HtoBE64(laddr);
+
+    EP_STATIC_ASSERT(sizeof(*ctrl_seg_ptr) == sizeof(int4), "Invalid vectorization");
+    EP_STATIC_ASSERT(sizeof(*raddr_seg_ptr) == sizeof(int4), "Invalid vectorization");
+    EP_STATIC_ASSERT(sizeof(*atomic_seg_ptr) == sizeof(int4), "Invalid vectorization");
+    EP_STATIC_ASSERT(sizeof(*data_seg_ptr) == sizeof(int4), "Invalid vectorization");
+    st_na_relaxed(reinterpret_cast<int4*>(ctrl_seg_ptr), *reinterpret_cast<int4*>(&ctrl_seg));
+    st_na_relaxed(reinterpret_cast<int4*>(raddr_seg_ptr), *reinterpret_cast<int4*>(&raddr_seg));
+    st_na_relaxed(reinterpret_cast<int4*>(atomic_seg_ptr), *reinterpret_cast<int4*>(&atomic_seg_1));
+    st_na_relaxed(reinterpret_cast<int4*>(data_seg_ptr), *reinterpret_cast<int4*>(&data_seg));
+}
+
+__device__ __forceinline__ void nvshmemi_ibgda_amo_nonfetch_add(void *rptr, const int& value, int pe, int qp_id, bool is_local_copy = false) {
+    if (is_local_copy) {
+        atomicAdd(static_cast<unsigned long long*>(rptr), value);
+    } else {
+        nvshmemi_ibgda_device_qp_t *qp = ibgda_get_rc(pe, qp_id);
+
+        __be32 rkey;
+        uint64_t raddr;
+        ibgda_get_rkey(reinterpret_cast<uint64_t>(rptr), pe, &raddr, &rkey);
+
+        uint64_t my_wqe_idx = ibgda_reserve_wqe_slots(qp, 1);
+        void *wqe_ptrs = ibgda_get_wqe_ptr(qp, my_wqe_idx);
+
+        ibgda_write_amo_add_wqe(qp, value, reinterpret_cast<uint64_t>(qp->ibuf.buf),
+                                qp->ibuf.lkey, raddr, rkey, my_wqe_idx, &wqe_ptrs);
+
+        ibgda_submit_requests<true>(qp, my_wqe_idx, 1);
+    }
+}
+
+__device__ __forceinline__ uint64_t nvshmemi_get_p2p_ptr(const uint64_t& ptr, const int& rank, const int& dst_rank) {
+    // Local rank, no need for mapping
+    if (rank == dst_rank)
+        return ptr;
+    auto peer_base = __ldg(reinterpret_cast<uint64_t*>(nvshmemi_device_state_d.peer_heap_base_p2p) + dst_rank);
+
+    // RDMA connected
+    if (peer_base == 0)
+        return 0;
+
+    // NVLink P2P is enabled
+    return peer_base + (ptr - reinterpret_cast<uint64_t>(nvshmemi_device_state_d.heap_base));
+}
+
+// This is a simplified version of NVSHMEM's `ibgda_poll_cq`. 
+// Note that this implementation does not guarantee thread safety,
+// so we must ensure that no other threads are concurrently using the same QP.
+__device__ static __forceinline__ void
+ibgda_poll_cq(nvshmemi_ibgda_device_cq_t *cq, uint64_t idx) {
+    const auto cqe64 = static_cast<mlx5_cqe64*>(cq->cqe);
+    const uint32_t ncqes = cq->ncqes;
+    memory_fence_cta();
+
+    // NOTES: this while loop is part of do-while below.
+    // `wqe_counter` is the HW consumer index. However, we always maintain `index + 1`.
+    // To be able to compare with the index, we need to use `wqe_counter + 1`.
+    // Because `wqe_counter` is `uint16_t`, it may be overflow. Still, we know for
+    // sure that if `idx - wqe_counter - 1 < ncqes`, `wqe_counter + 1 is less than
+    // idx, and thus we need to wait. We don't need to wait when `idx == wqe_counter + 1`
+    // That's why we use `- 2` here to make this case overflow.
+    uint16_t wqe_counter;
+    do {
+        wqe_counter = HtoBE16(ld_na_relaxed(&cqe64->wqe_counter));
+    } while ((static_cast<uint16_t>(static_cast<uint16_t>(idx) - wqe_counter - static_cast<uint16_t>(2)) < ncqes));
+    *cq->cons_idx = idx;
+
+    // Prevent reordering of this function and later instructions
+    memory_fence_cta();
+}
+
+// Wait until wqe `idx - 1` is completed.
+__device__ static __forceinline__ void
+nvshmemi_ibgda_quiet(int dst_pe, int qp_id) {
+    auto qp = ibgda_get_rc(dst_pe, qp_id);
+    uint64_t prod_idx = ld_na_relaxed(qp->tx_wq.prod_idx);
+    ibgda_poll_cq(qp->tx_wq.cq, prod_idx);
+}
+
+} // namespace deep_ep
--- a/DeepEP/csrc/kernels/internode.cu
+++ b/DeepEP/csrc/kernels/internode.cu
--- a/DeepEP/csrc/kernels/internode_ll.cu
+++ b/DeepEP/csrc/kernels/internode_ll.cu
@ -0,0 +1,584 @@
+#include "configs.cuh"
+#include "exception.cuh"
+#include "launch.cuh"
+#include "ibgda_device.cuh"
+
+namespace deep_ep {
+
+namespace internode_ll {
+
+template <int kNumThreads> __launch_bounds__(kNumThreads, 1)
+__global__ void clean_low_latency_buffer(int* clean_0, int num_clean_int_0,
+                                         int* clean_1, int num_clean_int_1) {
+    // Barrier before cleaning (in case of unfinished chunked EP)
+    nvshmemx_barrier_all_block();
+
+    // Clean
+    auto thread_id = static_cast<int>(threadIdx.x);
+    #pragma unroll
+    for (int i = thread_id; i < num_clean_int_0; i += kNumThreads)
+        clean_0[i] = 0;
+    #pragma unroll
+    for (int i = thread_id; i < num_clean_int_1; i += kNumThreads)
+        clean_1[i] = 0;
+
+    // Barrier after cleaning (make sure the low-latency mode works fine)
+    nvshmemx_barrier_all_block();
+}
+
+void clean_low_latency_buffer(int* clean_0, int num_clean_int_0,
+                              int* clean_1, int num_clean_int_1,
+                              cudaStream_t stream) {
+    constexpr int kNumThreads = 256;
+
+    SETUP_LAUNCH_CONFIG(1, kNumThreads, stream);
+    LAUNCH_KERNEL(&cfg, clean_low_latency_buffer<kNumThreads>,
+                  clean_0, num_clean_int_0, clean_1, num_clean_int_1);
+}
+
+template <bool kUseFP8, bool kUseUE8M0, int kHidden>
+__global__ __launch_bounds__(1024, 1) void
+dispatch(void* packed_recv_x, void* packed_recv_x_scales,
+         int* packed_recv_src_info, int64_t* packed_recv_layout_range,
+         int* packed_recv_count,
+         int* cumulative_local_expert_recv_stats,
+         void* rdma_recv_x, int* rdma_recv_count, void* rdma_x,
+         const void* x, const int64_t* topk_idx,
+         int* atomic_counter_per_expert, int* atomic_finish_counter_per_expert,
+         int* next_clean, int num_next_clean_int,
+         int num_tokens, int num_max_dispatch_tokens_per_rank,
+         int num_topk, int num_experts, int rank, int num_ranks,
+         int num_warp_groups, int num_warps_per_group,
+         bool round_scale, int phases) {
+    const auto sm_id = static_cast<int>(blockIdx.x);
+    const auto thread_id = static_cast<int>(threadIdx.x);
+    const auto warp_id = thread_id / 32, lane_id = get_lane_id();
+    const auto num_sms = static_cast<int>(gridDim.x);
+    const auto num_warps = num_warp_groups * num_warps_per_group;
+    const auto num_local_experts = num_experts / num_ranks;
+    const auto warp_group_id = warp_id / num_warps_per_group;
+    const auto sub_warp_id = warp_id % num_warps_per_group;
+    const auto responsible_expert_idx = sm_id * num_warp_groups + warp_group_id;
+
+    // May extract UE8M0 from the scales
+    using scale_t = std::conditional_t<kUseUE8M0, uint8_t, float>;
+    using packed_t = std::conditional_t<kUseUE8M0, uint32_t, float>;
+    EP_STATIC_ASSERT(sizeof(packed_t) % sizeof(scale_t) == 0, "Invalid vector length");
+
+    // FP8 staffs
+    constexpr int kNumPerChannels = 128;
+    const int num_scales = kHidden / kNumPerChannels;
+    const size_t hidden_bytes = kHidden * (kUseFP8 ? sizeof(__nv_fp8_storage_t) : sizeof(nv_bfloat16));
+    const size_t hidden_int4 = hidden_bytes / sizeof(int4);
+
+    // Message package: hidden data, FP8 scales, index at source
+    // NOTES: currently we have 3 reserved int fields for future use
+    using vec_t = typename std::conditional<kUseFP8, int2, int4>::type;
+    const size_t num_bytes_per_msg = sizeof(int4) + (kUseFP8 ? (kHidden + num_scales * sizeof(float)) : (kHidden * sizeof(nv_bfloat16)));
+    const size_t num_int4_per_msg = num_bytes_per_msg / sizeof(int4);
+    EP_DEVICE_ASSERT(num_bytes_per_msg % sizeof(int4) == 0);
+
+    // Expert counts
+    constexpr int kNumMaxWarpGroups = 32;
+    __shared__ int shared_num_tokens_sent_per_expert[kNumMaxWarpGroups];
+
+    // Sending phase
+    if ((phases & LOW_LATENCY_SEND_PHASE) == 0)
+        goto LOW_LATENCY_DISPATCH_RECV;
+
+    // There are 2 kinds of warps in this part:
+    // 1. The first-kind warps for FP8 cast and sending top-k tokens
+    // 2. The last warp for reading `topk_idx` and count for per-expert information
+    if (warp_id < num_warps - 1) {
+        constexpr int kNumElemsPerRead = sizeof(int4) / sizeof(nv_bfloat16);
+        EP_DEVICE_ASSERT(kHidden % kNumElemsPerRead == 0);
+        EP_STATIC_ASSERT(kNumElemsPerRead * 32 % kNumPerChannels == 0, "Invalid vectorization");
+        const auto num_threads = (num_warps - 1) * 32;
+        const size_t hidden_bf16_int4 = kHidden / kNumElemsPerRead;
+
+        for (int token_idx = sm_id; token_idx < num_tokens; token_idx += num_sms) {
+            const auto x_int4 = static_cast<const int4*>(x) + token_idx * hidden_bf16_int4;
+            const auto rdma_x_src_idx = reinterpret_cast<int*>(static_cast<uint8_t*>(rdma_x) + token_idx * num_bytes_per_msg);
+            const auto rdma_x_vec = reinterpret_cast<vec_t*>(reinterpret_cast<uint8_t*>(rdma_x_src_idx) + sizeof(int4));
+            const auto rdma_x_scales = reinterpret_cast<float*>(reinterpret_cast<uint8_t*>(rdma_x_vec) + hidden_bytes);
+
+            // Overlap top-k index read and source token index writes
+            auto dst_expert_idx = warp_id < num_topk ? static_cast<int>(__ldg(topk_idx + token_idx * num_topk + warp_id)) : -1;
+            thread_id == 0 ? (*rdma_x_src_idx = token_idx) : 0;
+
+            // FP8 cast
+            #pragma unroll
+            for (int i = thread_id; i < hidden_bf16_int4; i += num_threads) {
+                // Read
+                auto int4_value = __ldg(x_int4 + i);
+
+                if constexpr (kUseFP8) {
+                    // Calculate local amax
+                    auto bf16_values = reinterpret_cast<nv_bfloat16*>(&int4_value);
+                    float fp32_values[kNumElemsPerRead];
+                    float amax = kFP8Margin, scale, scale_inv;
+                    #pragma unroll
+                    for (int j = 0; j < kNumElemsPerRead; ++ j) {
+                        fp32_values[j] = static_cast<float>(bf16_values[j]);
+                        amax = fmaxf(amax, fabsf(fp32_values[j]));
+                    }
+
+                    // Reduce amax and scale
+                    EP_STATIC_ASSERT(kNumElemsPerRead * 32 / kNumPerChannels == 2, "Invalid vectorization");
+                    amax = half_warp_reduce_max(amax);
+                    calculate_fp8_scales(amax, scale, scale_inv, round_scale);
+                    if (lane_id == 0 or lane_id == 16)
+                        rdma_x_scales[i * kNumElemsPerRead / 128] = scale_inv;
+
+                    // Cast into send buffer
+                    vec_t int2_value;
+                    auto fp8x2_values = reinterpret_cast<__nv_fp8x2_storage_t*>(&int2_value);
+                    #pragma unroll
+                    for (int j = 0; j < kNumElemsPerRead; j += 2) {
+                        float2 fp32x2 = {fp32_values[j] * scale, fp32_values[j + 1] * scale};
+                        fp8x2_values[j / 2] = __nv_cvt_float2_to_fp8x2(fp32x2, __NV_SATFINITE, __NV_E4M3);
+                    }
+                    rdma_x_vec[i] = int2_value;
+                } else {
+                    // Reinterpret-cast is for C++14 compatibility
+                    rdma_x_vec[i] = *reinterpret_cast<vec_t*>(&int4_value);
+                }
+            }
+            asm volatile("bar.sync 1, %0;" :: "r"(num_threads));
+
+            // Issue IBGDA sends
+            if (dst_expert_idx >= 0) {
+                int slot_idx = lane_id == 0 ? atomicAdd(atomic_counter_per_expert + dst_expert_idx, 1) : 0;
+                slot_idx = __shfl_sync(0xffffffff, slot_idx, 0);
+                const auto dst_rank = dst_expert_idx / num_local_experts;
+                const auto dst_expert_local_idx = dst_expert_idx % num_local_experts;
+                const auto src_ptr = reinterpret_cast<uint64_t>(rdma_x_src_idx);
+                const auto dst_ptr = reinterpret_cast<uint64_t>(rdma_recv_x) +
+                                     dst_expert_local_idx * num_ranks * num_max_dispatch_tokens_per_rank * num_bytes_per_msg +
+                                     rank * num_max_dispatch_tokens_per_rank * num_bytes_per_msg +
+                                     slot_idx * num_bytes_per_msg;
+                const auto dst_p2p_ptr = nvshmemi_get_p2p_ptr(dst_ptr, rank, dst_rank);
+                if (dst_p2p_ptr == 0) {
+                    nvshmemi_ibgda_put_nbi_warp(dst_ptr, src_ptr, num_bytes_per_msg, dst_rank, dst_expert_local_idx, lane_id, slot_idx);
+                } else {
+                    // NOTES: only 2 load iterations for 7K hidden with 8 unrolls
+                    const auto* src_int4_ptr = reinterpret_cast<const int4*>(src_ptr);
+                    const auto* dst_int4_ptr = reinterpret_cast<int4*>(dst_p2p_ptr);
+                    UNROLLED_WARP_COPY(8, lane_id, num_int4_per_msg, dst_int4_ptr, src_int4_ptr, ld_nc_global, st_na_global);
+                }
+
+                // Increase counter after finishing
+                __syncwarp();
+                lane_id == 0 ? atomic_add_release_global(atomic_finish_counter_per_expert + dst_expert_idx, 1) : 0;
+            }
+        }
+    } else if (warp_id == num_warps - 1) {
+        EP_DEVICE_ASSERT(num_sms > 1);
+        if (sm_id == 0) {
+            // The first SM is also responsible for checking QPs
+            EP_DEVICE_ASSERT(ibgda_get_state()->num_rc_per_pe >= num_local_experts);
+
+            // The first SM is also responsible for cleaning the next buffer
+            #pragma unroll
+            for (int i = lane_id; i < num_next_clean_int; i += 32)
+                next_clean[i] = 0;
+
+            // Notify before executing `int_p`
+            __syncwarp();
+            #pragma unroll
+            for (int i = lane_id; i < num_experts; i += 32)
+                atomic_add_release_global(atomic_finish_counter_per_expert + i, FINISHED_SUM_TAG);
+        }
+
+        // This SM should be responsible for some destination experts, read `topk_idx` for them
+        int expert_count[kNumMaxWarpGroups] = {0};
+        const auto expert_begin_idx = sm_id * num_warp_groups;
+        const auto expert_end_idx = min(expert_begin_idx + num_warp_groups, num_experts);
+
+        // Per lane count
+        #pragma unroll 8
+        for (int i = lane_id; i < num_tokens * num_topk; i += 32) {
+            auto idx = static_cast<int>(__ldg(topk_idx + i));
+            if (idx >= expert_begin_idx and idx < expert_end_idx)
+                expert_count[idx - expert_begin_idx] ++;
+        }
+
+        // Warp reduce
+        #pragma unroll
+        for (int i = expert_begin_idx; i < expert_end_idx; ++ i) {
+            auto sum = warp_reduce_sum(expert_count[i - expert_begin_idx]);
+            if (lane_id == 0) {
+                shared_num_tokens_sent_per_expert[i - expert_begin_idx] = sum;
+                atomic_add_release_global(atomic_finish_counter_per_expert + i, FINISHED_SUM_TAG - sum);
+            }
+        }
+    }
+    __syncthreads();
+
+    // Issue count sends
+    if (responsible_expert_idx < num_experts and sub_warp_id == 0 and lane_id == 0) {
+        const auto dst_rank = responsible_expert_idx / num_local_experts;
+        const auto dst_expert_local_idx = responsible_expert_idx % num_local_experts;
+        const auto num_tokens_sent = shared_num_tokens_sent_per_expert[responsible_expert_idx - sm_id * num_warp_groups];
+
+        // Wait local sends issued and send expert counts
+        while (ld_acquire_global(atomic_finish_counter_per_expert + responsible_expert_idx) != FINISHED_SUM_TAG * 2);
+        auto dst_ptr = reinterpret_cast<uint64_t>(rdma_recv_count + dst_expert_local_idx * num_ranks + rank);
+        auto dst_p2p_ptr = nvshmemi_get_p2p_ptr(dst_ptr, rank, dst_rank);
+        if (dst_p2p_ptr == 0) {
+            nvshmemi_ibgda_amo_nonfetch_add(reinterpret_cast<int*>(dst_ptr), -num_tokens_sent - 1, dst_rank, dst_expert_local_idx);
+        } else {
+            st_release_sys_global(reinterpret_cast<int*>(dst_p2p_ptr), -num_tokens_sent - 1);
+        }
+
+        // Clean workspace for next use
+        atomic_counter_per_expert[responsible_expert_idx] = 0;
+        atomic_finish_counter_per_expert[responsible_expert_idx] = 0;
+
+        // Clean `packed_recv_count`
+        if (dst_rank == 0)
+            packed_recv_count[dst_expert_local_idx] = 0;
+    }
+    __syncwarp();
+
+    // Receiving phase
+    LOW_LATENCY_DISPATCH_RECV:
+    if ((phases & LOW_LATENCY_RECV_PHASE) == 0)
+        return;
+
+    // For send-and-recv kernels, we need a grid sync for making `packed_recv_count` visible
+    if (phases & LOW_LATENCY_SEND_PHASE)
+        cg::this_grid().sync();
+
+    // Receiving and packing
+    if (responsible_expert_idx < num_experts) {
+        const auto src_rank = responsible_expert_idx / num_local_experts;
+        const auto local_expert_idx = responsible_expert_idx % num_local_experts;
+        const auto rdma_recv_x_uint8 = static_cast<uint8_t*>(rdma_recv_x) +
+                local_expert_idx * num_ranks * num_max_dispatch_tokens_per_rank * num_bytes_per_msg +
+                src_rank * num_max_dispatch_tokens_per_rank * num_bytes_per_msg;
+        const auto recv_x_int4 = static_cast<int4*>(packed_recv_x) +
+                local_expert_idx * num_ranks * num_max_dispatch_tokens_per_rank * hidden_int4;
+        const auto recv_src_info = packed_recv_src_info + local_expert_idx * num_ranks * num_max_dispatch_tokens_per_rank;
+        const auto recv_range = packed_recv_layout_range + local_expert_idx * num_ranks;
+        const auto num_aligned_scales = align<int>(num_scales, sizeof(float) / sizeof(scale_t));
+        const auto recv_x_scales = static_cast<scale_t*>(packed_recv_x_scales) + local_expert_idx * num_ranks * num_max_dispatch_tokens_per_rank * num_aligned_scales;
+
+        // Shared between sub-warps in warp groups
+        __shared__ int shared_num_recv_tokens[kNumMaxWarpGroups], shared_recv_token_begin_idx[kNumMaxWarpGroups];
+
+        // Wait tokens to arrive
+        // NOTES: using sub-warp 1 to overlap with sub-warp 0
+        int num_recv_tokens, recv_token_begin_idx;
+        EP_DEVICE_ASSERT(num_warps_per_group > 1 and num_warp_groups < 15);
+        if (sub_warp_id == 1 and lane_id == 0) {
+            while ((num_recv_tokens = ld_acquire_sys_global(rdma_recv_count + local_expert_idx * num_ranks + src_rank)) == 0);
+            num_recv_tokens = -num_recv_tokens - 1;
+            recv_token_begin_idx = atomicAdd(packed_recv_count + local_expert_idx, num_recv_tokens);
+            shared_num_recv_tokens[warp_group_id] = num_recv_tokens;
+            shared_recv_token_begin_idx[warp_group_id] = recv_token_begin_idx;
+            recv_range[src_rank] = pack2<int, int64_t>(num_recv_tokens, recv_token_begin_idx);
+            if (cumulative_local_expert_recv_stats != nullptr)
+                atomicAdd(cumulative_local_expert_recv_stats + local_expert_idx, num_recv_tokens);
+        }
+        asm volatile("bar.sync %0, %1;" :: "r"(warp_group_id + 2), "r"(num_warps_per_group * 32));
+        num_recv_tokens = shared_num_recv_tokens[warp_group_id];
+        recv_token_begin_idx = shared_recv_token_begin_idx[warp_group_id];
+
+        // Copy tokens
+        EP_DEVICE_ASSERT(num_scales <= 64);
+        for (int i = sub_warp_id; i < num_recv_tokens; i += num_warps_per_group) {
+            // Copy source info
+            const auto src_src_idx = reinterpret_cast<int*>(rdma_recv_x_uint8 + i * num_bytes_per_msg);
+            if (lane_id == 0)
+                recv_src_info[recv_token_begin_idx + i] = ld_nc_global(src_src_idx);
+            __syncwarp();
+
+            // Copy data
+            // NOTES: only 2 load iterations for 7K hidden with 7 unrolls
+            const auto src_data = reinterpret_cast<int4*>(reinterpret_cast<uint8_t*>(src_src_idx) + sizeof(int4));
+            const auto dst_data = recv_x_int4 + (recv_token_begin_idx + i) * hidden_int4;
+            UNROLLED_WARP_COPY(7, lane_id, hidden_int4, dst_data, src_data, ld_nc_global, st_na_global);
+
+            // Copy scales
+            if constexpr (kUseFP8) {
+                // Equivalent CuTe layout:
+                //   (num_tokens, (num_packed, num_elems_per_pack)):(num_elems_per_pack, (num_tokens * num_elems_per_pack, 1))
+                const auto src_scales = reinterpret_cast<float*>(reinterpret_cast<uint8_t*>(src_data) + hidden_bytes);
+                const auto num_elems_per_pack = static_cast<int>(sizeof(packed_t) / sizeof(scale_t));
+                const auto token_idx = recv_token_begin_idx + i;
+                const auto token_stride = num_elems_per_pack;
+                const auto pack_stride = num_ranks * num_max_dispatch_tokens_per_rank * num_elems_per_pack;
+                if (lane_id < num_scales) {
+                    const auto pack_idx = lane_id / num_elems_per_pack;
+                    const auto elem_idx = lane_id % num_elems_per_pack;
+                    auto scale = extract_required_scale_format<kUseUE8M0>(ld_nc_global(src_scales + lane_id));
+                    recv_x_scales[token_idx * token_stride + pack_idx * pack_stride + elem_idx] = scale;
+                }
+                if (lane_id + 32 < num_scales) {
+                    const auto pack_idx = (lane_id + 32) / num_elems_per_pack;
+                    const auto elem_idx = (lane_id + 32) % num_elems_per_pack;
+                    auto scale = extract_required_scale_format<kUseUE8M0>(ld_nc_global(src_scales + lane_id + 32));
+                    recv_x_scales[token_idx * token_stride + pack_idx * pack_stride + elem_idx] = scale;
+                }
+            }
+        }
+    }
+}
+
+void dispatch(void* packed_recv_x, void* packed_recv_x_scales,
+              int* packed_recv_src_info, int64_t* packed_recv_layout_range,
+              int* packed_recv_count,
+              int* cumulative_local_expert_recv_stats,
+              void* rdma_recv_x, int* rdma_recv_count, void* rdma_x,
+              const void* x, const int64_t* topk_idx,
+              int* next_clean, int num_next_clean_int,
+              int num_tokens, int hidden, int num_max_dispatch_tokens_per_rank,
+              int num_topk, int num_experts, int rank, int num_ranks,
+              bool use_fp8, bool round_scale, bool use_ue8m0,
+              void* workspace, int num_device_sms,
+              cudaStream_t stream, int phases) {
+    constexpr int kNumMaxTopK = 9;
+    const int num_warp_groups = ceil_div(num_experts, num_device_sms);
+    const int num_warps_per_group = 32 / num_warp_groups;
+    EP_HOST_ASSERT(num_warp_groups > 0 and num_warps_per_group > 0);
+    EP_HOST_ASSERT(kNumMaxTopK + 1 <= num_warp_groups * num_warps_per_group);
+
+    const auto num_warps = num_warp_groups * num_warps_per_group;
+    const auto num_sms = ceil_div(num_experts, num_warp_groups);
+    EP_HOST_ASSERT(num_topk <= kNumMaxTopK);
+
+    // Workspace checks
+    auto atomic_counter_per_expert = static_cast<int*>(workspace);
+    auto atomic_finish_counter_per_expert = atomic_counter_per_expert + num_experts;
+    EP_HOST_ASSERT(num_experts * sizeof(int) * 2 <= NUM_WORKSPACE_BYTES);
+
+    // FP8 checks
+    if (use_ue8m0)
+        EP_HOST_ASSERT(round_scale and "UE8M0 SF requires `round_scale=True`");
+
+#define DISPATCH_LAUNCH_CASE(hidden) { \
+auto dispatch_func = dispatch<false, false, hidden>; \
+if (use_fp8 and not use_ue8m0) \
+    dispatch_func = dispatch<true, false, hidden>; \
+if (use_fp8 and use_ue8m0) \
+    dispatch_func = dispatch<true, true, hidden>; \
+LAUNCH_KERNEL(&cfg, dispatch_func, \
+              packed_recv_x, packed_recv_x_scales, \
+              packed_recv_src_info, packed_recv_layout_range, \
+              packed_recv_count, \
+              cumulative_local_expert_recv_stats, \
+              rdma_recv_x, rdma_recv_count, rdma_x, \
+              x, topk_idx, \
+              atomic_counter_per_expert, atomic_finish_counter_per_expert, \
+              next_clean, num_next_clean_int, \
+              num_tokens, num_max_dispatch_tokens_per_rank, \
+              num_topk, num_experts, rank, num_ranks, \
+              num_warp_groups, num_warps_per_group, \
+              round_scale, phases); } break
+
+    SETUP_LAUNCH_CONFIG(num_sms, num_warps * 32, stream);
+    SWITCH_HIDDEN(DISPATCH_LAUNCH_CASE);
+#undef DISPATCH_LAUNCH_CASE
+}
+
+template <int kHidden, int kNumMaxTopk>
+__global__ __launch_bounds__(1024, 1) void
+combine(void* combined_x,
+        void* rdma_recv_x, int* rdma_recv_flag, void* rdma_send_x,
+        const void* x, const int64_t* topk_idx, const float* topk_weights,
+        const int* src_info, const int64_t* layout_range,
+        int* next_clean, int num_next_clean_int,
+        int* atomic_clean_flag,
+        int num_combined_tokens, int hidden, int num_topk,
+        int num_max_dispatch_tokens_per_rank,
+        int num_experts, int rank, int num_ranks,
+        int num_warp_groups, int num_warps_per_group,
+        int phases, bool zero_copy) {
+    const auto sm_id = static_cast<int>(blockIdx.x);
+    const auto num_sms = static_cast<int>(gridDim.x);
+    const auto thread_id = static_cast<int>(threadIdx.x);
+    const auto num_threads = static_cast<int>(blockDim.x);
+    const auto warp_id = thread_id / 32, lane_id = get_lane_id();
+    const auto num_local_experts = num_experts / num_ranks;
+    const auto warp_group_id = warp_id / num_warps_per_group;
+    const auto sub_warp_id = warp_id % num_warps_per_group;
+    const auto responsible_expert_idx = sm_id * num_warp_groups + warp_group_id;
+
+    // Data type staffs
+    constexpr int kNumElemsPerInt4 = sizeof(int4) / sizeof(nv_bfloat16);
+    const size_t hidden_bf16_int4 = kHidden / kNumElemsPerInt4;
+
+    // Message package
+    constexpr size_t num_bytes_per_slot = kHidden * sizeof(nv_bfloat16);
+    EP_STATIC_ASSERT(num_bytes_per_slot % sizeof(int4) == 0, "Invalid vectorization");
+
+    // Sending phase
+    if ((phases & LOW_LATENCY_SEND_PHASE) == 0)
+        goto LOW_LATENCY_COMBINE_RECV;
+
+    // Clean up next buffer
+    if (sm_id == 0 and warp_group_id == 0 and sub_warp_id == 0) {
+        #pragma unroll
+        for (int i = lane_id; i < num_next_clean_int; i += 32)
+            next_clean[i] = 0;
+
+        // Notify before executing `int_p`
+        __syncwarp();
+        if (lane_id == 0)
+            atomic_add_release_global(atomic_clean_flag, num_experts);
+    }
+
+    // Issue IBGDA sends
+    if (responsible_expert_idx < num_experts) {
+        const auto dst_rank = responsible_expert_idx / num_local_experts;
+        const auto local_expert_idx = responsible_expert_idx % num_local_experts;
+        const auto global_expert_idx = rank * num_local_experts + local_expert_idx;
+        const auto layout = __ldg(layout_range + local_expert_idx * num_ranks + dst_rank);
+        const auto local_x = static_cast<const int4*>(x) +
+                local_expert_idx * num_ranks * num_max_dispatch_tokens_per_rank * hidden_bf16_int4;
+        const auto local_src_info = src_info + local_expert_idx * num_ranks * num_max_dispatch_tokens_per_rank;
+        const auto rdma_send_x_vec = static_cast<uint8_t*>(rdma_send_x) +
+                local_expert_idx * num_ranks * num_max_dispatch_tokens_per_rank * num_bytes_per_slot;
+
+        // Unpack layout
+        int offset, num_tokens_to_send;
+        unpack2(layout, num_tokens_to_send, offset);
+
+        // Issue IBGDA send
+        for (int token_idx = offset + sub_warp_id; token_idx < offset + num_tokens_to_send; token_idx += num_warps_per_group) {
+            const auto x_int4 = local_x + token_idx * hidden_bf16_int4;
+            const auto rdma_send_type_row = reinterpret_cast<int*>(rdma_send_x_vec + token_idx * num_bytes_per_slot);
+            const auto rdma_send_x_vec_row = reinterpret_cast<uint8_t*>(rdma_send_type_row);
+
+            // Copy directly to local rank, or copy to buffer and issue RDMA
+            auto src_idx = __ldg(local_src_info + token_idx);
+            const auto buf_ptr = reinterpret_cast<int64_t>(rdma_send_x_vec_row);
+            const auto dst_ptr = reinterpret_cast<uint64_t>(rdma_recv_x) + (global_expert_idx * num_max_dispatch_tokens_per_rank + src_idx) * num_bytes_per_slot;
+            const auto dst_p2p_ptr = nvshmemi_get_p2p_ptr(dst_ptr, rank, dst_rank);
+            if (dst_p2p_ptr == 0) {
+                const auto buf_int4_ptr = reinterpret_cast<int4*>(buf_ptr);
+                if (not zero_copy)
+                    UNROLLED_WARP_COPY(7, lane_id, hidden_bf16_int4, buf_int4_ptr, x_int4, ld_nc_global, st_na_global);
+                nvshmemi_ibgda_put_nbi_warp(dst_ptr, buf_ptr, hidden * sizeof(nv_bfloat16), dst_rank, local_expert_idx, lane_id, token_idx - offset);
+            } else {
+                const auto dst_int4_ptr = reinterpret_cast<int4*>(dst_p2p_ptr);
+                UNROLLED_WARP_COPY(7, lane_id, hidden_bf16_int4, dst_int4_ptr, x_int4, ld_nc_global, st_na_global);
+            }
+        }
+
+        // Put the finishing flag
+        EP_DEVICE_ASSERT(num_warps_per_group > 1 and num_warp_groups < 16);
+        asm volatile("bar.sync %0, %1;" :: "r"(warp_group_id + 1), "r"(num_warps_per_group * 32));
+        if (sub_warp_id == 1 and lane_id == 0) {
+            while (ld_acquire_global(atomic_clean_flag) == 0);
+            auto dst_ptr = reinterpret_cast<uint64_t>(rdma_recv_flag + global_expert_idx);
+            auto dst_p2p_ptr = nvshmemi_get_p2p_ptr(dst_ptr, rank, dst_rank);
+            if (dst_p2p_ptr == 0) {
+                nvshmemi_ibgda_amo_nonfetch_add(reinterpret_cast<int*>(dst_ptr), 1, dst_rank, local_expert_idx);
+            } else {
+                st_release_sys_global(reinterpret_cast<int*>(dst_p2p_ptr), 1);
+            }
+            atomic_add_release_global(atomic_clean_flag, -1);
+        }
+        __syncwarp();
+    }
+
+    // Receiving phase
+    LOW_LATENCY_COMBINE_RECV:
+    if ((phases & LOW_LATENCY_RECV_PHASE) == 0)
+        return;
+
+    // Wait all ranks to arrive
+    if (responsible_expert_idx < num_experts) {
+        EP_DEVICE_ASSERT(num_warps_per_group > 1);
+        if (sub_warp_id == 0 and lane_id == 0) {
+            while (ld_acquire_sys_global(rdma_recv_flag + responsible_expert_idx) == 0);
+        }
+    }
+    cg::this_grid().sync();
+
+    // Reduce tokens
+    EP_DEVICE_ASSERT(num_topk <= 32 and hidden_bf16_int4 <= num_threads);
+    EP_STATIC_ASSERT(kHidden % (32 * kNumElemsPerInt4) == 0, "Invalid vectorization");
+    if (thread_id < hidden_bf16_int4) {
+        for (int token_idx = sm_id; token_idx < num_combined_tokens; token_idx += num_sms) {
+            // Read top-k indices and weights
+            int reg_topk_idx[kNumMaxTopk];
+            float reg_topk_weights[kNumMaxTopk];
+            #pragma unroll
+            for (int i = 0; i < num_topk; ++ i) {
+                reg_topk_idx[i] = static_cast<int>(__ldg(topk_idx + token_idx * num_topk + i));
+                reg_topk_weights[i] = __ldg(topk_weights + token_idx * num_topk + i);
+            }
+
+            float combined_values[kNumElemsPerInt4] = {0.0f};
+            #pragma unroll
+            for (int i = 0; i < num_topk; ++ i) if (reg_topk_idx[i] >= 0) {
+                // Read from sources
+                auto rdma_buffer_type = reinterpret_cast<const int*>(static_cast<uint8_t*>(rdma_recv_x) + (reg_topk_idx[i] * num_max_dispatch_tokens_per_rank + token_idx) * num_bytes_per_slot);
+                auto rdma_buffer_row = reinterpret_cast<const uint8_t*>(rdma_buffer_type);
+
+                // Reduce
+                auto x_vec = ld_nc_global(reinterpret_cast<const int4*>(rdma_buffer_row) + thread_id);
+                const auto x_bf16 = reinterpret_cast<nv_bfloat16*>(&x_vec);
+                #pragma unroll
+                for (int j = 0; j < kNumElemsPerInt4; ++ j)
+                    combined_values[j] += static_cast<float>(x_bf16[j]) * reg_topk_weights[i];
+            }
+
+            // Write results
+            int4& combined_int4 = *reinterpret_cast<int4*>(combined_values);
+            auto combined_bf16 = reinterpret_cast<nv_bfloat16*>(&combined_values);
+            #pragma unroll
+            for (int j = 0; j < kNumElemsPerInt4; ++ j)
+                combined_bf16[j] = static_cast<nv_bfloat16>(combined_values[j]);
+            (static_cast<int4*>(combined_x) + token_idx * hidden_bf16_int4)[thread_id] = combined_int4;
+        }
+    }
+}
+
+void combine(void* combined_x,
+             void* rdma_recv_x, int* rdma_recv_flag, void* rdma_send_x,
+             const void* x, const int64_t* topk_idx, const float* topk_weights,
+             const int* src_info, const int64_t* layout_range,
+             int* next_clean, int num_next_clean_int,
+             int num_combined_tokens, int hidden, int num_max_dispatch_tokens_per_rank,
+             int num_topk, int num_experts, int rank, int num_ranks,
+             void* workspace, int num_device_sms,
+             cudaStream_t stream, int phases, bool zero_copy) {
+    constexpr int kNumMaxTopk = 9;
+    const int num_warp_groups = ceil_div(num_experts, num_device_sms);
+    const int num_warps_per_group = 32 / num_warp_groups;
+    EP_HOST_ASSERT(num_warp_groups > 0 and num_warps_per_group > 0);
+
+    const auto num_warps = num_warp_groups * num_warps_per_group;
+    const auto num_sms = ceil_div(num_experts, num_warp_groups);
+
+    // Check workspace
+    auto atomic_clean_flag = static_cast<int*>(workspace);
+    EP_HOST_ASSERT(sizeof(int) <= NUM_WORKSPACE_BYTES);
+    EP_HOST_ASSERT(num_topk <= kNumMaxTopk);
+
+#define COMBINE_LAUNCH_CASE(hidden) { \
+auto combine_func = combine<hidden, kNumMaxTopk>; \
+LAUNCH_KERNEL(&cfg, combine_func, \
+              combined_x, \
+              rdma_recv_x, rdma_recv_flag, rdma_send_x, \
+              x, topk_idx, topk_weights, src_info, layout_range, \
+              next_clean, num_next_clean_int, \
+              atomic_clean_flag, \
+              num_combined_tokens, hidden, num_topk, \
+              num_max_dispatch_tokens_per_rank, \
+              num_experts, rank, num_ranks, \
+              num_warp_groups, num_warps_per_group, \
+              phases, zero_copy); } break
+
+    SETUP_LAUNCH_CONFIG(num_sms, num_warps * 32, stream);
+    SWITCH_HIDDEN(COMBINE_LAUNCH_CASE);
+#undef COMBINE_LAUNCH_CASE
+}
+
+} // namespace internode_ll
+
+} // namespace deep_ep
--- a/DeepEP/csrc/kernels/intranode.cu
+++ b/DeepEP/csrc/kernels/intranode.cu
@ -0,0 +1,935 @@
+#include "configs.cuh"
+#include "buffer.cuh"
+#include "exception.cuh"
+#include "launch.cuh"
+#include "utils.cuh"
+
+namespace deep_ep {
+
+namespace intranode {
+
+template<int kNumRanks>
+__global__ void
+notify_dispatch(const int* num_tokens_per_rank, int* moe_recv_counter_mapped,
+                const int* num_tokens_per_expert, int* moe_recv_expert_counter_mapped, int num_experts,
+                int num_tokens, int num_channels, const bool* is_token_in_rank, int* channel_prefix_matrix,
+                int* rank_prefix_matrix_copy, int num_memset_int, int expert_alignment,
+                void** buffer_ptrs, int** barrier_signal_ptrs, int rank) {
+    auto sm_id = static_cast<int>(blockIdx.x);
+    auto thread_id = static_cast<int>(threadIdx.x), num_threads = static_cast<int>(blockDim.x);
+    auto lane_id = thread_id % 32, warp_id = thread_id / 32, num_warps = num_threads / 32;
+
+    if (sm_id == 0) {
+        // Barrier first
+        barrier_block<kNumRanks, true>(barrier_signal_ptrs, rank);
+
+        int *per_rank_buffer, *per_expert_buffer;
+        if (thread_id < kNumRanks) {
+            per_rank_buffer = static_cast<int*>(buffer_ptrs[thread_id]);
+            per_expert_buffer = per_rank_buffer + kNumRanks * kNumRanks;
+        }
+
+        // After this loop:
+        //  - `per_rank_buffer[rank][i, j]` means the number of tokens from rank i to rank j
+        //  - `per_expert_buffer[rank][i, j]` means the number of tokens from rank i to local expert j
+        int num_experts_per_rank = num_experts / kNumRanks;
+        if (thread_id < kNumRanks) {
+            #pragma unroll
+            for (int i = 0; i < kNumRanks; ++ i)
+                per_rank_buffer[rank * kNumRanks + i] = num_tokens_per_rank[i];
+            #pragma unroll
+            for (int i = 0; i < num_experts_per_rank; ++ i)
+                per_expert_buffer[rank * num_experts_per_rank + i] = num_tokens_per_expert[thread_id * num_experts_per_rank + i];
+        }
+
+        // Wait for all ranks to be finished
+        barrier_block<kNumRanks>(barrier_signal_ptrs, rank);
+
+        // Sum per-rank counts and return to CPU
+        // Also pre-compute the prefix sum for data sending
+        auto local_per_rank_buffer = static_cast<int*>(buffer_ptrs[rank]);
+        if (thread_id < kNumRanks) {
+            #pragma unroll
+            for (int i = 1; i < kNumRanks; ++ i)
+                local_per_rank_buffer[i * kNumRanks + thread_id] += local_per_rank_buffer[(i - 1) * kNumRanks + thread_id];
+            if (thread_id == rank)
+                *moe_recv_counter_mapped = local_per_rank_buffer[(kNumRanks - 1) * kNumRanks + rank];
+        }
+
+        // Sum per-experts counts and return to CPU
+        auto local_per_expert_buffer = local_per_rank_buffer + kNumRanks * kNumRanks;
+        if (thread_id < num_experts_per_rank) {
+            int sum = 0;
+            #pragma unroll
+            for (int i = 0; i < kNumRanks; ++ i)
+                sum += local_per_expert_buffer[i * num_experts_per_rank + thread_id];
+            sum = (sum + expert_alignment - 1) / expert_alignment * expert_alignment;
+            moe_recv_expert_counter_mapped[thread_id] = sum;
+        }
+        __syncthreads();
+
+        // Copy rank size prefix matrix to another tensor
+        #pragma unroll
+        for (int i = thread_id; i < kNumRanks * kNumRanks; i += num_threads)
+            rank_prefix_matrix_copy[i] = local_per_rank_buffer[i];
+
+        // Extra memset for later communication queue
+        #pragma unroll
+        for (int i = thread_id; i < num_memset_int; i += num_threads)
+            local_per_expert_buffer[i] = 0;
+
+        // Barrier
+        barrier_block<kNumRanks>(barrier_signal_ptrs, rank);
+    } else {
+        int dst_rank = sm_id - 1;
+        for (int channel_id = warp_id; channel_id < num_channels; channel_id += num_warps) {
+            int token_start_idx, token_end_idx;
+            get_channel_task_range(num_tokens, num_channels, channel_id, token_start_idx, token_end_idx);
+
+            // Iterate over tokens
+            int count = 0;
+            for (int64_t i = token_start_idx + lane_id; i < token_end_idx; i += 32)
+                count += is_token_in_rank[i * kNumRanks + dst_rank];
+            count = warp_reduce_sum(count);
+            if (lane_id == 0)
+                channel_prefix_matrix[dst_rank * num_channels + channel_id] = count;
+        }
+        __syncthreads();
+
+        // Pre-compute prefix sum for all channels
+        if (thread_id == 0) {
+            #pragma unroll
+            for (int i = 1; i < num_channels; ++ i)
+                channel_prefix_matrix[dst_rank * num_channels + i] += channel_prefix_matrix[dst_rank * num_channels + i - 1];
+        }
+    }
+}
+
+void notify_dispatch(const int* num_tokens_per_rank, int* moe_recv_counter_mapped, int num_ranks,
+                     const int* num_tokens_per_expert, int* moe_recv_expert_counter_mapped, int num_experts,
+                     int num_tokens, const bool* is_token_in_rank, int* channel_prefix_matrix,
+                     int* rank_prefix_matrix_copy, int num_memset_int, int expert_alignment,
+                     void** buffer_ptrs, int** barrier_signal_ptrs, int rank,
+                     cudaStream_t stream, int num_channels) {
+#define NOTIFY_DISPATCH_LAUNCH_CASE(ranks) \
+    LAUNCH_KERNEL(&cfg, notify_dispatch<ranks>, \
+        num_tokens_per_rank, moe_recv_counter_mapped, \
+        num_tokens_per_expert, moe_recv_expert_counter_mapped, num_experts, \
+        num_tokens, num_channels, is_token_in_rank, channel_prefix_matrix, \
+        rank_prefix_matrix_copy, num_memset_int, expert_alignment, \
+        buffer_ptrs, barrier_signal_ptrs, rank); \
+    break
+
+    constexpr int kNumThreads = 128;
+    EP_HOST_ASSERT(num_experts % num_ranks == 0);
+    EP_HOST_ASSERT(num_experts / num_ranks <= kNumThreads and num_ranks <= kNumThreads);
+
+    SETUP_LAUNCH_CONFIG(1 + num_ranks, kNumThreads, stream);
+    SWITCH_RANKS(NOTIFY_DISPATCH_LAUNCH_CASE);
+#undef NOTIFY_DISPATCH_LAUNCH_CASE
+}
+
+template<int kNumRanks>
+__global__ void
+cached_notify_dispatch(const int* rank_prefix_matrix, int num_memset_int,
+                       void** buffer_ptrs, int** barrier_signal_ptrs, int rank) {
+    // A simplified version for cached handles
+    barrier_block<kNumRanks, true>(barrier_signal_ptrs, rank);
+
+    // Copy and clean
+    auto thread_id = static_cast<int>(threadIdx.x), num_threads = static_cast<int>(blockDim.x);
+    auto ptr = static_cast<int*>(buffer_ptrs[rank]);
+    #pragma unroll
+    for (int i = thread_id; i < kNumRanks * kNumRanks; i += num_threads)
+        ptr[i] = rank_prefix_matrix[i];
+    #pragma unroll
+    for (int i = thread_id; i < num_memset_int; i += num_threads)
+        ptr[kNumRanks * kNumRanks + i] = 0;
+
+    // Barrier after cleaning
+    barrier_block<kNumRanks>(barrier_signal_ptrs, rank);
+}
+
+void cached_notify_dispatch(const int* rank_prefix_matrix, int num_memset_int,
+                            void** buffer_ptrs, int** barrier_signal_ptrs,
+                            int rank, int num_ranks, cudaStream_t stream) {
+#define CACHED_NOTIFY_DISPATCH_LAUNCH_CASE(ranks) \
+    LAUNCH_KERNEL(&cfg, cached_notify_dispatch<ranks>, \
+        rank_prefix_matrix, num_memset_int, buffer_ptrs, barrier_signal_ptrs, rank); \
+    break
+
+    SETUP_LAUNCH_CONFIG(1, 128, stream);
+    SWITCH_RANKS(CACHED_NOTIFY_DISPATCH_LAUNCH_CASE);
+#undef CACHED_NOTIFY_DISPATCH_LAUNCH_CASE
+}
+
+template <int kNumRanks, int kNumThreads, int kNumTMABytesPerWarp>
+__global__ void __launch_bounds__(kNumThreads, 1)
+dispatch(int4* recv_x, float* recv_x_scales, int* recv_src_idx, int64_t* recv_topk_idx, float* recv_topk_weights, int* recv_channel_offset,
+         int* send_head, const int4* x, const float* x_scales, const int64_t* topk_idx, const float* topk_weights,
+         const bool* is_token_in_rank, const int* channel_prefix_matrix,
+         int num_tokens, int num_worst_tokens, int hidden_int4, int num_topk, int num_experts, int num_scales,
+         int scale_token_stride, int scale_hidden_stride,
+         void** buffer_ptrs, int rank,
+         int num_max_send_tokens, int num_recv_buffer_tokens) {
+    const auto num_sms = static_cast<int>(gridDim.x), sm_id = static_cast<int>(blockIdx.x);
+    const auto thread_id = static_cast<int>(threadIdx.x), lane_id = get_lane_id();
+    const bool is_sender = sm_id % 2 == 0;
+    EP_DEVICE_ASSERT(num_sms % 2 == 0);
+
+    // Several warps are response for a single rank
+    const auto num_threads_per_rank = kNumThreads / kNumRanks;
+    const auto num_channels = num_sms / 2;
+    const auto responsible_rank = (static_cast<int>(thread_id)) / num_threads_per_rank;
+    // Even-numbered blocks for sending, odd-numbered blocks for receiving.
+    const auto responsible_channel = sm_id / 2;
+
+    int num_experts_per_rank = num_experts / kNumRanks;
+    EP_DEVICE_ASSERT(num_experts_per_rank > 0 or num_topk == 0);
+    EP_DEVICE_ASSERT(num_topk <= 32);
+    EP_DEVICE_ASSERT((topk_idx == nullptr)  == (topk_weights == nullptr));
+    EP_DEVICE_ASSERT((recv_topk_idx == nullptr) == (recv_topk_weights == nullptr));
+
+    // Calculate pointers by the specific layout
+    // `rank_prefix_matrix`: kNumRanks * kNumRanks * sizeof(int)
+    auto ptr = reinterpret_cast<void*>(static_cast<int8_t*>(buffer_ptrs[is_sender ? responsible_rank : rank]) + kNumRanks * kNumRanks * sizeof(int));
+    int target_rank = is_sender ? rank : responsible_rank;
+    auto num_channels_total = num_channels * kNumRanks;
+    auto channel_rank_offset = responsible_channel * kNumRanks + target_rank;
+
+    // Channel buffer metadata
+    // Senders are responsible for tails, and receivers are responsible for heads
+    // Stored on the receiver side
+    // The retired signals are actually boolean flags, but to align with 16 bytes, we make it `int64_t`
+    // `start_offset`: kNumChannels * kNumRanks * sizeof(int)
+    // `end_offset`: kNumChannels * kNumRanks * sizeof(int)
+    // `head_idx`: kNumChannels * kNumRanks * sizeof(int)
+    // `tail_idx`: kNumChannels * kNumRanks * sizeof(int)
+    auto channel_start_offset = Buffer<int>(ptr, num_channels_total, channel_rank_offset);
+    auto channel_end_offset = Buffer<int>(ptr, num_channels_total, channel_rank_offset);
+    auto channel_head_idx = Buffer<int>(ptr, num_channels_total, channel_rank_offset);
+    auto channel_tail_idx = Buffer<int>(ptr, num_channels_total, channel_rank_offset);
+
+    // Channel data buffers, stored on the receiver side
+    // `x_buffers`: kNumChannels * kNumRanks * num_recv_buffer_tokens * hidden_int4 * sizeof(int4)
+    // `src_idx_buffers`: kNumChannels * kNumRanks * num_recv_buffer_tokens * sizeof(int)
+    // `topk_idx_buffers`: kNumChannels * kNumRanks * num_recv_buffer_tokens * num_topk * sizeof(int64_t)
+    // `topk_weights_buffers`: kNumChannels * kNumRanks * num_recv_buffer_tokens * num_topk * sizeof(float)
+    // `x_scales_buffers`: kNumChannels * kNumRanks * num_recv_buffer_tokens * num_scales * sizeof(float)
+    auto channel_x_buffers = Buffer<int4>(ptr, num_channels_total * num_recv_buffer_tokens * hidden_int4, channel_rank_offset * num_recv_buffer_tokens * hidden_int4);
+    auto channel_src_idx_buffers = Buffer<int>(ptr, num_channels_total * num_recv_buffer_tokens, channel_rank_offset * num_recv_buffer_tokens);
+    auto channel_topk_idx_buffers = Buffer<int64_t>(ptr, num_channels_total * num_recv_buffer_tokens * num_topk, channel_rank_offset * num_recv_buffer_tokens * num_topk);
+    auto channel_topk_weights_buffers = Buffer<float>(ptr, num_channels_total * num_recv_buffer_tokens * num_topk, channel_rank_offset * num_recv_buffer_tokens * num_topk);
+    auto channel_x_scales_buffers = Buffer<float>(ptr, num_channels_total * num_recv_buffer_tokens * num_scales, channel_rank_offset * num_recv_buffer_tokens * num_scales);
+
+    // TMA stuffs
+#ifndef DISABLE_SM90_FEATURES
+    extern __shared__ __align__(1024) uint8_t smem_buffer[];
+    auto half_hidden_int4 = hidden_int4 / 2;
+    auto half_hidden_bytes = half_hidden_int4 * static_cast<int>(sizeof(int4));
+    auto tma_buffer = smem_buffer + (thread_id / 32) * kNumTMABytesPerWarp;
+    auto tma_mbarrier = reinterpret_cast<uint64_t*>(tma_buffer + half_hidden_bytes);
+    uint32_t tma_phase = 0;
+    if (lane_id == 0) {
+        mbarrier_init(tma_mbarrier, 1);
+        fence_view_async_shared();
+        fence_barrier_init();
+        EP_DEVICE_ASSERT(hidden_int4 % 2 == 0 and half_hidden_bytes + sizeof(uint64_t) <= kNumTMABytesPerWarp);
+    }
+    __syncwarp();
+#endif
+
+    if (is_sender) {
+        // Workers for sending
+        constexpr int num_send_warps = kNumThreads / 32;
+        constexpr int num_send_warps_per_rank = num_send_warps / kNumRanks;
+        const auto send_thread_id = thread_id;
+        const auto send_warp_id_in_rank = send_thread_id % num_threads_per_rank / 32;
+        EP_DEVICE_ASSERT(kNumRanks <= 32);
+        EP_DEVICE_ASSERT(num_send_warps % kNumRanks == 0);
+
+        // Send offset by `-value - 1`, e.g. 0 -> -1, 1 -> -2
+        // NOTES: this is for distinguishing zero tokens
+        if (lane_id == 0 and send_warp_id_in_rank == 0) {
+            int value = responsible_channel > 0 ? channel_prefix_matrix[responsible_rank * num_channels + responsible_channel - 1] : 0;
+            st_relaxed_sys_global(channel_start_offset.buffer(), -value - 1);
+            value = channel_prefix_matrix[responsible_rank * num_channels + responsible_channel];
+            st_relaxed_sys_global(channel_end_offset.buffer(), -value - 1);
+        }
+        __syncwarp();
+
+        // Get tasks
+        int token_start_idx, token_end_idx;
+        get_channel_task_range(num_tokens, num_channels, responsible_channel, token_start_idx, token_end_idx);
+
+        // Iterate over all tokens and send by chunks
+        int cached_channel_tail_idx = 0;
+        for (int64_t token_idx = token_start_idx; token_idx < token_end_idx; ) {
+            // Check destination queue emptiness, or wait a buffer to be released (rare cases)
+            // NOTES: the head index received by different warps may not be the same
+            auto start_time = clock64();
+            while (lane_id == 0) {
+                // NOTES: we only consider the worst case, because counting the real numbers are time-consuming
+                int num_used_slots = cached_channel_tail_idx - ld_volatile_global(channel_head_idx.buffer());
+                if (num_recv_buffer_tokens - num_used_slots >= num_max_send_tokens)
+                    break;
+
+                // Rare cases to loop again
+                if (clock64() - start_time > NUM_TIMEOUT_CYCLES) {
+                    printf("DeepEP timeout for dispatch senders, rank %d, responsible_channel = %d\n", rank, responsible_channel);
+                    trap();
+                }
+            }
+            __syncwarp();
+
+            int chunk_token_idx = 0;
+            while (chunk_token_idx < num_max_send_tokens and token_idx < token_end_idx) {
+                // NOTES: for the same token, the warp assigned to save `send_head` may be different from the warp assigned to send the following data
+                if (lane_id == 0 and token_idx % num_send_warps_per_rank == send_warp_id_in_rank)
+                    send_head[token_idx * kNumRanks + responsible_rank] = is_token_in_rank[token_idx * kNumRanks + responsible_rank] ? cached_channel_tail_idx : -1;
+
+                // Skip if not selected
+                if (not is_token_in_rank[token_idx * kNumRanks + responsible_rank]) {
+                    token_idx ++;
+                    continue;
+                }
+
+                // Get an empty slot
+                int dst_slot_idx = (cached_channel_tail_idx ++) % num_recv_buffer_tokens;
+                if (cached_channel_tail_idx % num_send_warps_per_rank == send_warp_id_in_rank) {
+                    // Copy data
+                    auto shifted_channel_x_buffers = channel_x_buffers.buffer() + dst_slot_idx * hidden_int4;
+                    auto shifted_x = x + token_idx * hidden_int4;
+                    UNROLLED_WARP_COPY(5, lane_id, hidden_int4, shifted_channel_x_buffers, shifted_x, __ldg, st_na_global);
+
+                    // Copy source index
+                    if (lane_id == 0)
+                        channel_src_idx_buffers[dst_slot_idx] = static_cast<int>(token_idx);
+
+                    // Copy `topk_idx` and `topk_weights` with transformed index
+                    if (lane_id < num_topk) {
+                        // Top-k index
+                        int recv_expert_begin = responsible_rank * num_experts_per_rank, recv_expert_end = (responsible_rank + 1) * num_experts_per_rank;
+                        auto idx_value = __ldg(topk_idx + token_idx * num_topk + lane_id);
+                        idx_value = (idx_value >= recv_expert_begin and idx_value < recv_expert_end) ? idx_value - recv_expert_begin : -1;
+                        channel_topk_idx_buffers[dst_slot_idx * num_topk + lane_id] = idx_value;
+
+                        // Top-k weights
+                        auto weight_value = __ldg(topk_weights + token_idx * num_topk + lane_id);
+                        weight_value = (idx_value >= 0) ? weight_value : 0.0f;
+                        channel_topk_weights_buffers[dst_slot_idx * num_topk + lane_id] = weight_value;
+                    }
+
+                    // Copy `x_scales`
+                    #pragma unroll
+                    for (int i = lane_id; i < num_scales; i += 32) {
+                        auto offset = token_idx * scale_token_stride + i * scale_hidden_stride;
+                        channel_x_scales_buffers[dst_slot_idx * num_scales + i] = __ldg(x_scales + offset);
+                    }
+                }
+
+                // Move token index
+                chunk_token_idx ++, token_idx ++;
+            }
+
+            // Move tail index
+            // NOTES: here all warps should share the same new tail
+            asm volatile("bar.sync %0, %1;" :: "r"(responsible_rank), "r"(num_threads_per_rank));
+            if (send_warp_id_in_rank == 0 and lane_id == 0)
+                st_release_sys_global(channel_tail_idx.buffer(), cached_channel_tail_idx);
+        }
+    } else {
+        // Workers for receiving and copying into buffer
+        constexpr int num_recv_warps = kNumThreads / 32;
+        constexpr int num_recv_warps_per_rank = num_recv_warps / kNumRanks;
+        const auto recv_thread_id = thread_id;
+        const auto recv_thread_id_in_rank = recv_thread_id % num_threads_per_rank;
+        const auto recv_warp_id_in_rank = recv_thread_id_in_rank / 32;
+        EP_DEVICE_ASSERT(kNumRanks <= 32);
+        EP_DEVICE_ASSERT(recv_thread_id >= 0 and num_recv_warps % kNumRanks == 0);
+
+        // Calculate offset first
+        auto rank_prefix_matrix = static_cast<int*>(buffer_ptrs[rank]);
+        int rank_offset = responsible_rank > 0 ? rank_prefix_matrix[(responsible_rank - 1) * kNumRanks + rank] : 0;
+
+        // Receive channel offset
+        int total_offset, num_tokens_to_recv;
+        while (lane_id == 0 and (total_offset = ld_volatile_global(channel_start_offset.buffer())) == 0);
+        while (lane_id == 0 and (num_tokens_to_recv = ld_volatile_global(channel_end_offset.buffer())) == 0);
+        if (lane_id == 0) {
+            total_offset = -total_offset - 1, num_tokens_to_recv = -num_tokens_to_recv - 1;
+            if (recv_warp_id_in_rank == 0)
+                recv_channel_offset[responsible_rank * num_channels + responsible_channel] = total_offset;
+            num_tokens_to_recv -= total_offset;
+        }
+        total_offset = __shfl_sync(0xffffffff, total_offset, 0);
+        total_offset += rank_offset;
+        num_tokens_to_recv = __shfl_sync(0xffffffff, num_tokens_to_recv, 0);
+
+        // Shared tail indices for different warps
+        __shared__ volatile int shared_channel_tail_idx[kNumRanks];
+
+        auto start_time = clock64();
+        int cached_channel_head_idx = 0, cached_channel_tail_idx = 0;
+        while (num_tokens_to_recv > 0) {
+            // NOTES: unlike the sender, the receiver must ensure that the tail indices hold by different warps are the same
+            while (recv_thread_id_in_rank == 0) {
+                cached_channel_tail_idx = ld_acquire_sys_global(channel_tail_idx.buffer());
+
+                // Ready to copy
+                if (cached_channel_head_idx != cached_channel_tail_idx) {
+                    shared_channel_tail_idx[responsible_rank] = cached_channel_tail_idx;
+                    break;
+                }
+
+                // Timeout check
+                if (clock64() - start_time > NUM_TIMEOUT_CYCLES) {
+                    printf("DeepEP timeout for dispatch receivers, rank %d, responsible_channel = %d, tokens remained: %d\n", rank, responsible_channel, num_tokens_to_recv);
+                    trap();
+                }
+            }
+
+            // Synchronize queue tail
+            asm volatile("bar.sync %0, %1;" :: "r"(responsible_rank), "r"(num_threads_per_rank));
+            cached_channel_tail_idx = shared_channel_tail_idx[responsible_rank];
+
+            // Copy data
+            int num_recv_tokens = cached_channel_tail_idx - cached_channel_head_idx;
+            for (int chunk_idx = recv_warp_id_in_rank; chunk_idx < num_recv_tokens; chunk_idx += num_recv_warps_per_rank) {
+                int token_idx_in_buffer = (cached_channel_head_idx + chunk_idx) % num_recv_buffer_tokens;
+                auto shifted_buffer_x_int4 = channel_x_buffers.buffer() + token_idx_in_buffer * hidden_int4;
+                auto shifted_recv_x_int4 = recv_x + static_cast<int64_t>(total_offset + chunk_idx) * hidden_int4;
+#ifndef DISABLE_SM90_FEATURES
+                #pragma unroll
+                for (int i = 0; i < 2; ++ i) if (lane_id == 0) {
+                    tma_store_wait();
+                    tma_load_1d(tma_buffer, shifted_buffer_x_int4 + i * half_hidden_int4, tma_mbarrier, half_hidden_bytes);
+                    mbarrier_arrive_and_expect_tx(tma_mbarrier, half_hidden_bytes);
+                    mbarrier_wait(tma_mbarrier, tma_phase);
+                    tma_store_1d(tma_buffer, shifted_recv_x_int4 + i * half_hidden_int4, half_hidden_bytes, false);
+                }
+                __syncwarp();
+#else
+                UNROLLED_WARP_COPY(5, lane_id, hidden_int4, shifted_recv_x_int4, shifted_buffer_x_int4,
+                                   ld_nc_global, st_na_global);
+#endif
+            }
+
+            // Copy `src_idx`
+            #pragma unroll 4
+            for (int chunk_idx = cached_channel_head_idx + recv_thread_id_in_rank; chunk_idx < cached_channel_tail_idx; chunk_idx += 32 * num_recv_warps_per_rank)
+                recv_src_idx[total_offset + chunk_idx - cached_channel_head_idx] = ld_nc_global(channel_src_idx_buffers.buffer() + chunk_idx % num_recv_buffer_tokens);
+
+            // Copy `topk_idx` and `topk_weights`
+            #pragma unroll 4
+            for (int idx = recv_thread_id_in_rank; idx < num_recv_tokens * num_topk; idx += 32 * num_recv_warps_per_rank) {
+                int chunk_idx = idx / num_topk, token_topk_idx = idx % num_topk;
+                int token_idx_in_buffer = (cached_channel_head_idx + chunk_idx) % num_recv_buffer_tokens;
+                auto recv_idx = static_cast<int64_t>(total_offset + chunk_idx) * num_topk + token_topk_idx;
+                auto buffer_idx = token_idx_in_buffer * num_topk + token_topk_idx;
+                recv_topk_idx[recv_idx] = ld_nc_global(channel_topk_idx_buffers.buffer() + buffer_idx);
+                recv_topk_weights[recv_idx] = ld_nc_global(channel_topk_weights_buffers.buffer() + buffer_idx);
+            }
+
+            // Copy `x_scales`
+            #pragma unroll 4
+            for (int i = recv_thread_id_in_rank; i < num_recv_tokens * num_scales; i += 32 * num_recv_warps_per_rank) {
+                int chunk_idx = i / num_scales, scales_idx = i % num_scales;
+                int token_idx_in_buffer = (cached_channel_head_idx + chunk_idx) % num_recv_buffer_tokens;
+                recv_x_scales[static_cast<int64_t>(total_offset + chunk_idx) * num_scales + scales_idx] =
+                        ld_nc_global(channel_x_scales_buffers.buffer() + token_idx_in_buffer * num_scales + scales_idx);
+            }
+
+            // Move queue
+            cached_channel_head_idx += num_recv_tokens;
+            total_offset += num_recv_tokens;
+            asm volatile("bar.sync %0, %1;" :: "r"(responsible_rank), "r"(num_threads_per_rank));
+            if (recv_warp_id_in_rank == num_recv_warps_per_rank - 1 and lane_id == 0)
+                st_relaxed_sys_global(channel_head_idx.buffer(), cached_channel_head_idx);
+
+            // Exit
+            num_tokens_to_recv -= num_recv_tokens;
+        }
+
+        // Make TMA store visible to the next kernel
+#ifndef DISABLE_SM90_FEATURES
+        if (lane_id == 0)
+            tma_store_wait();
+#endif
+    }
+
+
+    // Clean unused `recv_topk_idx` as -1
+    if (num_worst_tokens > 0) {
+        auto rank_prefix_matrix = static_cast<int*>(buffer_ptrs[rank]);
+        const auto num_recv_tokens = rank_prefix_matrix[(kNumRanks - 1) * kNumRanks + rank];
+        const auto clean_start = num_recv_tokens * num_topk + sm_id * kNumThreads;
+        const auto clean_end = num_worst_tokens * num_topk;
+        const auto clean_stride = num_sms * kNumThreads;
+        #pragma unroll
+        for (int i = clean_start + thread_id; i < clean_end; i += clean_stride)
+            recv_topk_idx[i] = -1;
+    }
+}
+
+void dispatch(void* recv_x, float* recv_x_scales, int* recv_src_idx, int64_t* recv_topk_idx, float* recv_topk_weights, int* recv_channel_offset,
+              int* send_head, const void* x, const float* x_scales, const int64_t* topk_idx, const float* topk_weights,
+              const bool* is_token_in_rank, const int* channel_prefix_matrix,
+              int num_tokens, int num_worst_tokens, int hidden_int4, int num_topk, int num_experts, int num_scales,
+              int scale_token_stride, int scale_hidden_stride,
+              void** buffer_ptrs, int rank, int num_ranks,
+              cudaStream_t stream, int num_sms, int num_max_send_tokens, int num_recv_buffer_tokens) {
+    constexpr int kNumThreads = 768;
+    constexpr int kNumTMABytesPerWarp = 8192;
+#ifndef DISABLE_SM90_FEATURES
+    constexpr int smem_size = kNumTMABytesPerWarp * (kNumThreads / 32);
+#endif
+
+    // Make sure never OOB
+    EP_HOST_ASSERT(static_cast<int64_t>(num_scales) * scale_hidden_stride < std::numeric_limits<int>::max());
+
+#define DISPATCH_LAUNCH_CASE(ranks) { \
+    auto kernel = dispatch<ranks, kNumThreads, kNumTMABytesPerWarp>; \
+    SET_SHARED_MEMORY_FOR_TMA(kernel); \
+    LAUNCH_KERNEL(&cfg, kernel, \
+        reinterpret_cast<int4*>(recv_x), recv_x_scales, recv_src_idx, recv_topk_idx, recv_topk_weights, recv_channel_offset, \
+        send_head, reinterpret_cast<const int4*>(x), x_scales, topk_idx, topk_weights, \
+        is_token_in_rank, channel_prefix_matrix, \
+        num_tokens, num_worst_tokens, hidden_int4, num_topk, num_experts, num_scales, \
+        scale_token_stride, scale_hidden_stride, \
+        buffer_ptrs, rank, \
+        num_max_send_tokens, num_recv_buffer_tokens); \
+    } break
+
+    // Even-numbered blocks for sending, odd-numbered blocks for receiving.
+    EP_HOST_ASSERT(num_sms % 2 == 0);
+    SETUP_LAUNCH_CONFIG(num_sms, kNumThreads, stream);
+    SWITCH_RANKS(DISPATCH_LAUNCH_CASE);
+#undef DISPATCH_LAUNCH_CASE
+}
+
+template<int kNumRanks>
+__global__ void
+cached_notify_combine(void** buffer_ptrs, int* send_head, int num_channels, int num_recv_tokens, int num_memset_int,
+                      int** barrier_signal_ptrs, int rank) {
+    const auto sm_id = static_cast<int>(blockIdx.x);
+    if (sm_id == 0) {
+        // Barrier before cleaning
+        barrier_block<kNumRanks, true>(barrier_signal_ptrs, rank);
+
+        // Clean
+        auto thread_id = static_cast<int>(threadIdx.x), num_threads = static_cast<int>(blockDim.x);
+        auto ptr = static_cast<int*>(buffer_ptrs[rank]);
+        #pragma unroll
+        for (int i = thread_id; i < num_memset_int; i += num_threads)
+            ptr[i] = 0;
+
+        // Barrier after cleaning
+        barrier_block<kNumRanks>(barrier_signal_ptrs, rank);
+    } else {
+        const auto channel_id = sm_id - 1;
+        const auto thread_id = static_cast<int>(threadIdx.x);
+        const auto rank_id = thread_id / 32;
+        const auto lane_id = thread_id % 32;
+        if (rank_id >= kNumRanks)
+            return;
+
+        int token_start_idx, token_end_idx;
+        get_channel_task_range(num_recv_tokens, num_channels, channel_id, token_start_idx, token_end_idx);
+
+        // NOTES: `1 << 25` is a heuristic large number
+        int last_head = 1 << 25;
+        #pragma unroll
+        for (int token_idx_tail = token_end_idx - 1; token_idx_tail >= token_start_idx; token_idx_tail -= 32) {
+            int token_idx = token_idx_tail - lane_id, expected_head = 0;
+            auto current_head = (token_idx >= token_start_idx) ? __ldg(send_head + token_idx * kNumRanks + rank_id) : -1;
+            for (int i = 0; i < min(32, token_idx_tail - token_start_idx + 1); ++ i) {
+                const int head = __shfl_sync(0xffffffff, current_head, i);
+                if (head < 0) {
+                    if (lane_id == i)
+                        expected_head = -last_head - 1;
+                } else {
+                    last_head = head;
+                }
+            }
+            if (current_head < 0 and token_idx >= token_start_idx)
+                send_head[token_idx * kNumRanks + rank_id] = expected_head;
+        }
+    }
+}
+
+void cached_notify_combine(void** buffer_ptrs, int* send_head, int num_channels,
+                           int num_recv_tokens, int num_memset_int,
+                           int** barrier_signal_ptrs, int rank, int num_ranks,
+                           cudaStream_t stream) {
+#define CACHED_NOTIFY_COMBINE(ranks) \
+    LAUNCH_KERNEL(&cfg, cached_notify_combine<ranks>, \
+        buffer_ptrs, send_head, num_channels, num_recv_tokens, num_memset_int, barrier_signal_ptrs, rank); \
+    break
+
+    const int num_threads = std::max(128, 32 * num_ranks);
+    EP_HOST_ASSERT(num_ranks <= num_threads);
+    EP_HOST_ASSERT(num_threads <= 1024);
+    EP_HOST_ASSERT(1 + num_channels <= num_channels * 2);
+    SETUP_LAUNCH_CONFIG(1 + num_channels, num_threads, stream);
+    SWITCH_RANKS(CACHED_NOTIFY_COMBINE);
+#undef CACHED_NOTIFY_COMBINE
+}
+
+template<typename dtype_t, int kNumRanks, int kNumThreads, int kNumTMABytesPerWarp>
+__global__ void __launch_bounds__(kNumThreads, 1)
+combine(dtype_t* recv_x, float* recv_topk_weights,
+        const dtype_t* x, const float* topk_weights,
+        const dtype_t* bias_0, const dtype_t* bias_1,
+        const int* src_idx, const int* rank_prefix_matrix, const int* channel_prefix_matrix,
+        int* send_head, int num_tokens, int num_recv_tokens, int hidden, int num_topk,
+        void** buffer_ptrs, int rank,
+        int num_max_send_tokens, int num_recv_buffer_tokens) {
+    const auto num_sms = static_cast<int>(gridDim.x);
+    const auto thread_id = static_cast<int>(threadIdx.x);
+    const auto sm_id = static_cast<int>(blockIdx.x), lane_id = get_lane_id();
+    const auto num_channels = num_sms / 2;
+    const bool is_sender = sm_id % 2 == 0;
+    const int responsible_channel = sm_id / 2;
+    EP_DEVICE_ASSERT(num_topk <= 32);
+
+    constexpr int kDtypePerInt4 = sizeof(int4) / sizeof(dtype_t);
+    int hidden_int4 = hidden * sizeof(dtype_t) / sizeof(int4);
+    auto x_int4 = reinterpret_cast<const int4*>(x);
+    auto bias_0_int4 = reinterpret_cast<const int4*>(bias_0);
+    auto bias_1_int4 = reinterpret_cast<const int4*>(bias_1);
+    auto recv_int4 = reinterpret_cast<int4*>(recv_x);
+
+    // TMA stuffs
+#ifndef DISABLE_SM90_FEATURES
+    extern __shared__ __align__(1024) uint8_t smem_buffer[];
+    auto tma_buffer = smem_buffer + (thread_id / 32) * kNumTMABytesPerWarp;
+#endif
+
+    if (is_sender) {
+        // Workers for sending
+        // Several warps are responsible for a single rank
+        constexpr int num_send_warps_per_rank = (kNumThreads / 32) / kNumRanks;
+        constexpr int num_send_warps = num_send_warps_per_rank * kNumRanks;
+        const auto num_threads_per_rank = num_send_warps_per_rank * 32;
+        const auto send_thread_id = thread_id;
+        const auto send_warp_id = send_thread_id / 32;
+        const auto send_rank_id = (responsible_channel + send_warp_id) % kNumRanks;
+        const auto send_warp_id_in_rank = send_warp_id / kNumRanks;
+        EP_STATIC_ASSERT(num_send_warps * 32 == kNumThreads, "Invalid warp count");
+
+        // Calculate pointers by the specific layout
+        auto ptr = reinterpret_cast<void*>(static_cast<int8_t*>(buffer_ptrs[send_rank_id]));
+        auto num_channels_total = num_channels * kNumRanks;
+        auto channel_rank_offset = responsible_channel * kNumRanks + rank;
+
+        // Channel meta data
+        // `head_idx`: kNumChannels * kNumRanks * sizeof(int)
+        // `tail_idx`: kNumChannels * kNumRanks * sizeof(int)
+        // `x_buffers`: kNumChannels * kNumRanks * num_recv_buffer_tokens * hidden_int4 * sizeof(int4)
+        // `src_idx_buffers`: kNumChannels * kNumRanks * num_recv_buffer_tokens * sizeof(int)
+        // `topk_weights_buffers`: kNumChannels * kNumRanks * num_recv_buffer_tokens * num_topk * sizeof(float)
+        auto channel_head_idx = Buffer<int>(ptr, num_channels_total, channel_rank_offset);
+        auto channel_tail_idx = Buffer<int>(ptr, num_channels_total, channel_rank_offset);
+        auto channel_x_buffers = Buffer<int4>(ptr, num_channels_total * num_recv_buffer_tokens * hidden_int4, channel_rank_offset * num_recv_buffer_tokens * hidden_int4);
+        auto channel_src_idx_buffers = Buffer<int>(ptr, num_channels_total * num_recv_buffer_tokens, channel_rank_offset * num_recv_buffer_tokens);
+        auto channel_topk_weights_buffers = Buffer<float>(ptr, num_channels_total * num_recv_buffer_tokens * num_topk, channel_rank_offset * num_recv_buffer_tokens * num_topk);
+
+        // Get tasks
+        // NOTES: `channel_offset` is already shifted
+        int rank_offset = send_rank_id > 0 ? rank_prefix_matrix[(send_rank_id - 1) * kNumRanks + rank] : 0;
+        int num_rank_tokens = rank_prefix_matrix[send_rank_id * kNumRanks + rank] - rank_offset;
+        int channel_offset = channel_prefix_matrix[send_rank_id * num_channels + responsible_channel];
+        int num_channel_tokens = (responsible_channel == num_channels - 1 ? num_rank_tokens : channel_prefix_matrix[send_rank_id * num_channels + responsible_channel + 1]) - channel_offset;
+        int token_start_idx = rank_offset + channel_offset, token_end_idx = rank_offset + channel_offset + num_channel_tokens;
+
+        // Iterate over all tokens and send by chunks
+        int current_channel_tail_idx = 0;
+        for (int64_t token_idx = token_start_idx; token_idx < token_end_idx; ) {
+            // Check destination queue emptiness, or wait a buffer to be released (rare cases)
+            auto start_time = clock64();
+            int num_round_tokens = min(num_max_send_tokens, token_end_idx - static_cast<int>(token_idx));
+            while (lane_id == 0) {
+                // NOTES: we only consider the worst case, because counting the real numbers are time-consuming
+                int num_used_slots = current_channel_tail_idx - ld_volatile_global(channel_head_idx.buffer());
+                if (num_recv_buffer_tokens - num_used_slots >= num_round_tokens)
+                    break;
+
+                // Rare cases to loop again
+                if (clock64() - start_time > NUM_TIMEOUT_CYCLES) {
+                    printf("DeepEP timeout for combine senders, rank %d, responsible_channel = %d\n", rank, responsible_channel);
+                    trap();
+                }
+            }
+            __syncwarp();
+
+            // Send by chunk
+            #pragma unroll
+            for (int i = send_warp_id_in_rank; i < num_round_tokens; i += num_send_warps_per_rank) {
+                // Get an empty slot
+                int dst_slot_idx = (current_channel_tail_idx + i) % num_recv_buffer_tokens;
+
+                // Copy data
+                auto shifted_x_buffers = channel_x_buffers.buffer() + dst_slot_idx * hidden_int4;
+                auto shifted_x = x_int4 + (token_idx + i) * hidden_int4;
+                UNROLLED_WARP_COPY(4, lane_id, hidden_int4, shifted_x_buffers, shifted_x, ld_nc_global, st_na_global);
+
+                // Send source index
+                if (lane_id == 0)
+                    channel_src_idx_buffers[dst_slot_idx] = __ldg(src_idx + token_idx + i);
+
+                // Send `topk_weights`
+                if (num_topk > 0 and lane_id < num_topk)
+                    channel_topk_weights_buffers[dst_slot_idx * num_topk + lane_id] = __ldg(topk_weights + (token_idx + i) * num_topk + lane_id);
+            }
+            token_idx += num_round_tokens;
+            current_channel_tail_idx += num_round_tokens;
+
+            // Move tail index
+            asm volatile("bar.sync %0, %1;" :: "r"(send_rank_id), "r"(num_threads_per_rank));
+            if (lane_id == 0 and send_warp_id_in_rank == 0)
+                st_release_sys_global(channel_tail_idx.buffer(), current_channel_tail_idx);
+        }
+    } else {
+        // Workers for receiving
+        // One warp for moving the queue head, others for reduction
+        constexpr int num_recv_warps = kNumThreads / 32;
+        const auto recv_warp_id = thread_id / 32;
+        EP_DEVICE_ASSERT(kNumRanks <= 32 and kNumThreads > 32);
+        EP_DEVICE_ASSERT(thread_id >= 0 and kNumThreads % 32 == 0);
+
+        // Shared head, tail and retired flags for receiver warps
+        __shared__ volatile int warp_channel_head_idx[num_recv_warps][kNumRanks];
+        __shared__ volatile int channel_tail_idx[kNumRanks];
+        __shared__ volatile bool warp_retired[num_recv_warps];
+        if (thread_id < num_recv_warps)
+            warp_retired[thread_id] = false;
+        if (lane_id < kNumRanks)
+            warp_channel_head_idx[recv_warp_id][lane_id] = 0;
+        if (thread_id < kNumRanks)
+            channel_tail_idx[thread_id] = 0;
+        asm volatile("bar.sync 0, %0;" :: "r"(kNumThreads));
+
+        if (thread_id < 32) {
+            int* channel_head_idx_ptr = static_cast<int*>(buffer_ptrs[rank]) + responsible_channel * kNumRanks + lane_id;
+            int* channel_tail_idx_ptr = channel_head_idx_ptr + num_channels * kNumRanks;
+
+            // Queue head updater
+            int last_head = 0;
+            while (lane_id < kNumRanks) {
+                // Check retired
+                bool retired = true;
+                #pragma unroll
+                for (int i = 1; i < num_recv_warps; ++ i)
+                    retired = retired and warp_retired[i];
+                if (retired)
+                    break;
+
+                // Update queue tail
+                channel_tail_idx[lane_id] = ld_acquire_sys_global(channel_tail_idx_ptr);
+
+                // Update minimum head
+                int min_head = std::numeric_limits<int>::max();
+                #pragma unroll
+                for (int i = 1; i < num_recv_warps; ++ i) if (not warp_retired[i])
+                    min_head = min(min_head, warp_channel_head_idx[i][lane_id]);
+                if (min_head != std::numeric_limits<int>::max() and min_head > last_head)
+                    st_relaxed_sys_global(channel_head_idx_ptr, last_head = min_head);
+            }
+        } else {
+            // Receivers
+            // Channel metadata
+            // All lanes will use data buffer, but only rank lane will use `head/tail/src_idx`
+            Buffer<int4> channel_x_buffers[kNumRanks];
+            Buffer<float> channel_topk_weights_buffers[kNumRanks];
+
+            // Calculate pointers by the specific layout
+            #pragma unroll
+            for (int i = 0; i < kNumRanks; ++ i) {
+                auto channel_rank_offset = responsible_channel * kNumRanks + i;
+                auto num_channels_total = num_channels * kNumRanks;
+                // `head_idx` & `tail_idx`: kNumChannels * kNumRanks * sizeof(int)
+                auto ptr = reinterpret_cast<void*>(static_cast<int8_t*>(buffer_ptrs[rank]) + 2 * num_channels * kNumRanks * sizeof(int));
+
+                // `x_buffers`: kNumChannels * kNumRanks * num_recv_buffer_tokens * hidden_int4 * sizeof(int4)
+                channel_x_buffers[i] = Buffer<int4>(ptr, num_channels_total * num_recv_buffer_tokens * hidden_int4, channel_rank_offset * num_recv_buffer_tokens * hidden_int4);
+
+                // `src_idx_buffers`: kNumChannels * kNumRanks * num_recv_buffer_tokens * sizeof(int)
+                ptr = reinterpret_cast<void*>(static_cast<int8_t*>(ptr) + num_channels_total * num_recv_buffer_tokens * sizeof(int));
+
+                // `topk_weights_buffers`: kNumChannels * kNumRanks * num_recv_buffer_tokens * num_topk * sizeof(float)
+                channel_topk_weights_buffers[i] = Buffer<float>(ptr, num_channels_total * num_recv_buffer_tokens * num_topk, channel_rank_offset * num_recv_buffer_tokens * num_topk);
+            }
+
+            // The same tokens as the dispatch process
+            int token_start_idx, token_end_idx;
+            get_channel_task_range(num_recv_tokens, num_channels, responsible_channel, token_start_idx, token_end_idx);
+
+            // Iterate over all tokens and combine
+            for (int64_t token_idx = token_start_idx + recv_warp_id - 1; token_idx < token_end_idx; token_idx += num_recv_warps - 1) {
+                // Read expected head
+                int expected_head = -1;
+                if (lane_id < kNumRanks)
+                    expected_head = ld_nc_global(send_head + token_idx * kNumRanks + lane_id);
+
+                auto start_time = clock64();
+                while (__any_sync(0xffffffff, channel_tail_idx[lane_id] <= expected_head and expected_head >= 0)) {
+                    // Timeout check
+                    if (clock64() - start_time > NUM_TIMEOUT_CYCLES) {
+                        printf("DeepEP timeout for combine receivers, rank %d, responsible_channel = %d, expect = %d\n", rank, responsible_channel, expected_head);
+                        trap();
+                    }
+                }
+                __syncwarp();
+
+                // Broadcast current heads
+                int num_topk_ranks = 0, topk_ranks[kNumRanks], slot_indices[kNumRanks];
+                #pragma unroll
+                for (int i = 0; i < kNumRanks; ++ i) {
+                    auto expected_head_i = __shfl_sync(0xffffffff, expected_head, i);
+                    if (expected_head_i >= 0) {
+                        slot_indices[num_topk_ranks] = expected_head_i % num_recv_buffer_tokens;
+                        topk_ranks[num_topk_ranks ++] = i;
+                    }
+                }
+
+                // Wait shared memory release
+#ifndef DISABLE_SM90_FEATURES
+                if (lane_id == 0)
+                    tma_store_wait();
+                __syncwarp();
+#endif
+
+                // Reduce data with pipeline
+                constexpr int kNumStages = 8;
+                EP_STATIC_ASSERT(kNumStages * 32 * sizeof(int4) <= kNumTMABytesPerWarp, "Invalid count");
+                #pragma unroll
+                for (int i = lane_id; i < hidden_int4; i += 32) {
+                    // Read bias
+                    // TODO: make it as a template
+                    int4 bias_0_value_int4 = bias_0_int4 != nullptr ? __ldg(bias_0_int4 + token_idx * hidden_int4 + i) : make_int4(0, 0, 0, 0);
+                    int4 bias_1_value_int4 = bias_1_int4 != nullptr ? __ldg(bias_1_int4 + token_idx * hidden_int4 + i) : make_int4(0, 0, 0, 0);
+
+                    // Read buffers
+                    int4 recv_value_int4[kNumRanks];
+                    #pragma unroll
+                    for (int j = 0; j < num_topk_ranks; ++ j)
+                        recv_value_int4[j] = ld_nc_global(channel_x_buffers[topk_ranks[j]].buffer() + slot_indices[j] * hidden_int4 + i);
+
+                    // Reduce bias
+                    float values[kDtypePerInt4];
+                    auto bias_0_values = reinterpret_cast<const dtype_t*>(&bias_0_value_int4);
+                    auto bias_1_values = reinterpret_cast<const dtype_t*>(&bias_1_value_int4);
+                    #pragma unroll
+                    for (int j = 0; j < kDtypePerInt4; ++ j)
+                        values[j] = static_cast<float>(bias_0_values[j]) + static_cast<float>(bias_1_values[j]);
+
+                    // Reduce all-to-all results
+                    #pragma unroll
+                    for (int j = 0; j < num_topk_ranks; ++ j) {
+                        auto recv_value_dtypes = reinterpret_cast<const dtype_t*>(&recv_value_int4[j]);
+                        #pragma unroll
+                        for (int k = 0; k < kDtypePerInt4; ++ k)
+                            values[k] += static_cast<float>(recv_value_dtypes[k]);
+                    }
+
+                    // Cast back to `dtype_t`
+                    int4 out_int4;
+                    auto out_dtypes = reinterpret_cast<dtype_t*>(&out_int4);
+                    #pragma unroll
+                    for (int j = 0; j < kDtypePerInt4; ++ j)
+                        out_dtypes[j] = static_cast<dtype_t>(values[j]);
+
+#ifndef DISABLE_SM90_FEATURES
+                    // Wait TMA arrival
+                    if (lane_id == 0)
+                        tma_store_wait<kNumStages - 1>();
+                    __syncwarp();
+
+                    // Write into TMA buffer
+                    auto tma_stage_idx = (i / 32) % kNumStages;
+                    reinterpret_cast<int4*>(tma_buffer)[tma_stage_idx * 32 + lane_id] = out_int4;
+
+                    // Issue TMA
+                    tma_store_fence();
+                    __syncwarp();
+                    if (lane_id == 0) {
+                        auto tma_bytes = min(32, hidden_int4 - i) * static_cast<int>(sizeof(int4));
+                        tma_store_1d(reinterpret_cast<int4*>(tma_buffer) + tma_stage_idx * 32,
+                                     recv_int4 + token_idx * hidden_int4 + i, tma_bytes, false);
+                    }
+                    __syncwarp();
+#else
+                    recv_int4[token_idx * hidden_int4 + i] = out_int4;
+#endif
+                }
+
+                // Reduce `topk_weights`
+                if (lane_id < num_topk) {
+                    float value = 0;
+                    #pragma unroll
+                    for (int i = 0; i < num_topk_ranks; ++ i)
+                        value += ld_nc_global(channel_topk_weights_buffers[topk_ranks[i]].buffer() + slot_indices[i] * num_topk + lane_id);
+                    recv_topk_weights[token_idx * num_topk + lane_id] = value;
+                }
+
+                // Update head
+                if (lane_id < kNumRanks)
+                    warp_channel_head_idx[recv_warp_id][lane_id] = (expected_head < 0) ? -expected_head - 1 : expected_head + 1;
+            }
+
+            // Retired
+            __syncwarp();
+            if (lane_id == 0)
+                warp_retired[recv_warp_id] = true;
+
+            // Make TMA store visible to the next kernel
+#ifndef DISABLE_SM90_FEATURES
+            if (lane_id == 0)
+                tma_store_wait();
+#endif
+        }
+    }
+}
+
+void combine(cudaDataType_t type,
+             void* recv_x, float* recv_topk_weights,
+             const void* x, const float* topk_weights,
+             const void* bias_0, const void* bias_1,
+             const int* src_idx, const int* rank_prefix_matrix, const int* channel_prefix_matrix,
+             int* send_head, int num_tokens, int num_recv_tokens, int hidden, int num_topk,
+             void** buffer_ptrs, int rank, int num_ranks,
+             cudaStream_t stream, int num_sms,
+             int num_max_send_tokens, int num_recv_buffer_tokens) {
+    constexpr int kNumThreads = 768;
+    constexpr int kNumTMABytesPerWarp = 4096;
+#ifndef DISABLE_SM90_FEATURES
+    constexpr int smem_size = kNumTMABytesPerWarp * (kNumThreads / 32);
+#endif
+
+#define COMBINE_LAUNCH_CASE(dtype, ranks) { \
+    auto kernel = combine<dtype, ranks, kNumThreads, kNumTMABytesPerWarp>; \
+    SET_SHARED_MEMORY_FOR_TMA(kernel); \
+    LAUNCH_KERNEL(&cfg, kernel, \
+        reinterpret_cast<dtype*>(recv_x), recv_topk_weights, \
+        reinterpret_cast<const dtype*>(x), topk_weights,   \
+        reinterpret_cast<const dtype*>(bias_0), reinterpret_cast<const dtype*>(bias_1), \
+        src_idx, rank_prefix_matrix, channel_prefix_matrix, \
+        send_head, num_tokens, num_recv_tokens, hidden, num_topk, \
+        buffer_ptrs, rank, \
+        num_max_send_tokens, num_recv_buffer_tokens); } \
+    break
+#define COMBINE_DTYPE_LAUNCH_CASE(dtype) SWITCH_RANKS_WITH_DTYPE(dtype, COMBINE_LAUNCH_CASE); break
+
+    // Even-numbered blocks for sending, odd-numbered blocks for receiving
+    EP_HOST_ASSERT(num_sms % 2 == 0);
+    EP_HOST_ASSERT(kNumThreads >= num_ranks * 32);
+    SETUP_LAUNCH_CONFIG(num_sms, kNumThreads, stream);
+    SWITCH_TYPES(COMBINE_DTYPE_LAUNCH_CASE);
+#undef COMBINE_DTYPE_LAUNCH_CASE
+#undef COMBINE_LAUNCH_CASE
+}
+
+} // namespace intranode
+
+} // namespace deep_ep
--- a/DeepEP/csrc/kernels/launch.cuh
+++ b/DeepEP/csrc/kernels/launch.cuh
@ -0,0 +1,89 @@
+#pragma once
+
+#include "configs.cuh"
+#include "exception.cuh"
+
+#ifndef SETUP_LAUNCH_CONFIG
+#ifndef DISABLE_SM90_FEATURES
+#define SETUP_LAUNCH_CONFIG(num_sms, num_threads, stream) \
+    cudaLaunchConfig_t cfg = {(num_sms), (num_threads), 0, stream, nullptr, 0}; \
+    cudaLaunchAttribute attr[1]; \
+    attr[0].id = cudaLaunchAttributeCooperative; \
+    attr[0].val.cooperative = 1; \
+    cfg.attrs = attr; \
+    cfg.numAttrs = 1
+#else
+#define SETUP_LAUNCH_CONFIG(sms, threads, stream) \
+    int __num_sms = (sms); \
+    int __num_threads = (threads); \
+    auto __stream = (stream)
+#endif
+#endif
+
+#ifndef LAUNCH_KERNEL
+#ifndef DISABLE_SM90_FEATURES
+#define LAUNCH_KERNEL(config, kernel, ...) CUDA_CHECK(cudaLaunchKernelEx(config, kernel, ##__VA_ARGS__))
+#else
+#define LAUNCH_KERNEL(config, kernel, ...) \
+do { \
+    kernel<<<__num_sms, __num_threads, 0, __stream>>>(__VA_ARGS__); \
+    cudaError_t e = cudaGetLastError(); \
+    if (e != cudaSuccess) { \
+        EPException cuda_exception("CUDA", __FILE__, __LINE__, cudaGetErrorString(e)); \
+        fprintf(stderr, "%s\n", cuda_exception.what()); \
+        throw cuda_exception; \
+    } \
+} while (0)
+#endif
+#endif
+
+#ifndef SET_SHARED_MEMORY_FOR_TMA
+#ifndef DISABLE_SM90_FEATURES
+#define SET_SHARED_MEMORY_FOR_TMA(kernel) \
+EP_HOST_ASSERT(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size) == cudaSuccess); \
+cfg.dynamicSmemBytes = smem_size;
+#else
+#define SET_SHARED_MEMORY_FOR_TMA(kernel) void()
+#endif
+#endif
+
+#define SWITCH_RANKS(case_macro) \
+    switch (num_ranks) { \
+        case 2: case_macro(2); \
+        case 4: case_macro(4); \
+        case 8: case_macro(8); \
+        default: EP_HOST_ASSERT(false and "Unsupported ranks"); \
+    } while (false)
+
+#define SWITCH_RDMA_RANKS(case_macro) \
+    switch (num_ranks / NUM_MAX_NVL_PEERS) { \
+        case 2: case_macro(2); \
+        case 4: case_macro(4); \
+        case 8: case_macro(8); \
+        case 16: case_macro(16); \
+        default: EP_HOST_ASSERT(false and "Unsupported RDMA ranks"); \
+    } while (false)
+
+#define SWITCH_RANKS_WITH_DTYPE(dtype, case_macro) \
+    switch (num_ranks) { \
+        case 2: case_macro(dtype, 2); \
+        case 4: case_macro(dtype, 4); \
+        case 8: case_macro(dtype, 8); \
+        default: EP_HOST_ASSERT(false && "Unsupported ranks"); \
+    } while (false)
+
+#define SWITCH_TYPES(case_macro) \
+    switch (type) { \
+        case CUDA_R_16BF: case_macro(nv_bfloat16); \
+        default: EP_HOST_ASSERT(false && "Unsupported type"); \
+    } while (false)
+
+#define SWITCH_HIDDEN(case_macro) \
+    switch (hidden) { \
+        case 2048: case_macro(2048); \
+        case 2560: case_macro(2560); \
+        case 4096: case_macro(4096); \
+        case 5120: case_macro(5120); \
+        case 7168: case_macro(7168); \
+        default: EP_HOST_ASSERT(false && "Unsupported hidden"); \
+    } while (false)
--- a/DeepEP/csrc/kernels/layout.cu
+++ b/DeepEP/csrc/kernels/layout.cu
@ -0,0 +1,136 @@
+#include "configs.cuh"
+#include "exception.cuh"
+#include "launch.cuh"
+
+namespace deep_ep {
+
+namespace layout {
+
+template <int kNumThreads, int kNumExpertsPerSM, int kNumRanksPerSM>
+__global__ void __launch_bounds__(kNumThreads, 1)
+get_dispatch_layout(const int64_t* topk_idx,
+                    int* num_tokens_per_rank, int* num_tokens_per_rdma_rank,
+                    int* num_tokens_per_expert, bool* is_token_in_rank,
+                    int num_tokens, int num_topk, int num_ranks, int num_experts) {
+    auto sm_id = static_cast<int>(blockIdx.x);
+    auto thread_id = static_cast<int>(threadIdx.x);
+
+    // Count expert statistics
+    __shared__ int num_tokens_per_expert_per_thread[kNumThreads][kNumExpertsPerSM];
+    int expert_begin_idx = sm_id * kNumExpertsPerSM, expert_end_idx = min(expert_begin_idx + kNumExpertsPerSM, num_experts);
+    if (expert_begin_idx < expert_end_idx) {
+        // Per-thread count
+        #pragma unroll
+        for (int i = 0; i < kNumExpertsPerSM; ++ i)
+            num_tokens_per_expert_per_thread[thread_id][i] = 0;
+        #pragma unroll
+        for (int i = thread_id; i < num_tokens; i += kNumThreads) {
+            auto shifted_topk_idx = topk_idx + i * num_topk;
+            #pragma unroll
+            for (int j = 0, expert_idx; j < num_topk; ++ j) {
+                expert_idx = static_cast<int>(shifted_topk_idx[j]);
+                if (expert_begin_idx <= expert_idx and expert_idx < expert_end_idx)
+                    ++ num_tokens_per_expert_per_thread[thread_id][expert_idx - expert_begin_idx];
+            }
+        }
+        __syncthreads();
+
+        // Sum up
+        EP_STATIC_ASSERT(kNumExpertsPerSM <= kNumThreads, "Too many experts per SM");
+        if (expert_begin_idx + thread_id < expert_end_idx) {
+            int sum = 0;
+            #pragma unroll
+            for (int i = 0; i < kNumThreads; ++ i)
+                sum += num_tokens_per_expert_per_thread[i][thread_id];
+            num_tokens_per_expert[expert_begin_idx + thread_id] = sum;
+        }
+        return;
+    }
+
+    if (num_tokens_per_rdma_rank != nullptr)
+        EP_DEVICE_ASSERT(num_ranks % NUM_MAX_NVL_PEERS == 0 and num_ranks > NUM_MAX_NVL_PEERS);
+
+    // Count rank statistics
+    constexpr int kNumRDMARanksPerSM = kNumRanksPerSM / NUM_MAX_NVL_PEERS;
+    __shared__ int num_tokens_per_rank_per_thread[kNumThreads][kNumRanksPerSM];
+    __shared__ int num_tokens_per_rdma_rank_per_thread[kNumThreads][kNumRDMARanksPerSM];
+    auto sm_begin = (num_experts + kNumExpertsPerSM - 1) / kNumExpertsPerSM;
+    int rank_begin_idx = (sm_id - sm_begin) * kNumRanksPerSM, rank_end_idx = min(rank_begin_idx + kNumRanksPerSM, num_ranks);
+    int rdma_rank_begin_idx = rank_begin_idx / NUM_MAX_NVL_PEERS, rdma_rank_end_idx = rank_end_idx / NUM_MAX_NVL_PEERS;
+    if (rank_begin_idx < rank_end_idx) {
+        const auto num_expert_per_rank = num_experts / num_ranks;
+        auto expert_begin = rank_begin_idx * num_expert_per_rank;
+        auto expert_end = rank_end_idx * num_expert_per_rank;
+
+        // Per-thread count
+        #pragma unroll
+        for (int i = 0; i < kNumRanksPerSM; ++ i)
+            num_tokens_per_rank_per_thread[thread_id][i] = 0;
+        #pragma unroll
+        for (int i = 0; i < kNumRDMARanksPerSM; ++ i)
+            num_tokens_per_rdma_rank_per_thread[thread_id][i] = 0;
+        #pragma unroll
+        for (int i = thread_id; i < num_tokens; i += kNumThreads) {
+            auto shifted_topk_idx = topk_idx + i * num_topk;
+            int is_in_rank[kNumRanksPerSM] = {0}, is_in_rdma_rank[kNumRDMARanksPerSM] = {0};
+            #pragma unroll
+            for (int j = 0, expert_idx, rank_idx; j < num_topk; ++j) {
+                expert_idx = static_cast<int>(shifted_topk_idx[j]);
+                if (expert_begin <= expert_idx and expert_idx < expert_end) {
+                    // Count single rank
+                    rank_idx = expert_idx / num_expert_per_rank - rank_begin_idx;
+                    is_in_rank[rank_idx] ++, is_in_rdma_rank[rank_idx / NUM_MAX_NVL_PEERS] ++;
+                }
+            }
+
+            auto shifted_is_token_in_rank = is_token_in_rank + i * num_ranks;
+            #pragma unroll
+            for (int j = 0; j + rank_begin_idx < rank_end_idx; ++ j) {
+                shifted_is_token_in_rank[j + rank_begin_idx] = (is_in_rank[j] > 0);
+                num_tokens_per_rank_per_thread[thread_id][j] += (is_in_rank[j] > 0);
+            }
+
+            #pragma unroll
+            for (int j = 0; j + rdma_rank_begin_idx < rdma_rank_end_idx; ++ j)
+                num_tokens_per_rdma_rank_per_thread[thread_id][j] += (is_in_rdma_rank[j] > 0);
+        }
+        __syncthreads();
+
+        // Sum up
+        EP_STATIC_ASSERT(kNumRanksPerSM <= kNumThreads, "Too many ranks per SM");
+        if (rank_begin_idx + thread_id < rank_end_idx) {
+            int sum = 0;
+            #pragma unroll
+            for (int i = 0; i < kNumThreads; ++ i)
+                sum += num_tokens_per_rank_per_thread[i][thread_id];
+            num_tokens_per_rank[rank_begin_idx + thread_id] = sum;
+        }
+
+        if (num_tokens_per_rdma_rank != nullptr and rdma_rank_begin_idx + thread_id < rdma_rank_end_idx) {
+            int sum = 0;
+            #pragma unroll
+            for (int i = 0; i < kNumThreads; ++ i)
+                sum += num_tokens_per_rdma_rank_per_thread[i][thread_id];
+            num_tokens_per_rdma_rank[rdma_rank_begin_idx + thread_id] = sum;
+        }
+    }
+}
+
+void get_dispatch_layout(const int64_t* topk_idx,
+                         int* num_tokens_per_rank, int* num_tokens_per_rdma_rank,
+                         int* num_tokens_per_expert, bool* is_token_in_rank,
+                         int num_tokens, int num_topk, int num_ranks, int num_experts,
+                         cudaStream_t stream) {
+    constexpr int kNumThreads = 256, kNumExpertsPerSM = 32, kNumRanksPerSM = 8;
+    int num_sms = ((num_experts + kNumExpertsPerSM - 1) / kNumExpertsPerSM) + (num_ranks + kNumRanksPerSM - 1) / kNumRanksPerSM;
+    EP_STATIC_ASSERT(kNumExpertsPerSM % NUM_MAX_NVL_PEERS == 0, "Invalid number of experts per SM");
+
+    SETUP_LAUNCH_CONFIG(num_sms, kNumThreads, stream);
+    LAUNCH_KERNEL(&cfg, (get_dispatch_layout<kNumThreads, kNumExpertsPerSM, kNumRanksPerSM>),
+                  topk_idx, num_tokens_per_rank, num_tokens_per_rdma_rank, num_tokens_per_expert, is_token_in_rank,
+                  num_tokens, num_topk, num_ranks, num_experts);
+}
+
+} // namespace layout
+
+} // namespace deep_ep
--- a/DeepEP/csrc/kernels/runtime.cu
+++ b/DeepEP/csrc/kernels/runtime.cu
@ -0,0 +1,92 @@
+#include <vector>
+#include <cstring>
+
+#include "configs.cuh"
+#include "exception.cuh"
+#include "launch.cuh"
+#include "utils.cuh"
+
+#ifndef DISABLE_NVSHMEM
+#include "ibgda_device.cuh"
+#endif
+
+namespace deep_ep {
+
+namespace intranode {
+
+template<int kNumRanks>
+__global__ void barrier(int** barrier_signal_ptrs, int rank) {
+    barrier_block<kNumRanks>(barrier_signal_ptrs, rank);
+}
+
+void barrier(int** barrier_signal_ptrs, int rank, int num_ranks, cudaStream_t stream) {
+#define BARRIER_LAUNCH_CASE(ranks) \
+    LAUNCH_KERNEL(&cfg, barrier<ranks>, barrier_signal_ptrs, rank); \
+    break
+
+    SETUP_LAUNCH_CONFIG(1, 32, stream);
+    SWITCH_RANKS(BARRIER_LAUNCH_CASE);
+#undef BARRIER_LAUNCH_CASE
+}
+
+} // namespace intranode
+
+namespace internode {
+
+#ifndef DISABLE_NVSHMEM
+nvshmem_team_t cpu_rdma_team = NVSHMEM_TEAM_INVALID;
+nvshmem_team_config_t cpu_rdma_team_config;
+
+std::vector<uint8_t> get_unique_id() {
+    nvshmemx_uniqueid_t unique_id;
+    nvshmemx_get_uniqueid(&unique_id);
+    std::vector<uint8_t> result(sizeof(nvshmemx_uniqueid_t));
+    std::memcpy(result.data(), &unique_id, sizeof(nvshmemx_uniqueid_t));
+    return result;
+}
+
+int init(const std::vector<uint8_t> &root_unique_id_val, int rank, int num_ranks, bool low_latency_mode) {
+    nvshmemx_uniqueid_t root_unique_id;
+    nvshmemx_init_attr_t attr;
+    std::memcpy(&root_unique_id, root_unique_id_val.data(), sizeof(nvshmemx_uniqueid_t));
+    nvshmemx_set_attr_uniqueid_args(rank, num_ranks, &root_unique_id, &attr);
+    nvshmemx_init_attr(NVSHMEMX_INIT_WITH_UNIQUEID, &attr);
+
+    // Create sub-RDMA teams
+    // NOTES: if `num_ranks <= NUM_MAX_NVL_PEERS` then only low-latency kernels are used
+    if (low_latency_mode and num_ranks > NUM_MAX_NVL_PEERS) {
+        EP_HOST_ASSERT(cpu_rdma_team == NVSHMEM_TEAM_INVALID);
+        EP_HOST_ASSERT(num_ranks % NUM_MAX_NVL_PEERS == 0);
+        EP_HOST_ASSERT(nvshmem_team_split_strided(NVSHMEM_TEAM_WORLD, rank % NUM_MAX_NVL_PEERS, NUM_MAX_NVL_PEERS,
+                                                  num_ranks / NUM_MAX_NVL_PEERS, &cpu_rdma_team_config, 0, &cpu_rdma_team) == 0);
+        EP_HOST_ASSERT(cpu_rdma_team != NVSHMEM_TEAM_INVALID);
+    }
+
+    nvshmem_barrier_all();
+    return nvshmem_my_pe();
+}
+
+void* alloc(size_t size, size_t alignment) {
+    return nvshmem_align(alignment, size);
+}
+
+void free(void* ptr) {
+    nvshmem_free(ptr);
+}
+
+void barrier() {
+    nvshmem_barrier_all();
+}
+
+void finalize() {
+    if (cpu_rdma_team != NVSHMEM_TEAM_INVALID) {
+        nvshmem_team_destroy(cpu_rdma_team);
+        cpu_rdma_team = NVSHMEM_TEAM_INVALID;
+    }
+    nvshmem_finalize();
+}
+#endif
+
+} // namespace internode
+
+} // namespace deep_ep
--- a/DeepEP/csrc/kernels/utils.cuh
+++ b/DeepEP/csrc/kernels/utils.cuh
@ -0,0 +1,496 @@
+#pragma once
+
+#include "exception.cuh"
+
+#define UNROLLED_WARP_COPY(UNROLL_FACTOR, LANE_ID, N, DST, SRC, LD_FUNC, ST_FUNC) \
+{ \
+    constexpr int kLoopStride = 32 * (UNROLL_FACTOR); \
+    typename std::remove_reference<decltype(LD_FUNC((SRC) + 0))>::type unrolled_values[(UNROLL_FACTOR)]; \
+    auto __src = (SRC); \
+    auto __dst = (DST); \
+    for (int __i = (LANE_ID); __i < ((N) / kLoopStride) * kLoopStride; __i += kLoopStride) { \
+        _Pragma("unroll") \
+        for (int __j = 0; __j < (UNROLL_FACTOR); ++ __j) \
+            unrolled_values[__j] = LD_FUNC(__src + __i + __j * 32); \
+        _Pragma("unroll") \
+        for (int __j = 0; __j < (UNROLL_FACTOR); ++ __j) \
+            ST_FUNC(__dst + __i + __j * 32, unrolled_values[__j]); \
+    } \
+    for (int __i = ((N) / kLoopStride) * kLoopStride + (LANE_ID); __i < (N); __i += 32) \
+        ST_FUNC(__dst + __i, LD_FUNC(__src + __i)); \
+}
+
+namespace deep_ep {
+
+template <int kBytes>
+struct VecInt {};
+template<> struct VecInt<1> { using vec_t = int8_t; };
+template<> struct VecInt<2> { using vec_t = int16_t; };
+template<> struct VecInt<4> { using vec_t = int; };
+template<> struct VecInt<8> { using vec_t = int64_t; };
+template<> struct VecInt<16> { using vec_t = int4; };
+
+__device__ __forceinline__ void trap() {
+    asm("trap;");
+}
+
+__device__ __forceinline__ void memory_fence() {
+    asm volatile("fence.acq_rel.sys;":: : "memory");
+}
+
+__device__ __forceinline__ void memory_fence_gpu() {
+    asm volatile("fence.acq_rel.gpu;":: : "memory");
+}
+
+__device__ __forceinline__ void memory_fence_cta() {
+    asm volatile("fence.acq_rel.cta;":: : "memory");
+}
+
+__device__  __forceinline__ void st_relaxed_sys_global(const int *ptr, int val) {
+    asm volatile("st.relaxed.sys.global.s32 [%0], %1;"::"l"(ptr), "r"(val) : "memory");
+}
+
+__device__  __forceinline__ void st_release_sys_global(const int *ptr, int val) {
+    asm volatile("st.release.sys.global.s32 [%0], %1;"::"l"(ptr), "r"(val) : "memory");
+}
+
+__device__  __forceinline__ void st_release_cta(const int *ptr, int val) {
+    asm volatile("st.release.cta.s32 [%0], %1;"::"l"(ptr), "r"(val) : "memory");
+}
+
+__device__ __forceinline__ int ld_acquire_sys_global(const int *ptr) {
+    int ret;
+    asm volatile("ld.acquire.sys.global.s32 %0, [%1];" : "=r"(ret) : "l"(ptr));
+    return ret;
+}
+
+__device__ __forceinline__ uint64_t ld_acquire_sys_global(const uint64_t *ptr) {
+    uint64_t ret;
+    asm volatile("ld.acquire.sys.global.u64 %0, [%1];" : "=l"(ret) : "l"(ptr));
+    return ret;
+}
+
+__device__ __forceinline__ int ld_acquire_global(const int *ptr) {
+    int ret;
+    asm volatile("ld.acquire.gpu.global.s32 %0, [%1];" : "=r"(ret) : "l"(ptr));
+    return ret;
+}
+
+__device__ __forceinline__ int atomic_add_release_sys_global(const int* ptr, int value) {
+    int ret;
+    asm volatile("atom.add.release.sys.global.s32 %0, [%1], %2;" : "=r"(ret) : "l"(ptr), "r"(value));
+    return ret;
+}
+
+__device__ __forceinline__ int atomic_add_release_global(const int* ptr, int value) {
+    int ret;
+    asm volatile("atom.add.release.gpu.global.s32 %0, [%1], %2;" : "=r"(ret) : "l"(ptr), "r"(value));
+    return ret;
+}
+
+__device__ __forceinline__ int ld_acquire_cta(const int *ptr) {
+    int ret;
+    asm volatile("ld.acquire.cta.s32 %0, [%1];" : "=r"(ret) : "l"(ptr));
+    return ret;
+}
+
+__device__ __forceinline__ uint8_t ld_na_relaxed(const uint8_t *ptr) {
+    uint16_t ret;
+    asm volatile("ld.relaxed.gpu.global.L1::no_allocate.b8 %0, [%1];" : "=h"(ret) : "l"(ptr));
+    return static_cast<uint8_t>(ret);
+}
+
+__device__ __forceinline__ uint16_t ld_na_relaxed(const uint16_t *ptr) {
+    uint16_t ret;
+    asm volatile("ld.relaxed.gpu.global.L1::no_allocate.b16 %0, [%1];" : "=h"(ret) : "l"(ptr));
+    return ret;
+}
+
+__device__ __forceinline__ uint32_t ld_na_relaxed(const uint32_t *ptr) {
+    uint32_t ret;
+    asm volatile("ld.relaxed.gpu.global.L1::no_allocate.b32 %0, [%1];" : "=r"(ret) : "l"(ptr));
+    return ret;
+}
+
+__device__ __forceinline__ uint64_t ld_na_relaxed(const uint64_t *ptr) {
+    uint64_t ret;
+    asm volatile("ld.relaxed.gpu.global.L1::no_allocate.b64 %0, [%1];" : "=l"(ret) : "l"(ptr));
+    return ret;
+}
+
+__device__  __forceinline__ int ld_volatile_global(const int *ptr) {
+    int ret;
+    asm volatile("ld.volatile.global.s32 %0, [%1];" : "=r"(ret) : "l"(ptr));
+    return ret;
+}
+
+__device__  __forceinline__ float ld_volatile_global(const float *ptr) {
+    float ret;
+    asm volatile("ld.volatile.global.f32 %0, [%1];" : "=f"(ret) : "l"(ptr));
+    return ret;
+}
+
+__device__  __forceinline__ int64_t ld_volatile_global(const int64_t *ptr) {
+    int64_t ret;
+    asm volatile("ld.volatile.global.s64 %0, [%1];" : "=l"(ret) : "l"(ptr));
+    return ret;
+}
+
+__device__  __forceinline__ int64_t ld_volatile_global(const uint64_t *ptr) {
+    int64_t ret;
+    asm volatile("ld.volatile.global.u64 %0, [%1];" : "=l"(ret) : "l"(ptr));
+    return ret;
+}
+
+#ifndef DISABLE_AGGRESSIVE_PTX_INSTRS
+#define LD_NC_FUNC "ld.global.nc.L1::no_allocate.L2::256B"
+#else
+#define LD_NC_FUNC "ld.volatile.global.L2::256B"
+#endif
+
+// `ld.global.nc.L1::no_allocate` will be translated into `LDG.E.NA.[width].CONSTANT` in SASS
+template <typename dtype_t>
+__device__  __forceinline__ dtype_t ld_nc_global(const dtype_t *ptr) {
+    auto ret = ld_nc_global(reinterpret_cast<const typename VecInt<sizeof(dtype_t)>::vec_t*>(ptr));
+    return *reinterpret_cast<dtype_t*>(&ret);
+}
+
+template <>
+__device__  __forceinline__ uint8_t ld_nc_global(const uint8_t *ptr) {
+    uint16_t ret;
+    // NOTES: we must use `uint16_t` as inline ASM does not support 8-bit constraint letter (`h` below means unsigned 16-bit)
+    asm volatile(LD_NC_FUNC ".u8 %0, [%1];" : "=h"(ret) : "l"(ptr));
+    return static_cast<uint8_t>(ret);
+}
+
+template <>
+__device__  __forceinline__ int ld_nc_global(const int *ptr) {
+    int ret;
+    asm volatile(LD_NC_FUNC ".s32 %0, [%1];" : "=r"(ret) : "l"(ptr));
+    return ret;
+}
+
+template <>
+__device__  __forceinline__ int64_t ld_nc_global(const int64_t *ptr) {
+    int64_t ret;
+    asm volatile(LD_NC_FUNC ".s64 %0, [%1];" : "=l"(ret) : "l"(ptr));
+    return ret;
+}
+
+template <>
+__device__  __forceinline__ float ld_nc_global(const float *ptr) {
+    float ret;
+    asm volatile(LD_NC_FUNC ".f32 %0, [%1];" : "=f"(ret) : "l"(ptr));
+    return ret;
+}
+
+template <>
+__device__  __forceinline__ int2 ld_nc_global(const int2 *ptr) {
+    int2 ret;
+    asm volatile(LD_NC_FUNC ".v2.s32 {%0, %1}, [%2];" : "=r"(ret.x), "=r"(ret.y) : "l"(ptr));
+    return ret;
+}
+
+template <>
+__device__  __forceinline__ int4 ld_nc_global(const int4 *ptr) {
+    int4 ret;
+    asm volatile(LD_NC_FUNC ".v4.s32 {%0, %1, %2, %3}, [%4];"
+            : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : "l"(ptr));
+    return ret;
+}
+
+__device__ __forceinline__ void st_na_relaxed(const uint8_t *ptr, uint8_t val) {
+    asm volatile("st.relaxed.gpu.global.L1::no_allocate.b8 [%0], %1;" : : "l"(ptr), "h"(static_cast<uint16_t>(val)));
+}
+
+__device__ __forceinline__ void st_na_relaxed(const uint16_t *ptr, uint16_t val) {
+    asm volatile("st.relaxed.gpu.global.L1::no_allocate.b16 [%0], %1;" : : "l"(ptr), "h"(val));
+}
+
+__device__ __forceinline__ void st_na_relaxed(const uint32_t *ptr, uint32_t val) {
+    asm volatile("st.relaxed.gpu.global.L1::no_allocate.b32 [%0], %1;" : : "l"(ptr), "r"(val));
+}
+
+__device__ __forceinline__ void st_na_relaxed(const int *ptr, int val) {
+    asm volatile("st.relaxed.gpu.global.L1::no_allocate.b32 [%0], %1;" : : "l"(ptr), "r"(val));
+}
+
+__device__ __forceinline__ void st_na_relaxed(const int4 *ptr, int4 val) {
+    asm volatile("st.relaxed.gpu.global.L1::no_allocate.v4.s32 [%0], {%1, %2, %3, %4};"
+            : : "l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w));
+}
+
+__device__ __forceinline__ void st_na_release(const int *ptr, int val) {
+    asm volatile("st.release.gpu.global.L1::no_allocate.b32 [%0], %1;" : : "l"(ptr), "r"(val));
+}
+
+__device__ __forceinline__ void st_na_release(const uint32_t *ptr, uint32_t val) {
+    asm volatile("st.release.gpu.global.L1::no_allocate.b32 [%0], %1;" : : "l"(ptr), "r"(val));
+}
+
+__device__ __forceinline__ void st_na_release(const uint64_t *ptr, uint64_t val) {
+    asm volatile("st.release.gpu.global.L1::no_allocate.b64 [%0], %1;" : : "l"(ptr), "l"(val));
+}
+
+// `st.global.L1::no_allocate` will be translated into `ST.E.NA.[width]` in SASS
+#ifndef DISABLE_AGGRESSIVE_PTX_INSTRS
+#define ST_NA_FUNC "st.global.L1::no_allocate"
+#else
+#define ST_NA_FUNC "st.global"
+#endif
+
+template <typename dtype_t>
+__device__  __forceinline__ void st_na_global(const dtype_t *ptr, const dtype_t& value) {
+    st_na_global(reinterpret_cast<const typename VecInt<sizeof(dtype_t)>::vec_t*>(ptr),
+                 *reinterpret_cast<const typename VecInt<sizeof(dtype_t)>::vec_t*>(&value));
+}
+
+template <>
+__device__  __forceinline__ void st_na_global(const int *ptr, const int& value) {
+    asm volatile(ST_NA_FUNC ".s32 [%0], %1;" ::"l"(ptr), "r"(value));
+}
+
+template <>
+__device__  __forceinline__ void st_na_global(const int64_t *ptr, const int64_t& value) {
+    asm volatile(ST_NA_FUNC ".s64 [%0], %1;" ::"l"(ptr), "l"(value));
+}
+
+template <>
+__device__  __forceinline__ void st_na_global(const float *ptr, const float& value) {
+    asm volatile(ST_NA_FUNC ".f32 [%0], %1;" ::"l"(ptr), "f"(value));
+}
+
+template <>
+__device__  __forceinline__ void st_na_global(const int4 *ptr, const int4& value) {
+    asm volatile(ST_NA_FUNC ".v4.s32 [%0], {%1, %2, %3, %4};"
+            ::"l"(ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w));
+}
+
+// TMA PTX instructions
+#ifndef DISABLE_SM90_FEATURES
+
+__device__ __forceinline__ void fence_view_async_shared() {
+    asm volatile("fence.proxy.async.shared::cta; \n" :: );
+}
+
+__device__ __forceinline__ void fence_barrier_init() {
+    asm volatile("fence.mbarrier_init.release.cluster; \n" :: );
+}
+
+__device__ __forceinline__ void mbarrier_init(uint64_t* mbar_ptr, uint32_t arrive_count) {
+    auto mbar_int_ptr = static_cast<uint32_t>(__cvta_generic_to_shared(mbar_ptr));
+    asm volatile("mbarrier.init.shared::cta.b64 [%1], %0;" :: "r"(arrive_count), "r"(mbar_int_ptr));
+}
+
+__device__ __forceinline__ void mbarrier_wait(uint64_t* mbar_ptr, uint32_t& phase) {
+    auto mbar_int_ptr = static_cast<uint32_t>(__cvta_generic_to_shared(mbar_ptr));
+    asm volatile("{\n\t"
+                 ".reg .pred       P1; \n\t"
+                 "LAB_WAIT: \n\t"
+                 "mbarrier.try_wait.parity.shared::cta.b64 P1, [%0], %1, %2; \n\t"
+                 "@P1 bra DONE; \n\t"
+                 "bra     LAB_WAIT; \n\t"
+                 "DONE: \n\t"
+                 "}" :: "r"(mbar_int_ptr), "r"(phase), "r"(0x989680));
+    phase ^= 1;
+}
+
+__device__ __forceinline__ void mbarrier_arrive_and_expect_tx(uint64_t* mbar_ptr, int num_bytes) {
+    auto mbar_int_ptr = static_cast<uint32_t>(__cvta_generic_to_shared(mbar_ptr));
+    asm volatile("mbarrier.arrive.expect_tx.shared::cta.b64 _, [%1], %0; \n\t" :: "r"(num_bytes), "r"(mbar_int_ptr));
+}
+
+__device__ __forceinline__ void tma_store_fence() {
+    asm volatile ("fence.proxy.async.shared::cta;");
+}
+
+constexpr uint64_t kEvictFirst = 0x12f0000000000000;
+constexpr uint64_t kEvictNormal = 0x1000000000000000;
+
+__device__ __forceinline__ void tma_load_1d(const void* smem_ptr, const void* gmem_ptr, uint64_t* mbar_ptr, int num_bytes,
+                                            bool evict_first = true) {
+    auto mbar_int_ptr = static_cast<uint32_t>(__cvta_generic_to_shared(mbar_ptr));
+    auto smem_int_ptr  = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+    const auto cache_hint = evict_first ? kEvictFirst : kEvictNormal;
+    asm volatile("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.L2::cache_hint [%0], [%1], %2, [%3], %4;\n"
+                 :: "r"(smem_int_ptr), "l"(gmem_ptr), "r"(num_bytes), "r"(mbar_int_ptr), "l"(cache_hint) : "memory");
+}
+
+__device__ __forceinline__ void tma_store_1d(const void* smem_ptr, const void* gmem_ptr, int num_bytes,
+                                             bool evict_first = true) {
+    auto smem_int_ptr = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+    const auto cache_hint = evict_first ? kEvictFirst : kEvictNormal;
+    asm volatile("cp.async.bulk.global.shared::cta.bulk_group.L2::cache_hint [%0], [%1], %2, %3;\n"
+                 :: "l"(gmem_ptr), "r"(smem_int_ptr), "r"(num_bytes), "l"(cache_hint) : "memory");
+    asm volatile("cp.async.bulk.commit_group;");
+}
+
+template <int N = 0>
+__device__ __forceinline__ void tma_store_wait() {
+    asm volatile("cp.async.bulk.wait_group.read %0;" :: "n"(N) : "memory");
+}
+
+#endif
+
+template <typename dtype_t>
+__host__ __device__ dtype_t ceil_div(dtype_t a, dtype_t b) {
+    return (a + b - 1) / b;
+}
+
+template <typename dtype_t>
+__host__ __device__ dtype_t align(dtype_t a, dtype_t b) {
+    return ceil_div<dtype_t>(a, b) * b;
+}
+
+__forceinline__ __device__ void get_channel_task_range(int num_tokens, int num_sms, int sm_id,
+                                                       int& token_start_idx, int& token_end_idx) {
+    int num_tokens_per_sm = ceil_div(num_tokens, num_sms);
+    token_start_idx = min(num_tokens_per_sm * sm_id, num_tokens);
+    token_end_idx = min(token_start_idx + num_tokens_per_sm, num_tokens);
+}
+
+template <typename dtype_a_t, typename dtype_b_t>
+__device__ __forceinline__ dtype_b_t pack2(const dtype_a_t& x, const dtype_a_t& y) {
+    EP_STATIC_ASSERT(sizeof(dtype_a_t) * 2 == sizeof(dtype_b_t), "Invalid dtypes");
+    dtype_b_t packed;
+    auto unpacked_ptr = reinterpret_cast<dtype_a_t*>(&packed);
+    unpacked_ptr[0] = x, unpacked_ptr[1] = y;
+    return packed;
+}
+
+template <typename dtype_a_t, typename dtype_b_t>
+__device__ __forceinline__ void unpack2(const dtype_b_t& packed, dtype_a_t& x, dtype_a_t& y) {
+    EP_STATIC_ASSERT(sizeof(dtype_a_t) * 2 == sizeof(dtype_b_t), "Invalid dtypes");
+    auto unpacked_ptr = reinterpret_cast<const dtype_a_t*>(&packed);
+    x = unpacked_ptr[0], y = unpacked_ptr[1];
+}
+
+template <typename dtype_t>
+__device__ __forceinline__ dtype_t broadcast(dtype_t& ptr, int src_lane_idx) {
+    EP_STATIC_ASSERT(sizeof(dtype_t) % sizeof(int) == 0, "");
+    auto send_int_values = reinterpret_cast<int*>(&ptr);
+    int recv_int_values[sizeof(dtype_t) / sizeof(int)];
+    #pragma unroll
+    for (int i = 0; i < sizeof(dtype_t) / sizeof(int); ++ i)
+        recv_int_values[i] = __shfl_sync(0xffffffff, send_int_values[i], src_lane_idx);
+    return *reinterpret_cast<dtype_t*>(recv_int_values);
+}
+
+__forceinline__ __device__ int warp_reduce_sum(int value) {
+    value += __shfl_xor_sync(0xffffffff, value, 16);
+    value += __shfl_xor_sync(0xffffffff, value, 8);
+    value += __shfl_xor_sync(0xffffffff, value, 4);
+    value += __shfl_xor_sync(0xffffffff, value, 2);
+    value += __shfl_xor_sync(0xffffffff, value, 1);
+    return value;
+}
+
+__forceinline__ __device__ float half_warp_reduce_max(float value) {
+    auto mask = __activemask();
+    // The mask be in `{0xffffffff, 0xffff}`
+    value = max(value, __shfl_xor_sync(mask, value, 8));
+    value = max(value, __shfl_xor_sync(mask, value, 4));
+    value = max(value, __shfl_xor_sync(mask, value, 2));
+    value = max(value, __shfl_xor_sync(mask, value, 1));
+    return value;
+}
+
+__forceinline__ __device__ int get_lane_id() {
+    int lane_id;
+    asm("mov.s32 %0, %laneid;" : "=r"(lane_id));
+    return lane_id;
+}
+
+constexpr float kFP8Margin = 1e-4;
+constexpr float kFinfoAmaxE4M3 = 448.0f;
+constexpr float kFinfoAmaxInvE4M3 = 1 / 448.0f;
+
+__forceinline__ __device__ float fast_pow2(int x) {
+    // We can ensure `-126 <= x and x <= 127`
+    uint32_t bits_x = (x + 127) << 23;
+    return *reinterpret_cast<float*>(&bits_x);
+}
+
+__forceinline__ __device__ int fast_log2_ceil(float x) {
+    auto bits_x = *reinterpret_cast<uint32_t*>(&x);
+    auto exp_x = (bits_x >> 23) & 0xff;
+    auto man_bits = bits_x & ((1 << 23) - 1);
+    return exp_x - 127 + (man_bits != 0);
+}
+
+__forceinline__ __device__ void calculate_fp8_scales(float amax, float& scale, float& scale_inv, bool round_scale) {
+    if (round_scale) {
+        auto exp_scale_inv = fast_log2_ceil(amax * kFinfoAmaxInvE4M3);
+        scale = fast_pow2(-exp_scale_inv);
+        scale_inv = fast_pow2(exp_scale_inv);
+    } else {
+        scale_inv = amax * kFinfoAmaxInvE4M3;
+        scale = kFinfoAmaxE4M3 / amax;
+    }
+}
+
+template <bool kIsUE8M0, typename out_dtype_t = std::conditional_t<kIsUE8M0, uint8_t, float>>
+__forceinline__ __device__ out_dtype_t extract_required_scale_format(float value) {
+    if constexpr (kIsUE8M0) {
+        return static_cast<uint8_t>((*reinterpret_cast<uint32_t*>(&value)) >> 23);
+    } else {
+        return value;
+    }
+}
+
+template <int kNumRanks, bool kSyncOnly = false>
+__forceinline__ __device__ void
+barrier_block(int** barrier_signal_ptrs, int rank) {
+    auto thread_id = static_cast<int>(threadIdx.x);
+
+    // For non-sync-only cases, the memory operations by other threads in the block must be visible to the `sys` scope
+    if constexpr (not kSyncOnly) {
+        memory_fence();
+        __syncthreads();
+    }
+
+    // Add self-ranks, sub other ranks
+    if (thread_id < kNumRanks) {
+        atomicAdd_system(barrier_signal_ptrs[rank] + thread_id, FINISHED_SUM_TAG);
+        atomicSub_system(barrier_signal_ptrs[thread_id] + rank, FINISHED_SUM_TAG);
+    }
+    EP_DEVICE_ASSERT(kNumRanks <= blockDim.x);
+
+    // Check timeout
+    auto start_time = clock64();
+    while (true) {
+        auto value = thread_id < kNumRanks ? ld_volatile_global(barrier_signal_ptrs[rank] + thread_id) : 0;
+        if (__all_sync(0xffffffff, value <= 0))
+            break;
+
+        if (clock64() - start_time > NUM_TIMEOUT_CYCLES and get_lane_id() == 0) {
+            printf("DeepEP timeout check failed: rank = %d, thread = %d)\n", rank, thread_id);
+            trap();
+        }
+    }
+    __syncthreads();
+}
+
+__forceinline__ __device__ int atomic_cas_cta_acquire(int* addr, int x, int y) {
+    int ret;
+    asm volatile("atom.acquire.cta.shared::cta.cas.b32 %0, [%1], %2, %3;" : "=r"(ret) : "l"(addr), "r"(x), "r"(y) : "memory");
+    return ret;
+}
+
+__forceinline__ __device__ int atomic_exch_cta_release(int* addr, int x) {
+    int ret;
+    asm volatile("atom.release.cta.shared::cta.exch.b32 %0, [%1], %2;" : "=r"(ret) : "l"(addr), "r"(x) : "memory");
+    return ret;
+}
+
+__forceinline__ __device__ void acquire_lock(int* mutex) {
+    // To make later memory operations valid, we must use `acquire` for memory semantics
+    while (atomic_cas_cta_acquire(mutex, 0, 1) != 0);
+}
+
+__forceinline__ __device__ void release_lock(int* mutex) {
+    // To make previous memory operations visible to other threads, we must use `release` for memory semantics
+    atomic_exch_cta_release(mutex, 0);
+}
+
+} // namespace deep_ep
--- a/DeepEP/deep_ep/init.py
+++ b/DeepEP/deep_ep/init.py
@ -0,0 +1,7 @@
+import torch
+
+from .utils import EventOverlap
+from .buffer import Buffer
+
+# noinspection PyUnresolvedReferences
+from deep_ep_cpp import Config
--- a/DeepEP/deep_ep/buffer.py
+++ b/DeepEP/deep_ep/buffer.py
@ -0,0 +1,617 @@
+import os
+import torch
+import torch.distributed as dist
+from typing import Callable, List, Tuple, Optional, Union
+
+# noinspection PyUnresolvedReferences
+import deep_ep_cpp
+# noinspection PyUnresolvedReferences
+from deep_ep_cpp import Config, EventHandle
+from .utils import EventOverlap, check_nvlink_connections
+
+
+class Buffer:
+    """
+    The core expert-parallel (EP) communication buffers for Mixture of Experts (MoE) model, which supports:
+        - high-throughput intranode all-to-all (dispatch and combine, using NVLink)
+        - high-throughput internode all-to-all (dispatch and combine, using RDMA and NVLink)
+        - low-latency all-to-all (dispatch and combine, using RDMA)
+
+    Attributes:
+        num_sms: the SMs used in high-throughput kernels.
+        rank: the local rank number.
+        group_size: the number of ranks in the group.
+        group: the communication group.
+        num_nvl_bytes: the buffer size for intranode NVLink communication.
+        num_rdma_bytes: the buffer size for internode (also for intranode with low-latency mode) RDMA communication.
+        runtime: the C++ runtime.
+    """
+
+    num_sms: int = 20
+
+    def __init__(self, group: dist.ProcessGroup,
+                 num_nvl_bytes: int = 0, num_rdma_bytes: int = 0,
+                 low_latency_mode: bool = False, num_qps_per_rank: int = 24,
+                 allow_nvlink_for_low_latency_mode: bool = True,
+                 allow_mnnvl: bool = False) -> None:
+        """
+        Initialize the communication buffer.
+
+        Arguments:
+            group: the communication group.
+            num_nvl_bytes: the buffer size for intranode NVLink communication.
+            num_rdma_bytes: the buffer size for internode (also for intranode with low-latency mode) RDMA communication.
+            low_latency_mode: whether to enable low-latency mode.
+            num_qps_per_rank: the number of QPs for RDMA, the low-latency mode requires that this number equals
+                to the number of local experts.
+            allow_nvlink_for_low_latency_mode: whether allow NVLink traffic for low-latency mode, you should notice
+                this is somehow incompatible with the hook-based overlapping.
+                Warning: PCIe connections may lead to errors due to memory ordering issues,
+                please make sure all connections are via NVLink.
+            allow_mnnvl: whether to allow MNNVL
+        """
+        check_nvlink_connections(group)
+
+        # Initialize the CPP runtime
+        self.rank = group.rank()
+        self.group_size = group.size()
+        self.group = group
+        self.num_nvl_bytes = num_nvl_bytes
+        self.num_rdma_bytes = num_rdma_bytes
+        self.low_latency_mode = low_latency_mode
+        self.runtime = deep_ep_cpp.Buffer(self.rank, self.group_size, num_nvl_bytes, num_rdma_bytes, low_latency_mode)
+
+        # Synchronize device IDs
+        device_ids = [None, ] * self.group_size
+        local_device_id = self.runtime.get_local_device_id()
+        dist.all_gather_object(device_ids, local_device_id, group)
+
+        # Synchronize IPC handles
+        ipc_handles = [None, ] * self.group_size
+        local_ipc_handle = self.runtime.get_local_ipc_handle()
+        dist.all_gather_object(ipc_handles, local_ipc_handle, group)
+
+        # Synchronize NVSHMEM unique IDs
+        root_unique_id = None
+        if self.runtime.get_num_rdma_ranks() > 1 or low_latency_mode:
+            # Enable IBGDA 
+            assert num_qps_per_rank > 0
+            os.environ['NVSHMEM_DISABLE_P2P'] = '0' if allow_nvlink_for_low_latency_mode else '1'
+            os.environ['NVSHMEM_IB_ENABLE_IBGDA'] = '1'
+            os.environ['NVSHMEM_IBGDA_NIC_HANDLER'] = 'gpu'
+            os.environ['NVSHMEM_IBGDA_NUM_RC_PER_PE'] = f'{num_qps_per_rank}'
+            # Make sure QP depth is always larger than the number of on-flight WRs, so that we can skip WQ slot check
+            os.environ['NVSHMEM_QP_DEPTH'] = '1024'
+
+            # Reduce gpu memory usage
+            # 6 default teams + 1 extra team
+            os.environ['NVSHMEM_MAX_TEAMS'] = '7'
+            # Disable NVLink SHArP
+            os.environ['NVSHMEM_DISABLE_NVLS'] = '1'
+            # NOTES: NVSHMEM initialization requires at least 256 MiB
+            os.environ['NVSHMEM_CUMEM_GRANULARITY'] = f'{2 ** 29}'
+
+            if not allow_mnnvl:
+                # Disable multi-node NVLink detection
+                os.environ['NVSHMEM_DISABLE_MNNVL'] = '1'
+
+            # Synchronize using the root ID
+            nvshmem_unique_ids = [None, ] * self.group_size
+            if (low_latency_mode and self.rank == 0) or (not low_latency_mode and self.runtime.get_rdma_rank() == 0):
+                root_unique_id = self.runtime.get_local_nvshmem_unique_id()
+            dist.all_gather_object(nvshmem_unique_ids, root_unique_id, group)
+            root_unique_id = nvshmem_unique_ids[0 if low_latency_mode else self.runtime.get_root_rdma_rank(True)]
+
+        # Make CPP runtime available
+        self.runtime.sync(device_ids, ipc_handles, root_unique_id)
+        assert self.runtime.is_available()
+
+    @staticmethod
+    def is_sm90_compiled():
+        return deep_ep_cpp.is_sm90_compiled()
+
+    @staticmethod
+    def set_num_sms(new_num_sms: int) -> None:
+        """
+        Set the number of SMs to use in high-throughput kernels.
+
+        Arguments:
+            new_num_sms: the new number to be set.
+        """
+
+        assert new_num_sms % 2 == 0, 'The SM count must be even'
+        Buffer.num_sms = new_num_sms
+
+    @staticmethod
+    def capture() -> EventOverlap:
+        """
+        Capture a CUDA event on the current stream, i.e. `torch.cuda.current_stream()`.
+
+        Returns:
+            event: the captured event.
+        """
+        return EventOverlap(EventHandle())
+
+    @staticmethod
+    def get_low_latency_rdma_size_hint(num_max_dispatch_tokens_per_rank: int, hidden: int, num_ranks: int, num_experts: int) -> int:
+        """
+        Get a minimum size requirement for the RDMA buffer. The size calculation will be done with BF16.
+
+        Arguments:
+            num_max_dispatch_tokens_per_rank: the maximum number of tokens to dispatch, all the ranks must hold the same value.
+            hidden: the hidden dimension of each token.
+            num_ranks: the number of EP group ranks.
+            num_experts: the number of all experts.
+
+        Returns:
+            size: the RDMA buffer size recommended.
+        """
+        return deep_ep_cpp.get_low_latency_rdma_size_hint(num_max_dispatch_tokens_per_rank, hidden, num_ranks, num_experts)
+    
+    def get_comm_stream(self) -> torch.Stream:
+        """
+        Get the communication stream.
+
+        Returns:
+            stream: the communication stream. 
+        """
+        ts: torch.Stream = self.runtime.get_comm_stream()
+        return torch.cuda.Stream(stream_id=ts.stream_id, device_index=ts.device_index, device_type=ts.device_type)
+
+    def get_local_buffer_tensor(self, dtype: torch.dtype, size: Optional[torch.Size] = None,
+                                offset: int = 0, use_rdma_buffer: bool = False) -> torch.Tensor:
+        """
+        Get the raw buffer (slice supported) as a PyTorch tensor.
+
+        Argument:
+            dtype: the data type (PyTorch `dtype`) for the tensor.
+            size: the slice size (by elements) to get from the buffer.
+            offset: the offset of the beginning element.
+            use_rdma_buffer: whether to return the RDMA buffer.
+        """
+        tensor = self.runtime.get_local_buffer_tensor(dtype, offset, use_rdma_buffer)
+        if size is None:
+            return tensor
+
+        assert tensor.numel() >= size.numel()
+        return tensor[:size.numel()].view(size)
+
+    @staticmethod
+    def _unpack_bias(bias: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]):
+        bias_0, bias_1 = None, None
+        if isinstance(bias, torch.Tensor):
+            bias_0 = bias
+        elif isinstance(bias, tuple):
+            assert len(bias) == 2
+            bias_0, bias_1 = bias
+        return bias_0, bias_1
+
+    @staticmethod
+    def get_dispatch_config(num_ranks: int) -> Config:
+        """
+        Get a recommended dispatch config.
+
+        Argument:
+            num_ranks: the number of ranks.
+
+        Returns:
+            config: the recommended config.
+        """
+
+        # TODO: automatically tune
+        config_map = {
+            2: Config(Buffer.num_sms, 24, 256, 6, 128),
+            4: Config(Buffer.num_sms, 6, 256, 6, 128),
+            8: Config(Buffer.num_sms, 6, 256, 6, 128),
+            16: Config(Buffer.num_sms, 16, 288, 20, 128),
+            24: Config(Buffer.num_sms, 8, 288, 32, 128),
+            32: Config(Buffer.num_sms, 8, 288, 32, 128),
+            64: Config(Buffer.num_sms, 20, 288, 28, 128),
+            128: Config(Buffer.num_sms, 20, 560, 32, 128),
+            144: Config(Buffer.num_sms, 32, 720, 12, 128),
+            160: Config(Buffer.num_sms, 28, 720, 12, 128),
+        }
+        assert num_ranks in config_map, f'Unsupported number of EP ranks: {num_ranks}'
+        return config_map[num_ranks]
+
+    @staticmethod
+    def get_combine_config(num_ranks: int) -> Config:
+        """
+        Get a recommended combine config.
+
+        Argument:
+            num_ranks: the number of ranks.
+
+        Returns:
+            config: the recommended config.
+        """
+
+        # TODO: automatically tune
+        config_map = {
+            2: Config(Buffer.num_sms, 10, 256, 6, 128),
+            4: Config(Buffer.num_sms, 9, 256, 6, 128),
+            8: Config(Buffer.num_sms, 4, 256, 6, 128),
+            16: Config(Buffer.num_sms, 2, 288, 28, 128),
+            24: Config(Buffer.num_sms, 1, 288, 20, 128),
+            32: Config(Buffer.num_sms, 1, 288, 20, 128),
+            64: Config(Buffer.num_sms, 1, 288, 20, 128),
+            128: Config(Buffer.num_sms, 1, 560, 12, 128),
+            144: Config(Buffer.num_sms, 2, 720, 8, 128),
+            160: Config(Buffer.num_sms, 2, 720, 8, 128),
+        }
+        assert num_ranks in config_map, f'Unsupported number of EP ranks: {num_ranks}'
+        return config_map[num_ranks]
+
+    # noinspection PyTypeChecker
+    def get_dispatch_layout(self, topk_idx: torch.Tensor, num_experts: int,
+                            previous_event: Optional[EventOverlap] = None, async_finish: bool = False,
+                            allocate_on_comm_stream: bool = False) -> \
+            Tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor, torch.Tensor, EventOverlap]:
+        """
+        Calculate the layout required for later communication.
+
+        Arguments:
+            topk_idx: `[num_tokens, num_topk]`, dtype must be `torch.int64`, the expert indices selected by each token,
+                `-1` means no selections.
+            num_experts: the number of experts.
+            previous_event: the event to wait before actually executing the kernel.
+            async_finish: the current stream will not wait for the communication kernels to be finished if set.
+            allocate_on_comm_stream: control whether all the allocated tensors' ownership to be on the communication stream.
+
+        Returns:
+            num_tokens_per_rank: `[num_ranks]` with `torch.int`, the number of tokens to be sent to each rank.
+            num_tokens_per_rdma_rank: `[num_rdma_ranks]` with `torch.int`, the number of tokens to be sent to each RDMA
+                rank (with the same GPU index), return `None` for intranode settings.
+            num_tokens_per_expert: `[num_experts]` with `torch.int`, the number of tokens to be sent to each expert.
+            is_token_in_rank: `[num_tokens, num_ranks]` with `torch.bool`, whether a token be sent to a rank.
+            event: the event after executing the kernel (valid only if `async_finish` is set).
+        """
+        num_tokens_per_rank, num_tokens_per_rdma_rank, num_tokens_per_expert, is_token_in_rank, event = \
+            self.runtime.get_dispatch_layout(topk_idx, num_experts, getattr(previous_event, 'event', None),
+                                             async_finish, allocate_on_comm_stream)
+        return num_tokens_per_rank, num_tokens_per_rdma_rank, num_tokens_per_expert, is_token_in_rank, EventOverlap(event)
+
+    # noinspection PyTypeChecker
+    def dispatch(self, x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+                 handle: Optional[Tuple] = None,
+                 num_tokens_per_rank: Optional[torch.Tensor] = None, num_tokens_per_rdma_rank: Optional[torch.Tensor] = None,
+                 is_token_in_rank: Optional[torch.Tensor] = None, num_tokens_per_expert: Optional[torch.Tensor] = None,
+                 topk_idx: Optional[torch.Tensor] = None, topk_weights: Optional[torch.Tensor] = None,
+                 expert_alignment: int = 1, num_worst_tokens: int = 0,
+                 config: Optional[Config] = None,
+                 previous_event: Optional[EventOverlap] = None, async_finish: bool = False,
+                 allocate_on_comm_stream: bool = False) -> \
+            Tuple[Union[Tuple[torch.Tensor, torch.Tensor], torch.Tensor], Optional[torch.Tensor],
+                  Optional[torch.Tensor], List[int], Tuple, EventOverlap]:
+        """
+        Dispatch tokens to different ranks, both intranode and internode settings are supported.
+        Intranode kernels require all the ranks should be visible via NVLink.
+        Internode kernels require the ranks in a node should be visible via NVLink, while the ranks with the same GPU
+            index should be visible via RDMA.
+
+        Arguments:
+            x: `torch.Tensor` or tuple of `torch.Tensor`, for the first type, the shape must be `[num_tokens, hidden]`,
+                and type must be `torch.bfloat16`; for the second type, the first element of the tuple must be shaped as
+                `[num_tokens, hidden]` with type `torch.float8_e4m3fn`, the second must be `[num_tokens, hidden // 128]`
+                 (requiring divisible) with type `torch.float`.
+            handle: an optional communication handle, if set, the CPU will reuse the layout information to save some time.
+            num_tokens_per_rank: `[num_ranks]` with `torch.int`, the number of tokens to be sent to each rank.
+            num_tokens_per_rdma_rank: `[num_rdma_ranks]` with `torch.int`, the number of tokens to be sent to each RDMA
+                rank (with the same GPU index), return `None` for intranode settings.
+            is_token_in_rank: `[num_tokens, num_ranks]` with `torch.bool`, whether a token be sent to a rank.
+            num_tokens_per_expert: `[num_experts]` with `torch.int`, the number of tokens to be sent to each expert.
+            topk_idx: `[num_tokens, num_topk]` with `torch.int64`, the expert indices selected by each token,
+                `-1` means no selections.
+            topk_weights: `[num_tokens, num_topk]` with `torch.float`, the expert weights of each token to dispatch.
+            expert_alignment: align the number of tokens received by each local expert to this variable.
+            num_worst_tokens: the worst number of tokens to receive, if specified, there will be no CPU sync, and it
+                will be CUDA-graph compatible. Please also notice that this flag is for intranode only.
+            config: the performance tuning config.
+            previous_event: the event to wait before actually executing the kernel.
+            async_finish: the current stream will not wait for the communication kernels to be finished if set.
+            allocate_on_comm_stream: control whether all the allocated tensors' ownership to be on the communication stream.
+
+        Returns:
+            recv_x: received tokens, the same type and tuple as the input `x`, but the number of tokens equals to the
+                received token count.
+            recv_topk_idx: received expert indices.
+            recv_topk_weights: received expert weights.
+            num_recv_tokens_per_expert_list: Python list shaped `[num_local_experts]`, the received token count by
+                each local expert, aligned to the input `expert_alignment`. If `num_worst_tokens` is specified, the list
+                will be empty.
+            handle: the returned communication handle.
+            event: the event after executing the kernel (valid only if `async_finish` is set).
+        """
+        # Default config
+        config = self.get_dispatch_config(self.group_size) if config is None else config
+
+        # Internode
+        if self.runtime.get_num_rdma_ranks() > 1:
+            assert num_worst_tokens == 0, 'Internode dispatch does not support `num_worst_tokens > 0`'
+            return self.internode_dispatch(x, handle, num_tokens_per_rank, num_tokens_per_rdma_rank, is_token_in_rank, num_tokens_per_expert,
+                                           topk_idx, topk_weights, expert_alignment, config, previous_event, async_finish, allocate_on_comm_stream)
+
+        # Launch the kernel with cached or non-cached mode
+        x, x_scales = x if isinstance(x, tuple) else (x, None)
+        if handle is not None:
+            assert topk_idx is None and topk_weights is None
+            rank_prefix_matrix, channel_prefix_matrix, recv_channel_prefix_matrix, recv_src_idx, is_token_in_rank, send_head = handle
+            num_recv_tokens = recv_src_idx.size(0)
+            recv_x, recv_x_scales, _, _, _, _, _, _, _, _, event = self.runtime.intranode_dispatch(
+                x, x_scales, None, None,
+                None, is_token_in_rank, None, num_recv_tokens, rank_prefix_matrix, channel_prefix_matrix,
+                expert_alignment, num_worst_tokens, config,
+                getattr(previous_event, 'event', None), async_finish, allocate_on_comm_stream)
+            return (recv_x, recv_x_scales) if x_scales is not None else recv_x, None, None, None, None, EventOverlap(event)
+        else:
+            assert num_tokens_per_rank is not None and is_token_in_rank is not None and num_tokens_per_expert is not None
+            recv_x, recv_x_scales, recv_topk_idx, recv_topk_weights, num_recv_tokens_per_expert_list, rank_prefix_matrix, channel_prefix_matrix, recv_channel_prefix_matrix, recv_src_idx, send_head, event = \
+                self.runtime.intranode_dispatch(x, x_scales, topk_idx, topk_weights,
+                                                num_tokens_per_rank, is_token_in_rank, num_tokens_per_expert, 0, None, None,
+                                                expert_alignment, num_worst_tokens, config,
+                                                getattr(previous_event, 'event', None), async_finish, allocate_on_comm_stream)
+            handle = (rank_prefix_matrix, channel_prefix_matrix, recv_channel_prefix_matrix, recv_src_idx, is_token_in_rank, send_head)
+            return (recv_x, recv_x_scales) if x_scales is not None else recv_x, recv_topk_idx, recv_topk_weights, num_recv_tokens_per_expert_list, handle, EventOverlap(event)
+
+    # noinspection PyTypeChecker
+    def combine(self, x: torch.Tensor, handle: Tuple,
+                topk_weights: Optional[torch.Tensor] = None,
+                bias: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]] = None,
+                config: Optional[Config] = None,
+                previous_event: Optional[EventOverlap] = None, async_finish: bool = False,
+                allocate_on_comm_stream: bool = False) -> \
+            Tuple[torch.Tensor, Optional[torch.Tensor], EventOverlap]:
+        """
+        Combine (reduce) tokens (addition **without** weights) from different ranks, both intranode and internode
+            settings are supported.
+        Intranode kernels require all the ranks should be visible via NVLink.
+        Internode kernels require the ranks in a node should be visible via NVLink, while the ranks with the same GPU
+            index should be visible via RDMA.
+
+        Arguments:
+            x: `[num_tokens, hidden]` with `torch.bfloat16`, the tokens to send for reducing to its original ranks.
+            handle: a must-set communication handle, you can obtain this from the dispatch function.
+            topk_weights: `[num_tokens, num_topk]` with `torch.float`, the tokens' top-k weights for reducing to its original ranks.
+            config: the performance tuning config.
+            previous_event: the event to wait before actually executing the kernel.
+            async_finish: the current stream will not wait for the communication kernels to be finished if set.
+            allocate_on_comm_stream: control whether all the allocated tensors' ownership to be on the communication stream.
+
+        Returns:
+            recv_x: the reduced token from its dispatched ranks.
+            recv_topk_weights: the reduced top-k weights from its dispatch ranks.
+            event: the event after executing the kernel (valid only if `async_finish` is set).
+        """
+        # Default config
+        config = self.get_combine_config(self.group_size) if config is None else config
+
+        # Internode
+        if self.runtime.get_num_rdma_ranks() > 1:
+            return self.internode_combine(x, handle, topk_weights, bias, config, previous_event, async_finish, allocate_on_comm_stream)
+
+        # NOTES: the second `_` is for the sending side, so we should use the third one
+        rank_prefix_matrix, _, channel_prefix_matrix, src_idx, is_recv_token_in_rank, send_head = handle
+        bias_0, bias_1 = Buffer._unpack_bias(bias)
+
+        # Launch the kernel
+        recv_x, recv_topk_weights, event = self.runtime.intranode_combine(
+            x, topk_weights, bias_0, bias_1,
+            src_idx, rank_prefix_matrix, channel_prefix_matrix, send_head, config,
+            getattr(previous_event, 'event', None), async_finish, allocate_on_comm_stream)
+        return recv_x, recv_topk_weights, EventOverlap(event)
+
+    # noinspection PyTypeChecker
+    def internode_dispatch(self, x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+                           handle: Optional[Tuple] = None,
+                           num_tokens_per_rank: Optional[torch.Tensor] = None, num_tokens_per_rdma_rank: Optional[torch.Tensor] = None,
+                           is_token_in_rank: Optional[torch.Tensor] = None, num_tokens_per_expert: Optional[torch.Tensor] = None,
+                           topk_idx: Optional[torch.Tensor] = None, topk_weights: Optional[torch.Tensor] = None, expert_alignment: int = 1,
+                           config: Optional[Config] = None,
+                           previous_event: Optional[EventOverlap] = None, async_finish: bool = False,
+                           allocate_on_comm_stream: bool = False) -> \
+            Tuple[Union[Tuple[torch.Tensor, torch.Tensor], torch.Tensor], Optional[torch.Tensor],
+            Optional[torch.Tensor], List[int], Tuple, EventOverlap]:
+        """
+        Internode dispatch implementation, for more details, please refer to the `dispatch` docs.
+        Normally, you should not directly call this function.
+        """
+        assert config is not None
+
+        # Launch the kernel with cached or non-cached mode
+        x, x_scales = x if isinstance(x, tuple) else (x, None)
+        if handle is not None:
+            assert topk_idx is None and topk_weights is None
+            is_token_in_rank, \
+                rdma_channel_prefix_matrix, gbl_channel_prefix_matrix, \
+                recv_rdma_channel_prefix_matrix, recv_rdma_rank_prefix_sum, recv_gbl_channel_prefix_matrix, recv_gbl_rank_prefix_sum, \
+                recv_src_meta, send_rdma_head, send_nvl_head = handle
+            num_recv_tokens = recv_src_meta.size(0)
+            num_rdma_recv_tokens = send_nvl_head.size(0)
+            recv_x, recv_x_scales, _, _, _, _, _, _, _, _, _, _, _, _, event = self.runtime.internode_dispatch(
+                x, x_scales, topk_idx, topk_weights,
+                None, None, is_token_in_rank, None,
+                num_recv_tokens, num_rdma_recv_tokens,
+                rdma_channel_prefix_matrix, recv_rdma_rank_prefix_sum, gbl_channel_prefix_matrix, recv_gbl_rank_prefix_sum,
+                expert_alignment, config, getattr(previous_event, 'event', None), async_finish, allocate_on_comm_stream)
+            return (recv_x, recv_x_scales) if x_scales is not None else recv_x, None, None, None, None, EventOverlap(event)
+        else:
+            assert num_tokens_per_rank is not None and is_token_in_rank is not None and num_tokens_per_expert is not None
+            recv_x, recv_x_scales, recv_topk_idx, recv_topk_weights, num_recv_tokens_per_expert_list, \
+                rdma_channel_prefix_matrix, gbl_channel_prefix_matrix, \
+                recv_rdma_channel_prefix_matrix, recv_rdma_rank_prefix_sum, \
+                recv_gbl_channel_prefix_matrix, recv_gbl_rank_prefix_sum, \
+                recv_src_meta, send_rdma_head, send_nvl_head, event = self.runtime.internode_dispatch(
+                x, x_scales, topk_idx, topk_weights,
+                num_tokens_per_rank, num_tokens_per_rdma_rank, is_token_in_rank, num_tokens_per_expert,
+                0, 0, None, None, None, None,
+                expert_alignment, config, getattr(previous_event, 'event', None), async_finish, allocate_on_comm_stream)
+            handle = (is_token_in_rank,
+                      rdma_channel_prefix_matrix, gbl_channel_prefix_matrix,
+                      recv_rdma_channel_prefix_matrix, recv_rdma_rank_prefix_sum, recv_gbl_channel_prefix_matrix, recv_gbl_rank_prefix_sum,
+                      recv_src_meta, send_rdma_head, send_nvl_head)
+            return (recv_x, recv_x_scales) if x_scales is not None else recv_x, recv_topk_idx, recv_topk_weights, num_recv_tokens_per_expert_list, handle, EventOverlap(event)
+
+    # noinspection PyTypeChecker
+    def internode_combine(self, x: torch.Tensor, handle: Union[tuple, list],
+                          topk_weights: Optional[torch.Tensor] = None,
+                          bias: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]] = None,
+                          config: Optional[Config] = None,
+                          previous_event: Optional[EventOverlap] = None, async_finish: bool = False,
+                          allocate_on_comm_stream: bool = False) -> \
+            Tuple[torch.Tensor, Optional[torch.Tensor], EventOverlap]:
+        """
+        Internode combine implementation, for more details, please refer to the `combine` docs.
+        Normally, you should not directly call this function.
+        """
+        assert config is not None
+
+        # Unpack handle and bias
+        is_combined_token_in_rank, \
+            _, _, \
+            rdma_channel_prefix_matrix, rdma_rank_prefix_sum, gbl_channel_prefix_matrix, gbl_rank_prefix_sum, \
+            src_meta, send_rdma_head, send_nvl_head = handle
+        bias_0, bias_1 = Buffer._unpack_bias(bias)
+
+        # Launch the kernel
+        combined_x, combined_topk_weights, event = self.runtime.internode_combine(
+            x, topk_weights, bias_0, bias_1,
+            src_meta, is_combined_token_in_rank,
+            rdma_channel_prefix_matrix, rdma_rank_prefix_sum, gbl_channel_prefix_matrix,
+            send_rdma_head, send_nvl_head, config, getattr(previous_event, 'event', None),
+            async_finish, allocate_on_comm_stream)
+        return combined_x, combined_topk_weights, EventOverlap(event)
+
+    def clean_low_latency_buffer(self, num_max_dispatch_tokens_per_rank: int, hidden: int, num_experts: int) -> None:
+        """
+        As low-latency kernels require part of the buffer to be zero-initialized, so it is vital to clean the buffer
+            if the buffer is dirty at some time.
+        For example, after running the normal dispatch/combine, you must run this function before executing any
+            low-latency kernel.
+
+        Arguments:
+            num_max_dispatch_tokens_per_rank: the maximum number of tokens to dispatch, all the ranks must hold the same value.
+            hidden: the hidden dimension of each token.
+            num_experts: the number of all experts.
+        """
+        self.runtime.clean_low_latency_buffer(num_max_dispatch_tokens_per_rank, hidden, num_experts)
+
+    # noinspection PyTypeChecker
+    def low_latency_dispatch(self, x: torch.Tensor, topk_idx: torch.Tensor,
+                             num_max_dispatch_tokens_per_rank: int, num_experts: int,
+                             cumulative_local_expert_recv_stats: Optional[torch.Tensor] = None,
+                             use_fp8: bool = True, round_scale: bool = False, use_ue8m0: bool = False,
+                             async_finish: bool = False, return_recv_hook: bool = False) -> \
+            Tuple[Tuple[torch.Tensor, torch.Tensor], torch.Tensor, Tuple, EventOverlap, Callable]:
+        """
+        A low-latency implementation for dispatching with IBGDA.
+        This kernel requires all the ranks (no matter intranode or internode) should be visible via RDMA
+            (specifically, IBGDA must be enabled).
+        Warning: as there are only two buffers, and the returned tensors reuse the buffer, you cannot hold more than 2
+            low-latency kernels' result tensors at a single moment.
+
+        Arguments:
+            x: `torch.Tensor` with `torch.bfloat16`, shaped as `[num_tokens, hidden]`, only several hidden shapes are
+                supported. The number of tokens to be dispatched must be less than `num_max_dispatch_tokens_per_rank`.
+            topk_idx: `torch.Tensor` with `torch.int64`, shaped as `[num_tokens, num_topk]`, only several top-k shapes
+                are supported. `-1` indices (not selecting any expert) are supported.
+            num_max_dispatch_tokens_per_rank: the maximum number of tokens to dispatch, all the ranks must hold the same value.
+            num_experts: the number of all experts.
+            cumulative_local_expert_recv_stats: a cumulative expert count tensor for statistics, which should have shape
+                `[num_local_experts]` and be typed as `torch.int`. This is useful for online service EP load balance
+                monitoring.
+            use_fp8: whether to enable FP8 casting, with this, the received data will be a tuple of FP8 tensor and scaling factors.
+            round_scale: whether round the scaling factors into power of 2.
+            use_ue8m0: whether use UE8M0 as scaling factor format (available only with `round_scale=True`).
+            async_finish: the current stream will not wait for the communication kernels to be finished if set.
+            return_recv_hook: return a receiving hook if set. If set, the kernel will just do the RDMA request issues,
+                but **without actually receiving the data**. You must call the received hook to make sure the data's arrival.
+                If you do not set this flag, the kernel will ensure the data's arrival.
+
+        Returns:
+            recv_x: a tensor or tuple with received tokens for each expert.
+                With `use_fp8=True`: the first element is a `torch.Tensor` shaped as
+                `[num_local_experts, num_max_dispatch_tokens_per_rank * num_ranks, hidden]` with `torch.float8_e4m3fn`.
+                The second tensor is the corresponding scales for the first element with shape
+                `[num_local_experts, num_max_dispatch_tokens_per_rank * num_ranks, hidden // 128]` with `torch.float`,
+                if `use_ue8m0=False`. With `use_ue8m0=True`, the second one is packed and shaped as
+                `[num_local_experts, num_max_dispatch_tokens_per_rank * num_ranks, hidden // 512]` with type `torch.int`.
+                Notice that, the last-two-dimension of the scaling tensors are in column-major for TMA compatibility.
+                With `use_fp8=False`, the result would be a tensor shaped as
+                `[num_local_experts, num_max_dispatch_tokens_per_rank * num_ranks, hidden]` with `torch.bfloat16`.
+                Moreover, not all tokens are valid, only some of the `num_max_dispatch_tokens_per_rank * num_ranks` are,
+                as we do not synchronize CPU received count with GPU (also not incompatible with CUDA graph if synced).
+            recv_count: a tensor shaped `[num_local_experts]` with type `torch.int`, indicating how many tokens each
+                expert receives. As mentioned before, not all tokens are valid in `recv_x`.
+            handle: the communication handle to be used in the `low_latency_combine` function.
+            event: the event after executing the kernel (valid only if `async_finish` is set).
+            hook: the receiving hook function (valid only if `return_recv_hook` is set).
+        """
+        packed_recv_x, packed_recv_x_scales, packed_recv_count, packed_recv_src_info, packed_recv_layout_range, event, hook = \
+            self.runtime.low_latency_dispatch(x, topk_idx,
+                                              cumulative_local_expert_recv_stats,
+                                              num_max_dispatch_tokens_per_rank, num_experts,
+                                              use_fp8, round_scale, use_ue8m0,
+                                              async_finish, return_recv_hook)
+        handle = (packed_recv_src_info, packed_recv_layout_range, num_max_dispatch_tokens_per_rank, x.size(1), num_experts)
+        tensors_to_record = (x, topk_idx,
+                             packed_recv_x, packed_recv_x_scales, packed_recv_count,
+                             packed_recv_src_info, packed_recv_layout_range,
+                             cumulative_local_expert_recv_stats)
+        return (packed_recv_x, packed_recv_x_scales) if use_fp8 else packed_recv_x, packed_recv_count, handle, \
+            EventOverlap(event, tensors_to_record if async_finish else None), hook
+
+    # noinspection PyTypeChecker
+    def low_latency_combine(self, x: torch.Tensor, topk_idx: torch.Tensor, topk_weights: torch.Tensor,
+                            handle: tuple, zero_copy: bool = False, async_finish: bool = False,
+                            return_recv_hook: bool = False, out: Optional[torch.Tensor] = None) -> \
+            Tuple[torch.Tensor, EventOverlap, Callable]:
+        """
+        A low-latency implementation for combining tokens (reduce **with weights**) with IBGDA.
+        This kernel requires all the ranks (no matter intranode or internode) should be visible via RDMA
+            (specifically, IBGDA must be enabled).
+        Warning: as there are only two buffers, and the returned tensors reuse the buffer, you cannot hold more than 2
+            low-latency kernels' result tensors at a single moment.
+
+        Arguments:
+            x: `[num_local_experts, num_max_dispatch_tokens_per_rank * num_ranks, hidden]` with `torch.bfloat16`,
+                the local calculated tokens to be sent to this original rank and reduced.
+            topk_idx: `[num_combined_tokens, num_topk]` with `torch.int64`, the expert indices selected by the dispatched
+                tokens. `-1` indices (not selecting any expert) are supported. Note that, `num_combined_tokens` equals
+                to the number of dispatched tokens.
+            topk_weights: `[num_combined_tokens, num_topk]` with `torch.float`, the expert weights selected by the dispatched
+                tokens. The received tokens will be reduced with the weights in this tensor.
+            handle: the communication handle given by the `dispatch` function.
+            zero_copy: whether the tensor is already copied into the RDMA buffer, should be cooperative
+                with `get_next_low_latency_combine_buffer`.
+            async_finish: the current stream will not wait for the communication kernels to be finished if set.
+            return_recv_hook: return a receiving hook if set. If set, the kernel will just do the RDMA request issues,
+                but **without actually receiving the data**. You must call the received hook to make sure the data's arrival.
+                If you do not set this flag, the kernel will ensure the data's arrival.
+            out: the in-place output tensor, if set, the kernel will write the result to this tensor and return it directly.
+
+        Returns:
+            combined_x: the reduced token tensor, with shape `[num_combined_tokens, hidden]` and type `torch.bfloat16`.
+            event: the event after executing the kernel (valid only if `async_finish` is set).
+            hook: the receiving hook function (valid only if `return_recv_hook` is set).
+        """
+        src_info, layout_range, num_max_dispatch_tokens_per_rank, hidden, num_experts = handle
+        combined_x, event, hook = self.runtime.low_latency_combine(x, topk_idx, topk_weights, src_info, layout_range,
+                                                                   num_max_dispatch_tokens_per_rank, num_experts,
+                                                                   zero_copy, async_finish, return_recv_hook, out)
+        tensors_to_record = (x, topk_idx, topk_weights, src_info, layout_range, combined_x)
+        return combined_x, EventOverlap(event, tensors_to_record if async_finish else None), hook
+
+    def get_next_low_latency_combine_buffer(self, handle: object):
+        """
+        Get the raw registered RDMA buffer tensor for next low-latency combine, so that the next combine kernel can skip the copying.
+
+        Arguments:
+            handle: the communication handle given by the `dispatch` function.
+
+        Returns:
+            buffer: the raw RDMA low-latency buffer as a BF16 PyTorch tensor with shape
+                `[num_local_experts, num_ranks * num_max_dispatch_tokens_per_rank, hidden]`, you should fill this buffer
+                by yourself.
+        """
+        src_info, layout_range, num_max_dispatch_tokens_per_rank, hidden, num_experts = handle
+        return self.runtime.get_next_low_latency_combine_buffer(num_max_dispatch_tokens_per_rank, hidden, num_experts)
--- a/DeepEP/deep_ep/utils.py
+++ b/DeepEP/deep_ep/utils.py
@ -0,0 +1,101 @@
+import os
+import subprocess
+import torch
+import torch.distributed as dist
+from typing import Any, Optional, Tuple
+
+# noinspection PyUnresolvedReferences
+from deep_ep_cpp import Config, EventHandle
+
+
+class EventOverlap:
+    """
+    A wrapper class to manage CUDA events, also for better overlapping convenience.
+
+    Attributes:
+        event: the CUDA event captured.
+        extra_tensors: an easier way to simulate PyTorch tensor `record_stream`, may be useful with CUDA graph.
+    """
+
+    def __init__(self, event: Optional[EventHandle] = None,
+                 extra_tensors: Optional[Tuple[torch.Tensor]] = None) -> None:
+        """
+        Initialize the class.
+
+        Arguments:
+            event: the CUDA event captured.
+            extra_tensors: an easier way to simulate PyTorch tensor `record_stream`, may be useful with CUDA graph.
+        """
+        self.event = event
+
+        # NOTES: we use extra tensors to achieve stream recording, otherwise,
+        # stream recording will be incompatible with CUDA graph.
+        self.extra_tensors = extra_tensors
+
+    def current_stream_wait(self) -> None:
+        """
+        The current stream `torch.cuda.current_stream()` waits for the event to be finished.
+        """
+        assert self.event is not None
+        self.event.current_stream_wait()
+
+    def __enter__(self) -> Any:
+        """
+        Utility for overlapping and Python `with` syntax.
+
+        You can overlap the kernels on the current stream with the following example:
+        ```python
+        event_overlap = event_after_all_to_all_kernels()
+        with event_overlap():
+            do_something_on_current_stream()
+        # After exiting the `with` scope, the current stream with wait the event to be finished.
+        ```
+        """
+        return self
+
+    def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
+        """
+        Utility for overlapping and Python `with` syntax.
+
+        Please follow the example in the `__enter__` function.
+        """
+        if self.event is not None:
+            self.event.current_stream_wait()
+
+
+def check_nvlink_connections(group: dist.ProcessGroup):
+    """
+    Check NVLink connection between every pair of GPUs.
+
+    Arguments:
+        group: the communication group.
+    """
+    # Check NVLink connection
+    # NOTES: some A100 PCIE GPUs only have pairwise NVLink connection, so that we can only use EP2
+    # TODO: check all cases, all local-node GPUs in the group should be connected via NVLink
+    if 'PCIE' in torch.cuda.get_device_name():
+        assert group.size() <= 2, 'PCIe GPUs only have pairwise NVLink connections'
+
+        # noinspection PyUnresolvedReferences
+        import pynvml
+        pynvml.nvmlInit()
+
+        # noinspection PyTypeChecker
+        devices = os.environ.get('CUDA_VISIBLE_DEVICES', '0,1,2,3,4,5,6,7').strip(',').split(',')
+        physical_device_idx = int(devices[torch.cuda.current_device()])
+        physical_device_indices = [0, ] * group.size()
+        dist.all_gather_object(physical_device_indices, physical_device_idx, group)
+
+        # Check whether they are all connected via NVLink
+        # Reference: https://github.com/vllm-project/vllm/blob/b8e809a057765c574726a6077fd124db5077ce1f/vllm/platforms/cuda.py#L438
+        handles = [pynvml.nvmlDeviceGetHandleByIndex(i) for i in physical_device_indices]
+        for i, handle in enumerate(handles):
+            for j, peer_handle in enumerate(handles):
+                if i >= j:
+                    continue
+                status = pynvml.nvmlDeviceGetP2PStatus(handle, peer_handle, pynvml.NVML_P2P_CAPS_INDEX_NVLINK)
+                assert status == pynvml.NVML_P2P_STATUS_OK,\
+                    f'GPU {physical_device_indices[i]} and GPU {physical_device_indices[j]} are not connected via NVLink'
+
+        # Close NVML
+        pynvml.nvmlShutdown()
--- a/DeepEP/install.sh
+++ b/DeepEP/install.sh
@ -0,0 +1,12 @@
+# Change current directory into project root
+original_dir=$(pwd)
+script_dir=$(dirname "$0")
+cd "$script_dir"
+
+# Remove old dist file, build, and install
+rm -rf dist
+python setup.py bdist_wheel
+pip install dist/*.whl
+
+# Open users' original directory
+cd "$original_dir"
--- a/DeepEP/setup.py
+++ b/DeepEP/setup.py
@ -0,0 +1,107 @@
+import os
+import subprocess
+import setuptools
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+
+if __name__ == '__main__':
+    nvshmem_dir = os.getenv('NVSHMEM_DIR', None)
+    disable_nvshmem = nvshmem_dir is None
+    if disable_nvshmem:
+        print('Warning: `NVSHMEM_DIR` is not specified, all internode and low-latency features are disabled\n')
+    else:
+        assert os.path.exists(nvshmem_dir), f'Failed to find NVSHMEM: {nvshmem_dir}'
+
+    cxx_flags = ['-O3', '-Wno-deprecated-declarations', '-Wno-unused-variable',
+                 '-Wno-sign-compare', '-Wno-reorder', '-Wno-attributes']
+    nvcc_flags = ['-O3', '-Xcompiler', '-O3']
+    sources = ['csrc/deep_ep.cpp', 'csrc/kernels/runtime.cu', 'csrc/kernels/layout.cu', 'csrc/kernels/intranode.cu']
+    include_dirs = ['csrc/']
+    library_dirs = []
+    nvcc_dlink = []
+    extra_link_args = []
+
+    # NVSHMEM flags
+    if disable_nvshmem:
+        cxx_flags.append('-DDISABLE_NVSHMEM')
+        nvcc_flags.append('-DDISABLE_NVSHMEM')
+    else:
+        sources.extend(['csrc/kernels/internode.cu', 'csrc/kernels/internode_ll.cu'])
+        include_dirs.extend([f'{nvshmem_dir}/include'])
+        library_dirs.extend([f'{nvshmem_dir}/lib'])
+        nvcc_dlink.extend(['-dlink', f'-L{nvshmem_dir}/lib', '-lnvshmem'])
+        extra_link_args.extend(['-l:libnvshmem.a', '-l:nvshmem_bootstrap_uid.so', f'-Wl,-rpath,{nvshmem_dir}/lib'])
+
+    if int(os.getenv('DISABLE_SM90_FEATURES', 0)):
+        # Prefer A100
+        os.environ['TORCH_CUDA_ARCH_LIST'] = os.getenv('TORCH_CUDA_ARCH_LIST', '8.0')
+
+        # Disable some SM90 features: FP8, launch methods, and TMA
+        cxx_flags.append('-DDISABLE_SM90_FEATURES')
+        nvcc_flags.append('-DDISABLE_SM90_FEATURES')
+
+        # Disable internode and low-latency kernels
+        assert disable_nvshmem
+    else:
+        # Prefer H800 series
+        os.environ['TORCH_CUDA_ARCH_LIST'] = os.getenv('TORCH_CUDA_ARCH_LIST', '9.0')
+
+        # CUDA 12 flags
+        nvcc_flags.extend(['-rdc=true', '--ptxas-options=--register-usage-level=10'])
+
+    # Disable LD/ST tricks, as some CUDA version does not support `.L1::no_allocate`
+    if os.environ['TORCH_CUDA_ARCH_LIST'].strip() != '9.0':
+        assert int(os.getenv('DISABLE_AGGRESSIVE_PTX_INSTRS', 1)) == 1
+        os.environ['DISABLE_AGGRESSIVE_PTX_INSTRS'] = '1'
+
+    # Disable aggressive PTX instructions
+    if int(os.getenv('DISABLE_AGGRESSIVE_PTX_INSTRS', '0')):
+        cxx_flags.append('-DDISABLE_AGGRESSIVE_PTX_INSTRS')
+        nvcc_flags.append('-DDISABLE_AGGRESSIVE_PTX_INSTRS')
+
+    # Put them together
+    extra_compile_args = {
+        'cxx': cxx_flags,
+        'nvcc': nvcc_flags,
+    }
+    if len(nvcc_dlink) > 0:
+        extra_compile_args['nvcc_dlink'] = nvcc_dlink
+
+    # Summary
+    print(f'Build summary:')
+    print(f' > Sources: {sources}')
+    print(f' > Includes: {include_dirs}')
+    print(f' > Libraries: {library_dirs}')
+    print(f' > Compilation flags: {extra_compile_args}')
+    print(f' > Link flags: {extra_link_args}')
+    print(f' > Arch list: {os.environ["TORCH_CUDA_ARCH_LIST"]}')
+    print(f' > NVSHMEM path: {nvshmem_dir}')
+    print()
+
+    # noinspection PyBroadException
+    try:
+        cmd = ['git', 'rev-parse', '--short', 'HEAD']
+        revision = '+' + subprocess.check_output(cmd).decode('ascii').rstrip()
+    except Exception as _:
+        revision = ''
+
+    setuptools.setup(
+        name='deep_ep',
+        version='1.1.0' + revision,
+        packages=setuptools.find_packages(
+            include=['deep_ep']
+        ),
+        ext_modules=[
+            CUDAExtension(
+                name='deep_ep_cpp',
+                include_dirs=include_dirs,
+                library_dirs=library_dirs,
+                sources=sources,
+                extra_compile_args=extra_compile_args,
+                extra_link_args=extra_link_args
+            )
+        ],
+        cmdclass={
+            'build_ext': BuildExtension
+        }
+    )
--- a/DeepEP/tests/test_internode.py
+++ b/DeepEP/tests/test_internode.py
@ -0,0 +1,254 @@
+import os
+import time
+import torch
+import torch.distributed as dist
+
+# noinspection PyUnresolvedReferences
+import deep_ep
+from utils import init_dist, bench, calc_diff, create_grouped_scores, inplace_unique, per_token_cast_to_fp8, per_token_cast_back
+
+# Test compatibility with low latency functions
+import test_low_latency
+
+
+def test_main(num_sms: int, local_rank: int, num_local_ranks: int, num_ranks: int, num_nodes: int, rank: int, buffer: deep_ep.Buffer, group: dist.ProcessGroup):
+    # Settings
+    num_tokens, hidden, num_topk_groups, num_topk, num_experts = 4096, 7168, min(num_nodes, 4), 8, (256 // num_ranks) * num_ranks
+    assert num_experts % num_ranks == 0 and num_local_ranks == 8
+    if local_rank == 0:
+        print(f'[config] num_tokens={num_tokens}, hidden={hidden}, num_topk_groups={num_topk_groups}, num_topk={num_topk}', flush=True)
+
+    # Random data
+    x = torch.ones((num_tokens, hidden), dtype=torch.bfloat16, device='cuda') * rank
+    x_pure_rand = torch.randn((num_tokens, hidden), dtype=torch.bfloat16, device='cuda')
+    x_e4m3 = per_token_cast_to_fp8(x)
+    x_e4m3 = (x_e4m3[0], x_e4m3[1].T.contiguous().T)
+    scores = torch.randn((num_tokens, num_experts), dtype=torch.float32, device='cuda').abs() + 1
+    group_scores = scores.view(num_tokens, num_nodes, -1).amax(dim=-1)
+    group_idx = torch.topk(group_scores, k=num_topk_groups, dim=-1, sorted=False).indices
+    masked_scores = create_grouped_scores(scores, group_idx, num_nodes)
+    topk_idx = torch.topk(masked_scores, num_topk, dim=-1, largest=True, sorted=False)[1]
+    topk_weights = torch.ones((num_tokens, num_topk), dtype=torch.float32, device='cuda') * rank
+    topk_weights_pure_rand = torch.randn((num_tokens, num_topk), dtype=torch.float32, device='cuda')
+    rank_idx = topk_idx // (num_experts // num_ranks)
+    rank_idx.masked_fill_(topk_idx == -1, -1)
+    inplace_unique(rank_idx, num_ranks)
+    rdma_rank_idx = rank_idx // num_local_ranks
+    rdma_rank_idx.masked_fill_(rank_idx == -1, -1)
+    inplace_unique(rdma_rank_idx, num_nodes)
+
+    # RDMA dispatch counts
+    rdma_idx = topk_idx // (num_experts // num_nodes)
+    rdma_idx.masked_fill_(topk_idx == -1, -1)
+    inplace_unique(rdma_idx, num_nodes)
+    num_rdma_token_sent = rdma_idx.ne(-1).sum().item()
+
+    # Expert meta
+    num_tokens_per_expert = torch.zeros((num_experts, ), dtype=torch.int, device='cuda')
+    for i in range(num_experts):
+        num_tokens_per_expert[i] = (topk_idx == i).sum()
+    gbl_num_tokens_per_expert = num_tokens_per_expert.clone()
+    dist.all_reduce(gbl_num_tokens_per_expert, group=group)
+
+    # Rank layout meta
+    num_tokens_per_rank = torch.empty((num_ranks, ), dtype=torch.int, device='cuda')
+    num_tokens_per_rdma_rank = torch.empty((num_nodes, ), dtype=torch.int, device='cuda')
+    token_idx_in_rank = torch.full((num_ranks, num_tokens), -1, dtype=torch.long, device='cuda')
+    for i in range(num_ranks):
+        num_tokens_per_rank[i] = (rank_idx == i).sum()
+        token_sel = (rank_idx == i).max(dim=-1)[0]
+        count = token_sel.sum().item()
+        tokens = torch.sort(token_sel.to(torch.int), descending=True)[1]
+        tokens[:count] = torch.sort(tokens[:count])[0]
+        token_idx_in_rank[i][tokens[:count]] = torch.arange(count, dtype=torch.long, device='cuda')
+    for i in range(num_nodes):
+        num_tokens_per_rdma_rank[i] = (rdma_rank_idx == i).sum()
+    token_idx_in_rank = token_idx_in_rank.T.contiguous().to(torch.int)
+    is_token_in_rank = token_idx_in_rank >= 0
+    gbl_num_tokens_per_rank = num_tokens_per_rank.clone()
+    dist.all_reduce(gbl_num_tokens_per_rank, group=group)
+
+    ref_num_tokens_per_rank, ref_num_tokens_per_rdma_rank, ref_num_tokens_per_expert, ref_is_token_in_rank, _ = \
+        buffer.get_dispatch_layout(topk_idx, num_experts)
+    assert torch.allclose(ref_num_tokens_per_rank, num_tokens_per_rank)
+    assert torch.allclose(ref_num_tokens_per_rdma_rank, num_tokens_per_rdma_rank)
+    assert torch.allclose(ref_num_tokens_per_expert, num_tokens_per_expert)
+    assert torch.allclose(ref_is_token_in_rank, is_token_in_rank)
+    t = bench(lambda: buffer.get_dispatch_layout(topk_idx, num_experts))[0]
+    if local_rank == 0:
+        print(f'[layout] Kernel performance: {t * 1000:.3f} ms', flush=True)
+        print('', flush=True)
+    group.barrier()
+    time.sleep(1)
+
+    # Config
+    rdma_buffer_size, nvl_buffer_size = 128, (720 if num_ranks in (144, 160) else 512)
+    config = deep_ep.Config(num_sms, 8, nvl_buffer_size, 16, rdma_buffer_size)
+
+    # Test dispatch
+    # noinspection PyShadowingNames
+    def check_data(check_x, recv_gbl_rank_prefix_sum):
+        assert torch.allclose(check_x.amin(dim=1), check_x.amax(dim=1))
+        check_start = 0
+        for i in range(num_ranks):
+            check_end = recv_gbl_rank_prefix_sum[i].item()
+            assert (check_x[check_start:check_end, :].int() - i).sum().item() == 0
+            check_start = check_end
+
+    for previous_mode in (False, True):
+        for async_mode in (False, True):
+            for current_x in (x_pure_rand, x, x_e4m3):
+                for with_topk in (False, True):
+                    if local_rank == 0:
+                        print(f'[testing] Running with {"FP8" if isinstance(current_x, tuple) else "BF16"}, {"with" if with_topk else "without"} top-k (async={async_mode}, previous={previous_mode}) ...', flush=True, end='')
+                    dispatch_args = {'x': current_x, 'num_tokens_per_rank': num_tokens_per_rank, 'num_tokens_per_rdma_rank': num_tokens_per_rdma_rank,  'is_token_in_rank': is_token_in_rank,
+                                     'num_tokens_per_expert': num_tokens_per_expert, 'config': config, 'async_finish': async_mode}
+                    if with_topk:
+                        dispatch_args.update({'topk_idx': topk_idx, 'topk_weights': topk_weights_pure_rand if current_x is x_pure_rand else topk_weights})
+                    if previous_mode:
+                        dispatch_args.update({'previous_event': buffer.capture()})
+                    recv_x, recv_topk_idx, recv_topk_weights, recv_num_tokens_per_expert_list, handle, event = buffer.dispatch(**dispatch_args)
+                    event.current_stream_wait() if async_mode else ()
+                    recv_x = per_token_cast_back(*recv_x) if isinstance(recv_x, tuple) else recv_x
+
+                    # Checks
+                    recv_gbl_rank_prefix_sum = handle[-4]
+                    assert gbl_num_tokens_per_rank[rank].item() == recv_x.size(0), f'{gbl_num_tokens_per_rank[rank].item()} != {recv_x.size(0)}'
+                    assert gbl_num_tokens_per_expert.view(num_ranks, -1)[rank].tolist() == recv_num_tokens_per_expert_list
+                    if current_x is not x_pure_rand:
+                        check_data(recv_x, recv_gbl_rank_prefix_sum)
+                    if with_topk:
+                        # Check `topk_idx`
+                        assert (recv_topk_idx.eq(-1) | ((recv_topk_idx >= 0) & (recv_topk_idx < (num_experts // num_ranks)))).sum().item() == recv_topk_idx.numel()
+                        for i, count in enumerate(recv_num_tokens_per_expert_list):
+                            assert recv_topk_idx.eq(i).sum().item() == count
+
+                        # Check `topk_weights`
+                        if current_x is not x_pure_rand:
+                            recv_topk_weights[recv_topk_idx.eq(-1)] = recv_topk_weights.amax(dim=1, keepdim=True).expand_as(recv_topk_weights)[recv_topk_idx.eq(-1)]
+                            check_data(recv_topk_weights, recv_gbl_rank_prefix_sum)
+
+                    # Test cached dispatch (must without top-k staffs)
+                    if not with_topk:
+                        dispatch_args = {'x': current_x, 'handle': handle, 'config': config, 'async_finish': async_mode}
+                        if previous_mode:
+                            dispatch_args.update({'previous_event': buffer.capture()})
+                        recv_x, _, _, _, _, event = buffer.dispatch(**dispatch_args)
+                        event.current_stream_wait() if async_mode else ()
+                        recv_x = per_token_cast_back(*recv_x) if isinstance(recv_x, tuple) else recv_x
+                        if current_x is not x_pure_rand:
+                            check_data(recv_x, recv_gbl_rank_prefix_sum)
+
+                    # Test combine
+                    bias_0 = torch.ones((num_tokens, hidden), dtype=torch.bfloat16, device='cuda')
+                    bias_1 = torch.randn((num_tokens, hidden), dtype=torch.bfloat16, device='cuda')
+                    combine_args = {'x': recv_x, 'bias': (bias_0, bias_1), 'handle': handle, 'config': config, 'async_finish': async_mode}
+                    if with_topk:
+                        combine_args.update({'topk_weights': recv_topk_weights})
+                    if previous_mode:
+                        combine_args.update({'previous_event': buffer.capture()})
+                    combined_x, combined_topk_weights, event = buffer.combine(**combine_args)
+                    event.current_stream_wait() if async_mode else ()
+                    check_x = (combined_x.float() - bias_0.float() - bias_1.float()) / is_token_in_rank.sum(dim=1).unsqueeze(1)
+                    ref_x = x_pure_rand if current_x is x_pure_rand else x
+                    assert calc_diff(check_x, ref_x) < 5e-6
+                    if with_topk:
+                        check_topk_weights = combined_topk_weights if (current_x is x_pure_rand) else (combined_topk_weights / is_token_in_rank.sum(dim=1).unsqueeze(1))
+                        ref_topk_weights = topk_weights_pure_rand if current_x is x_pure_rand else topk_weights
+                        assert calc_diff(check_topk_weights, ref_topk_weights) < 1e-9
+
+                    # For later tuning
+                    dispatch_bf16_rdma_send_bytes = num_rdma_token_sent * hidden * 2
+                    dispatch_bf16_nvl_recv_bytes = recv_x.numel() * 2
+                    combine_bf16_nvl_send_bytes = dispatch_bf16_nvl_recv_bytes
+                    combine_bf16_rdma_recv_bytes = dispatch_bf16_rdma_send_bytes
+
+                    if local_rank == 0:
+                        print(' passed', flush=True)
+    if local_rank == 0:
+        print('', flush=True)
+
+    # Tune dispatch performance
+    best_dispatch_results = None
+    fp8_factor = (1 + 4 / 128) / 2
+    for current_x in (x_e4m3, x):
+        best_time, best_results = 1e10, None
+        rdma_send_bytes = (dispatch_bf16_rdma_send_bytes * fp8_factor) if isinstance(current_x, tuple) else dispatch_bf16_rdma_send_bytes
+        nvl_recv_bytes = (dispatch_bf16_nvl_recv_bytes * fp8_factor) if isinstance(current_x, tuple) else dispatch_bf16_nvl_recv_bytes
+        for nvl_chunk_size in range(4, 33, 4):
+            for rdma_chunk_size in range(4, 33, 4):
+                config = deep_ep.Config(num_sms, nvl_chunk_size, nvl_buffer_size, rdma_chunk_size, rdma_buffer_size)
+                tune_args = {'x': current_x, 'handle': handle, 'config': config}
+                t = bench(lambda: buffer.dispatch(**tune_args))[0]
+                if t < best_time:
+                    best_time, best_results = t, (num_sms, nvl_chunk_size, rdma_chunk_size)
+                if local_rank == 0:
+                    print(f'[tuning] SMs {num_sms}, NVL chunk {nvl_chunk_size}, RDMA chunk {rdma_chunk_size}: {rdma_send_bytes / 1e9 / t:.2f} GB/s (RDMA), {nvl_recv_bytes / 1e9 / t:.2f} GB/s (NVL) ', flush=True)
+        if local_rank == 0:
+            print(f'[tuning] Best dispatch ({"FP8" if isinstance(current_x, tuple) else "BF16"}): SMs {best_results[0]}, NVL chunk {best_results[1]}, RDMA chunk {best_results[2]}: {rdma_send_bytes / 1e9 / best_time:.2f} GB/s (RDMA), {nvl_recv_bytes / 1e9 / best_time:.2f} GB/s (NVL)', flush=True)
+            print('', flush=True)
+
+        if isinstance(current_x, tuple):
+            # Gather FP8 the best config from rank 0
+            best_dispatch_results = torch.tensor([best_results[0], best_results[1], best_results[2]], dtype=torch.int32, device='cuda')
+            all_best_fp8_results_list = [torch.zeros_like(best_dispatch_results) for _ in range(torch.distributed.get_world_size())]
+            dist.all_gather(all_best_fp8_results_list, best_dispatch_results, group=group)
+            best_dispatch_results = all_best_fp8_results_list[0].tolist()
+    dispatch_config = deep_ep.Config(best_dispatch_results[0], best_dispatch_results[1], nvl_buffer_size, best_dispatch_results[2], rdma_buffer_size)
+
+    dispatch_args = {'x': x, 'num_tokens_per_rank': num_tokens_per_rank, 'num_tokens_per_rdma_rank': num_tokens_per_rdma_rank,
+                     'is_token_in_rank': is_token_in_rank, 'num_tokens_per_expert': num_tokens_per_expert,
+                     'config': dispatch_config if dispatch_config is not None else config}
+    recv_x, _, _, _, handle, _ = buffer.dispatch(**dispatch_args)
+
+    # Tune combine performance
+    best_time, best_results = 1e10, None
+    for nvl_chunk_size in range(1, 5, 1):
+        for rdma_chunk_size in range(8, 33, 4):
+            config = deep_ep.Config(num_sms, nvl_chunk_size, nvl_buffer_size, rdma_chunk_size, rdma_buffer_size)
+            tune_args = {'x': recv_x, 'handle': handle, 'config': config}
+            t = bench(lambda: buffer.combine(**tune_args))[0]
+            if local_rank == 0:
+                print(f'[tuning] SMs {num_sms}, NVL chunk {nvl_chunk_size}, RDMA chunk {rdma_chunk_size}: {combine_bf16_rdma_recv_bytes / 1e9 / t:.2f} GB/s (RDMA), {combine_bf16_nvl_send_bytes / 1e9 / t:.2f} GB/s (NVL) ', flush=True)
+                if t < best_time:
+                    best_time, best_results = t, (num_sms, nvl_chunk_size, rdma_chunk_size)
+
+    if local_rank == 0:
+        print(f'[tuning] Best combine: SMs {best_results[0]}, NVL chunk {best_results[1]}, RDMA chunk {best_results[2]}: {combine_bf16_rdma_recv_bytes / 1e9 / best_time:.2f} GB/s (RDMA), {combine_bf16_nvl_send_bytes / 1e9 / best_time:.2f} GB/s (NVL)', flush=True)
+        print('', flush=True)
+
+
+# noinspection PyUnboundLocalVariable
+def test_loop(local_rank: int, num_local_ranks: int):
+    num_nodes = int(os.getenv('WORLD_SIZE', 1))
+    rank, num_ranks, group = init_dist(local_rank, num_local_ranks)
+    test_ll_compatibility = os.getenv('EP_TEST_LL_COMPATIBILITY', False)
+    if test_ll_compatibility:
+        ll_num_tokens, ll_hidden, ll_num_experts, ll_num_topk = 16, 5120, 256, 9
+
+    num_sms = 24
+    num_qps_per_rank = max(num_sms, ll_num_experts // num_ranks if test_ll_compatibility else 0)
+
+    buffer = deep_ep.Buffer(group, int(1e9), int(1e9), low_latency_mode=test_ll_compatibility,
+                            num_qps_per_rank=num_qps_per_rank)
+    assert num_local_ranks == 8 and num_ranks > 8
+    torch.manual_seed(rank)
+
+    for i in (num_sms, ):
+        test_main(i, local_rank, num_local_ranks, num_ranks, num_nodes, rank, buffer, group)
+        if local_rank == 0:
+            print('', flush=True)
+
+    # Test compatibility with low latency functions
+    if test_ll_compatibility:
+        buffer.clean_low_latency_buffer(ll_num_tokens, ll_hidden, ll_num_experts)
+        test_low_latency.test_main(ll_num_tokens, ll_hidden, ll_num_experts, ll_num_topk, rank, num_ranks, group, buffer, seed=1)
+
+    # Destroy the communication group
+    dist.barrier()
+    dist.destroy_process_group()
+
+
+if __name__ == '__main__':
+    num_processes = 8
+    torch.multiprocessing.spawn(test_loop, args=(num_processes, ), nprocs=num_processes)
--- a/DeepEP/tests/test_intranode.py
+++ b/DeepEP/tests/test_intranode.py
@ -0,0 +1,256 @@
+import time
+import torch
+import torch.distributed as dist
+
+# noinspection PyUnresolvedReferences
+import deep_ep
+from utils import init_dist, bench, calc_diff, inplace_unique, per_token_cast_to_fp8, per_token_cast_back
+
+# Test compatibility with low latency functions
+import test_low_latency
+
+
+def test_main(num_sms: int, local_rank: int, num_ranks: int, rank: int, buffer: deep_ep.Buffer, group: dist.ProcessGroup):
+    # Settings
+    num_tokens, hidden, num_topk, num_experts = 4096, 7168, 8, (256 // num_ranks) * num_ranks
+    assert num_experts % num_ranks == 0
+    if local_rank == 0:
+        print(f'[config] num_tokens={num_tokens}, hidden={hidden}, num_topk={num_topk}', flush=True)
+
+    # Random data
+    x = torch.ones((num_tokens, hidden), dtype=torch.bfloat16, device='cuda') * rank
+    x_pure_rand = torch.randn((num_tokens, hidden), dtype=torch.bfloat16, device='cuda')
+    x_e4m3 = per_token_cast_to_fp8(x) if deep_ep.Buffer.is_sm90_compiled() else None
+    x_e4m3 = (x_e4m3[0], x_e4m3[1].T.contiguous().T) if x_e4m3 is not None else None
+    scores = torch.randn((num_tokens, num_experts), dtype=torch.float32, device='cuda').abs() + 1
+    topk_idx = torch.topk(scores, num_topk, dim=-1, largest=True, sorted=False)[1]
+    topk_weights = torch.ones((num_tokens, num_topk), dtype=torch.float32, device='cuda') * rank
+    topk_weights_pure_rand = torch.randn((num_tokens, num_topk), dtype=torch.float32, device='cuda')
+    rank_idx = topk_idx // (num_experts // num_ranks)
+    rank_idx.masked_fill_(topk_idx == -1, -1)
+    inplace_unique(rank_idx, num_ranks)
+
+    # Expert meta
+    num_tokens_per_expert = torch.zeros((num_experts, ), dtype=torch.int, device='cuda')
+    for i in range(num_experts):
+        num_tokens_per_expert[i] = (topk_idx == i).sum()
+    gbl_num_tokens_per_expert = num_tokens_per_expert.clone()
+    dist.all_reduce(gbl_num_tokens_per_expert, group=group)
+
+    # Rank layout meta
+    num_tokens_per_rank = torch.empty((num_ranks, ), dtype=torch.int, device='cuda')
+    token_idx_in_rank = torch.full((num_ranks, num_tokens), -1, dtype=torch.long, device='cuda')
+    for i in range(num_ranks):
+        num_tokens_per_rank[i] = (rank_idx == i).sum()
+        token_sel = (rank_idx == i).max(dim=-1)[0]
+        count = token_sel.sum().item()
+        tokens = torch.sort(token_sel.to(torch.int), descending=True)[1]
+        tokens[:count] = torch.sort(tokens[:count])[0]
+        token_idx_in_rank[i][tokens[:count]] = torch.arange(count, dtype=torch.long, device='cuda')
+    token_idx_in_rank = token_idx_in_rank.T.contiguous().to(torch.int)
+    is_token_in_rank = token_idx_in_rank >= 0
+    gbl_num_tokens_per_rank = num_tokens_per_rank.clone()
+    dist.all_reduce(gbl_num_tokens_per_rank, group=group)
+
+    ref_num_tokens_per_rank, _, ref_num_tokens_per_expert, ref_is_token_in_rank, _ = \
+        buffer.get_dispatch_layout(topk_idx, num_experts)
+    assert torch.allclose(ref_num_tokens_per_rank, num_tokens_per_rank)
+    assert torch.allclose(ref_num_tokens_per_expert, num_tokens_per_expert)
+    assert torch.allclose(ref_is_token_in_rank, is_token_in_rank)
+    t = bench(lambda: buffer.get_dispatch_layout(topk_idx, num_experts))[0]
+    if local_rank == 0:
+        print(f'[layout] Kernel performance: {t * 1000:.3f} ms', flush=True)
+        print('', flush=True)
+    group.barrier()
+    time.sleep(1)
+
+    # Config
+    nvl_buffer_size = 256
+    config = deep_ep.Config(num_sms, 8, nvl_buffer_size)
+
+    # Test dispatch
+    # noinspection PyShadowingNames
+    def check_data(check_x, rank_prefix_matrix):
+        assert torch.allclose(check_x.amin(dim=1), check_x.amax(dim=1))
+        check_start = 0
+        for i in range(num_ranks):
+            check_end = rank_prefix_matrix[i][rank].item()
+            assert (check_x[check_start:check_end, :].int() - i).sum().item() == 0
+            check_start = check_end
+
+    for previous_mode in (False, True):
+        for async_mode in (False, True):
+            for current_x in filter(lambda elem: elem is not None, (x_pure_rand, x, x_e4m3)):
+                for with_topk in (False, True):
+                    if local_rank == 0:
+                        print(f'[testing] Running with {"FP8" if isinstance(current_x, tuple) else "BF16"}, {"with" if with_topk else "without"} top-k (async={async_mode}, previous={previous_mode}) ...', flush=True, end='')
+                    dispatch_args = {'x': current_x, 'num_tokens_per_rank': num_tokens_per_rank,  'is_token_in_rank': is_token_in_rank,
+                                     'num_tokens_per_expert': num_tokens_per_expert, 'config': config, 'async_finish': async_mode}
+                    if with_topk:
+                        dispatch_args.update({'topk_idx': topk_idx, 'topk_weights': topk_weights_pure_rand if current_x is x_pure_rand else topk_weights})
+                    if previous_mode:
+                        dispatch_args.update({'previous_event': buffer.capture()})
+                    recv_x, recv_topk_idx, recv_topk_weights, recv_num_tokens_per_expert_list, handle, event = buffer.dispatch(**dispatch_args)
+                    event.current_stream_wait() if async_mode else ()
+                    recv_x = per_token_cast_back(*recv_x) if isinstance(recv_x, tuple) else recv_x
+
+                    # Checks
+                    rank_prefix_matrix = handle[0]
+                    assert gbl_num_tokens_per_rank[rank].item() == recv_x.size(0), f'{gbl_num_tokens_per_rank[rank].item()} != {recv_x.size(0)}'
+                    assert gbl_num_tokens_per_expert.view(num_ranks, -1)[rank].tolist() == recv_num_tokens_per_expert_list
+                    if current_x is not x_pure_rand:
+                        check_data(recv_x, rank_prefix_matrix)
+                    recv_topk_weights_clone = None
+                    if with_topk:
+                        # Check `topk_idx`
+                        assert (recv_topk_idx.eq(-1) | ((recv_topk_idx >= 0) & (recv_topk_idx < (num_experts // num_ranks)))).sum().item() == recv_topk_idx.numel()
+                        for i, count in enumerate(recv_num_tokens_per_expert_list):
+                            assert recv_topk_idx.eq(i).sum().item() == count
+
+                        # Check `topk_weights`
+                        recv_topk_weights_clone = recv_topk_weights.clone()
+                        if current_x is not x_pure_rand:
+                            recv_topk_weights[recv_topk_idx.eq(-1)] = recv_topk_weights.amax(dim=1, keepdim=True).expand_as(recv_topk_weights)[recv_topk_idx.eq(-1)]
+                            check_data(recv_topk_weights, rank_prefix_matrix)
+
+                    # Test `num_worst_tokens != 0`
+                    if with_topk:
+                        num_worst_tokens = num_tokens * num_ranks
+                        dispatch_args.update({'num_worst_tokens': num_worst_tokens})
+                        recv_worst_x, recv_worst_topk_idx, recv_worst_topk_weights, empty_list, _, event = buffer.dispatch(**dispatch_args)
+                        event.current_stream_wait() if async_mode else ()
+                        recv_worst_x = per_token_cast_back(*recv_worst_x) if isinstance(recv_worst_x, tuple) else recv_worst_x
+                        assert len(empty_list) == 0
+                        assert num_worst_tokens == recv_worst_x.size(0)
+                        assert num_worst_tokens == recv_worst_topk_idx.size(0)
+                        assert num_worst_tokens == recv_worst_topk_weights.size(0)
+                        assert torch.equal(recv_x, recv_worst_x[:recv_x.size(0)])
+                        assert torch.equal(recv_topk_idx, recv_worst_topk_idx[:recv_x.size(0)])
+                        assert torch.equal(recv_topk_weights_clone, recv_worst_topk_weights[:recv_x.size(0)])
+                        assert torch.all(recv_worst_topk_idx[recv_x.size(0):] == -1).item()
+
+                    # Test cached dispatch (must without top-k staffs)
+                    if not with_topk:
+                        dispatch_args = {'x': current_x, 'handle': handle, 'config': config, 'async_finish': async_mode}
+                        if previous_mode:
+                            dispatch_args.update({'previous_event': buffer.capture()})
+                        recv_x, _, _, _, _, event = buffer.dispatch(**dispatch_args)
+                        event.current_stream_wait() if async_mode else ()
+                        recv_x = per_token_cast_back(*recv_x) if isinstance(recv_x, tuple) else recv_x
+                        if current_x is not x_pure_rand:
+                            check_data(recv_x, rank_prefix_matrix)
+
+                    # Test combine
+                    combine_args = {'x': recv_x, 'handle': handle, 'config': config, 'async_finish': async_mode}
+                    if with_topk:
+                        combine_args.update({'topk_weights': recv_topk_weights})
+                    if previous_mode:
+                        combine_args.update({'previous_event': buffer.capture()})
+                    combined_x, combined_topk_weights, event = buffer.combine(**combine_args)
+                    event.current_stream_wait() if async_mode else ()
+                    check_x = combined_x.float() / is_token_in_rank.sum(dim=1).unsqueeze(1)
+                    ref_x = x_pure_rand if current_x is x_pure_rand else x
+                    assert calc_diff(check_x, ref_x) < 5e-6
+                    if with_topk:
+                        check_topk_weights = combined_topk_weights if (current_x is x_pure_rand) else (combined_topk_weights / is_token_in_rank.sum(dim=1).unsqueeze(1))
+                        ref_topk_weights = topk_weights_pure_rand if current_x is x_pure_rand else topk_weights
+                        assert calc_diff(check_topk_weights, ref_topk_weights) < 1e-9
+
+                    # For later tuning
+                    dispatch_bf16_nvl_recv_bytes = recv_x.numel() * 2
+                    combine_bf16_nvl_send_bytes = dispatch_bf16_nvl_recv_bytes
+
+                    if local_rank == 0:
+                        print(' passed', flush=True)
+    if local_rank == 0:
+        print('', flush=True)
+
+    # Tune dispatch performance
+    best_dispatch_results = None
+    fp8_factor = (1 + 4 / 128) / 2
+    for current_x in filter(lambda elem: elem is not None, (x_e4m3, x)):
+        best_time, best_results = 1e10, None
+        nvl_recv_bytes = (dispatch_bf16_nvl_recv_bytes * fp8_factor) if isinstance(current_x, tuple) else dispatch_bf16_nvl_recv_bytes
+        for nvl_chunk_size in tuple(range(4, 33, 2)) + (0, ):
+            if nvl_chunk_size > 0:
+                config = deep_ep.Config(num_sms, nvl_chunk_size, nvl_buffer_size)
+            else:
+                # Test default config as well
+                deep_ep.Buffer.set_num_sms(num_sms)
+                config = deep_ep.Buffer.get_dispatch_config(num_ranks)
+            tune_args = {'x': current_x, 'handle': handle, 'config': config}
+            t = bench(lambda: buffer.dispatch(**tune_args))[0]
+            if t < best_time and nvl_chunk_size > 0:
+                best_time, best_results = t, (num_sms, nvl_chunk_size)
+            if local_rank == 0:
+                print(f'[tuning] SMs {num_sms}, NVL chunk {nvl_chunk_size if nvl_chunk_size else "default"}: '
+                      f'{nvl_recv_bytes / 1e9 / t:.2f} GB/s (NVL), avg_t: {t * 1e6:.2f} us', flush=True)
+        if local_rank == 0:
+            print(f'[tuning] Best dispatch ({"FP8" if isinstance(current_x, tuple) else "BF16"}): SMs {best_results[0]}, NVL chunk {best_results[1]}, {nvl_recv_bytes / 1e9 / best_time:.2f} GB/s (NVL), t: {best_time * 1e6:.2f} us', flush=True)
+            print('', flush=True)
+
+        # Gather the best config from rank 0 and the first test setting
+        if best_dispatch_results is None:
+            best_dispatch_results = torch.tensor([best_results[0], best_results[1]], dtype=torch.int32, device='cuda')
+            all_best_fp8_results_list = [torch.zeros_like(best_dispatch_results) for _ in range(torch.distributed.get_world_size())]
+            dist.all_gather(all_best_fp8_results_list, best_dispatch_results, group=group)
+            best_dispatch_results = all_best_fp8_results_list[0].tolist()
+    dispatch_config = deep_ep.Config(best_dispatch_results[0], best_dispatch_results[1], nvl_buffer_size)
+
+    dispatch_args = {'x': x, 'num_tokens_per_rank': num_tokens_per_rank,
+                     'is_token_in_rank': is_token_in_rank, 'num_tokens_per_expert': num_tokens_per_expert,
+                     'config': dispatch_config if dispatch_config is not None else config}
+    recv_x, _, _, _, handle, _ = buffer.dispatch(**dispatch_args)
+
+    # Tune combine performance
+    best_time, best_results = 1e10, None
+    for nvl_chunk_size in tuple(range(1, 17, 1)) + (0, ):
+        if nvl_chunk_size > 0:
+            config = deep_ep.Config(num_sms, nvl_chunk_size, nvl_buffer_size)
+        else:
+            # Test default config as well
+            deep_ep.Buffer.set_num_sms(num_sms)
+            config = deep_ep.Buffer.get_combine_config(num_ranks)
+        tune_args = {'x': recv_x, 'handle': handle, 'config': config}
+        t = bench(lambda: buffer.combine(**tune_args))[0]
+        if local_rank == 0:
+            print(f'[tuning] SMs {num_sms}, NVL chunk {nvl_chunk_size if nvl_chunk_size else "default"}: '
+                  f'{combine_bf16_nvl_send_bytes / 1e9 / t:.2f} GB/s (NVL), avg_t: {t * 1e6:.2f} us', flush=True)
+            if t < best_time and nvl_chunk_size > 0:
+                best_time, best_results = t, (num_sms, nvl_chunk_size)
+
+    if local_rank == 0:
+        print(f'[tuning] Best combine: SMs {best_results[0]}, NVL chunk {best_results[1]}: {combine_bf16_nvl_send_bytes / 1e9 / best_time:.2f} GB/s (NVL), t: {best_time * 1e6:.2f} us', flush=True)
+        print('', flush=True)
+
+
+# noinspection PyUnboundLocalVariable
+def test_loop(local_rank: int, num_local_ranks: int):
+    rank, num_ranks, group = init_dist(local_rank, num_local_ranks)
+    test_ll_compatibility, num_rdma_bytes = False, 0
+    if test_ll_compatibility:
+        ll_num_tokens, ll_hidden, ll_num_experts, ll_num_topk = 16, 5120, 256, 9
+        num_rdma_bytes = deep_ep.Buffer.get_low_latency_rdma_size_hint(ll_num_tokens, ll_hidden, num_ranks, ll_num_experts)
+
+    buffer = deep_ep.Buffer(group, int(2e9), num_rdma_bytes, low_latency_mode=test_ll_compatibility,
+                            num_qps_per_rank=(ll_num_experts // num_ranks if test_ll_compatibility else 1))
+    torch.manual_seed(rank)
+
+    for i in (24, ):
+        test_main(i, local_rank, num_ranks, rank, buffer, group)
+        if local_rank == 0:
+            print('', flush=True)
+
+    # Test compatibility with low latency functions
+    if test_ll_compatibility:
+        buffer.clean_low_latency_buffer(ll_num_tokens, ll_hidden, ll_num_experts)
+        test_low_latency.test_main(ll_num_tokens, ll_hidden, ll_num_experts, ll_num_topk, rank, num_ranks, group, buffer, seed=1)
+
+    # Destroy the communication group
+    dist.barrier()
+    dist.destroy_process_group()
+
+
+if __name__ == '__main__':
+    num_processes = 8
+    torch.multiprocessing.spawn(test_loop, args=(num_processes, ), nprocs=num_processes)
--- a/DeepEP/tests/test_low_latency.py
+++ b/DeepEP/tests/test_low_latency.py
@ -0,0 +1,187 @@
+import random
+import torch
+import torch.distributed as dist
+from functools import partial
+
+import deep_ep
+from utils import init_dist, bench, bench_kineto, calc_diff, hash_tensor, per_token_cast_back
+
+
+def test_main(num_tokens: int, hidden: int, num_experts: int, num_topk: int,
+              rank: int, num_ranks: int, group: dist.ProcessGroup, buffer: deep_ep.Buffer, seed: int = 0):
+    torch.manual_seed(seed + rank)
+    random.seed(seed + rank)
+
+    assert num_experts % num_ranks == 0
+    num_local_experts = num_experts // num_ranks
+
+    # NOTES: the integers greater than 256 exceeds the BF16 precision limit
+    rank_offset = 128
+    assert num_ranks - rank_offset < 257, 'Too many ranks (exceeding test precision limit)'
+
+    x = torch.ones((num_tokens, hidden), dtype=torch.bfloat16, device='cuda') * (rank - rank_offset)
+    x[:, -128:] = torch.arange(num_tokens, device='cuda').to(torch.bfloat16).view(-1, 1)
+    scores = torch.randn((num_tokens, num_experts), dtype=torch.float32, device='cuda').abs() + 1
+    topk_idx = torch.topk(scores, num_topk, dim=-1, largest=True, sorted=True)[1]
+    topk_weights = torch.randn((num_tokens, num_topk), dtype=torch.float32, device='cuda').abs()
+
+    # Randomly mask some positions
+    for i in range(10):
+        topk_idx[random.randint(0, num_tokens - 1), random.randint(0, num_topk - 1)] = -1
+
+    # Check dispatch correctness
+    do_check = True
+    hash_value, num_times = 0, 0
+    for return_recv_hook in (False, True):
+        for dispatch_use_fp8 in (False, True):
+            for round_scale in (False, True) if dispatch_use_fp8 else (False, ):
+                for use_ue8m0 in (False, True) if round_scale else (False, ):
+                    num_times += 1
+                    for i in range((num_times % 2) + 1):
+                        cumulative_local_expert_recv_stats = torch.zeros((num_local_experts, ), dtype=torch.int, device='cuda')
+                        packed_recv_x, packed_recv_count, handle, event, hook = \
+                            buffer.low_latency_dispatch(x, topk_idx, num_tokens, num_experts,
+                                                        use_fp8=dispatch_use_fp8, round_scale=round_scale, use_ue8m0=use_ue8m0,
+                                                        cumulative_local_expert_recv_stats=cumulative_local_expert_recv_stats,
+                                                        async_finish=not return_recv_hook, return_recv_hook=return_recv_hook)
+                        hook() if return_recv_hook else event.current_stream_wait()
+                    packed_recv_x = (packed_recv_x[0], packed_recv_x[1].contiguous()) if dispatch_use_fp8 else packed_recv_x
+                    simulated_gemm_x = per_token_cast_back(packed_recv_x[0].view(-1, hidden), packed_recv_x[1].view(-1, hidden // 128)).view(packed_recv_x[0].shape) \
+                        if dispatch_use_fp8 else packed_recv_x.clone()
+                    all_topk_idx = torch.empty((num_ranks, num_tokens, num_topk), dtype=topk_idx.dtype, device='cuda')
+                    dist.all_gather_into_tensor(all_topk_idx, topk_idx, group=group)
+                    for i in range(num_local_experts if do_check else 0):
+                        expert_id = rank * num_local_experts + i
+                        recv_x = per_token_cast_back(packed_recv_x[0][i], packed_recv_x[1][i]) if dispatch_use_fp8 else packed_recv_x[i]
+                        recv_count, recv_src_info, recv_layout_range = packed_recv_count[i], handle[0][i], handle[1][i]
+
+                        # Check expert indices
+                        int_mask = (2 ** 32) - 1
+                        num_valid_tokens = recv_count.item()
+                        assert cumulative_local_expert_recv_stats[i].item() == num_valid_tokens, f'{cumulative_local_expert_recv_stats[i].item()} != {num_valid_tokens}'
+                        assert num_valid_tokens == (recv_layout_range & int_mask).sum().item(), f'{num_valid_tokens} != {recv_layout_range & int_mask}.sum().item()'
+                        assert num_valid_tokens == (all_topk_idx == expert_id).sum().item(), f'{num_valid_tokens} != {(all_topk_idx == expert_id).sum().item()}'
+
+                        # Check received data
+                        recv_x = recv_x[:num_valid_tokens]
+                        recv_x_amin = recv_x[:, :-128].amin(dim=-1)
+                        recv_src_info = recv_src_info[:num_valid_tokens]
+                        assert torch.equal(recv_x_amin, recv_x[:, :-128].amax(dim=-1))
+                        if round_scale:
+                            assert calc_diff(recv_x[:, -1], recv_src_info.view(-1)) < 0.007
+                        else:
+                            assert (recv_x[:, -128:] - recv_src_info.view(-1, 1) % num_tokens).sum().item() == 0
+                        for j in range(num_ranks):
+                            begin_idx, count = (recv_layout_range[j] >> 32).item(), (recv_layout_range[j] & int_mask).item()
+                            if not round_scale:
+                                assert (recv_x_amin == j - rank_offset).sum().item() == (all_topk_idx[j] == expert_id).sum().item()
+                            assert (recv_x[begin_idx:begin_idx + count][:-128] - j).sum().item() == 0
+                        if dispatch_use_fp8:
+                            hash_value ^= hash_tensor(packed_recv_x[0][i, :num_valid_tokens])
+                            hash_value ^= hash_tensor(packed_recv_x[1][i, :num_valid_tokens])
+                        else:
+                            hash_value ^= hash_tensor(packed_recv_x[i, :num_valid_tokens])
+
+                    # Check combine correctness
+                    for zero_copy in (False, True):
+                        if zero_copy:
+                            buffer.get_next_low_latency_combine_buffer(handle)[:, :, :] = simulated_gemm_x
+                        out = torch.empty((num_tokens, hidden), dtype=torch.bfloat16, device='cuda')
+                        combined_x, event, hook = buffer.low_latency_combine(simulated_gemm_x, topk_idx, topk_weights, handle,
+                                                                             async_finish=not return_recv_hook, zero_copy=zero_copy,
+                                                                             return_recv_hook=return_recv_hook, out=out)
+                        hook() if return_recv_hook else event.current_stream_wait()
+                        if do_check:
+                            diff = calc_diff(x * topk_weights.masked_fill(topk_idx == -1, 0).sum(dim=1).view(-1, 1), combined_x)
+                            assert torch.isnan(combined_x).sum().item() == 0
+                            assert diff < (7e-4 if round_scale else 1e-5), f'Error: {diff=}, {zero_copy=}'
+                            hash_value ^= hash_tensor(combined_x)
+
+    def create_test_cast_with_outliers(num_outliers):
+        tmp = torch.randn((num_tokens, hidden), dtype=torch.bfloat16, device='cuda')
+        tmp /= tmp.abs().amax(dim=1).view(-1, 1)
+        assert tmp.abs().amax().item() <= 1
+
+        # Create some amax outliers
+        for i in range(num_outliers):
+            tmp[random.randint(0, num_tokens - 1)] *= 1e3
+        return tmp
+
+    # noinspection PyShadowingNames
+    def large_gemm_with_hook(hook):
+        mat_0 = torch.randn((8192, 8192), dtype=torch.float)
+        mat_1 = torch.randn((8192, 8192), dtype=torch.float)
+        mat_0 @ mat_1
+        hook()
+
+    # noinspection PyShadowingNames
+    def test_func(zero_copy: bool, return_recv_hook: bool):
+        recv_x, recv_count, handle, event, hook = \
+            buffer.low_latency_dispatch(x, topk_idx, num_tokens, num_experts,
+                                        cumulative_local_expert_recv_stats=cumulative_local_expert_recv_stats,
+                                        use_fp8=True, async_finish=False, return_recv_hook=return_recv_hook)
+        large_gemm_with_hook(hook) if return_recv_hook else None
+        if zero_copy:
+            buffer.get_next_low_latency_combine_buffer(handle)[:, :, :] = simulated_gemm_x
+        combined_x, event, hook = buffer.low_latency_combine(simulated_gemm_x, topk_idx, topk_weights, handle,
+                                                             zero_copy=zero_copy, return_recv_hook=return_recv_hook)
+        large_gemm_with_hook(hook) if return_recv_hook else None
+
+    # Calculate bandwidth
+    num_fp8_bytes, num_bf16_bytes = (hidden + hidden / 128 * 4 + 16), hidden * 2
+    num_dispatch_comm_bytes, num_combine_comm_bytes = 0, 0
+    for i in range(num_tokens):
+        num_selections = (topk_idx[i] != -1).sum().item()
+        num_dispatch_comm_bytes += num_fp8_bytes * num_selections
+        num_combine_comm_bytes += num_bf16_bytes * num_selections
+
+    # Dispatch + combine testing
+    avg_t, min_t, max_t = bench(partial(test_func, zero_copy=False, return_recv_hook=False))
+    print(f'[rank {rank}] Dispatch + combine bandwidth: {(num_dispatch_comm_bytes + num_combine_comm_bytes) / 1e9 / avg_t:.2f} GB/s, '
+          f'avg_t={avg_t * 1e6:.2f} us, min_t={min_t * 1e6:.2f} us, max_t={max_t * 1e6:.2f} us', flush=True)
+
+    # Separate profiling
+    for return_recv_hook in (False, True):
+        group.barrier()
+        dispatch_t, combine_t = bench_kineto(partial(test_func, zero_copy=True, return_recv_hook=return_recv_hook),
+                                             kernel_names=('dispatch', 'combine'), barrier_comm_profiling=True,
+                                             suppress_kineto_output=True)
+        if not return_recv_hook:
+            print(f'[rank {rank}] Dispatch bandwidth: {num_dispatch_comm_bytes / 1e9 / dispatch_t:.2f} GB/s, avg_t={dispatch_t * 1e6:.2f} us | '
+                  f'Combine bandwidth: {num_combine_comm_bytes / 1e9 / combine_t:.2f} GB/s, avg_t={combine_t * 1e6:.2f} us', flush=True)
+        else:
+            print(f'[rank {rank}] Dispatch send/recv time: {dispatch_t * 2 * 1e6:.2f} us | '
+                  f'Combine send/recv time: {combine_t * 2 * 1e6:.2f} us', flush=True)
+
+    return hash_value
+
+
+# noinspection PyUnboundLocalVariable
+def test_loop(local_rank: int, num_local_ranks: int):
+    rank, num_ranks, group = init_dist(local_rank, num_local_ranks)
+    num_tokens, hidden, num_topk, num_experts = 128, 7168, 8, 288
+
+    num_rdma_bytes = deep_ep.Buffer.get_low_latency_rdma_size_hint(num_tokens, hidden, num_ranks, num_experts)
+    if local_rank == 0:
+        print(f'Allocating buffer size: {num_rdma_bytes / 1e6} MB ...', flush=True)
+    buffer = deep_ep.Buffer(group, num_rdma_bytes=num_rdma_bytes, low_latency_mode=True,
+                            num_qps_per_rank=num_experts // num_ranks)
+    test_main(num_tokens, hidden, num_experts, num_topk, rank, num_ranks, group, buffer, seed=1)
+
+    do_pressure_test = False
+    for seed in range(int(1e9) if do_pressure_test else 0):
+        if local_rank == 0:
+            print(f'Testing with seed {seed} ...', flush=True)
+        ref_hash = test_main(num_tokens, hidden, num_experts, num_topk, rank, num_ranks, group, buffer, seed=seed)
+        for i in range(20):
+            assert test_main(num_tokens, hidden, num_experts, num_topk, rank, num_ranks, group, buffer, seed=seed) == ref_hash, f'Error: seed={seed}'
+
+    # Destroy the communication group
+    dist.barrier()
+    dist.destroy_process_group()
+
+
+if __name__ == '__main__':
+    # TODO: you may modify NUMA binding for less CPU overhead
+    num_processes = 8
+    torch.multiprocessing.spawn(test_loop, args=(num_processes,), nprocs=num_processes)
--- a/DeepEP/tests/utils.py
+++ b/DeepEP/tests/utils.py
@ -0,0 +1,201 @@
+import inspect
+import numpy as np
+import os
+import sys
+import torch
+import torch.distributed as dist
+from typing import Optional
+
+
+def init_dist(local_rank: int, num_local_ranks: int):
+    # NOTES: you may rewrite this function with your own cluster settings
+    ip = os.getenv('MASTER_ADDR', '127.0.0.1')
+    port = int(os.getenv('MASTER_PORT', '8361'))
+    num_nodes = int(os.getenv('WORLD_SIZE', 1))
+    node_rank = int(os.getenv('RANK', 0))
+    assert (num_local_ranks < 8 and num_nodes == 1) or num_local_ranks == 8
+
+    sig = inspect.signature(dist.init_process_group)
+    params = {
+        'backend': 'nccl',
+        'init_method': f'tcp://{ip}:{port}',
+        'world_size': num_nodes * num_local_ranks,
+        'rank': node_rank * num_local_ranks + local_rank,
+    }
+    if 'device_id' in sig.parameters:
+        # noinspection PyTypeChecker
+        params['device_id'] = torch.device(f'cuda:{local_rank}')
+    dist.init_process_group(**params)
+    torch.set_default_dtype(torch.bfloat16)
+    torch.set_default_device('cuda')
+    torch.cuda.set_device(local_rank)
+
+    return dist.get_rank(), dist.get_world_size(), dist.new_group(list(range(num_local_ranks * num_nodes)))
+
+
+def calc_diff(x: torch.Tensor, y: torch.Tensor):
+    x, y = x.double() + 1, y.double() + 1
+    denominator = (x * x + y * y).sum()
+    sim = 2 * (x * y).sum() / denominator
+    return (1 - sim).item()
+
+
+def per_token_cast_to_fp8(x: torch.Tensor):
+    assert x.dim() == 2 and x.size(1) % 128 == 0
+    m, n = x.shape
+    x_view = x.view(m, -1, 128)
+    x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
+    return (x_view * (448.0 / x_amax.unsqueeze(2))).to(torch.float8_e4m3fn).view(m, n), (x_amax / 448.0).view(m, -1)
+
+
+def per_token_cast_back(x_fp8: torch.Tensor, x_scales: torch.Tensor):
+    if x_scales.dtype == torch.int:
+        x_scales = x_scales.view(dtype=torch.int8).to(torch.int) << 23
+        x_scales = x_scales.view(dtype=torch.float)
+    x_fp32 = x_fp8.to(torch.float32).view(x_fp8.size(0), -1, 128)
+    x_scales = x_scales.view(x_fp8.size(0), -1, 1)
+    return (x_fp32 * x_scales).view(x_fp8.shape).to(torch.bfloat16)
+
+
+def inplace_unique(x: torch.Tensor, num_slots: int):
+    assert x.dim() == 2
+    mask = x < 0
+    x_padded = x.masked_fill(mask, num_slots)
+    bin_count = torch.zeros((x.size(0), num_slots + 1), dtype=x.dtype, device=x.device)
+    bin_count.scatter_add_(1, x_padded, torch.ones_like(x_padded))
+    bin_count = bin_count[:, :num_slots]
+    sorted_bin_count, sorted_bin_idx = torch.sort(bin_count, dim=-1, descending=True)
+    sorted_bin_idx.masked_fill_(sorted_bin_count == 0, -1)
+    sorted_bin_idx = torch.sort(sorted_bin_idx, descending=True, dim=-1).values
+    x[:, :].fill_(-1)
+    valid_len = min(num_slots, x.size(1))
+    x[:, :valid_len] = sorted_bin_idx[:, :valid_len]
+
+
+def create_grouped_scores(scores: torch.Tensor, group_idx: torch.Tensor, num_groups: int):
+    num_tokens, num_experts = scores.shape
+    scores = scores.view(num_tokens, num_groups, -1)
+    mask = torch.zeros((num_tokens, num_groups), dtype=torch.bool, device=scores.device)
+    mask = mask.scatter_(1, group_idx, True).unsqueeze(-1).expand_as(scores)
+    return (scores * mask).view(num_tokens, num_experts)
+
+
+def bench(fn, num_warmups: int = 50, num_tests: int = 50, post_fn=None):
+    # Flush L2 cache with 256 MB data
+    torch.cuda.synchronize()
+    cache = torch.empty(int(256e6 // 4), dtype=torch.int, device='cuda')
+
+    # Warmup
+    for _ in range(num_warmups):
+        fn()
+
+    # Flush L2
+    cache.zero_()
+
+    # Testing
+    start_events = [torch.cuda.Event(enable_timing=True) for _ in range(num_tests)]
+    end_events = [torch.cuda.Event(enable_timing=True) for _ in range(num_tests)]
+    for i in range(num_tests):
+        # Record
+        start_events[i].record()
+        fn()
+        end_events[i].record()
+        if post_fn is not None:
+            post_fn()
+    torch.cuda.synchronize()
+
+    times = np.array([s.elapsed_time(e) / 1e3 for s, e in zip(start_events, end_events)])[1:]
+    return np.average(times), np.min(times), np.max(times)
+
+
+class empty_suppress:
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *_):
+        pass
+
+
+class suppress_stdout_stderr:
+    def __enter__(self):
+        self.outnull_file = open(os.devnull, 'w')
+        self.errnull_file = open(os.devnull, 'w')
+
+        self.old_stdout_fileno_undup = sys.stdout.fileno()
+        self.old_stderr_fileno_undup = sys.stderr.fileno()
+
+        self.old_stdout_fileno = os.dup(sys.stdout.fileno())
+        self.old_stderr_fileno = os.dup(sys.stderr.fileno())
+
+        self.old_stdout = sys.stdout
+        self.old_stderr = sys.stderr
+
+        os.dup2(self.outnull_file.fileno(), self.old_stdout_fileno_undup)
+        os.dup2(self.errnull_file.fileno(), self.old_stderr_fileno_undup)
+
+        sys.stdout = self.outnull_file
+        sys.stderr = self.errnull_file
+        return self
+
+    def __exit__(self, *_):
+        sys.stdout = self.old_stdout
+        sys.stderr = self.old_stderr
+
+        os.dup2(self.old_stdout_fileno, self.old_stdout_fileno_undup)
+        os.dup2(self.old_stderr_fileno, self.old_stderr_fileno_undup)
+
+        os.close(self.old_stdout_fileno)
+        os.close(self.old_stderr_fileno)
+
+        self.outnull_file.close()
+        self.errnull_file.close()
+
+
+def bench_kineto(fn, kernel_names, num_tests: int = 30, suppress_kineto_output: bool = False,
+                 trace_path: Optional[str] = None, barrier_comm_profiling: bool = False):
+    # Profile
+    suppress = suppress_stdout_stderr if suppress_kineto_output else empty_suppress
+    with suppress():
+        schedule = torch.profiler.schedule(wait=0, warmup=1, active=1, repeat=1)
+        with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CUDA], schedule=schedule) as prof:
+            for i in range(2):
+                # NOTES: use a large kernel and a barrier to eliminate the unbalanced CPU launch overhead
+                if barrier_comm_profiling:
+                    lhs = torch.randn((8192, 8192), dtype=torch.float, device='cuda')
+                    rhs = torch.randn((8192, 8192), dtype=torch.float, device='cuda')
+                    lhs @ rhs
+                    dist.all_reduce(torch.ones(1, dtype=torch.float, device='cuda'))
+                for _ in range(num_tests):
+                    fn()
+                prof.step()
+
+    # Parse the profiling table
+    assert isinstance(kernel_names, str) or isinstance(kernel_names, tuple)
+    is_tupled = isinstance(kernel_names, tuple)
+    prof_lines = prof.key_averages().table(sort_by='cuda_time_total', max_name_column_width=100).split('\n')
+    kernel_names = (kernel_names, ) if isinstance(kernel_names, str) else kernel_names
+    assert all([isinstance(name, str) for name in kernel_names])
+    for name in kernel_names:
+        assert sum([name in line for line in prof_lines]) == 1, f'Errors of the kernel {name} in the profiling table'
+
+    # Save chrome traces
+    if trace_path is not None:
+        prof.export_chrome_trace(trace_path)
+
+    # Return average kernel times
+    units = {'ms': 1e3, 'us': 1e6}
+    kernel_times = []
+    for name in kernel_names:
+        for line in prof_lines:
+            if name in line:
+                time_str = line.split()[-2]
+                for unit, scale in units.items():
+                    if unit in time_str:
+                        kernel_times.append(float(time_str.replace(unit, '')) / scale)
+                        break
+                break
+    return tuple(kernel_times) if is_tupled else kernel_times[0]
+
+
+def hash_tensor(t: torch.Tensor):
+    return t.view(torch.int64).sum().item()
--- a/DeepEP/third-party/README.md
+++ b/DeepEP/third-party/README.md
@ -0,0 +1,89 @@
+# Install NVSHMEM
+
+## Important notices
+
+**This project is neither sponsored nor supported by NVIDIA.**
+
+**Use of NVIDIA NVSHMEM is governed by the terms at [NVSHMEM Software License Agreement](https://docs.nvidia.com/nvshmem/api/sla.html).**
+
+## Prerequisites
+
+Hardware requirements:
+   - GPUs inside one node needs to be connected by NVLink
+   - GPUs across different nodes needs to be connected by RDMA devices, see [GPUDirect RDMA Documentation](https://docs.nvidia.com/cuda/gpudirect-rdma/)
+   - InfiniBand GPUDirect Async (IBGDA) support, see [IBGDA Overview](https://developer.nvidia.com/blog/improving-network-performance-of-hpc-systems-using-nvidia-magnum-io-nvshmem-and-gpudirect-async/)
+   - For more detailed requirements, see [NVSHMEM Hardware Specifications](https://docs.nvidia.com/nvshmem/release-notes-install-guide/install-guide/abstract.html#hardware-requirements)
+
+## Installation procedure
+
+### 1. Acquiring NVSHMEM source code
+
+Download NVSHMEM v3.2.5 from the [NVIDIA NVSHMEM OPEN SOURCE PACKAGES](https://developer.nvidia.com/downloads/assets/secure/nvshmem/nvshmem_src_3.2.5-1.txz).
+
+### 2. Apply our custom patch
+
+Navigate to your NVSHMEM source directory and apply our provided patch:
+
+```bash
+git apply /path/to/deep_ep/dir/third-party/nvshmem.patch
+```
+
+### 3. Configure NVIDIA driver (required by inter-node communication)
+
+Enable IBGDA by modifying `/etc/modprobe.d/nvidia.conf`:
+
+```bash
+options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1;"
+```
+
+Update kernel configuration:
+
+```bash
+sudo update-initramfs -u
+sudo reboot
+```
+
+For more detailed configurations, please refer to the [NVSHMEM Installation Guide](https://docs.nvidia.com/nvshmem/release-notes-install-guide/install-guide/abstract.html).
+
+### 4. Build and installation
+
+DeepEP uses NVLink for intra-node communication and IBGDA for inter-node communication. All the other features are disabled to reduce the dependencies.
+
+```bash
+export CUDA_HOME=/path/to/cuda
+# disable all features except IBGDA
+export NVSHMEM_IBGDA_SUPPORT=1
+
+export NVSHMEM_SHMEM_SUPPORT=0
+export NVSHMEM_UCX_SUPPORT=0
+export NVSHMEM_USE_NCCL=0
+export NVSHMEM_PMIX_SUPPORT=0
+export NVSHMEM_TIMEOUT_DEVICE_POLLING=0
+export NVSHMEM_USE_GDRCOPY=0
+export NVSHMEM_IBRC_SUPPORT=0
+export NVSHMEM_BUILD_TESTS=0
+export NVSHMEM_BUILD_EXAMPLES=0
+export NVSHMEM_MPI_SUPPORT=0
+export NVSHMEM_BUILD_HYDRA_LAUNCHER=0
+export NVSHMEM_BUILD_TXZ_PACKAGE=0
+export NVSHMEM_TIMEOUT_DEVICE_POLLING=0
+
+cmake -G Ninja -S . -B build -DCMAKE_INSTALL_PREFIX=/path/to/your/dir/to/install
+cmake --build build/ --target install
+```
+
+## Post-installation configuration
+
+Set environment variables in your shell configuration:
+
+```bash
+export NVSHMEM_DIR=/path/to/your/dir/to/install  # Use for DeepEP installation
+export LD_LIBRARY_PATH="${NVSHMEM_DIR}/lib:$LD_LIBRARY_PATH"
+export PATH="${NVSHMEM_DIR}/bin:$PATH"
+```
+
+## Verification
+
+```bash
+nvshmem-info -a # Should display details of nvshmem
+```
--- a/DeepEP/third-party/nvshmem.patch
+++ b/DeepEP/third-party/nvshmem.patch
@ -0,0 +1,474 @@
+From 9e6cc27cceb3130784e4ea7b61ea3171156365fd Mon Sep 17 00:00:00 2001
+From: Shangyan Zhou <sy.zhou@deepseek.com>
+Date: Fri, 20 Dec 2024 10:57:12 +0800
+Subject: [PATCH 1/4] Change QP creating order.
+
+---
+ src/modules/transport/ibgda/ibgda.cpp | 13 ++++++++-----
+ 1 file changed, 8 insertions(+), 5 deletions(-)
+
+diff --git a/src/modules/transport/ibgda/ibgda.cpp b/src/modules/transport/ibgda/ibgda.cpp
+index ef325cd..286132e 100644
+--- a/src/modules/transport/ibgda/ibgda.cpp
+++ b/src/modules/transport/ibgda/ibgda.cpp
+@@ -2936,17 +2936,20 @@ int nvshmemt_ibgda_connect_endpoints(nvshmem_transport_t t, int *selected_dev_id
+         INFO(ibgda_state->log_level, "Creating %d RC QPs", device->rc.num_eps_per_pe);
+         for (int i = 0; i < num_rc_eps; ++i) {
+             // Do not create loopback to self
+-            if (i / device->rc.num_eps_per_pe == mype) {
+            int dst_pe = (i + 1 + mype) % n_pes;
+            int offset = i / n_pes;
+            int mapped_i = dst_pe * device->rc.num_eps_per_pe + offset;
+            if (dst_pe == mype) {
+                 continue;
+             }
+-            status = ibgda_create_qp(&device->rc.eps[i], device, portid, i,
+            status = ibgda_create_qp(&device->rc.eps[mapped_i], device, portid, mapped_i,
+                                      NVSHMEMI_IBGDA_DEVICE_QP_TYPE_RC);
+             NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, out,
+-                                  "ibgda_create_dci failed on RC #%d.", i);
+                                  "ibgda_create_dci failed on RC #%d.", mapped_i);
+
+-            status = ibgda_get_rc_handle(&local_rc_handles[i], device->rc.eps[i], device);
+            status = ibgda_get_rc_handle(&local_rc_handles[mapped_i], device->rc.eps[mapped_i], device);
+             NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, out,
+-                                  "ibgda_get_rc_handle failed on RC #%d.", i);
+                                  "ibgda_get_rc_handle failed on RC #%d.", mapped_i);
+         }
+
+         if (num_rc_eps) {
+--
+2.25.1
+
+
+From b11d41e4f3727f2f6ccc00a8c852e59e2ee33c8a Mon Sep 17 00:00:00 2001
+From: Shangyan Zhou <sy.zhou@deepseek.com>
+Date: Fri, 10 Jan 2025 11:53:38 +0800
+Subject: [PATCH 2/4] Add recv queue and recv cq for rc qps.
+
+Let the ibgda rc qps use regular recv queue.
+
+Add recv queue to ibgda dev qp.
+
+IBGDA create recv cq
+
+Setup recv cq.
+
+fix recv queue.
+
+Remove some useless idx.
+
+Longer recv queue.
+---
+ .../nvshmem_common_ibgda.h                    | 19 +++++-
+ src/modules/transport/ibgda/ibgda.cpp         | 65 ++++++++++++++++---
+ 2 files changed, 71 insertions(+), 13 deletions(-)
+
+diff --git a/src/include/device_host_transport/nvshmem_common_ibgda.h b/src/include/device_host_transport/nvshmem_common_ibgda.h
+index 8b8a263..1be3dec 100644
+--- a/src/include/device_host_transport/nvshmem_common_ibgda.h
+++ b/src/include/device_host_transport/nvshmem_common_ibgda.h
+@@ -168,14 +168,17 @@ typedef struct {
+         uint64_t get_head;    // last wqe idx + 1 with a "fetch" operation (g, get, amo_fetch)
+         uint64_t get_tail;    // last wqe idx + 1 polled with cst; get_tail > get_head is possible
+     } tx_wq;
+    struct {
+        uint64_t resv_head;   // last reserved wqe idx + 1
+    } rx_wq;
+     struct {
+         uint64_t head;
+         uint64_t tail;
+     } ibuf;
+     char padding[NVSHMEMI_IBGDA_QP_MANAGEMENT_PADDING];
+ } __attribute__((__aligned__(8))) nvshmemi_ibgda_device_qp_management_v1;
+-static_assert(sizeof(nvshmemi_ibgda_device_qp_management_v1) == 96,
+-              "ibgda_device_qp_management_v1 must be 96 bytes.");
+static_assert(sizeof(nvshmemi_ibgda_device_qp_management_v1) == 104,
+              "ibgda_device_qp_management_v1 must be 104 bytes.");
+
+ typedef nvshmemi_ibgda_device_qp_management_v1 nvshmemi_ibgda_device_qp_management_t;
+
+@@ -199,9 +202,19 @@ typedef struct nvshmemi_ibgda_device_qp {
+         // May point to mvars.prod_idx or internal prod_idx
+         uint64_t *prod_idx;
+     } tx_wq;
+    struct {
+        uint16_t nwqes;
+        uint64_t tail;
+        void *wqe;
+        __be32 *dbrec;
+        void *bf;
+        nvshmemi_ibgda_device_cq_t *cq;
+        // May point to mvars.prod_idx or internal prod_idx
+        uint64_t *prod_idx;
+    } rx_wq;
+     nvshmemi_ibgda_device_qp_management_v1 mvars;  // management variables
+ } nvshmemi_ibgda_device_qp_v1;
+-static_assert(sizeof(nvshmemi_ibgda_device_qp_v1) == 184, "ibgda_device_qp_v1 must be 184 bytes.");
+static_assert(sizeof(nvshmemi_ibgda_device_qp_v1) == 248, "ibgda_device_qp_v1 must be 248 bytes.");
+
+ typedef nvshmemi_ibgda_device_qp_v1 nvshmemi_ibgda_device_qp_t;
+
+diff --git a/src/modules/transport/ibgda/ibgda.cpp b/src/modules/transport/ibgda/ibgda.cpp
+index 286132e..e0b2d5c 100644
+--- a/src/modules/transport/ibgda/ibgda.cpp
+++ b/src/modules/transport/ibgda/ibgda.cpp
+@@ -198,6 +198,7 @@ struct ibgda_ep {
+     off_t dbr_offset;
+
+     struct ibgda_cq *send_cq;
+    struct ibgda_cq *recv_cq;
+     struct ibv_ah *ah;
+
+     uint32_t user_index;
+@@ -1538,7 +1539,8 @@ static int ibgda_create_cq_shared_objects(nvshmemt_ibgda_state_t *ibgda_state,
+
+     struct ibv_context *context = device->context;
+
+-    unsigned int num_cqs = device->dci.num_eps + device->rc.num_eps_per_pe * n_pes;
+    // Each RC qp has one send CQ and one recv CQ.
+    unsigned int num_cqs = device->dci.num_eps + device->rc.num_eps_per_pe * n_pes * 2;
+
+     assert(ibgda_qp_depth > 0);
+     size_t num_cqe = IBGDA_ROUND_UP_POW2_OR_0(ibgda_qp_depth);
+@@ -1701,7 +1703,8 @@ static int ibgda_create_qp_shared_objects(nvshmemt_ibgda_state_t *ibgda_state,
+     }
+
+     // Allocate and map WQ buffer for all QPs.
+-    wq_buf_size_per_qp = num_wqebb * MLX5_SEND_WQE_BB;  // num_wqebb is always a power of 2
+    // Todo: reduce the size of wq buffer.
+    wq_buf_size_per_qp = num_wqebb * MLX5_SEND_WQE_BB * 2;  // num_wqebb is always a power of 2
+     wq_buf_size = wq_buf_size_per_qp * num_eps;
+     status = ibgda_nic_control_alloc(&wq_mobject, wq_buf_size, IBGDA_GPAGE_SIZE);
+     NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, out, "cannot allocate wq buf.\n");
+@@ -1882,8 +1885,11 @@ static int ibgda_create_qp(struct ibgda_ep **ep_ptr, struct ibgda_device *device
+     int cqe_version = 0;
+
+     struct ibgda_cq *send_cq = NULL;
+    struct ibgda_cq *recv_cq = NULL;
+
+     size_t num_wqebb = IBGDA_ROUND_UP_POW2_OR_0(ibgda_qp_depth);
+    size_t num_recv_wqe = ibgda_qp_depth;
+    size_t recv_wqe_size = 16;
+
+     int status = 0;
+
+@@ -1911,6 +1917,11 @@ static int ibgda_create_qp(struct ibgda_ep **ep_ptr, struct ibgda_device *device
+     status = ibgda_create_cq(&send_cq, device);
+     NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, out, "ibgda_create_cq failed.\n");
+
+    if (qp_type == NVSHMEMI_IBGDA_DEVICE_QP_TYPE_RC) {
+        status = ibgda_create_cq(&recv_cq, device);
+        NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, out, "ibgda_create_cq failed.\n");
+    }
+
+     ep = (struct ibgda_ep *)calloc(1, sizeof(struct ibgda_ep));
+     NVSHMEMI_NULL_ERROR_JMP(ep, status, NVSHMEMX_ERROR_OUT_OF_MEMORY, out,
+                             "Unable to allocate mem for ep.\n");
+@@ -1939,12 +1950,9 @@ static int ibgda_create_qp(struct ibgda_ep **ep_ptr, struct ibgda_device *device
+     DEVX_SET(qpc, qp_context, pm_state, MLX5_QPC_PM_STATE_MIGRATED);
+     DEVX_SET(qpc, qp_context, pd, device->qp_shared_object.pdn);
+     DEVX_SET(qpc, qp_context, uar_page, uar_mobject->uar->page_id);  // BF register
+-    DEVX_SET(qpc, qp_context, rq_type, IBGDA_SRQ_TYPE_VALUE);        // Shared Receive Queue
+-    DEVX_SET(qpc, qp_context, srqn_rmpn_xrqn, device->qp_shared_object.srqn);
+     DEVX_SET(qpc, qp_context, cqn_snd, send_cq->cqn);
+-    DEVX_SET(qpc, qp_context, cqn_rcv, device->qp_shared_object.rcqn);
+    DEVX_SET(qpc, qp_context, cqn_rcv, qp_type == NVSHMEMI_IBGDA_DEVICE_QP_TYPE_RC ? recv_cq->cqn : device->qp_shared_object.rcqn);
+     DEVX_SET(qpc, qp_context, log_sq_size, IBGDA_ILOG2_OR0(num_wqebb));
+-    DEVX_SET(qpc, qp_context, log_rq_size, 0);
+     DEVX_SET(qpc, qp_context, cs_req, 0);                                     // Disable CS Request
+     DEVX_SET(qpc, qp_context, cs_res, 0);                                     // Disable CS Response
+     DEVX_SET(qpc, qp_context, dbr_umem_valid, IBGDA_MLX5_UMEM_VALID_ENABLE);  // Enable dbr_umem_id
+@@ -1953,6 +1961,15 @@ static int ibgda_create_qp(struct ibgda_ep **ep_ptr, struct ibgda_device *device
+     DEVX_SET(qpc, qp_context, dbr_umem_id, dbr_umem->umem_id);  // DBR buffer
+     DEVX_SET(qpc, qp_context, user_index, qp_idx);
+     DEVX_SET(qpc, qp_context, page_offset, 0);
+    if (qp_type == NVSHMEMI_IBGDA_DEVICE_QP_TYPE_RC){
+        DEVX_SET(qpc, qp_context, rq_type, 0);        // Regular recv queue
+        DEVX_SET(qpc, qp_context, log_rq_size, IBGDA_ILOG2(num_recv_wqe)); // 4 wqe
+        DEVX_SET(qpc, qp_context, log_rq_stride, IBGDA_ILOG2(recv_wqe_size) - 4); // max recv wqe size = 16B
+    } else {
+        DEVX_SET(qpc, qp_context, rq_type, IBGDA_SRQ_TYPE_VALUE);        // Shared Receive Queue, DC must use this.
+        DEVX_SET(qpc, qp_context, srqn_rmpn_xrqn, device->qp_shared_object.srqn);
+        DEVX_SET(qpc, qp_context, log_rq_size, 0);
+    }
+
+     ep->devx_qp = mlx5dv_devx_obj_create(context, cmd_in, sizeof(cmd_in), cmd_out, sizeof(cmd_out));
+     NVSHMEMI_NULL_ERROR_JMP(ep->devx_qp, status, NVSHMEMX_ERROR_INTERNAL, out,
+@@ -1962,9 +1979,9 @@ static int ibgda_create_qp(struct ibgda_ep **ep_ptr, struct ibgda_device *device
+     ep->portid = portid;
+
+     ep->sq_cnt = num_wqebb;
+-    ep->sq_buf_offset = 0;
+    ep->sq_buf_offset = num_recv_wqe * recv_wqe_size;
+
+-    ep->rq_cnt = 0;
+    ep->rq_cnt = num_recv_wqe;
+     ep->rq_buf_offset = 0;
+
+     ep->wq_mobject = device->qp_shared_object.wq_mobject;
+@@ -1978,6 +1995,7 @@ static int ibgda_create_qp(struct ibgda_ep **ep_ptr, struct ibgda_device *device
+     ep->uar_mobject = uar_mobject;
+
+     ep->send_cq = send_cq;
+    ep->recv_cq = recv_cq;
+
+     ep->qp_type = qp_type;
+
+@@ -1989,6 +2007,7 @@ out:
+     if (status) {
+         if (uar_mobject) ibgda_unmap_and_free_qp_uar(uar_mobject);
+         if (send_cq) ibgda_destroy_cq(send_cq);
+        if (recv_cq) ibgda_destroy_cq(recv_cq);
+         if (ep) free(ep);
+     }
+
+@@ -2287,6 +2306,10 @@ static int ibgda_destroy_ep(struct ibgda_ep *ep) {
+         ibgda_destroy_cq(ep->send_cq);
+     }
+
+    if (ep->recv_cq) {
+        ibgda_destroy_cq(ep->recv_cq);
+    }
+
+     if (ep->ah) {
+         ftable.destroy_ah(ep->ah);
+     }
+@@ -2318,7 +2341,7 @@ static void ibgda_get_device_qp(nvshmemi_ibgda_device_qp_t *dev_qp, struct ibgda
+     dev_qp->qpn = ep->qpn;
+
+     assert(ep->wq_mobject->has_gpu_mapping);
+-    dev_qp->tx_wq.wqe = (void *)((uintptr_t)ep->wq_mobject->aligned.gpu_ptr + ep->wq_offset);
+    dev_qp->tx_wq.wqe = (void *)((uintptr_t)ep->wq_mobject->aligned.gpu_ptr + ep->wq_offset + ep->sq_buf_offset);
+
+     if (ibgda_nic_handler == IBGDA_NIC_HANDLER_GPU) {
+         assert(ep->dbr_mobject->has_gpu_mapping);
+@@ -2330,6 +2353,12 @@ static void ibgda_get_device_qp(nvshmemi_ibgda_device_qp_t *dev_qp, struct ibgda
+     }
+
+     dev_qp->tx_wq.nwqes = ep->sq_cnt;
+    if (ep->qp_type == NVSHMEMI_IBGDA_DEVICE_QP_TYPE_RC) {
+        dev_qp->rx_wq.nwqes = ep->rq_cnt;
+        dev_qp->rx_wq.wqe = (void *)((uintptr_t)ep->wq_mobject->aligned.gpu_ptr + ep->wq_offset + ep->rq_buf_offset);
+        dev_qp->rx_wq.dbrec = (__be32 *)((uintptr_t)ep->dbr_mobject->aligned.gpu_ptr + ep->dbr_offset);
+        dev_qp->rx_wq.bf = (void *)ep->uar_mobject->aligned.gpu_ptr;
+    }
+
+     ibuf_dci_start = (uintptr_t)device->qp_shared_object.internal_buf.mem_object->aligned.gpu_ptr;
+     ibuf_rc_start = ibuf_dci_start + (size_per_dci * device->dci.num_eps);
+@@ -2379,6 +2408,9 @@ static int ibgda_setup_gpu_state(nvshmem_transport_t t) {
+     nvshmemi_ibgda_device_cq_t *cq_d = NULL;
+     nvshmemi_ibgda_device_cq_t *cq_h = NULL;
+
+    nvshmemi_ibgda_device_cq_t *recv_cq_d = NULL;
+    nvshmemi_ibgda_device_cq_t *recv_cq_h = NULL;
+
+     uint8_t *qp_group_switches_d = NULL;
+
+     const size_t mvars_offset = offsetof(nvshmemi_ibgda_device_qp_t, mvars);
+@@ -2386,6 +2418,7 @@ static int ibgda_setup_gpu_state(nvshmem_transport_t t) {
+     const size_t cons_t_offset = offsetof(nvshmemi_ibgda_device_qp_management_t, tx_wq.cons_idx);
+     const size_t wqe_h_offset = offsetof(nvshmemi_ibgda_device_qp_management_t, tx_wq.resv_head);
+     const size_t wqe_t_offset = offsetof(nvshmemi_ibgda_device_qp_management_t, tx_wq.ready_head);
+    const size_t rx_resv_head_offset = offsetof(nvshmemi_ibgda_device_qp_management_t, rx_wq.resv_head);
+
+     nvshmemi_ibgda_device_qp_map_type_t rc_map_type = NVSHMEMI_IBGDA_DEVICE_QP_MAP_TYPE_INVALID;
+     nvshmemi_ibgda_device_qp_map_type_t dc_map_type = NVSHMEMI_IBGDA_DEVICE_QP_MAP_TYPE_INVALID;
+@@ -2421,7 +2454,7 @@ static int ibgda_setup_gpu_state(nvshmem_transport_t t) {
+         num_dct_handles += device->dct.num_eps * n_pes;
+         num_dci_handles += device->dci.num_eps;
+         num_rc_handles += device->rc.num_eps_per_pe * n_pes;
+-        num_cq_handles += device->dci.num_eps + (device->rc.num_eps_per_pe * (n_pes - 1));
+        num_cq_handles += device->dci.num_eps + (device->rc.num_eps_per_pe * (n_pes - 1) * 2);
+         num_shared_dci_handles += device->dci.num_shared_eps;
+     }
+     assert(num_dci_handles - num_shared_dci_handles >= 0);
+@@ -2456,6 +2489,10 @@ static int ibgda_setup_gpu_state(nvshmem_transport_t t) {
+     for (int i = 0; i < num_cq_handles; i++) {
+         nvshmemi_init_ibgda_device_cq(cq_h[i]);
+     }
+
+    recv_cq_h = (nvshmemi_ibgda_device_cq_t *)calloc(1, sizeof(*recv_cq_h));
+    NVSHMEMI_NULL_ERROR_JMP(recv_cq_h, status, NVSHMEMX_ERROR_OUT_OF_MEMORY, out, "recv_cq calloc err.");
+    nvshmemi_init_ibgda_device_cq(recv_cq_h[0]);
+     /* allocate host memory for dct, rc, cq, dci end */
+
+     /* allocate device memory for dct, rc, cq, dci start */
+@@ -2559,6 +2596,14 @@ static int ibgda_setup_gpu_state(nvshmem_transport_t t) {
+                 }
+
+                 ++cq_idx;
+
+                rc_h[arr_idx].rx_wq.cq = &cq_d[cq_idx];
+
+                ibgda_get_device_cq(&cq_h[cq_idx], device->rc.eps[i]->recv_cq);
+                cq_h[cq_idx].resv_head = (uint64_t *)(base_mvars_d_addr + rx_resv_head_offset);
+                cq_h[cq_idx].qpn = rc_h[arr_idx].qpn;
+                cq_h[cq_idx].qp_type = rc_h[arr_idx].qp_type;
+                ++cq_idx;
+             }
+         }
+     }
+--
+2.25.1
+
+
+From af479f9f23103d4a1579fae38676d6b3022df887 Mon Sep 17 00:00:00 2001
+From: Shangyan Zhou <sy.zhou@deepseek.com>
+Date: Sat, 8 Feb 2025 18:02:39 +0800
+Subject: [PATCH 3/4] Maintain recv queue's cons_idx.
+
+---
+ src/include/device_host_transport/nvshmem_common_ibgda.h | 5 +++--
+ src/modules/transport/ibgda/ibgda.cpp                    | 6 ++++--
+ 2 files changed, 7 insertions(+), 4 deletions(-)
+
+diff --git a/src/include/device_host_transport/nvshmem_common_ibgda.h b/src/include/device_host_transport/nvshmem_common_ibgda.h
+index 1be3dec..ea1e284 100644
+--- a/src/include/device_host_transport/nvshmem_common_ibgda.h
+++ b/src/include/device_host_transport/nvshmem_common_ibgda.h
+@@ -170,6 +170,7 @@ typedef struct {
+     } tx_wq;
+     struct {
+         uint64_t resv_head;   // last reserved wqe idx + 1
+        uint64_t cons_idx;    // polled wqe idx + 1 (consumer index + 1)
+     } rx_wq;
+     struct {
+         uint64_t head;
+@@ -177,7 +178,7 @@ typedef struct {
+     } ibuf;
+     char padding[NVSHMEMI_IBGDA_QP_MANAGEMENT_PADDING];
+ } __attribute__((__aligned__(8))) nvshmemi_ibgda_device_qp_management_v1;
+-static_assert(sizeof(nvshmemi_ibgda_device_qp_management_v1) == 104,
+-              "ibgda_device_qp_management_v1 must be 104 bytes.");
+static_assert(sizeof(nvshmemi_ibgda_device_qp_management_v1) == 112,
+              "ibgda_device_qp_management_v1 must be 112 bytes.");
+
+ typedef nvshmemi_ibgda_device_qp_management_v1 nvshmemi_ibgda_device_qp_management_t;
+@@ -214,7 +215,7 @@ typedef struct nvshmemi_ibgda_device_qp {
+     } rx_wq;
+     nvshmemi_ibgda_device_qp_management_v1 mvars;  // management variables
+ } nvshmemi_ibgda_device_qp_v1;
+-static_assert(sizeof(nvshmemi_ibgda_device_qp_v1) == 248, "ibgda_device_qp_v1 must be 248 bytes.");
+static_assert(sizeof(nvshmemi_ibgda_device_qp_v1) == 256, "ibgda_device_qp_v1 must be 256 bytes.");
+
+ typedef nvshmemi_ibgda_device_qp_v1 nvshmemi_ibgda_device_qp_t;
+
+diff --git a/src/modules/transport/ibgda/ibgda.cpp b/src/modules/transport/ibgda/ibgda.cpp
+index e0b2d5c..bc339c5 100644
+--- a/src/modules/transport/ibgda/ibgda.cpp
+++ b/src/modules/transport/ibgda/ibgda.cpp
+@@ -1067,7 +1067,7 @@ static inline void ibgda_nic_control_free(struct ibgda_mem_object *mobject) {
+         ibgda_host_mem_free(mobject);
+ }
+
+-static int ibgda_create_cq(struct ibgda_cq **pgcq, struct ibgda_device *device) {
+static int ibgda_create_cq(struct ibgda_cq **pgcq, struct ibgda_device *device, int cc = 1) {
+     int status = 0;
+
+     struct ibgda_cq *gcq = NULL;
+@@ -1118,7 +1118,7 @@ static int ibgda_create_cq(struct ibgda_cq **pgcq, struct ibgda_device *device)
+     cq_context = DEVX_ADDR_OF(create_cq_in, cmd_in, cq_context);
+     DEVX_SET(cqc, cq_context, dbr_umem_valid, IBGDA_MLX5_UMEM_VALID_ENABLE);
+     DEVX_SET(cqc, cq_context, cqe_sz, MLX5_CQE_SIZE_64B);
+-    DEVX_SET(cqc, cq_context, cc, 0x1);  // Use collapsed CQ
+    DEVX_SET(cqc, cq_context, cc, cc);  // Use collapsed CQ
+     DEVX_SET(cqc, cq_context, oi, 0x1);  // Allow overrun
+     DEVX_SET(cqc, cq_context, dbr_umem_id, dbr_umem->umem_id);
+     DEVX_SET(cqc, cq_context, log_cq_size, IBGDA_ILOG2_OR0(num_cqe));
+@@ -2419,6 +2419,7 @@ static int ibgda_setup_gpu_state(nvshmem_transport_t t) {
+     const size_t wqe_h_offset = offsetof(nvshmemi_ibgda_device_qp_management_t, tx_wq.resv_head);
+     const size_t wqe_t_offset = offsetof(nvshmemi_ibgda_device_qp_management_t, tx_wq.ready_head);
+     const size_t rx_resv_head_offset = offsetof(nvshmemi_ibgda_device_qp_management_t, rx_wq.resv_head);
+    const size_t rx_cons_offset = offsetof(nvshmemi_ibgda_device_qp_management_t, rx_wq.cons_idx);
+
+     nvshmemi_ibgda_device_qp_map_type_t rc_map_type = NVSHMEMI_IBGDA_DEVICE_QP_MAP_TYPE_INVALID;
+     nvshmemi_ibgda_device_qp_map_type_t dc_map_type = NVSHMEMI_IBGDA_DEVICE_QP_MAP_TYPE_INVALID;
+@@ -2601,6 +2602,7 @@ static int ibgda_setup_gpu_state(nvshmem_transport_t t) {
+
+                 ibgda_get_device_cq(&cq_h[cq_idx], device->rc.eps[i]->recv_cq);
+                 cq_h[cq_idx].resv_head = (uint64_t *)(base_mvars_d_addr + rx_resv_head_offset);
+                cq_h[cq_idx].cons_idx = (uint64_t *)(base_mvars_d_addr + rx_cons_offset);
+                 cq_h[cq_idx].qpn = rc_h[arr_idx].qpn;
+                 cq_h[cq_idx].qp_type = rc_h[arr_idx].qp_type;
+                 ++cq_idx;
+--
+2.25.1
+
+
+From e0ba3fa21b4b633b481c6684c3ad04f2670c8df4 Mon Sep 17 00:00:00 2001
+From: Shangyan Zhou <sy.zhou@deepseek.com>
+Date: Tue, 11 Feb 2025 11:00:57 +0800
+Subject: [PATCH 4/4] Init rx_wq counters.
+
+---
+ src/include/device_host_transport/nvshmem_common_ibgda.h | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/src/include/device_host_transport/nvshmem_common_ibgda.h b/src/include/device_host_transport/nvshmem_common_ibgda.h
+index ea1e284..e6640d6 100644
+--- a/src/include/device_host_transport/nvshmem_common_ibgda.h
+++ b/src/include/device_host_transport/nvshmem_common_ibgda.h
+@@ -46,6 +46,8 @@
+         qp_man.tx_wq.cons_idx = NVSHMEMI_IBGDA_ULSCALAR_INVALID;                    \
+         qp_man.tx_wq.get_head = NVSHMEMI_IBGDA_ULSCALAR_INVALID;                    \
+         qp_man.tx_wq.get_tail = NVSHMEMI_IBGDA_ULSCALAR_INVALID;                    \
+        qp_man.rx_wq.resv_head = NVSHMEMI_IBGDA_ULSCALAR_INVALID;                    \
+        qp_man.rx_wq.cons_idx = NVSHMEMI_IBGDA_ULSCALAR_INVALID;                    \
+         qp_man.ibuf.head = NVSHMEMI_IBGDA_ULSCALAR_INVALID;                         \
+         qp_man.ibuf.tail = NVSHMEMI_IBGDA_ULSCALAR_INVALID;                         \
+     } while (0);
+--
+2.25.1
+
+diff --git a/src/modules/transport/common/transport_ib_common.cpp b/src/modules/transport/common/transport_ib_common.cpp
+index c89f408..f99018a 100644
+--- a/src/modules/transport/common/transport_ib_common.cpp
+++ b/src/modules/transport/common/transport_ib_common.cpp
+@@ -26,6 +26,9 @@ int nvshmemt_ib_common_nv_peer_mem_available() {
+     if (access("/sys/kernel/mm/memory_peers/nvidia-peermem/version", F_OK) == 0) {
+         return NVSHMEMX_SUCCESS;
+     }
+    if (access("/sys/module/nvidia_peermem/version", F_OK) == 0) {
+        return NVSHMEMX_SUCCESS;
+    }
+ 
+     return NVSHMEMX_ERROR_INTERNAL;
+ }
+
+
+From 099f608fcd9a1d34c866ad75d0af5d02d2020374 Mon Sep 17 00:00:00 2001
+From: Kaichao You <youkaichao@gmail.com>
+Date: Tue, 10 Jun 2025 00:35:03 -0700
+Subject: [PATCH] remove gdrcopy dependency
+
+---
+ src/modules/transport/ibgda/ibgda.cpp | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+diff --git a/src/modules/transport/ibgda/ibgda.cpp b/src/modules/transport/ibgda/ibgda.cpp
+index ef325cd..16ee09c 100644
+--- a/src/modules/transport/ibgda/ibgda.cpp
+++ b/src/modules/transport/ibgda/ibgda.cpp
+@@ -406,6 +406,7 @@ static size_t ibgda_get_host_page_size() {
+     return host_page_size;
+ }
+
+#ifdef NVSHMEM_USE_GDRCOPY
+ int nvshmemt_ibgda_progress(nvshmem_transport_t t) {
+     nvshmemt_ibgda_state_t *ibgda_state = (nvshmemt_ibgda_state_t *)t->state;
+     int n_devs_selected = ibgda_state->n_devs_selected;
+@@ -459,6 +460,11 @@ int nvshmemt_ibgda_progress(nvshmem_transport_t t) {
+     }
+     return 0;
+ }
+#else
+int nvshmemt_ibgda_progress(nvshmem_transport_t t) {
+    return NVSHMEMX_ERROR_NOT_SUPPORTED;
+}
+#endif
+
+ int nvshmemt_ibgda_show_info(struct nvshmem_transport *transport, int style) {
+     NVSHMEMI_ERROR_PRINT("ibgda show info not implemented");
+--
+2.34.1
--- a/184
+++ b/184
@ -0,0 +1,184 @@
+###############################################################################
+# Stage 0 ─ builder-torch：编译 PyTorch 2.7.1 (+cu126)
+###############################################################################
+ARG CUDA_VERSION=12.6.1
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
+
+ENV USE_CUDA=1 \
+    USE_DISTRIBUTED=1 \
+    USE_MPI=1 \
+    USE_GLOO=1 \
+    USE_NCCL=1 \
+    USE_SYSTEM_NCCL=1 \
+    BUILD_TEST=0
+
+ARG MAX_JOBS=90                       
+ENV DEBIAN_FRONTEND=noninteractive \
+    PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
+    TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+      python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
+      libopenblas-dev libopenmpi-dev \
+      libnccl2=2.22.3-1+cuda12.6 \
+      libnccl-dev=2.22.3-1+cuda12.6 \
+      libjpeg-dev libpng-dev ca-certificates && \
+    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
+
+WORKDIR /opt
+RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
+
+WORKDIR /opt/pytorch
+ENV MAX_JOBS=${MAX_JOBS}
+RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
+    python3 setup.py bdist_wheel
+
+###############################################################################
+# Stage 1 ─ builder-extras：用自编 Torch 装 TV / flashinfer / sglang，并收集轮子
+###############################################################################
+ARG CUDA_VERSION=12.6.1
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
+
+ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+      python3 python3-pip python3-distutils python3.10-dev git build-essential \
+      cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
+      libopenmpi-dev libopenblas-dev\
+      libnccl2=2.22.3-1+cuda12.6 \
+      libnccl-dev=2.22.3-1+cuda12.6 && \
+    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
+
+# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
+COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
+RUN set -e && \
+    echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
+    find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
+
+
+
+# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
+WORKDIR /opt
+RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
+WORKDIR /opt/vision
+RUN python3 setup.py bdist_wheel
+
+# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
+WORKDIR /opt
+RUN git clone https://github.com/flashinfer-ai/flashinfer.git
+WORKDIR /opt/flashinfer
+
+RUN pip install . && \
+    python3 -m pip wheel . --no-deps -w dist/
+
+# ── 下载 vllm 预编译 wheel，避免编译 flash-attn ───────────────────────────────
+WORKDIR /opt
+RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
+
+
+# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
+COPY ./sglang /sgl/sglang
+WORKDIR /sgl/sglang/python
+RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
+    python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
+
+
+# ── 🔄 下载 sgl-kernel（与 sglang 同步）───────────────────────────────────────
+RUN pip download --only-binary=:all: --no-deps sgl-kernel==0.1.9 -d /tmp/sgl_kernel_wheels
+
+# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
+RUN mkdir -p /wheels && \
+    cp /tmp/torch_dist/torch*.whl /wheels/ && \
+    cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
+    cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
+    cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
+    cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
+    cp /tmp/sgl_kernel_wheels/sgl_kernel-*.whl   /wheels/ && \
+    pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
+
+# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
+RUN pip wheel \
+    pydantic orjson psutil pyzmq pynvml \
+    transformers==4.52.0 uvicorn fastapi IPython aiohttp \
+    setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
+    -w /wheels
+
+# ── ✅ 打包 gradio UI 所需依赖 ────────────────────────────────────────────────
+RUN pip wheel "gradio==5.38.2" requests -w /wheels
+
+###############################################################################
+# Stage 2 ─ runtime：极简运行镜像，仅离线安装 wheel
+###############################################################################
+ARG CUDA_VERSION=12.6.1
+FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
+
+ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
+        python3 python3-dev python3-pip python3-distutils curl ca-certificates \
+        libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
+    rm -rf /var/lib/apt/lists/* && \
+    python3 -m pip install --no-cache-dir --upgrade pip \
+    && python3 -m pip install --no-cache-dir xgrammar
+
+# 👉 拷贝 cupti 动态库（避免写死版本号）
+COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
+COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
+
+# 👇建议在后面补上
+RUN ldconfig
+
+# ---- 拷贝预调优的 MoE Triton kernel config ----------------------------
+COPY moe_kernels /usr/local/lib/python3.10/dist-packages/sglang/srt/layers/moe/fused_moe_triton/configs
+
+
+COPY --from=builder-extras /wheels /tmp/wheels
+
+# ✅ 优先装你自编的 torch，避免被 PyPI 上的覆盖
+RUN ls -lh /tmp/wheels && \
+    rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
+    rm -f /tmp/wheels/huggingface_hub-0.33.4*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/sgl_kernel-*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/gradio-5.38.2*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps $(ls /tmp/wheels | grep -v '^gradio-' | sed 's|^|/tmp/wheels/|') && \
+    python3 -m pip install --no-cache-dir --no-deps $(find /tmp/wheels -maxdepth 1 -type f -name '*.whl' ! -name 'gradio-*') && \
+    python3 -c "import gradio, sys; print('✅ Gradio version =', gradio.__version__)" && \
+    rm -rf /tmp/wheels
+
+
+
+# ✅ 安装 Prometheus client
+RUN python3 -m pip install --no-cache-dir prometheus_client
+
+# ✅ 设置多进程 metrics 收集目录（用于 MultiProcessCollector）
+ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
+
+# ✅ 确保目录存在
+RUN mkdir -p /tmp/prometheus
+
+# ✅ 添加 Tini（推荐）
+ENV TINI_VERSION=v0.19.0
+ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
+RUN chmod +x /tini
+ENTRYPOINT ["/tini", "--"]
+
+# ---- 拷贝模型（路径可换） ----
+# COPY ./Alibaba/Qwen3-30B-A3B /root/.cradle/Alibaba/Qwen3-30B-A3B
+
+HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
+
+# ---- 暴露端口 ----
+EXPOSE 30000 30001
+
+# 安装 supervisor
+RUN apt-get update && apt-get install -y supervisor && \
+    mkdir -p /etc/supervisor/conf.d
+
+# 拷贝 supervisord 配置文件和 UI 脚本
+COPY ./meta_ui.py /app/meta_ui.py
+COPY ./supervisord.conf /etc/supervisor/supervisord.conf
+
+# 作为容器主进程运行 supervisor
+CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]
--- a/gdrcopy/.gitignore
+++ b/gdrcopy/.gitignore
@ -0,0 +1,25 @@
+# Compiled Object files
+*.slo
+*.lo
+*.o
+*.obj
+
+# Compiled Dynamic libraries
+*.so
+*.dylib
+*.dll
+
+# Compiled Static libraries
+*.lai
+*.la
+*.a
+*.lib
+
+# Executables
+*.exe
+*.out
+*.app
+
+# Editor files
+*~
+*.swp
--- a/gdrcopy/CHANGELOG.md
+++ b/gdrcopy/CHANGELOG.md
@ -0,0 +1,108 @@
+# Changelog
+
+## [2.4.4] - 2024-12-16
+- Fix the use-after-free bug of mr objects in gdrdv\_vma\_close.
+- Fix the resource leakage bug in gdrdrv\_release.
+
+## [2.4.3] - 2024-12-02
+- Fix NVIDIA\_IS\_OPENSOURCE detection when compile with NVIDIA driver version 545 or newer.
+- Fix compile error in gdrdrv when compile on RHEL9.5.
+
+## [2.4.2] - 2024-10-31
+- Fix the size alignment bug in gdrdrv.
+- Fix memory leak in gdr\_pin\_buffer.
+- Add support for another flavor of BF3.
+
+## [2.4.1] - 2023-12-18
+- Add support for persistent mapping.
+- Fix bug in src/gdrdrv/Makefile.
+- Fix compile-time bug when check.h is not found.
+
+## [2.4] - 2023-09-19
+- Various bug fixes in the test and benchmark applications.
+- Prefix all applications with "gdrcopy\_".
+- Introduce more unit tests in gdrcopy\_sanity.
+- Introduce gdrcopy\_pplat benchmark application.
+- Remove dependency on libcheck and libsubunit
+- Introduce gdr\_get\_info\_v2.
+- Introduce new copy algorithm for device mappings.
+- Add support for NVIDIA BLUEFIELD-3.
+- Add support for Linux kernel >= 6.3.
+- Add support for SLES and OpenSUSE.
+- Add support for systemd service on RHEL9.
+- Relicense gdrdrv to Dual MIT/GPL.
+- Fix bugs in gdrdrv when pinning two small buffers back-to-back.
+- Add support for coherent platforms such as Grace-Hopper.
+- Add support for Confidential Computing (CC).
+
+## [2.3.1] - 2023-05-12
+- Add a workaround for the GPL-compatibility issue when compile with CONFIG\_ARCH\_HAS\_CC\_PLATFORM on Linux kernel 5.18+.
+- Fix error in init.d/gdrcopy due to missing /etc/rc.d/init.d/functions.
+
+## [2.3] - 2021-07-27
+- Remove automatically-generated build id links in rpm packages.
+- Remove gdrcopy-kmod from the Requires field of the gdrcopy rpm package.
+- Remove gdrdrv-dkms dependency enforcement from the gdrcopy deb package.
+- Add libsubunit0 to the dependency list of the gdrcopy deb package.
+- Add apiperf test.
+- Revamp gdrdrv to fix race-condition bugs.
+- Add an option to build kmod package.
+- Split the gdrcopy deb package into meta, libgdrapi, and tests packages.
+- Update the package maintainer.
+- Various updates in README.
+
+## [2.2] - 2021-02-01
+- Add support for ARM64.
+- Update various information on README.
+- Improve Makefile.
+- Add multi-arch support.
+- Handle removal of HAVE\_UNLOCKED\_IOCTL in Linux kernel v5.9 and later.
+- Prevent dpkg package creation to unnecessarily compile gdrdrv.
+- Improve gdr\_open error message.
+- Fix bug that prevents sanity from correctly summarizing failure.
+- Add dkms support in kmod package.
+- Handle the removal of kzfree in Linux kernel v5.10 and later.
+- Improve small-size copy-to-mapping.
+
+## [2.1] - 2020-08-07
+- fix build problem on RHL8 kernels
+- relax checks in gdrdrv to support multi-threading use cases
+- fix fd leak in gdr\_open()
+- introduce new copylat test
+- remove CUDA RT dependency in tests
+- assorted cleanups
+
+## [2.0] - 2019-09-16
+- Harden security in gdrdrv.
+- Enable cached mappings in POWER9.
+- Improve copy performance with unrolling in POWERPC.
+- Creates _sanity_ unit test for testing the functionality and security.
+- Consolidate _basic_ and _validate_ into _sanity_ unit test.
+- Introduce compile time and runtime version checking in _libgdrapi_.
+- Improve rpm packaging.
+- Introduce deb packaging for the userspace library and the applications.
+- Introduce dkms packaging for the _gdrdrv_ driver.
+- Rename gdr\_copy\_from/to\_bar to gdr\_copy\_from/to\_mapping.
+- Update README
+
+## [1.3] - 2018-07-26
+- Add _gdrdrv_ driver for converting cudaMalloc'd addresses to the GPU's BAR1
+  addresses and exposing them to CPU-accessible virtual addresses.
+- Add _libgdrapi_, a user-space library for communicating with the gdrdrv driver.
+- Add _basic_ application as an minimal example on how to use gdrcopy.
+- Add _copybw_ application as a complete example on how CPU could read/write to
+  cudaMalloc'd memory via BAR1 mappings.
+- Add _validate_ unit test to ensure that gdrcopy functions as expected.
+- Add a script for packaging gdrcopy in the rpm format.
+
+[2.4.3]: https://github.com/NVIDIA/gdrcopy/releases/tag/v2.4.3
+[2.4.2]: https://github.com/NVIDIA/gdrcopy/releases/tag/v2.4.2
+[2.4.1]: https://github.com/NVIDIA/gdrcopy/releases/tag/v2.4.1
+[2.4]: https://github.com/NVIDIA/gdrcopy/releases/tag/v2.4
+[2.3.1]: https://github.com/NVIDIA/gdrcopy/releases/tag/v2.3.1
+[2.3]: https://github.com/NVIDIA/gdrcopy/releases/tag/v2.3
+[2.2]: https://github.com/NVIDIA/gdrcopy/releases/tag/v2.2
+[2.1]: https://github.com/NVIDIA/gdrcopy/releases/tag/v2.1
+[2.0]: https://github.com/NVIDIA/gdrcopy/releases/tag/v2.0
+[1.3]: https://github.com/NVIDIA/gdrcopy/releases/tag/v1.3
+
--- a/gdrcopy/LICENSE
+++ b/gdrcopy/LICENSE
@ -0,0 +1,19 @@
+Copyright (c) 2014-2021, NVIDIA CORPORATION. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+the rights to use, copy, modify, merge, publish, distribute, sublicense,
+and/or sell copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in 
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
--- a/gdrcopy/Makefile
+++ b/gdrcopy/Makefile
@ -0,0 +1,94 @@
+# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in 
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+prefix      ?= /usr/local
+exec_prefix ?= $(prefix)
+libdir      ?= $(exec_prefix)/lib
+bindir      ?= $(exec_prefix)/bin
+includedir  ?= $(prefix)/include
+
+DESTDIR := $(abspath $(DESTDIR))
+DESTLIB = $(DESTDIR)$(libdir)
+DESTBIN = $(DESTDIR)$(bindir)
+DESTINC = $(DESTDIR)$(includedir)
+
+CUDA ?= /usr/local/cuda
+
+LIB_MAJOR_VER ?= $(shell awk '/\#define GDR_API_MAJOR_VERSION/ { print $$3 }' include/gdrapi.h | tr -d '\n')
+LIB_MINOR_VER ?= $(shell awk '/\#define GDR_API_MINOR_VERSION/ { print $$3 }' include/gdrapi.h | tr -d '\n')
+
+GDRAPI_ARCH := $(shell ./config_arch)
+GDRAPI_INC := ../include
+
+LIB_VER:=$(LIB_MAJOR_VER).$(LIB_MINOR_VER)
+LIB_BASENAME:=libgdrapi.so
+LIB_DYNAMIC=$(LIB_BASENAME).$(LIB_VER)
+LIB_SONAME=$(LIB_BASENAME).$(LIB_MAJOR_VER)
+
+all: config driver lib exes
+
+version:
+	@ echo "$(LIB_VER)"
+
+config:
+	@ echo "GDRAPI_ARCH=$(GDRAPI_ARCH)"
+
+driver:
+	cd src/gdrdrv && \
+	$(MAKE) $(MAKE_PARAMS)
+
+lib:
+	cd src && \
+	$(MAKE) LIB_MAJOR_VER=$(LIB_MAJOR_VER) LIB_MINOR_VER=$(LIB_MINOR_VER)
+
+exes: lib
+	cd tests && \
+	$(MAKE) CUDA=$(CUDA)
+
+install: lib_install exes_install
+
+lib_install: lib
+	@ echo "installing in $(DESTLIB) $(DESTINC)..." && \
+	mkdir -p $(DESTLIB) && \
+	install -D -v -m u=rwx,g=rx,o=rx src/$(LIB_DYNAMIC) -t $(DESTLIB) && \
+	mkdir -p $(DESTINC) && \
+	install -D -v -m u=rw,g=rw,o=r include/* -t $(DESTINC); \
+	cd $(DESTLIB); \
+	ln -sf $(LIB_DYNAMIC) $(LIB_SONAME); \
+	ln -sf $(LIB_SONAME) $(LIB_BASENAME);
+
+exes_install: exes
+	cd tests && $(MAKE) install DESTBIN=$(DESTBIN)
+
+
+drv_install: driver
+	cd src/gdrdrv && \
+	$(MAKE) install
+
+clean:
+	cd tests && \
+	$(MAKE) clean
+	cd src && \
+	$(MAKE) clean
+	cd src/gdrdrv && \
+	$(MAKE) clean
+
+.PHONY: driver clean all lib exes lib_install drv_install exes_install install
+
--- a/gdrcopy/README.md
+++ b/gdrcopy/README.md
@ -0,0 +1,495 @@
+# GDRCopy
+
+A low-latency GPU memory copy library based on NVIDIA GPUDirect RDMA
+technology.
+
+
+## Introduction
+
+While GPUDirect RDMA is meant for direct access to GPU memory from
+third-party devices, it is possible to use these same APIs to create
+perfectly valid CPU mappings of the GPU memory.
+
+The advantage of a CPU driven copy is the very small overhead
+involved. That might be useful when low latencies are required.
+
+
+## What is inside
+
+GDRCopy offers the infrastructure to create user-space mappings of GPU memory,
+which can then be manipulated as if it was plain host memory (caveats apply
+here).
+
+A simple by-product of it is a copy library with the following characteristics:
+- very low overhead, as it is driven by the CPU. As a reference, currently a 
+  cudaMemcpy can incur in a 6-7us overhead.
+
+- An initial memory *pinning* phase is required, which is potentially expensive,
+  10us-1ms depending on the buffer size.
+
+- Fast H-D, because of write-combining. H-D bandwidth is 6-8GB/s on Ivy
+  Bridge Xeon but it is subject to NUMA effects.
+
+- Slow D-H, because the GPU BAR, which backs the mappings, can't be
+  prefetched and so burst reads transactions are not generated through
+  PCIE
+
+The library comes with a few tests like:
+- gdrcopy_sanity, which contains unit tests for the library and the driver.
+- gdrcopy_copybw, a minimal application which calculates the R/W bandwidth for a specific buffer size.
+- gdrcopy_copylat, a benchmark application which calculates the R/W copy latency for a range of buffer sizes.
+- gdrcopy_apiperf, an application for benchmarking the latency of each GDRCopy API call.
+- gdrcopy_pplat, a benchmark application which calculates the round-trip ping-pong latency between GPU and CPU.
+
+## Requirements
+
+GPUDirect RDMA requires NVIDIA Tesla or Quadro class GPUs based on Kepler,
+Pascal, Volta, or Turing, see [GPUDirect
+RDMA](http://developer.nvidia.com/gpudirect).  For more technical informations,
+please refer to the official GPUDirect RDMA [design
+document](http://docs.nvidia.com/cuda/gpudirect-rdma).
+
+The device driver requires GPU display driver >= 418.40 on ppc64le and >= 331.14 on other platforms. The library and tests
+require CUDA >= 6.0.
+
+DKMS is a prerequisite for installing GDRCopy kernel module package. On RHEL
+or SLE,
+however, users have an option to build kmod and install it instead of the DKMS
+package. See [Build and installation](#build-and-installation) section for more details.
+
+```shell
+# On RHEL
+# dkms can be installed from epel-release. See https://fedoraproject.org/wiki/EPEL.
+$ sudo yum install dkms
+
+# On Debian - No additional dependency
+
+# On SLE / Leap
+# On SLE dkms can be installed from PackageHub.
+$ sudo zypper install dkms rpmbuild
+```
+
+CUDA and GPU display driver must be installed before building and/or installing GDRCopy.
+The installation instructions can be found in https://developer.nvidia.com/cuda-downloads.
+
+GPU display driver header files are also required. They are installed as a part
+of the driver (or CUDA) installation with  *runfile*. If you install the driver
+via package management, we suggest
+- On RHEL, `sudo dnf module install nvidia-driver:latest-dkms`.
+- On Debian, `sudo apt install nvidia-dkms-<your-nvidia-driver-version>`.
+- On SLE, `sudo zypper install nvidia-gfx<your-nvidia-driver-version>-kmp`.
+
+The supported architectures are Linux x86\_64, ppc64le, and arm64. The supported
+platforms are RHEL8, RHEL9, Ubuntu20\_04, Ubuntu22\_04,
+SLE-15 (any SP) and Leap 15.x.
+
+Root privileges are necessary to load/install the kernel-mode device
+driver.
+
+
+## Build and installation
+
+We provide three ways for building and installing GDRCopy.
+
+### rpm package
+
+```shell
+# For RHEL:
+$ sudo yum groupinstall 'Development Tools'
+$ sudo yum install dkms rpm-build make
+
+# For SLE:
+$ sudo zypper in dkms rpmbuild
+
+$ cd packages
+$ CUDA=<cuda-install-top-dir> ./build-rpm-packages.sh
+$ sudo rpm -Uvh gdrcopy-kmod-<version>dkms.noarch.<platform>.rpm
+$ sudo rpm -Uvh gdrcopy-<version>.<arch>.<platform>.rpm
+$ sudo rpm -Uvh gdrcopy-devel-<version>.noarch.<platform>.rpm
+```
+DKMS package is the default kernel module package that `build-rpm-packages.sh`
+generates. To create kmod package, `-m` option must be passed to the script.
+Unlike the DKMS package, the kmod package contains a prebuilt GDRCopy kernel
+module which is specific to the NVIDIA driver version and the Linux kernel
+version used to build it.
+
+
+### deb package
+
+```shell
+$ sudo apt install build-essential devscripts debhelper fakeroot pkg-config dkms
+$ cd packages
+$ CUDA=<cuda-install-top-dir> ./build-deb-packages.sh
+$ sudo dpkg -i gdrdrv-dkms_<version>_<arch>.<platform>.deb
+$ sudo dpkg -i libgdrapi_<version>_<arch>.<platform>.deb
+$ sudo dpkg -i gdrcopy-tests_<version>_<arch>.<platform>.deb
+$ sudo dpkg -i gdrcopy_<version>_<arch>.<platform>.deb
+```
+
+### from source
+
+```shell
+$ make prefix=<install-to-this-location> CUDA=<cuda-install-top-dir> all install
+$ sudo ./insmod.sh
+```
+
+### Notes
+
+Compiling the gdrdrv driver requires the NVIDIA driver source code, which is typically installed at
+`/usr/src/nvidia-<version>`. Our make file automatically detects and picks that source code. In case there are multiple
+versions installed, it is possible to pass the correct path by defining the NVIDIA_SRC_DIR variable, e.g. `export
+NVIDIA_SRC_DIR=/usr/src/nvidia-520.61.05/nvidia` before building the gdrdrv module.
+
+There are two major flavors of NVIDIA driver: 1) proprietary, and 2)
+[opensource](https://developer.nvidia.com/blog/nvidia-releases-open-source-gpu-kernel-modules/). We detect the flavor
+when compiling gdrdrv based on the source code of the NVIDIA driver. Different flavors come with different features and
+restrictions:
+- gdrdrv compiled with the opensource flavor will provide functionality and high performance on all platforms. However,
+  you will not be able to load this gdrdrv driver when the proprietary NVIDIA driver is loaded.
+- gdrdrv compiled with the proprietary flavor can always be loaded regardless of the flavor of NVIDIA driver you have
+  loaded. However, it may have suboptimal performance on coherent platforms such as Grace-Hopper. Functionally, it will not
+  work correctly on Intel CPUs with Linux kernel built with confidential compute (CC) support, i.e.
+  `CONFIG_ARCH_HAS_CC_PLATFORM=y`, *WHEN* CC is enabled at runtime.
+
+
+## Tests
+
+Execute provided tests:
+```shell
+$ gdrcopy_sanity 
+Total: 28, Passed: 28, Failed: 0, Waived: 0
+
+List of passed tests:
+    basic_child_thread_pins_buffer_cumemalloc
+    basic_child_thread_pins_buffer_vmmalloc
+    basic_cumemalloc
+    basic_small_buffers_mapping
+    basic_unaligned_mapping
+    basic_vmmalloc
+    basic_with_tokens
+    data_validation_cumemalloc
+    data_validation_vmmalloc
+    invalidation_access_after_free_cumemalloc
+    invalidation_access_after_free_vmmalloc
+    invalidation_access_after_gdr_close_cumemalloc
+    invalidation_access_after_gdr_close_vmmalloc
+    invalidation_fork_access_after_free_cumemalloc
+    invalidation_fork_access_after_free_vmmalloc
+    invalidation_fork_after_gdr_map_cumemalloc
+    invalidation_fork_after_gdr_map_vmmalloc
+    invalidation_fork_child_gdr_map_parent_cumemalloc
+    invalidation_fork_child_gdr_map_parent_vmmalloc
+    invalidation_fork_child_gdr_pin_parent_with_tokens
+    invalidation_fork_map_and_free_cumemalloc
+    invalidation_fork_map_and_free_vmmalloc
+    invalidation_two_mappings_cumemalloc
+    invalidation_two_mappings_vmmalloc
+    invalidation_unix_sock_shared_fd_gdr_map_cumemalloc
+    invalidation_unix_sock_shared_fd_gdr_map_vmmalloc
+    invalidation_unix_sock_shared_fd_gdr_pin_buffer_cumemalloc
+    invalidation_unix_sock_shared_fd_gdr_pin_buffer_vmmalloc
+
+
+$ gdrcopy_copybw
+GPU id:0; name: Tesla V100-SXM2-32GB; Bus id: 0000:06:00
+GPU id:1; name: Tesla V100-SXM2-32GB; Bus id: 0000:07:00
+GPU id:2; name: Tesla V100-SXM2-32GB; Bus id: 0000:0a:00
+GPU id:3; name: Tesla V100-SXM2-32GB; Bus id: 0000:0b:00
+GPU id:4; name: Tesla V100-SXM2-32GB; Bus id: 0000:85:00
+GPU id:5; name: Tesla V100-SXM2-32GB; Bus id: 0000:86:00
+GPU id:6; name: Tesla V100-SXM2-32GB; Bus id: 0000:89:00
+GPU id:7; name: Tesla V100-SXM2-32GB; Bus id: 0000:8a:00
+selecting device 0
+testing size: 131072
+rounded size: 131072
+gpu alloc fn: cuMemAlloc
+device ptr: 7f1153a00000
+map_d_ptr: 0x7f1172257000
+info.va: 7f1153a00000
+info.mapped_size: 131072
+info.page_size: 65536
+info.mapped: 1
+info.wc_mapping: 1
+page offset: 0
+user-space pointer:0x7f1172257000
+writing test, size=131072 offset=0 num_iters=10000
+write BW: 9638.54MB/s
+reading test, size=131072 offset=0 num_iters=100
+read BW: 530.135MB/s
+unmapping buffer
+unpinning buffer
+closing gdrdrv
+
+
+$ gdrcopy_copylat
+GPU id:0; name: Tesla V100-SXM2-32GB; Bus id: 0000:06:00
+GPU id:1; name: Tesla V100-SXM2-32GB; Bus id: 0000:07:00
+GPU id:2; name: Tesla V100-SXM2-32GB; Bus id: 0000:0a:00
+GPU id:3; name: Tesla V100-SXM2-32GB; Bus id: 0000:0b:00
+GPU id:4; name: Tesla V100-SXM2-32GB; Bus id: 0000:85:00
+GPU id:5; name: Tesla V100-SXM2-32GB; Bus id: 0000:86:00
+GPU id:6; name: Tesla V100-SXM2-32GB; Bus id: 0000:89:00
+GPU id:7; name: Tesla V100-SXM2-32GB; Bus id: 0000:8a:00
+selecting device 0
+device ptr: 0x7fa2c6000000
+allocated size: 16777216
+gpu alloc fn: cuMemAlloc
+
+map_d_ptr: 0x7fa2f9af9000
+info.va: 7fa2c6000000
+info.mapped_size: 16777216
+info.page_size: 65536
+info.mapped: 1
+info.wc_mapping: 1
+page offset: 0
+user-space pointer: 0x7fa2f9af9000
+
+gdr_copy_to_mapping num iters for each size: 10000
+WARNING: Measuring the API invocation overhead as observed by the CPU. Data
+might not be ordered all the way to the GPU internal visibility.
+Test             Size(B)     Avg.Time(us)
+gdr_copy_to_mapping             1         0.0889
+gdr_copy_to_mapping             2         0.0884
+gdr_copy_to_mapping             4         0.0884
+gdr_copy_to_mapping             8         0.0884
+gdr_copy_to_mapping            16         0.0905
+gdr_copy_to_mapping            32         0.0902
+gdr_copy_to_mapping            64         0.0902
+gdr_copy_to_mapping           128         0.0952
+gdr_copy_to_mapping           256         0.0983
+gdr_copy_to_mapping           512         0.1176
+gdr_copy_to_mapping          1024         0.1825
+gdr_copy_to_mapping          2048         0.2549
+gdr_copy_to_mapping          4096         0.4366
+gdr_copy_to_mapping          8192         0.8141
+gdr_copy_to_mapping         16384         1.6155
+gdr_copy_to_mapping         32768         3.2284
+gdr_copy_to_mapping         65536         6.4906
+gdr_copy_to_mapping        131072        12.9761
+gdr_copy_to_mapping        262144        25.9459
+gdr_copy_to_mapping        524288        51.9100
+gdr_copy_to_mapping       1048576       103.8028
+gdr_copy_to_mapping       2097152       207.5990
+gdr_copy_to_mapping       4194304       415.2856
+gdr_copy_to_mapping       8388608       830.6355
+gdr_copy_to_mapping      16777216      1661.3285
+
+gdr_copy_from_mapping num iters for each size: 100
+Test             Size(B)     Avg.Time(us)
+gdr_copy_from_mapping           1         0.9069
+gdr_copy_from_mapping           2         1.7170
+gdr_copy_from_mapping           4         1.7169
+gdr_copy_from_mapping           8         1.7164
+gdr_copy_from_mapping          16         0.8601
+gdr_copy_from_mapping          32         1.7024
+gdr_copy_from_mapping          64         3.1016
+gdr_copy_from_mapping         128         3.4944
+gdr_copy_from_mapping         256         3.6400
+gdr_copy_from_mapping         512         2.4394
+gdr_copy_from_mapping        1024         2.8022
+gdr_copy_from_mapping        2048         4.6615
+gdr_copy_from_mapping        4096         7.9783
+gdr_copy_from_mapping        8192        14.9209
+gdr_copy_from_mapping       16384        28.9571
+gdr_copy_from_mapping       32768        56.9373
+gdr_copy_from_mapping       65536       114.1008
+gdr_copy_from_mapping      131072       234.9382
+gdr_copy_from_mapping      262144       496.4011
+gdr_copy_from_mapping      524288       985.5196
+gdr_copy_from_mapping     1048576      1970.7057
+gdr_copy_from_mapping     2097152      3942.5611
+gdr_copy_from_mapping     4194304      7888.9468
+gdr_copy_from_mapping     8388608     18361.5673
+gdr_copy_from_mapping    16777216     36758.8342
+unmapping buffer
+unpinning buffer
+closing gdrdrv
+
+
+$ gdrcopy_apiperf -s 8
+GPU id:0; name: Tesla V100-SXM2-32GB; Bus id: 0000:06:00
+GPU id:1; name: Tesla V100-SXM2-32GB; Bus id: 0000:07:00
+GPU id:2; name: Tesla V100-SXM2-32GB; Bus id: 0000:0a:00
+GPU id:3; name: Tesla V100-SXM2-32GB; Bus id: 0000:0b:00
+GPU id:4; name: Tesla V100-SXM2-32GB; Bus id: 0000:85:00
+GPU id:5; name: Tesla V100-SXM2-32GB; Bus id: 0000:86:00
+GPU id:6; name: Tesla V100-SXM2-32GB; Bus id: 0000:89:00
+GPU id:7; name: Tesla V100-SXM2-32GB; Bus id: 0000:8a:00
+selecting device 0
+device ptr: 0x7f1563a00000
+allocated size: 65536
+Size(B) pin.Time(us)    map.Time(us)    get_info.Time(us)   unmap.Time(us)
+unpin.Time(us)
+65536   1346.034060 3.603800    0.340270    4.700930    676.612800
+Histogram of gdr_pin_buffer latency for 65536 bytes
+[1303.852000    -   2607.704000]    93
+[2607.704000    -   3911.556000]    0
+[3911.556000    -   5215.408000]    0
+[5215.408000    -   6519.260000]    0
+[6519.260000    -   7823.112000]    0
+[7823.112000    -   9126.964000]    0
+[9126.964000    -   10430.816000]   0
+[10430.816000   -   11734.668000]   0
+[11734.668000   -   13038.520000]   0
+[13038.520000   -   14342.372000]   2
+
+closing gdrdrv
+
+
+
+$ numactl -N 1 -l gdrcopy_pplat
+GPU id:0; name: NVIDIA A40; Bus id: 0000:09:00
+selecting device 0
+device ptr: 0x7f99d2600000
+gpu alloc fn: cuMemAlloc
+map_d_ptr: 0x7f9a054fb000
+info.va: 7f99d2600000
+info.mapped_size: 4
+info.page_size: 65536
+info.mapped: 1
+info.wc_mapping: 1
+page offset: 0
+user-space pointer: 0x7f9a054fb000
+CPU does gdr_copy_to_mapping and GPU writes back via cuMemHostAlloc'd buffer.
+Running 1000 iterations with data size 4 bytes.
+Round-trip latency per iteration is 1.08762 us
+unmapping buffer
+unpinning buffer
+closing gdrdrv
+```
+
+## NUMA effects
+
+Depending on the platform architecture, like where the GPU are placed in
+the PCIe topology, performance may suffer if the processor which is driving
+the copy is not the one which is hosting the GPU, for example in a
+multi-socket server.
+
+In the example below, GPU ID 0 is hosted by
+CPU socket 0. By explicitly playing with the OS process and memory
+affinity, it is possible to run the test onto the optimal processor:
+
+```shell
+$ numactl -N 0 -l gdrcopy_copybw -d 0 -s $((64 * 1024)) -o $((0 * 1024)) -c $((64 * 1024))
+GPU id:0; name: Tesla V100-SXM2-32GB; Bus id: 0000:06:00
+GPU id:1; name: Tesla V100-SXM2-32GB; Bus id: 0000:07:00
+GPU id:2; name: Tesla V100-SXM2-32GB; Bus id: 0000:0a:00
+GPU id:3; name: Tesla V100-SXM2-32GB; Bus id: 0000:0b:00
+GPU id:4; name: Tesla V100-SXM2-32GB; Bus id: 0000:85:00
+GPU id:5; name: Tesla V100-SXM2-32GB; Bus id: 0000:86:00
+GPU id:6; name: Tesla V100-SXM2-32GB; Bus id: 0000:89:00
+GPU id:7; name: Tesla V100-SXM2-32GB; Bus id: 0000:8a:00
+selecting device 0
+testing size: 65536
+rounded size: 65536
+gpu alloc fn: cuMemAlloc
+device ptr: 7f5817a00000
+map_d_ptr: 0x7f583b186000
+info.va: 7f5817a00000
+info.mapped_size: 65536
+info.page_size: 65536
+info.mapped: 1
+info.wc_mapping: 1
+page offset: 0
+user-space pointer:0x7f583b186000
+writing test, size=65536 offset=0 num_iters=1000
+write BW: 9768.3MB/s
+reading test, size=65536 offset=0 num_iters=1000
+read BW: 548.423MB/s
+unmapping buffer
+unpinning buffer
+closing gdrdrv
+```
+
+or on the other socket:
+```shell
+$ numactl -N 1 -l gdrcopy_copybw -d 0 -s $((64 * 1024)) -o $((0 * 1024)) -c $((64 * 1024))
+GPU id:0; name: Tesla V100-SXM2-32GB; Bus id: 0000:06:00
+GPU id:1; name: Tesla V100-SXM2-32GB; Bus id: 0000:07:00
+GPU id:2; name: Tesla V100-SXM2-32GB; Bus id: 0000:0a:00
+GPU id:3; name: Tesla V100-SXM2-32GB; Bus id: 0000:0b:00
+GPU id:4; name: Tesla V100-SXM2-32GB; Bus id: 0000:85:00
+GPU id:5; name: Tesla V100-SXM2-32GB; Bus id: 0000:86:00
+GPU id:6; name: Tesla V100-SXM2-32GB; Bus id: 0000:89:00
+GPU id:7; name: Tesla V100-SXM2-32GB; Bus id: 0000:8a:00
+selecting device 0
+testing size: 65536
+rounded size: 65536
+gpu alloc fn: cuMemAlloc
+device ptr: 7fbb63a00000
+map_d_ptr: 0x7fbb82ab0000
+info.va: 7fbb63a00000
+info.mapped_size: 65536
+info.page_size: 65536
+info.mapped: 1
+info.wc_mapping: 1
+page offset: 0
+user-space pointer:0x7fbb82ab0000
+writing test, size=65536 offset=0 num_iters=1000
+write BW: 9224.36MB/s
+reading test, size=65536 offset=0 num_iters=1000
+read BW: 521.262MB/s
+unmapping buffer
+unpinning buffer
+closing gdrdrv
+```
+
+
+## Restrictions and known issues
+
+GDRCopy works with regular CUDA device memory only, as returned by cudaMalloc.
+In particular, it does not work with CUDA managed memory.
+
+`gdr_pin_buffer()` accepts any addresses returned by cudaMalloc and its family.
+In contrast, `gdr_map()` requires that the pinned address is aligned to the GPU page.
+Neither CUDA Runtime nor Driver APIs guarantees that GPU memory allocation
+functions return aligned addresses. Users are responsible for proper alignment
+of addresses passed to the library.
+
+Two cudaMalloc'd memory regions may be contiguous. Users may call
+`gdr_pin_buffer` and `gdr_map` with address and size that extend across these
+two regions. This use case is not well-supported in GDRCopy. On rare occassions,
+users may experience 1.) an error in `gdr_map`, or 2.) low copy performance
+because `gdr_map` cannot provide write-combined mapping.
+
+In some GPU driver versions, pinning the same GPU address multiple times
+consumes additional BAR1 space. This is because the space is not properly
+reused. If you encounter this issue, we suggest that you try the latest version
+of NVIDIA GPU driver.
+
+On POWER9 where CPU and GPU are connected via NVLink, CUDA9.2 and GPU Driver
+v396.37 are the minimum requirements in order to achieve the full performance.
+GDRCopy works with ealier CUDA and GPU driver versions but the achievable
+bandwidth is substantially lower.
+
+If gdrdrv is compiled with the proprietary flavor of NVIDIA driver, GDRCopy does not fully support Linux with the
+confidential computing (CC) configuration with Intel CPU. In particular, it does not functional if
+`CONFIG_ARCH_HAS_CC_PLATFORM=y` and CC is enabled at runtime. However, it works if CC is disabled or
+`CONFIG_ARCH_HAS_CC_PLATFORM=n`. This issue is not applied to AMD CPU. To avoid this issue, please compile and load
+gdrdrv with the opensource flavor of NVIDIA driver.
+
+To allow the loading of unsupported 3rd party modules in SLE, set `allow_unsupported_modules 1` in
+/etc/modprobe.d/unsupported-modules. After making this change, modules missing the "supported" flag, will be allowed to
+load.
+
+
+## Bug filing
+
+For reporting issues you may be having using any of NVIDIA software or
+reporting suspected bugs we would recommend you use the bug filing system
+which is available to NVIDIA registered developers on the developer site.
+
+If you are not a member you can [sign
+up](https://developer.nvidia.com/accelerated-computing-developer).
+
+Once a member you can submit issues using [this
+form](https://developer.nvidia.com/nvbugs/cuda/add). Be sure to select
+GPUDirect in the "Relevant Area" field.
+
+You can later track their progress using the __My Bugs__ link on the left of
+this [view](https://developer.nvidia.com/user).
+
+## Acknowledgment
+
+If you find this software useful in your work, please cite:
+R. Shi et al., "Designing efficient small message transfer mechanism for inter-node MPI communication on InfiniBand GPU clusters," 2014 21st International Conference on High Performance Computing (HiPC), Dona Paula, 2014, pp. 1-10, doi: 10.1109/HiPC.2014.7116873.
--- a/gdrcopy/config_arch
+++ b/gdrcopy/config_arch
@ -0,0 +1,46 @@
+#!/bin/bash
+# Copyright (c) 2014-2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in 
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+topdir="$(cd "$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")" && pwd)"
+dir=$(mktemp -d)
+src=$dir/arch.c
+exe=$dir/arch
+cat <<EOF >$src
+#include <stdio.h>
+#include "gdrconfig.h"
+int main(int argc, char *argv[])
+{
+#ifdef GDRAPI_X86
+  printf("X86\n");
+#elif defined(GDRAPI_POWER)
+  printf("POWER\n");
+#elif defined(GDRAPI_ARM64)
+  printf("ARM64\n");
+#else
+  printf("ERROR\n");
+#endif
+  return 0;
+}
+EOF
+
+gcc -I ${topdir}/include -I ${topdir}/src $src -o $exe
+$exe 
+rm -rf $dir
--- a/gdrcopy/include/gdrapi.h
+++ b/gdrcopy/include/gdrapi.h
@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2014-2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in 
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __GDRAPI_H__
+#define __GDRAPI_H__
+
+#include <stdint.h> // for standard [u]intX_t types
+#include <stddef.h>
+
+#define MAJOR_VERSION_SHIFT     16
+#define MINOR_VERSION_MASK      (((uint32_t)1 << MAJOR_VERSION_SHIFT) - 1)
+
+#define GDR_API_MAJOR_VERSION    2
+#define GDR_API_MINOR_VERSION    4
+#define GDR_API_VERSION          ((GDR_API_MAJOR_VERSION << MAJOR_VERSION_SHIFT) | GDR_API_MINOR_VERSION)
+
+#define MINIMUM_GDRDRV_MAJOR_VERSION    2
+#define MINIMUM_GDRDRV_MINOR_VERSION    0
+#define MINIMUM_GDRDRV_VERSION          ((MINIMUM_GDRDRV_MAJOR_VERSION << MAJOR_VERSION_SHIFT) | MINIMUM_GDRDRV_MINOR_VERSION)
+
+
+#define GPU_PAGE_SHIFT   16
+#define GPU_PAGE_SIZE    (1UL << GPU_PAGE_SHIFT)
+#define GPU_PAGE_OFFSET  (GPU_PAGE_SIZE-1)
+#define GPU_PAGE_MASK    (~GPU_PAGE_OFFSET)
+
+/*
+ * GDRCopy, a low-latency GPU memory copy library (and a kernel-mode
+ * driver) based on NVIDIA GPUDirect RDMA technology.
+ *
+ * supported environment variables:
+ *
+ * - GDRCOPY_ENABLE_LOGGING, if defined logging is enabled, default is
+ *   disabled.
+ *
+ * - GDRCOPY_LOG_LEVEL, overrides log threshold, default is to print errors
+ *   only.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct gdr;
+typedef struct gdr *gdr_t;
+
+// Initialize the library, e.g. by opening a connection to the kernel-mode
+// driver. Returns an handle to the library state object.
+gdr_t gdr_open(void);
+
+// Destroy library state object, e.g. it closes the connection to kernel-mode
+// driver.
+int gdr_close(gdr_t g);
+
+// The handle to a user-space GPU memory mapping
+typedef struct gdr_mh_s {
+  unsigned long h;
+} gdr_mh_t;
+
+// Create a peer-to-peer mapping of the device memory buffer, returning an opaque handle.
+// Note that at this point the mapping is still not accessible to user-space.
+int gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle);
+
+// Destroys the peer-to-peer mapping and frees the handle.
+//
+// If there exists a corresponding user-space mapping, gdr_unmap should be
+// called before this one.
+int gdr_unpin_buffer(gdr_t g, gdr_mh_t handle);
+
+// flag is set when the kernel callback (relative to the
+// nvidia_p2p_get_pages) gets invoked, e.g. cuMemFree() before
+// gdr_unpin_buffer.
+int gdr_get_callback_flag(gdr_t g, gdr_mh_t handle, int *flag);
+
+typedef enum gdr_mapping_type {
+    GDR_MAPPING_TYPE_NONE = 0,
+    GDR_MAPPING_TYPE_WC = 1,
+    GDR_MAPPING_TYPE_CACHING = 2,
+    GDR_MAPPING_TYPE_DEVICE = 3
+} gdr_mapping_type_t;
+
+// After pinning, info struct contains details of the mapped area.  
+//
+// Note that both info->va and info->mapped_size might be different from
+// the original address passed to gdr_pin_buffer due to aligning happening
+// in the kernel-mode driver
+typedef struct gdr_info_v2 {
+    uint64_t va;
+    uint64_t mapped_size;
+    uint32_t page_size;
+    // tm_cycles and cycles_per_ms are deprecated and will be removed in future.
+    uint64_t tm_cycles;
+    uint32_t cycles_per_ms;
+    unsigned mapped:1;
+    unsigned wc_mapping:1;
+    gdr_mapping_type_t mapping_type;
+} gdr_info_v2_t;
+typedef gdr_info_v2_t gdr_info_t;
+int gdr_get_info_v2(gdr_t g, gdr_mh_t handle, gdr_info_v2_t *info);
+
+#define gdr_get_info gdr_get_info_v2
+
+// Create a user-space mapping of the memory handle.
+//
+// WARNING: the address could be potentially aligned to the boundary of the page size
+// before being mapped in user-space, so the pointer returned might be
+// affected by an offset. gdr_get_info can be used to calculate that
+// offset.
+int gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size);
+
+// get rid of a user-space mapping.
+// First invoke gdr_unmap() then gdr_unpin_buffer().
+int gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size);
+
+// map_d_ptr is the user-space virtual address belonging to a mapping of a device memory buffer,
+// i.e. one returned by gdr_map()
+//
+// WARNING: Both integrity and ordering of data as observed by pre-launched GPU
+// work is not guaranteed by this API. For more information, see
+// https://docs.nvidia.com/cuda/gpudirect-rdma/index.html#sync-behavior
+int gdr_copy_to_mapping(gdr_mh_t handle, void *map_d_ptr, const void *h_ptr, size_t size);
+
+int gdr_copy_from_mapping(gdr_mh_t handle, void *h_ptr, const void *map_d_ptr, size_t size);
+
+// Query the version of libgdrapi
+void gdr_runtime_get_version(int *major, int *minor);
+
+// Query the version of gdrdrv driver
+int gdr_driver_get_version(gdr_t g, int *major, int *minor);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // __GDRAPI_H__
--- a/gdrcopy/include/gdrconfig.h
+++ b/gdrcopy/include/gdrconfig.h
@ -0,0 +1,15 @@
+#pragma once
+
+#if defined __GNUC__
+#if defined(__powerpc__)
+#define GDRAPI_POWER
+#elif defined(__aarch64__)
+#define GDRAPI_ARM64
+#elif defined(__i386__) || defined(__x86_64__) || defined(__X86__)
+#define GDRAPI_X86
+#else
+#error "architecture is not supported"
+#endif // arch
+#else
+#error "compiler not supported"
+#endif // __GNUC__
--- a/gdrcopy/insmod.sh
+++ b/gdrcopy/insmod.sh
@ -0,0 +1,41 @@
+#!/bin/bash
+# Copyright (c) 2014-2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in 
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+THIS_DIR=$(dirname $0)
+
+# remove driver
+grep gdrdrv /proc/devices >/dev/null && sudo /sbin/rmmod gdrdrv
+
+# insert driver
+sudo /sbin/insmod src/gdrdrv/gdrdrv.ko dbg_enabled=0 info_enabled=0 use_persistent_mapping=0
+
+# create device inodes
+major=`fgrep gdrdrv /proc/devices | cut -b 1-4`
+echo "INFO: driver major is $major"
+
+# remove old inodes just in case
+if [ -e /dev/gdrdrv ]; then
+    sudo rm /dev/gdrdrv
+fi
+
+echo "INFO: creating /dev/gdrdrv inode"
+sudo mknod /dev/gdrdrv c $major 0
+sudo chmod a+w+r /dev/gdrdrv
--- a/gdrcopy/packages/build-deb-packages.sh
+++ b/gdrcopy/packages/build-deb-packages.sh
@ -0,0 +1,247 @@
+#!/bin/bash
+# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in 
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+
+# Restart this number at 1 if MAJOR_VERSION or MINOR_VERSION changes
+# See https://www.debian.org/doc/debian-policy/ch-controlfields.html#version
+DEBIAN_VERSION=1
+
+SCRIPT_DIR_PATH="$(cd "$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")" && pwd)"
+TOP_DIR_PATH="${SCRIPT_DIR_PATH}/.."
+
+CWD=$(pwd)
+
+skip_dep_check=0
+build_test_package=1
+build_driver_package=1
+
+ex()
+{
+    local rc
+    echo "+ $@"
+    $@
+    rc=$?
+    
+    if [[ $rc -ne 0 ]]; then
+        echo "Failed with error $rc to execute: $@" >&2
+        exit $rc
+    fi
+}
+
+function show_help
+{
+    echo "Usage: [CUDA=<path>] $0 [-d] [-t] [-k] [-h]"
+    echo ""
+    echo "  CUDA=<path>     Set your installed CUDA path (ex. /usr/local/cuda)."
+    echo "  -d              Don't check build dependencies. Use my environment variables such as C_INCLUDE_PATH instead."
+    echo "  -t              Skip building gdrcopy-tests package."
+    echo "  -k              Skip building gdrdrv-dkms package."
+    echo "  -h              Show this help text."
+    echo ""
+}
+
+OPTIND=1	# Reset in case getopts has been used previously in the shell.
+
+while getopts "hdtk" opt; do
+    case "${opt}" in
+    h)
+        show_help
+        exit 0
+        ;;
+    d)  skip_dep_check=1
+        ;;
+    t)  build_test_package=0
+        ;;
+    k)  build_driver_package=0
+        ;;
+    esac
+done
+
+shift $((OPTIND-1))
+
+
+
+if [[ ${build_test_package} == 1 ]] && [ "X$CUDA" == "X" ]; then
+    echo "CUDA environment variable is not defined"; exit 1
+fi
+
+NVCC=${CUDA}/bin/nvcc
+CUDA_VERSION=`$NVCC --version | grep release | sed 's/^.*release \([0-9]\+\.[0-9]\+\).*/\1/'`
+CUDA_MAJOR=`echo ${CUDA_VERSION} | cut -d "." -f 1`
+CUDA_MINOR=`echo ${CUDA_VERSION} | cut -d "." -f 2`
+
+echo "Building debian package for the gdrcopy library ..."
+
+ex cd ${SCRIPT_DIR_PATH}
+
+MODULE_SUBDIR=$(awk '/MODULE_SUBDIR \?=/ { print $3 }' ${TOP_DIR_PATH}/src/gdrdrv/Makefile | tr -d '\n')
+
+MAJOR_VERSION=$(awk '/#define GDR_API_MAJOR_VERSION/ { print $3 }' ${TOP_DIR_PATH}/include/gdrapi.h | tr -d '\n')
+MINOR_VERSION=$(awk '/#define GDR_API_MINOR_VERSION/ { print $3 }' ${TOP_DIR_PATH}/include/gdrapi.h | tr -d '\n')
+VERSION="${MAJOR_VERSION}.${MINOR_VERSION}.4"
+if [ "X$VERSION" == "X" ]; then
+    echo "Failed to get version numbers!" >&2
+    exit 1
+fi
+#FULL_VERSION="${VERSION}-${DEBIAN_VERSION}"
+FULL_VERSION="${VERSION}"
+
+tmpdir=`mktemp -d /tmp/gdr.XXXXXX`
+if [ ! -d "${tmpdir}" ]; then
+    echo "Failed to create a temp directory!" >&2
+    exit 1
+fi
+
+echo "Building gdrcopy debian packages version ${FULL_VERSION} ..."
+
+echo "Working in ${tmpdir} ..."
+
+ex cd ${TOP_DIR_PATH}
+
+ex mkdir -p ${tmpdir}/gdrcopy
+ex rm -rf ${tmpdir}/gdrcopy/*
+ex cp -r Makefile README.md include src tests LICENSE config_arch ${tmpdir}/gdrcopy/
+ex cp -r packages/debian-lib ${tmpdir}/gdrcopy/
+ex cp -r packages/debian-tests ${tmpdir}/gdrcopy/
+ex cp README.md ${tmpdir}/gdrcopy/debian-lib/README.Debian
+ex cp README.md ${tmpdir}/gdrcopy/debian-lib/README.source
+ex cp README.md ${tmpdir}/gdrcopy/debian-tests/README.Debian
+ex cp README.md ${tmpdir}/gdrcopy/debian-tests/README.source
+
+ex cd ${tmpdir}/gdrcopy
+ex find . -type f -exec sed -i "s/@FULL_VERSION@/${FULL_VERSION}/g" {} +
+ex find . -type f -exec sed -i "s/@VERSION@/${VERSION}/g" {} +
+
+ex rm -f ${tmpdir}/libgdrapi_${VERSION}.orig.tar.gz
+ex rm -f ${tmpdir}/gdrcopy-tests_${VERSION}.orig.tar.gz
+
+ex cd ${tmpdir}
+ex cp -r gdrcopy libgdrapi-${VERSION}
+ex cd ${tmpdir}/libgdrapi-${VERSION}
+ex mv debian-lib debian
+ex rm -rf debian-*
+
+ex cd ${tmpdir}
+ex cp -r gdrcopy gdrcopy-tests-${VERSION}
+ex cd ${tmpdir}/gdrcopy-tests-${VERSION}
+ex mv debian-tests debian
+ex rm -rf debian-*
+
+ex cd ${tmpdir}
+ex tar czvf libgdrapi_${VERSION}.orig.tar.gz libgdrapi-${VERSION}
+ex tar czvf gdrcopy-tests_${VERSION}.orig.tar.gz gdrcopy-tests-${VERSION}
+
+echo "Building libgdrapi package ..."
+ex cd ${tmpdir}/libgdrapi-${VERSION}
+debuild_params="--set-envvar=PKG_CONFIG_PATH=${PKG_CONFIG_PATH}"
+if [ "${skip_dep_check}" -eq 1 ]; then
+    debuild_params+=" --preserve-env -d"
+    echo "Skip build dependency check. Use the environment variables instead ..."
+fi
+# --set-envvar needs to be placed before -us -uc
+debuild_params+=" -us -uc"
+ex debuild ${debuild_params}
+
+if [[ ${build_test_package} == 1 ]]; then
+    echo
+    echo "Building gdrcopy-tests package ..."
+    ex cd ${tmpdir}/gdrcopy-tests-${VERSION}
+    debuild_params="--set-envvar=CUDA=${CUDA} --set-envvar=PKG_CONFIG_PATH=${PKG_CONFIG_PATH}"
+    if [ "${skip_dep_check}" -eq 1 ]; then
+        debuild_params+=" --preserve-env -d"
+        echo "Skip build dependency check. Use the environment variables instead ..."
+    fi
+    # --set-envvar needs to be placed before -us -uc
+    debuild_params+=" -us -uc"
+    ex debuild ${debuild_params}
+fi
+
+if [[ ${build_driver_package} == 1 ]]; then
+    echo
+    echo "Building gdrdrv-dkms package ..."
+    ex cd ${tmpdir}/gdrcopy/src/gdrdrv
+    ex make clean
+
+    dkmsdir="${tmpdir}/gdrdrv-dkms-${VERSION}"
+    ex mkdir -p ${dkmsdir}
+    ex cp -r ${tmpdir}/gdrcopy/src/gdrdrv ${dkmsdir}/gdrdrv-${VERSION}
+    ex rm -rf ${dkmsdir}/gdrdrv-${VERSION}/debian-*
+    ex cp ${SCRIPT_DIR_PATH}/dkms.conf ${dkmsdir}/gdrdrv-${VERSION}/
+    ex cp -r ${TOP_DIR_PATH}/scripts ${dkmsdir}/gdrdrv-${VERSION}
+    ex cd ${dkmsdir}
+    ex cp -r ${SCRIPT_DIR_PATH}/dkms/* .
+    ex find . -type f -exec sed -i "s/@FULL_VERSION@/${FULL_VERSION}/g" {} +
+    ex find . -type f -exec sed -i "s/@VERSION@/${VERSION}/g" {} +
+    ex find . -type f -exec sed -i "s/@MODULE_LOCATION@/${MODULE_SUBDIR//\//\\/}/g" {} +
+
+    ex cd ${tmpdir}
+    ex tar czvf gdrdrv-dkms_${VERSION}.orig.tar.gz gdrdrv-dkms-${VERSION}
+
+    ex cd ${dkmsdir}
+    ex dpkg-buildpackage -rfakeroot -d -F -us -uc
+fi
+
+echo
+echo "Building gdrcopy package ..."
+metadir=${tmpdir}/gdrcopy-${VERSION}
+ex mkdir -p ${metadir}
+ex cd ${TOP_DIR_PATH}
+ex cp -r packages/debian-meta ${metadir}/debian
+ex cp README.md ${metadir}/debian/README.Debian
+ex cp README.md ${metadir}/debian/README.source
+ex cd ${metadir}
+ex find . -type f -exec sed -i "s/@FULL_VERSION@/${FULL_VERSION}/g" {} +
+ex find . -type f -exec sed -i "s/@VERSION@/${VERSION}/g" {} +
+ex find . -type f -exec sed -i "s/@MODULE_LOCATION@/${MODULE_SUBDIR//\//\\/}/g" {} +
+ex cd ${tmpdir}
+ex tar czvf gdrcopy_${VERSION}.orig.tar.gz gdrcopy-${VERSION}
+cd ${metadir}
+ex debuild -us -uc
+
+echo
+echo "Copying *.deb and supplementary files to the current working directory ..."
+if $(hash lsb_release 2>/dev/null); then
+    release=`lsb_release -rs | sed -e "s/\./_/g"`
+    id=`lsb_release -is | sed -e "s/ /_/g"`
+    release=".${id}${release}"
+else
+    release=""
+fi
+
+ex cd ${CWD}
+
+for item in `ls ${tmpdir}/*.deb`; do
+    item_name=`basename $item`
+    item_name=`echo $item_name | sed -e "s/\.deb//g"`
+    if echo "$item_name" | grep -q "tests"; then
+        item_name="${item_name}${release}+cuda${CUDA_MAJOR}.${CUDA_MINOR}.deb"
+    else
+        item_name="${item_name}${release}.deb"
+    fi
+    ex cp $item ./${item_name}
+done
+ex cp ${tmpdir}/*.tar.* .
+ex cp ${tmpdir}/*.dsc .
+
+echo
+echo "Cleaning up ..."
+
+ex rm -rf ${tmpdir}
--- a/gdrcopy/packages/build-rpm-packages.sh
+++ b/gdrcopy/packages/build-rpm-packages.sh
@ -0,0 +1,185 @@
+#!/bin/bash
+# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in 
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+
+# Restart this number at 1 if MAJOR_VERSION or MINOR_VERSION changes
+# See https://rpm-packaging-guide.github.io/#preamble-items
+RPM_VERSION=1
+
+SCRIPT_DIR_PATH="$(cd "$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")" && pwd)"
+TOP_DIR_PATH="${SCRIPT_DIR_PATH}/.."
+
+CWD=$(pwd)
+
+ex()
+{
+    local rc
+    echo "+ $@"
+    $@
+    rc=$?
+    
+    if [[ $rc -ne 0 ]]; then
+        echo "Failed with error $rc to execute: $@" >&2
+        exit $rc
+    fi
+}
+
+
+function show_help
+{
+    echo "This script is for generating GDRCopy RPM packages."
+    echo
+    echo "Usage: CUDA=<path> $0 [-m]"
+    echo
+    echo "Optional arguments:"
+    echo "  -m              Generate kmod package (default: no)."
+    echo
+    echo "Environment variables:"
+    echo "  CUDA=<path>             [Required] CUDA installation path (usually /usr/local/cuda)."
+    echo "  NVIDIA_SRC_DIR=<path>   [Optional] NVIDIA driver source directory (usually /usr/src/nvidia-<version>/nvidia)."
+}
+
+OPTIND=1	# Reset in case getopts has been used previously in the shell.
+
+generate_kmod=0
+
+while getopts "h?m" opt; do
+    case "$opt" in
+    h|\?)
+        show_help
+        exit 0
+        ;;
+    m)  generate_kmod=1
+        ;;
+    esac
+done
+
+shift $((OPTIND-1))
+
+NVCC=${CUDA}/bin/nvcc
+CUDA_VERSION=`$NVCC --version | grep release | sed 's/^.*release \([0-9]\+\.[0-9]\+\).*/\1/'`
+CUDA_MAJOR=`echo ${CUDA_VERSION} | cut -d "." -f 1`
+CUDA_MINOR=`echo ${CUDA_VERSION} | cut -d "." -f 2`
+
+if [ "X$CUDA" == "X" ]; then
+    echo "CUDA environment variable is not defined"
+    exit 1
+fi
+
+echo "Building rpm package ..."
+
+ex cd ${SCRIPT_DIR_PATH}
+
+MODULE_SUBDIR=$(awk '/MODULE_SUBDIR \?=/ { print $3 }' ${TOP_DIR_PATH}/src/gdrdrv/Makefile | tr -d '\n')
+
+MAJOR_VERSION=$(awk '/#define GDR_API_MAJOR_VERSION/ { print $3 }' ${TOP_DIR_PATH}/include/gdrapi.h | tr -d '\n')
+MINOR_VERSION=$(awk '/#define GDR_API_MINOR_VERSION/ { print $3 }' ${TOP_DIR_PATH}/include/gdrapi.h | tr -d '\n')
+VERSION="${MAJOR_VERSION}.${MINOR_VERSION}.4"
+if [ "X$VERSION" == "X" ]; then
+    echo "Failed to get version numbers!" >&2
+    exit 1
+fi
+FULL_VERSION="${VERSION}"
+
+if [[ ${generate_kmod} == 1 ]]; then
+    if [ -z "${NVIDIA_SRC_DIR}" ]; then
+	NVIDIA_SRC_DIR=$(find /usr/src/kernel-modules/nvidia-* /usr/src/nvidia-* -name "nv-p2p.c" -print -quit 2>/dev/null)
+        if [ ${#NVIDIA_SRC_DIR} -gt 0 ]; then
+            NVIDIA_SRC_DIR=$(dirname ${NVIDIA_SRC_DIR})
+        fi
+    fi
+
+    if [ -d ${NVIDIA_SRC_DIR} ]; then
+        NVIDIA_DRIVER_VERSION=$(basename $(dirname ${NVIDIA_SRC_DIR}))
+    else
+        echo "NVIDIA_SRC_DIR=${NVIDIA_SRC_DIR}" >&2
+        echo "Failed to find NVIDIA driver!" >&2
+        exit 1
+    fi
+fi
+
+
+tmpdir=`mktemp -d /tmp/gdr.XXXXXX`
+if [ ! -d "$tmpdir" ]; then
+    echo "Failed to create a temp directory!" >&2
+    exit 1
+fi
+
+echo "Building gdrcopy rpm packages version ${VERSION} ..."
+
+echo "Working in $tmpdir ..."
+
+ex cd ${TOP_DIR_PATH}
+
+ex mkdir -p $tmpdir/gdrcopy
+ex rm -rf $tmpdir/gdrcopy/*
+ex cp -r packages/dkms.conf packages/rhel/init.d packages/rhel/gdrcopy.service scripts/ insmod.sh Makefile README.md include src tests config_arch LICENSE packages/gdrcopy.spec $tmpdir/gdrcopy/
+ex rm -f $tmpdir/gdrcopy-$VERSION.tar.gz
+
+ex cd $tmpdir/gdrcopy
+ex find . -type f -exec sed -i "s/@FULL_VERSION@/${FULL_VERSION}/g" {} +
+ex find . -type f -exec sed -i "s/@VERSION@/${VERSION}/g" {} +
+ex find . -type f -exec sed -i "s/@MODULE_LOCATION@/${MODULE_SUBDIR//\//\\/}/g" {} +
+
+ex cd $tmpdir
+ex mv gdrcopy gdrcopy-$VERSION
+ex tar czvf gdrcopy-$VERSION.tar.gz gdrcopy-$VERSION
+
+ex mkdir -p $tmpdir/topdir/{SRPMS,RPMS,SPECS,BUILD,SOURCES}
+ex cp gdrcopy-$VERSION/gdrcopy.spec $tmpdir/topdir/SPECS/
+ex cp gdrcopy-$VERSION.tar.gz $tmpdir/topdir/SOURCES/
+
+rpmbuild_params="-ba --nodeps --define '_build_id_links none' --define \"_topdir $tmpdir/topdir\" --define \"_release ${RPM_VERSION}\" --define 'dist %{nil}' --define \"CUDA $CUDA\" --define \"GDR_VERSION ${VERSION}\" --define \"KVERSION $(uname -r)\" --define \"MODULE_LOCATION ${MODULE_SUBDIR}\""
+if [[ ${generate_kmod} == 1 ]]; then
+    rpmbuild_params="${rpmbuild_params} --define \"NVIDIA_DRIVER_VERSION ${NVIDIA_DRIVER_VERSION}\" --define \"NVIDIA_SRC_DIR ${NVIDIA_SRC_DIR}\" --define \"BUILD_KMOD 1\""
+fi
+rpmbuild_params="${rpmbuild_params} $tmpdir/topdir/SPECS/gdrcopy.spec"
+eval "rpmbuild ${rpmbuild_params}"
+
+rpms=`ls -1 $tmpdir/topdir/RPMS/*/*.rpm`
+srpm=`ls -1 $tmpdir/topdir/SRPMS/`
+if [ -f "/etc/redhat-release" ]; then
+    release_version=".el$(cat /etc/redhat-release | grep -o -E '[0-9]+' | head -1)"
+elif [ -f "/etc/centos-release" ]; then
+    release_version=".el$(cat /etc/centos-release | grep -o -E '[0-9]+' | head -1)"
+elif [ -f "/etc/os-release" ]; then
+    release_version=$(source /etc/os-release && echo ".$ID-$VERSION_ID")
+else
+    release_version="unknown_distro"
+fi
+echo $srpm $rpms
+ex cd ${CWD}
+for item in `ls $tmpdir/topdir/SRPMS/*.rpm $tmpdir/topdir/RPMS/*/*.rpm`; do
+    item_name=`basename $item .rpm`
+    arch=$(sed -ne 's/.*\(\.[^\.]\+\)$/\1/p' <<< $item_name)
+    item_name=`basename $item_name $arch`
+    if [ "$item_name" == "gdrcopy-${FULL_VERSION}-${RPM_VERSION}.`uname -m`" ]; then
+        item_name="${item_name}${release_version}+cuda${CUDA_MAJOR}.${CUDA_MINOR}.${arch}.rpm"
+    else
+        item_name="${item_name}${release_version}${arch}.rpm"
+    fi
+    ex cp $item ./${item_name}
+done
+
+echo
+echo "Cleaning up ..."
+
+ex rm -rf ${tmpdir}
--- a/gdrcopy/packages/debian-lib/changelog
+++ b/gdrcopy/packages/debian-lib/changelog
@ -0,0 +1,44 @@
+libgdrapi (2.4.4) stable; urgency=low
+
+  * No change.
+
+ -- Pak Markthub <pmarkthub@nvidia.com>  Mon, 16 Dec 2024 11:59:59 -0700
+
+libgdrapi (2.4.3) stable; urgency=low
+
+  * No change.
+
+ -- Pak Markthub <pmarkthub@nvidia.com>  Mon, 02 Dec 2024 11:59:59 -0700
+
+libgdrapi (2.4.2) stable; urgency=low
+
+  * Fix memory leak in gdr_pin_buffer.
+
+ -- Pak Markthub <pmarkthub@nvidia.com>  Thu, 31 Oct 2024 11:59:59 -0700
+
+libgdrapi (2.4.1) stable; urgency=low
+
+  * No change
+
+ -- Pak Markthub <pmarkthub@nvidia.com>  Mon, 18 Dec 2023 11:59:59 -0700
+
+libgdrapi (2.4) stable; urgency=low
+
+  * Introduce gdr_get_info_v2.
+  * Introduce new copy algorithm for device mappings.
+  * Add support for NVIDIA BLUEFIELD-3.
+
+ -- Pak Markthub <pmarkthub@nvidia.com>  Fri, 01 Sep 2023 11:59:59 -0700
+
+libgdrapi (2.3.1) stable; urgency=low
+
+  * No change
+
+ -- Pak Markthub <pmarkthub@nvidia.com>  Mon, 12 May 2023 11:59:59 -0700
+
+libgdrapi (2.3) stable; urgency=low
+
+  * Initial version of libgdrapi package -- was a part of gdrcopy package.
+
+ -- Pak Markthub <pmarkthub@nvidia.com>  Mon, 23 Jul 2021 11:59:59 -0700
+
--- a/gdrcopy/packages/debian-lib/compat
+++ b/gdrcopy/packages/debian-lib/compat
@ -0,0 +1 @@
+9
--- a/gdrcopy/packages/debian-lib/control
+++ b/gdrcopy/packages/debian-lib/control
@ -0,0 +1,19 @@
+Source: libgdrapi
+Priority: optional
+Maintainer: GPUDirect Team <gpudirect@nvidia.com>
+Uploaders: Davide Rossetti <drossetti@nvidia.com>, Pak Markthub <pmarkthub@nvidia.com>
+Build-Depends: debhelper (>= 9)
+Standards-Version: @FULL_VERSION@
+Section: libs
+Homepage: https://github.com/NVIDIA/gdrcopy
+#Vcs-Git: https://anonscm.debian.org/git/collab-maint/gdrcopy.git
+#Vcs-Browser: https://anonscm.debian.org/cgit/collab-maint/gdrcopy.git
+
+Package: libgdrapi
+Architecture: any
+Multi-Arch: same
+Depends: ${shlibs:Depends}, ${misc:Depends}
+Replaces: gdrcopy (<= 2.2-1)
+Conflicts: gdrcopy (<= 2.2-1)
+Description: A low-latency GPU memory copy library 
+ A low-latency GPU memory copy library based on NVIDIA GPUDirect RDMA technology.
--- a/gdrcopy/packages/debian-lib/copyright
+++ b/gdrcopy/packages/debian-lib/copyright
@ -0,0 +1,25 @@
+Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
+Upstream-Name: gdrcopy
+Source: https://github.com/NVIDIA/gdrcopy
+
+Files: *
+Copyright: 2019-2021 NVIDIA CORPORATION. All rights reserved.
+License: MIT
+ Permission is hereby granted, free of charge, to any person obtaining a
+ copy of this software and associated documentation files (the "Software"),
+ to deal in the Software without restriction, including without limitation
+ the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ and/or sell copies of the Software, and to permit persons to whom the
+ Software is furnished to do so, subject to the following conditions:
+ 
+ The above copyright notice and this permission notice shall be included in 
+ all copies or substantial portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ DEALINGS IN THE SOFTWARE.
+
--- a/gdrcopy/packages/debian-lib/libgdrapi-docs.docs
+++ b/gdrcopy/packages/debian-lib/libgdrapi-docs.docs
@ -0,0 +1,2 @@
+README.Debian
+README.source
--- a/gdrcopy/packages/debian-lib/libgdrapi.install
+++ b/gdrcopy/packages/debian-lib/libgdrapi.install
@ -0,0 +1 @@
+
--- a/gdrcopy/packages/debian-lib/rules
+++ b/gdrcopy/packages/debian-lib/rules
@ -0,0 +1,27 @@
+#!/usr/bin/make -f
+# See debhelper(7) (uncomment to enable)
+# output every command that modifies files on the build system.
+#export DH_VERBOSE = 1
+
+
+# see FEATURE AREAS in dpkg-buildflags(1)
+#export DEB_BUILD_MAINT_OPTIONS = hardening=+all
+
+# see ENVIRONMENT in dpkg-buildflags(1)
+# package maintainers to append CFLAGS
+#export DEB_CFLAGS_MAINT_APPEND  = -Wall -pedantic
+# package maintainers to append LDFLAGS
+#export DEB_LDFLAGS_MAINT_APPEND = -Wl,--as-needed
+
+
+%:
+	dh $@
+
+
+# dh_make generated override targets
+# This is example for Cmake (See https://bugs.debian.org/641051 )
+override_dh_auto_build:
+	dh_auto_build -- lib
+
+override_dh_auto_install:
+	$(MAKE) DESTDIR=$(CURDIR)/debian/libgdrapi prefix=/usr libdir=/usr/lib/$(DEB_HOST_MULTIARCH) lib_install
--- a/gdrcopy/packages/debian-lib/source/format
+++ b/gdrcopy/packages/debian-lib/source/format
@ -0,0 +1 @@
+3.0 (quilt)
--- a/gdrcopy/packages/debian-meta/changelog
+++ b/gdrcopy/packages/debian-meta/changelog
@ -0,0 +1,105 @@
+gdrcopy (2.4.4) stable; urgency=low
+
+  * Fix the use-after-free bug of mr objects in gdrdv_vma_close.
+  * Fix the resource leakage bug in gdrdrv_release.
+
+ -- Pak Markthub <pmarkthub@nvidia.com>  Mon, 16 Dec 2024 11:59:59 -0700
+
+gdrcopy (2.4.3) stable; urgency=low
+
+  * Fix NVIDIA_IS_OPENSOURCE detection when compile with NVIDIA driver version 545 or newer.
+  * Fix compile error in gdrdrv when compile on RHEL9.5.
+
+ -- Pak Markthub <pmarkthub@nvidia.com>  Mon, 02 Dec 2024 11:59:59 -0700
+
+gdrcopy (2.4.2) stable; urgency=low
+
+  * Fix the size alignment bug in gdrdrv.
+  * Fix memory leak in gdr_pin_buffer.
+  * Add support for another flavor of BF3.
+
+ -- Pak Markthub <pmarkthub@nvidia.com>  Thu, 31 Oct 2024 11:59:59 -0700
+
+gdrcopy (2.4.1) stable; urgency=low
+
+  * Add support for persistent mapping.
+  * Fix bug in src/gdrdrv/Makefile.
+  * Fix compile-time bug when check.h is not found.
+
+ -- Pak Markthub <pmarkthub@nvidia.com>  Mon, 18 Dec 2023 11:59:59 -0700
+
+gdrcopy (2.4) stable; urgency=low
+
+  * Various bug fixes in the test and benchmark applications.
+  * Prefix all applications with "gdrcopy_".
+  * Introduce more unit tests in gdrcopy_sanity.
+  * Introduce gdrcopy_pplat benchmark application.
+  * Remove dependency on libcheck and libsubunit
+  * Introduce gdr_get_info_v2.
+  * Introduce new copy algorithm for device mappings.
+  * Add support for NVIDIA BLUEFIELD-3.
+  * Add support for Linux kernel >= 6.3.
+  * Relicense gdrdrv to Dual MIT/GPL.
+  * Fix bugs in gdrdrv when pinning two small buffers back-to-back.
+  * Add support for coherent platforms such as Grace-Hopper.
+  * Add support for Confidential Computing (CC).
+
+ -- Pak Markthub <pmarkthub@nvidia.com>  Fri, 01 Sep 2023 11:59:59 -0700
+
+gdrcopy (2.3.1) stable; urgency=low
+
+  * Add a workaround for the GPL-compatibility issue when compile with CONFIG_ARCH_HAS_CC_PLATFORM on Linux kernel 5.18+.
+
+ -- Pak Markthub <pmarkthub@nvidia.com>  Mon, 12 May 2023 11:59:59 -0700
+
+gdrcopy (2.3) stable; urgency=low
+
+  * Convert to meta package.
+  * Declare dependency with gdrdrv-dkms, libgdrapi, and gdrcopy-tests.
+  * Update the package maintainer.
+
+ -- Pak Markthub <pmarkthub@nvidia.com>  Mon, 23 Jul 2021 11:59:59 -0700
+
+gdrcopy (2.2) stable; urgency=low
+
+  * Add support for ARM64.
+  * Update various information on README.
+  * Improve Makefile.
+  * Add multi-arch support.
+  * Handle removal of HAVE_UNLOCKED_IOCTL in Linux kernel v5.9 and later.
+  * Prevent dpkg package creation to unnecessarily compile gdrdrv.
+  * Improve gdr_open error message.
+  * Fix bug that prevents sanity from correctly summarizing failure.
+  * Add dkms support in kmod package.
+  * Handle the removal of kzfree in Linux kernel v5.10 and later.
+  * Improve small-size copy-to-mapping.
+
+ -- Pak Markthub <pmarkthub@nvidia.com>  Mon, 01 Feb 2021 11:59:59 -0700
+
+gdrcopy (2.1) stable; urgency=low
+
+  * fix build problem on RHL8 kernels
+  * relax checks in gdrdrv to support multi-threading use cases
+  * fix fd leak in gdr_open()
+  * Introduce copylat test application.
+  * Introduce basic_with_tokens and invalidation_fork_child_gdr_pin_parent_with_tokens sub-tests in sanity.
+  * Remove the dependency with libcudart.so.
+  * Clean up the code in the tests folder.
+  * Change the package maintainer to Davide Rossetti.
+
+ -- Davide Rossetti <drossetti@nvidia.com>  Mon, 02 Mar 2020 11:59:59 -0700
+
+gdrcopy (2.0) stable; urgency=low
+
+  * Improve copy performance with unrolling in POWERPC.
+  * Create sanity unit test for testing the functionality and security.
+  * Consolidate basic and validate into sanity unit test.
+  * Introduce compile time and runtime version checking in libgdrapi.
+  * Improve rpm packaging.
+  * Introduce deb packaging for the userspace library and the applications.
+  * Introduce dkms packaging for the gdrdrv driver.
+  * Rename gdr_copy_from/to_bar to gdr_copy_from/to_mapping.
+  * Update README
+
+ -- Pak Markthub <pmarkthub@nvidia.com>  Mon, 16 Sep 2019 11:59:59 -0700
+
--- a/gdrcopy/packages/debian-meta/compat
+++ b/gdrcopy/packages/debian-meta/compat
@ -0,0 +1 @@
+9
--- a/gdrcopy/packages/debian-meta/control
+++ b/gdrcopy/packages/debian-meta/control
@ -0,0 +1,17 @@
+Source: gdrcopy
+Priority: optional
+Maintainer: GPUDirect Team <gpudirect@nvidia.com>
+Uploaders: Davide Rossetti <drossetti@nvidia.com>, Pak Markthub <pmarkthub@nvidia.com>
+Build-Depends: debhelper (>= 9)
+Standards-Version: @FULL_VERSION@
+Section: misc
+Homepage: https://github.com/NVIDIA/gdrcopy
+
+Package: gdrcopy
+Architecture: any
+Multi-Arch: same
+Depends: gdrdrv-dkms (= @FULL_VERSION@), libgdrapi (= @FULL_VERSION@), gdrcopy-tests (= @FULL_VERSION@)
+Maintainer: GPUDirect Team <gpudirect@nvidia.com>
+Uploaders: Davide Rossetti <drossetti@nvidia.com>, Pak Markthub <pmarkthub@nvidia.com>
+Description: GDRCopy meta-package
+ Meta-package for GDRCopy, a low-latency GPU memory copy library based on NVIDIA GPUDirect RDMA technology.
--- a/gdrcopy/packages/debian-meta/copyright
+++ b/gdrcopy/packages/debian-meta/copyright
@ -0,0 +1,25 @@
+Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
+Upstream-Name: gdrcopy
+Source: https://github.com/NVIDIA/gdrcopy
+
+Files: *
+Copyright: 2019-2021 NVIDIA CORPORATION. All rights reserved.
+License: MIT
+ Permission is hereby granted, free of charge, to any person obtaining a
+ copy of this software and associated documentation files (the "Software"),
+ to deal in the Software without restriction, including without limitation
+ the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ and/or sell copies of the Software, and to permit persons to whom the
+ Software is furnished to do so, subject to the following conditions:
+ 
+ The above copyright notice and this permission notice shall be included in 
+ all copies or substantial portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ DEALINGS IN THE SOFTWARE.
+
--- a/gdrcopy/packages/debian-meta/rules
+++ b/gdrcopy/packages/debian-meta/rules
@ -0,0 +1,44 @@
+#!/usr/bin/make -f
+# See debhelper(7) (uncomment to enable)
+# output every command that modifies files on the build system.
+#export DH_VERBOSE = 1
+
+
+# see FEATURE AREAS in dpkg-buildflags(1)
+#export DEB_BUILD_MAINT_OPTIONS = hardening=+all
+
+# see ENVIRONMENT in dpkg-buildflags(1)
+# package maintainers to append CFLAGS
+#export DEB_CFLAGS_MAINT_APPEND  = -Wall -pedantic
+# package maintainers to append LDFLAGS
+#export DEB_LDFLAGS_MAINT_APPEND = -Wl,--as-needed
+
+build build-arch build-indep:
+
+clean:
+	dh_testdir
+	dh_clean
+
+install: build
+	dh_testdir
+	dh_testroot
+	dh_prep
+
+binary-arch: install
+
+binary-indep: install
+	dh_testdir
+	dh_testroot
+	dh_install
+	dh_installdocs
+	dh_installchangelogs
+	dh_compress
+	dh_fixperms
+	dh_installdeb
+	dh_gencontrol
+	dh_md5sums
+	dh_builddeb
+
+binary: binary-indep binary-arch
+.PHONY: build clean binary-indep binary-arch binary install
+
--- a/gdrcopy/packages/debian-meta/source/format
+++ b/gdrcopy/packages/debian-meta/source/format
@ -0,0 +1 @@
+3.0 (quilt)
--- a/gdrcopy/packages/debian-tests/changelog
+++ b/gdrcopy/packages/debian-tests/changelog
@ -0,0 +1,47 @@
+gdrcopy-tests (2.4.4) stable; urgency=low
+
+  * No change.
+
+ -- Pak Markthub <pmarkthub@nvidia.com>  Mon, 16 Dec 2024 11:59:59 -0700
+
+gdrcopy-tests (2.4.3) stable; urgency=low
+
+  * No change.
+
+ -- Pak Markthub <pmarkthub@nvidia.com>  Mon, 02 Dec 2024 11:59:59 -0700
+
+gdrcopy-tests (2.4.2) stable; urgency=low
+
+  * No change.
+
+ -- Pak Markthub <pmarkthub@nvidia.com>  Thu, 31 Oct 2024 11:59:59 -0700
+
+gdrcopy-tests (2.4.1) stable; urgency=low
+
+  * Fix compile-time bug when check.h is not found.
+
+ -- Pak Markthub <pmarkthub@nvidia.com>  Mon, 18 Dec 2023 11:59:59 -0700
+
+gdrcopy-tests (2.4) stable; urgency=low
+
+  * Various bug fixes in the test and benchmark applications.
+  * Prefix all applications with "gdrcopy_".
+  * Introduce more unit tests in gdrcopy_sanity.
+  * Introduce gdrcopy_pplat benchmark application.
+  * Remove dependency on libcheck and libsubunit
+
+ -- Pak Markthub <pmarkthub@nvidia.com>  Fri, 01 Sep 2023 11:59:59 -0700
+
+gdrcopy-tests (2.3.1) stable; urgency=low
+
+  * No change
+
+ -- Pak Markthub <pmarkthub@nvidia.com>  Mon, 12 May 2023 11:59:59 -0700
+
+gdrcopy-tests (2.3) stable; urgency=low
+
+  * Initial version of gdrcopy-tests package -- was a part of gdrcopy package.
+  * Add apiperf test.
+
+ -- Pak Markthub <pmarkthub@nvidia.com>  Mon, 23 Jul 2021 11:59:59 -0700
+
--- a/gdrcopy/packages/debian-tests/compat
+++ b/gdrcopy/packages/debian-tests/compat
@ -0,0 +1 @@
+9
--- a/gdrcopy/packages/debian-tests/control
+++ b/gdrcopy/packages/debian-tests/control
@ -0,0 +1,18 @@
+Source: gdrcopy-tests
+Priority: optional
+Maintainer: GPUDirect Team <gpudirect@nvidia.com>
+Uploaders: Davide Rossetti <drossetti@nvidia.com>, Pak Markthub <pmarkthub@nvidia.com>
+Build-Depends: debhelper (>= 9)
+Standards-Version: @FULL_VERSION@
+Section: utils
+Homepage: https://github.com/NVIDIA/gdrcopy
+#Vcs-Git: https://anonscm.debian.org/git/collab-maint/gdrcopy.git
+#Vcs-Browser: https://anonscm.debian.org/cgit/collab-maint/gdrcopy.git
+
+Package: gdrcopy-tests
+Architecture: any
+Multi-Arch: same
+Depends: libgdrapi (>= @FULL_VERSION@), ${shlibs:Depends}, ${misc:Depends}
+Replaces: gdrcopy (<= 2.2-1)
+Conflicts: gdrcopy (<= 2.2-1)
+Description: Test utilities for GDRCopy 
--- a/gdrcopy/packages/debian-tests/copyright
+++ b/gdrcopy/packages/debian-tests/copyright
@ -0,0 +1,25 @@
+Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
+Upstream-Name: gdrcopy
+Source: https://github.com/NVIDIA/gdrcopy
+
+Files: *
+Copyright: 2019-2021 NVIDIA CORPORATION. All rights reserved.
+License: MIT
+ Permission is hereby granted, free of charge, to any person obtaining a
+ copy of this software and associated documentation files (the "Software"),
+ to deal in the Software without restriction, including without limitation
+ the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ and/or sell copies of the Software, and to permit persons to whom the
+ Software is furnished to do so, subject to the following conditions:
+ 
+ The above copyright notice and this permission notice shall be included in 
+ all copies or substantial portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ DEALINGS IN THE SOFTWARE.
+
--- a/gdrcopy/packages/debian-tests/gdrcopy-tests-docs.docs
+++ b/gdrcopy/packages/debian-tests/gdrcopy-tests-docs.docs
@ -0,0 +1,2 @@
+README.Debian
+README.source
--- a/gdrcopy/packages/debian-tests/gdrcopy-tests.install
+++ b/gdrcopy/packages/debian-tests/gdrcopy-tests.install
@ -0,0 +1 @@
+
--- a/gdrcopy/packages/debian-tests/rules
+++ b/gdrcopy/packages/debian-tests/rules
@ -0,0 +1,30 @@
+#!/usr/bin/make -f
+# See debhelper(7) (uncomment to enable)
+# output every command that modifies files on the build system.
+#export DH_VERBOSE = 1
+
+
+# see FEATURE AREAS in dpkg-buildflags(1)
+#export DEB_BUILD_MAINT_OPTIONS = hardening=+all
+
+# see ENVIRONMENT in dpkg-buildflags(1)
+# package maintainers to append CFLAGS
+#export DEB_CFLAGS_MAINT_APPEND  = -Wall -pedantic
+# package maintainers to append LDFLAGS
+#export DEB_LDFLAGS_MAINT_APPEND = -Wl,--as-needed
+
+
+%:
+	dh $@
+
+
+# dh_make generated override targets
+# This is example for Cmake (See https://bugs.debian.org/641051 )
+override_dh_auto_build:
+	dh_auto_build -- CUDA=$(CUDA) lib exes
+
+override_dh_shlibdeps:
+	dh_shlibdeps -Xgdrcopy_apiperf -Xgdrcopy_copybw -Xgdrcopy_copylat -Xgdrcopy_sanity -Xgdrcopy_pplat
+
+override_dh_auto_install:
+	$(MAKE) DESTDIR=$(CURDIR)/debian/gdrcopy-tests prefix=/usr exes_install
--- a/gdrcopy/packages/debian-tests/source/format
+++ b/gdrcopy/packages/debian-tests/source/format
@ -0,0 +1 @@
+3.0 (quilt)
--- a/gdrcopy/packages/dkms.conf
+++ b/gdrcopy/packages/dkms.conf
@ -0,0 +1,6 @@
+PACKAGE_NAME="gdrdrv"
+PACKAGE_VERSION="@FULL_VERSION@"
+BUILT_MODULE_NAME[0]="gdrdrv"
+DEST_MODULE_LOCATION[0]="@MODULE_LOCATION@"
+AUTOINSTALL="yes"
+MAKE[0]="cd $dkms_tree/gdrdrv/@FULL_VERSION@/build && make CONF_SCRIPT_DIR=scripts KVER=$kernelver"
--- a/gdrcopy/packages/dkms/Makefile
+++ b/gdrcopy/packages/dkms/Makefile
@ -0,0 +1,33 @@
+#/usr/bin/make
+SRC = $(DESTDIR)/usr/src
+SHARE = $(DESTDIR)/usr/share/$(NAME)-dkms
+
+all:
+
+clean:
+
+install:
+
+#source tree
+ifeq ("$(wildcard $(NAME)-$(VERSION))", "$(NAME)-$(VERSION)")
+	install -d "$(SRC)"
+	cp -a $(NAME)-$(VERSION) $(SRC)
+
+	# sets 0755 for dirs, 0644 for files
+	chmod a-wx+rX,u+w -R "$(SRC)/$(NAME)-$(VERSION)"
+
+	# set u+x for all files under the scripts folder
+	chmod u+x -R "$(SRC)/$(NAME)-$(VERSION)/scripts"
+endif
+
+#tarball, possibly with binaries
+ifeq ("$(wildcard $(NAME)-$(VERSION).dkms.tar.gz)", "$(NAME)-$(VERSION).dkms.tar.gz")
+	install -d "$(SHARE)"
+	install -m 644 $(NAME)-$(VERSION).dkms.tar.gz "$(SHARE)"
+endif
+
+#postinst, only if we are supporting legacy mode
+ifeq ("$(wildcard common.postinst)", "common.postinst")
+	install -d "$(SHARE)"
+	install -m 755 $(PREFIX)/usr/lib/dkms/common.postinst $(SHARE)/postinst
+endif
--- a/gdrcopy/packages/dkms/common.postinst
+++ b/gdrcopy/packages/dkms/common.postinst
@ -0,0 +1,293 @@
+#!/bin/sh
+# Copyright (C) 2002-2005 Flavio Stanchina
+# Copyright (C) 2005-2006 Aric Cyr
+# Copyright (C) 2007 Mario Limonciello
+# Copyright (C) 2009 Alberto Milone
+
+set -e
+
+. /usr/share/debconf/confmodule
+
+uname_s=$(uname -s)
+
+_get_kernel_dir() {
+    KVER=$1
+    case ${uname_s} in
+	Linux)		DIR="/lib/modules/$KVER/build" ;;
+	GNU/kFreeBSD)	DIR="/usr/src/kfreebsd-headers-$KVER/sys" ;;
+    esac
+    echo $DIR
+}
+
+_check_kernel_dir() {
+    DIR=$(_get_kernel_dir $1)
+    case ${uname_s} in
+	Linux)		test -e $DIR/include ;;
+	GNU/kFreeBSD)	test -e $DIR/kern && test -e $DIR/conf/kmod.mk ;;
+	*)		return 1 ;;
+    esac
+    return $?
+}
+
+# Check the existence of a kernel named as $1
+_is_kernel_name_correct() {
+    CORRECT="no"
+    KERNEL_NAME=$1
+
+    for kernel in /boot/config-*; do
+        KERNEL=${kernel#*-}
+        if [ "${KERNEL}" = "${KERNEL_NAME}" ]; then
+            CORRECT="yes"
+            break
+        fi
+    done
+
+    echo $CORRECT
+}
+
+
+# Get the most recent kernel on Debian based systems. This keeps
+# into account both the version and the ABI. If the current kernel
+# is the most recent kernel then the function will print a null string.
+_get_newest_kernel_debian() {
+    NEWEST_KERNEL=
+    NEWEST_VERSION=
+    NEWEST_ABI=
+
+    for kernel in /boot/config-*; do
+        [ -f "$kernel" ] || continue
+        KERNEL=${kernel#*-}
+        KERNEL_VERSION=${KERNEL%%-*}
+        ABI=${KERNEL#*-}
+        ABI=${ABI%%-*}
+
+        if [ -z "$NEWEST_KERNEL" ]; then
+            # The 1st time get a version which is bigger than $1
+            COMPARE_TO=$1
+        else
+            # Get the biggest version
+            COMPARE_TO="$NEWEST_VERSION-$NEWEST_ABI"
+        fi
+
+        # if $kernel is greater than $COMPARE_TO
+        if [ `dpkg --compare-versions "$KERNEL_VERSION-$ABI" gt "$COMPARE_TO" && echo "yes" || \
+              echo "no"` = "yes" ]; then
+            NEWEST_KERNEL=$KERNEL
+            NEWEST_VERSION=$KERNEL_VERSION
+            NEWEST_ABI=$ABI
+        fi
+    done
+
+    echo "$NEWEST_KERNEL"
+}
+
+# Get the most recent kernel in Rhel based systems. If the current kernel
+# is the most recent kernel then the function will print a null string.
+_get_newest_kernel_rhel() {
+    NEWEST_KERNEL=
+
+    LAST_INSTALLED_KERNEL=$(rpm -q --whatprovides kernel  --last | grep kernel -m1 | cut -f1 -d' ')
+
+    LIK_FORMATTED_NAME=$(rpm -q $LAST_INSTALLED_KERNEL --queryformat="%{VERSION}-%{RELEASE}.%{ARCH}\n")
+
+    if [ `echo $LIK_FORMATTED_NAME | grep 2.6 >/dev/null` ]; then
+        # Fedora and Suse
+        NEWEST_KERNEL=$LIK_FORMATTED_NAME
+    else
+        # Hack for Mandriva where $LIK_FORMATTED_NAME is broken
+        LIK_NAME=$(rpm -q $LAST_INSTALLED_KERNEL --queryformat="%{NAME}\n")
+        LIK_TYPE=${LIK_NAME#kernel-}
+        LIK_TYPE=${LIK_TYPE%%-*}
+        LIK_STRIPPED=${LIK_NAME#kernel-}
+        LIK_STRIPPED=${LIK_STRIPPED#$LIK_TYPE-}
+        LIK_STRIPPED_BASE=${LIK_STRIPPED%%-*}
+        LIK_STRIPPED_END=${LIK_STRIPPED#$LIK_STRIPPED_BASE-}
+        LIK_FINAL=$LIK_STRIPPED_BASE-$LIK_TYPE-$LIK_STRIPPED_END
+
+        NEWEST_KERNEL=$LIK_FINAL
+    fi
+
+    echo $NEWEST_KERNEL
+}
+
+# Get the newest kernel on Debian and Rhel based systems.
+get_newest_kernel() {
+    NEWEST_KERNEL=
+    # Try Debian first as rpm can be installed in Debian based distros
+    if [ -e /usr/bin/dpkg ]; then
+        # If DEB based
+        CURRENT_VERSION=${CURRENT_KERNEL%%-*}
+        CURRENT_ABI=${CURRENT_KERNEL#*-}
+        CURRENT_FLAVOUR=${CURRENT_ABI#*-}
+        CURRENT_ABI=${CURRENT_ABI%%-*}
+        NEWEST_KERNEL=$(_get_newest_kernel_debian "$CURRENT_VERSION-$CURRENT_ABI")
+
+    elif [ `which rpm >/dev/null` ]; then
+        # If RPM based
+        NEWEST_KERNEL=$(_get_newest_kernel_rhel)
+    fi
+
+    # Make sure that kernel name that we extracted corresponds to an installed
+    # kernel
+    if [ -n "$NEWEST_KERNEL" ] && [ `_is_kernel_name_correct $NEWEST_KERNEL` = "no" ]; then
+        NEWEST_KERNEL=
+    fi
+
+    echo $NEWEST_KERNEL
+}
+
+NAME=$1
+VERSION=$2
+TARBALL_ROOT=$3
+ARCH=$4
+UPGRADE=$5
+
+if [ -z "$NAME" ] || [ -z "$VERSION" ]; then
+    echo "Need NAME, and VERSION defined"
+    echo "ARCH is optional"
+    exit 1
+fi
+
+# read framework configuration options
+if [ -r /etc/dkms/framework.conf ]; then
+    . /etc/dkms/framework.conf
+fi
+
+KERNELS=$(ls /lib/modules/ 2>/dev/null || true)
+CURRENT_KERNEL=$(uname -r)
+
+#We never want to keep an older version side by side to prevent conflicts
+if [ -e "/var/lib/dkms/$NAME/$VERSION" ]; then
+    echo "Removing old $NAME-$VERSION DKMS files..."
+    dkms remove -m $NAME -v $VERSION --all
+fi
+
+#Load new files, by source package and by tarball
+if [ -f "$TARBALL_ROOT/$NAME-$VERSION.dkms.tar.gz" ]; then
+    if ! dkms ldtarball --archive "$TARBALL_ROOT/$NAME-$VERSION.dkms.tar.gz"; then
+        echo ""
+        echo ""
+        echo "Unable to load DKMS tarball $TARBALL_ROOT/$NAME-$VERSION.dkms.tar.gz."
+        echo "Common causes include: "
+        echo " - You must be using DKMS 2.1.0.0 or later to support binaries only"
+        echo "   distribution specific archives."
+        echo " - Corrupt distribution specific archive"
+        echo ""
+        echo ""
+        exit 2
+    fi
+elif [ -d "/usr/src/$NAME-$VERSION" ]; then
+    echo "Loading new $NAME-$VERSION DKMS files..."
+    dkms add -m $NAME -v $VERSION > /dev/null
+fi
+
+# On 1st installation, let us look for a directory
+# in /lib/modules which matches `uname -r`. If none
+# is found it is possible that buildd is being used
+# and that uname -r is giving us the name of the
+# kernel used by the buildd machine.
+#
+# If this is the case we try to build the kernel
+# module for each kernel which has a directory in
+# /lib/modules. Furthermore we will have to tell
+# DKMS which architecture it should build the module
+# for (e.g. if the buildd machine is using a
+# 2.6.24-23-xen 64bit kernel).
+#
+# NOTE: if the headers are not installed then the
+#       module won't be built, as usual
+
+# Here we look for the most recent kernel so that we can
+# build the module for it (in addition to doing it for the
+# current kernel.
+NEWEST_KERNEL=$(get_newest_kernel)
+
+if [ -z "$autoinstall_all_kernels" ]; then
+    # If the current kernel is installed on the system or chroot
+    if [ `_is_kernel_name_correct $CURRENT_KERNEL` = "yes" ]; then
+        if [ -n "$NEWEST_KERNEL" ] && [ ${CURRENT_KERNEL} != ${NEWEST_KERNEL} ]; then
+            KERNELS="$CURRENT_KERNEL $NEWEST_KERNEL"
+        else
+            KERNELS=$CURRENT_KERNEL
+        fi
+    # The current kernel is not useful as it's not installed
+    else
+        echo "It is likely that $CURRENT_KERNEL belongs to a chroot's host"
+
+        # Let's use only the newest kernel if this is not a first installation
+        # otherwise build for all kernels
+        if [ -n "$NEWEST_KERNEL" -a -n "$UPGRADE" ]; then
+            KERNELS="$NEWEST_KERNEL"
+        fi
+    fi
+fi
+
+# Take care of displaying newline separated list
+echo "Building for $KERNELS" | tr '\n' ',' \
+    | sed -e 's/,/, /g; s/, $/\n/; s/, \([^,]\+\)$/ and \1/'
+
+if [ -n "$ARCH" ]; then
+    if which lsb_release >/dev/null && [ $(lsb_release -s -i) = "Ubuntu" ]; then
+        case $ARCH in
+            amd64)
+                ARCH="x86_64"
+                ;;
+            lpia|i?86)
+                ARCH="i686"
+                ;;
+        esac
+    fi
+    echo "Building for architecture $ARCH"
+    ARCH="-a $ARCH"
+fi
+
+for KERNEL in $KERNELS; do
+    dkms_status=`dkms status -m $NAME -v $VERSION -k $KERNEL $ARCH`
+    if [ `echo $KERNEL | grep -c "BOOT"` -gt 0 ]; then
+        echo ""
+        echo "Module build and install for $KERNEL was skipped as "
+        echo "it is a BOOT variant"
+        continue
+    fi
+
+
+    #if the module isn't yet built, try to build it
+    if [ `echo $dkms_status | grep -c ": built"` -eq 0 ]; then
+        if [ ! -L /var/lib/dkms/$NAME/$VERSION/source ]; then
+            echo "This package appears to be a binaries-only package"
+            echo " you will not be able to build against kernel $KERNEL"
+            echo " since the package source was not provided"
+            continue
+        fi
+        if _check_kernel_dir $KERNEL; then
+            echo "Building initial module for $KERNEL"
+            set +e
+            dkms build -m $NAME -v $VERSION -k $KERNEL $ARCH > /dev/null
+            case $? in
+            9)
+                set -e
+                echo "Skipped."
+                continue
+                ;;
+            0)
+                set -e
+                echo "Done."
+                ;;
+            *)
+                exit $?
+                ;;
+            esac
+            dkms_status=`dkms status -m $NAME -v $VERSION -k $KERNEL $ARCH`
+        else
+            echo "Module build for kernel $KERNEL was skipped since the"
+            echo "kernel headers for this kernel does not seem to be installed."
+        fi
+    fi
+
+    #if the module is built (either pre-built or just now), install it
+    if [ `echo $dkms_status | grep -c ": built"` -eq 1 ] && 
+       [ `echo $dkms_status | grep -c ": installed"` -eq 0 ]; then
+        dkms install -m $NAME -v $VERSION -k $KERNEL $ARCH
+    fi
+done
+
--- a/gdrcopy/packages/dkms/debian/README.Debian
+++ b/gdrcopy/packages/dkms/debian/README.Debian
@ -0,0 +1,5 @@
+gdrdrv DKMS module for Debian
+
+This package was automatically generated by the DKMS system,
+for distribution on Debian based operating systems.
+
--- a/gdrcopy/packages/dkms/debian/changelog
+++ b/gdrcopy/packages/dkms/debian/changelog
@ -0,0 +1,72 @@
+gdrdrv-dkms (2.4.4) stable; urgency=low
+
+  * Fix the use-after-free bug of mr objects in gdrdv_vma_close.
+  * Fix the resource leakage bug in gdrdrv_release.
+
+ -- Pak Markthub <pmarkthub@nvidia.com>  Mon, 16 Dec 2024 11:59:59 -0700
+
+gdrdrv-dkms (2.4.3) stable; urgency=low
+
+  * Fix NVIDIA_IS_OPENSOURCE detection when compile with NVIDIA driver version 545 or newer.
+  * Fix compile error in gdrdrv when compile on RHEL9.5.
+
+ -- Pak Markthub <pmarkthub@nvidia.com>  Mon, 02 Dec 2024 11:59:59 -0700
+
+gdrdrv-dkms (2.4.2) stable; urgency=low
+
+  * Fix the size alignment bug in gdrdrv.
+  * Add support for another flavor of BF3.
+
+ -- Pak Markthub <pmarkthub@nvidia.com>  Thu, 31 Oct 2024 11:59:59 -0700
+
+gdrdrv-dkms (2.4.1) stable; urgency=low
+
+  * Add support for persistent mapping.
+  * Fix bug in src/gdrdrv/Makefile.
+
+ -- Pak Markthub <pmarkthub@nvidia.com>  Mon, 18 Dec 2023 11:59:59 -0700
+
+gdrdrv-dkms (2.4) stable; urgency=low
+
+  * Add support for NVIDIA BLUEFIELD-3.
+  * Add support for Linux kernel >= 6.3.
+  * Relicense gdrdrv to Dual MIT/GPL.
+  * Fix bugs in gdrdrv when pinning two small buffers back-to-back.
+  * Add support for coherent platforms such as Grace-Hopper.
+  * Add support for Confidential Computing (CC).
+
+ -- Pak Markthub <pmarkthub@nvidia.com>  Fri, 01 Sep 2023 11:59:59 -0700
+
+gdrdrv-dkms (2.3.1) stable; urgency=low
+
+  * Add a workaround for the GPL-compatibility issue when compile with CONFIG_ARCH_HAS_CC_PLATFORM on Linux kernel 5.18+.
+
+ -- Pak Markthub <pmarkthub@nvidia.com>  Mon, 12 May 2023 11:59:59 -0700
+
+gdrdrv-dkms (2.3) stable; urgency=low
+
+  * Change the package maintainer to GPUDirect Team.
+  * Add Davide Rossetti and Pak Makthub as Uploaders.
+  * Revamp gdrdrv to fix race-condition bugs.
+
+ -- Pak Markthub <pmarkthub@nvidia.com>  Mon, 23 Jul 2021 11:59:59 -0700
+
+gdrdrv-dkms (2.2) stable; urgency=low
+
+  * No change.
+
+ -- Pak Markthub <pmarkthub@nvidia.com>  Mon, 01 Feb 2021 11:59:59 -0700
+
+gdrdrv-dkms (2.1) stable; urgency=low
+
+  * Change the package maintainer to Davide Rossetti.
+
+ -- Davide Rossetti <drossetti@nvidia.com>  Mon, 02 Mar 2020 11:59:59 -0700
+
+gdrdrv-dkms (2.0) stable; urgency=low
+
+  * Harden security in gdrdrv.
+  * Enable cached mappings in POWER9.
+
+ -- Pak Markthub <pmarkthub@nvidia.com>  Mon, 16 Sep 2019 11:59:59 -0700
+
--- a/gdrcopy/packages/dkms/debian/compat
+++ b/gdrcopy/packages/dkms/debian/compat
@ -0,0 +1 @@
+9
--- a/gdrcopy/packages/dkms/debian/control
+++ b/gdrcopy/packages/dkms/debian/control
@ -0,0 +1,13 @@
+Source: gdrdrv-dkms
+Section: misc
+Priority: optional
+Maintainer: GPUDirect Team <gpudirect@nvidia.com>
+Uploaders: Davide Rossetti <drossetti@nvidia.com>, Pak Markthub <pmarkthub@nvidia.com>
+Build-Depends: debhelper (>= 9), dkms
+Standards-Version: @FULL_VERSION@
+
+Package: gdrdrv-dkms
+Architecture: any
+Multi-Arch: same
+Depends: dkms (>= 1.95), ${misc:Depends}
+Description: gdrdrv driver in DKMS format.
--- a/gdrcopy/packages/dkms/debian/copyright
+++ b/gdrcopy/packages/dkms/debian/copyright
@ -0,0 +1,2 @@
+
+This copyright has not been completed by the author of this package.
--- a/gdrcopy/packages/dkms/debian/dirs
+++ b/gdrcopy/packages/dkms/debian/dirs
@ -0,0 +1 @@
+usr/src
--- a/gdrcopy/packages/dkms/debian/gdrdrv.init
+++ b/gdrcopy/packages/dkms/debian/gdrdrv.init
@ -0,0 +1,147 @@
+#!/bin/bash
+#
+# Startup/shutdown script for GDRcopy driver
+# chkconfig: 2345 20 80
+# description: Startup/shutdown script for GDRcopy kernel-mode driver
+
+### BEGIN INIT INFO
+# Provides:       gdrcopy
+# Required-Start:
+# Required-Stop:
+# Default-Start: 2 3 4 5
+# Default-Stop: 0 1 6
+# Description: GDRcopy kernel-mode driver init script
+### END INIT INFO
+
+# Source function library.
+. /lib/lsb/init-functions
+
+
+DRIVER=gdrdrv
+RETVAL=0
+
+is_module()
+{
+    local RC
+
+    /sbin/lsmod | grep -w "$1" > /dev/null 2>&1
+    RC=$?
+
+    return $RC
+}
+
+log_msg()
+{
+    logger -i "$modname: $@"
+}
+
+function req_modules_loaded() {
+    local RC
+    local reqmods="nvidia"
+    for mod in $reqmods; do
+        if ! is_module $mod; then
+            echo "module $mod is not loaded"
+            RC=1
+            break
+        fi
+    done
+    return $RC
+}
+
+# Create /dev nodes for device
+function createnodes() {
+    local module=$1
+    local RC
+    local inode=/dev/$module
+
+    major=`fgrep $module /proc/devices | cut -b 1-4`
+    log_msg "$module: driver major is $major"
+
+    [ -e $inode ] && rm -f $inode
+    mknod -m 666 $inode c $major 0
+    RC=$?
+
+    return $RC
+}
+
+# Remove /dev nodes for device
+function removenodes() {
+    rm -f /dev/gdrdrv*
+}
+
+load_module()
+{
+    local RC
+    local module=$1
+    filename=`modinfo $module | grep filename | awk '{print $NF}'`
+
+    if [ ! -n "$filename" ]; then
+        echo "Module $module does not exist"
+        log_msg "Error: Module $module does not exist"
+        return 1
+    fi
+
+    echo -n $"Loading $DRIVER kernel module: "
+    /sbin/modprobe $module && log_success_msg || log_failure_msg
+    RC=$?
+
+    return $RC
+}
+
+# Start daemon
+function start() {
+
+    echo -n $"Checking required modules: "
+    req_modules_loaded && log_success_msg || log_failure_msg
+    RETVAL=$?
+    echo
+    [ "$RETVAL" = 0 ] || exit $RETVAL
+
+    if is_module $DRIVER ; then
+        echo "module already loaded"
+    else
+        load_module $DRIVER
+        RETVAL=$?
+        echo
+        [ "$RETVAL" = 0 ] || exit $RETVAL
+    fi
+
+    echo -n $"Initializing GDRcopy /dev entries: "
+    createnodes $DRIVER && log_success_msg || log_failure_msg
+    RETVAL=$?
+    echo
+    [ "$RETVAL" = 0 ] || exit $RETVAL
+}
+
+# Stop daemon
+function stop() {
+    echo -n $"Unloading $DRIVER kernel module: "
+    /sbin/rmmod $DRIVER && log_success_msg || log_failure_msg
+    RETVAL=$?
+    echo
+    [ "$RETVAL" = 0 ] || exit $RETVAL
+
+    echo -n $"Removing GDRcopy /dev entries: "
+    removenodes $DRIVER && log_success_msg || log_failure_msg
+    RETVAL=$?
+    echo
+    [ "$RETVAL" = 0 ] || exit $RETVAL
+}
+
+# See how we were called
+case "$1" in
+   start)
+       start
+      ;;
+   stop)
+       stop
+      ;;
+   restart)
+       stop
+       start
+      ;;
+   *)
+       echo $"Usage: $0 {start|stop|restart}"
+       RETVAL=1
+esac
+exit $RETVAL
--- a/gdrcopy/packages/dkms/debian/postinst
+++ b/gdrcopy/packages/dkms/debian/postinst
@ -0,0 +1,49 @@
+#!/bin/sh
+# Copyright (C) 2002-2005 Flavio Stanchina
+# Copyright (C) 2005-2006 Aric Cyr
+# Copyright (C) 2007 Mario Limonciello
+# Copyright (C) 2009 Alberto Milone
+
+set -e
+
+NAME=gdrdrv
+PACKAGE_NAME=$NAME-dkms
+DEB_NAME=$(echo $PACKAGE_NAME | sed 's,_,-,')
+CVERSION=`dpkg-query -W -f='${Version}' $DEB_NAME | awk -F "-" '{print $1}' | cut -d\: -f2`
+ARCH=`dpkg --print-architecture`
+
+dkms_configure () {
+	for POSTINST in /usr/lib/dkms/common.postinst "/usr/share/$PACKAGE_NAME/postinst"; do
+		if [ -f "$POSTINST" ]; then
+			"$POSTINST" "$NAME" "$CVERSION" "/usr/share/$PACKAGE_NAME" "$ARCH" "$2"
+			return $?
+		fi
+		echo "WARNING: $POSTINST does not exist." >&2
+	done
+	echo "ERROR: DKMS version is too old and $PACKAGE_NAME was not" >&2
+	echo "built with legacy DKMS support." >&2
+	echo "You must either rebuild $PACKAGE_NAME with legacy postinst" >&2
+	echo "support or upgrade DKMS to a more current version." >&2
+	return 1
+}
+
+case "$1" in
+	configure)
+		dkms_configure
+	;;
+
+	abort-upgrade|abort-remove|abort-deconfigure)
+	;;
+
+	*)
+		echo "postinst called with unknown argument \`$1'" >&2
+		exit 1
+	;;
+esac
+
+# dh_installdeb will replace this with shell code automatically
+# generated by other debhelper scripts.
+
+#DEBHELPER#
+
+exit 0
--- a/gdrcopy/packages/dkms/debian/prerm
+++ b/gdrcopy/packages/dkms/debian/prerm
@ -0,0 +1,28 @@
+#!/bin/sh
+
+NAME=gdrdrv
+VERSION=@VERSION@
+
+set -e
+
+case "$1" in
+    remove|upgrade|deconfigure)
+      if [  "`dkms status -m $NAME`" ]; then
+         dkms remove -m $NAME -v $VERSION --all
+      fi
+    ;;
+
+    failed-upgrade)
+    ;;
+
+    *)
+        echo "prerm called with unknown argument \`$1'" >&2
+        exit 1
+    ;;
+esac
+
+#DEBHELPER#
+
+exit 0
+
+
--- a/gdrcopy/packages/dkms/debian/rules
+++ b/gdrcopy/packages/dkms/debian/rules
@ -0,0 +1,55 @@
+#!/usr/bin/make -f
+# -*- makefile -*-
+
+# Uncomment this to turn on verbose mode.
+#export DH_VERBOSE=1
+
+DEB_NAME=gdrdrv
+NAME=gdrdrv
+VERSION=@VERSION@
+
+configure: configure-stamp
+configure-stamp:
+	dh_testdir
+	touch configure-stamp
+
+
+build: build-stamp
+
+build-stamp: configure-stamp 
+	dh_testdir
+	$(MAKE)
+	touch $@
+
+clean:
+	dh_testdir
+	dh_testroot
+	rm -f build-stamp configure-stamp
+	-$(MAKE) clean
+	dh_clean
+
+install: build
+	dh_testdir
+	dh_testroot
+	dh_prep
+	dh_installdirs
+	$(MAKE) DESTDIR=$(CURDIR)/debian/$(DEB_NAME)-dkms NAME=$(NAME) VERSION=$(VERSION) install
+	dh_installinit --name $(DEB_NAME)
+
+binary-arch: build install
+
+binary-indep: build install
+	dh_testdir
+	dh_testroot
+	dh_link
+	dh_strip
+	dh_compress
+	dh_fixperms
+	dh_installdeb
+	dh_shlibdeps
+	dh_gencontrol
+	dh_md5sums
+	dh_builddeb
+
+binary: binary-indep binary-arch
+.PHONY: build clean binary-indep binary-arch binary install configure
--- a/gdrcopy/packages/dkms/debian/source/format
+++ b/gdrcopy/packages/dkms/debian/source/format
@ -0,0 +1 @@
+3.0 (quilt)
--- a/gdrcopy/packages/gdrcopy.cfg
+++ b/gdrcopy/packages/gdrcopy.cfg
@ -0,0 +1,20 @@
+### Commented entries have reasonable defaults.
+### Uncomment to edit them.
+# Source: <source package name; defaults to package name>
+Section: misc
+Priority: optional
+Homepage: https://github.com/NVIDIA/gdrcopy
+Standards-Version: @FULL_VERSION@
+
+Package: gdrcopy
+Version: @FULL_VERSION@
+Maintainer: GPUDirect Team <gpudirect@nvidia.com>
+Uploaders: Davide Rossetti <drossetti@nvidia.com>, Pak Markthub <pmarkthub@nvidia.com>
+Depends: gdrdrv-dkms (= @FULL_VERSION@), libgdrapi (= @FULL_VERSION@), gdrcopy-tests (= @FULL_VERSION@)
+Architecture: any
+Multi-Arch: same
+Copyright: MIT
+Changelog: changelog
+Readme: README.md
+Description: GDRCopy meta-package
+ Meta-package for GDRCopy, a low-latency GPU memory copy library based on NVIDIA GPUDirect RDMA technology.
--- a/gdrcopy/packages/rhel/gdrcopy.service
+++ b/gdrcopy/packages/rhel/gdrcopy.service
@ -0,0 +1,14 @@
+[Unit]
+Description=GDRCopy service
+After=multi-user.target
+
+[Service]
+Type=oneshot
+RemainAfterExit=yes
+ExecStart=/bin/bash /usr/libexec/gdrcopy/gdrcopy start
+ExecReload=/bin/bash /usr/libexec/gdrcopy/gdrcopy restart
+ExecStop=/bin/bash /usr/libexec/gdrcopy/gdrcopy stop
+
+[Install]
+WantedBy=multi-user.target
+
--- a/gdrcopy/scripts/test_gdrdrv_HAVE_VM_FLAGS_SET.sh
+++ b/gdrcopy/scripts/test_gdrdrv_HAVE_VM_FLAGS_SET.sh
@ -0,0 +1,81 @@
+#!/bin/sh
+
+show_help()
+{
+    echo "Usage: ${0} [-hk]"
+    echo
+    echo "  -h          Show this help text."
+    echo "  -k <kver>   Specify the kernel version."
+    echo
+}
+
+set_kver=0
+kver=""
+
+OPTIND=1    # Reset in case getopts has been used previously in the shell.
+while getopts "hk:" opt ; do
+    case "${opt}" in
+        h)
+            show_help
+            exit 0
+            ;;
+        k)
+            set_kver=1
+            kver="${OPTARG}"
+            ;;
+        ?)
+            show_help
+            exit 0
+            ;;
+    esac
+done
+
+if [ ${set_kver} -eq 0 ]; then
+    kver="$(uname -r)"
+fi
+
+kdir="/lib/modules/${kver}/build"
+
+tmpfolder=$(mktemp --tmpdir -d gdrcopy.XXXXXXXXX)
+
+testfile="${tmpfolder}/test-dummy.c"
+makefile="${tmpfolder}/Makefile"
+
+cat >${testfile} <<EOF
+#include <linux/module.h>
+#include <linux/mm.h>
+static int __init test_dummy_init(void)
+{
+    struct vm_area_struct vma;
+    vm_flags_set(&vma, 0);
+    return 0;
+}
+
+static void __exit test_dummy_fini(void)
+{
+}
+
+MODULE_AUTHOR("gpudirect@nvidia.com");
+MODULE_LICENSE("MIT");
+MODULE_VERSION("1.0");
+
+module_init(test_dummy_init);
+module_exit(test_dummy_fini);
+EOF
+
+cat >${makefile} <<EOF
+obj-m := test-dummy.o
+EOF
+
+cd ${tmpfolder}
+make -C ${kdir} M=${tmpfolder} modules > /dev/null 2>&1
+ret=$?
+
+rm -rf ${tmpfolder}
+
+if [ "${ret}" -eq 0 ]; then
+    echo "y"
+else
+    echo "n"
+fi
+
--- a/gdrcopy/src/Makefile
+++ b/gdrcopy/src/Makefile
@ -0,0 +1,79 @@
+# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in 
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+LIB_MAJOR_VER ?= $(shell awk '/\#define GDR_API_MAJOR_VERSION/ { print $$3 }' ../include/gdrapi.h | tr -d '\n')
+LIB_MINOR_VER ?= $(shell awk '/\#define GDR_API_MINOR_VERSION/ { print $$3 }' ../include/gdrapi.h | tr -d '\n')
+
+GDRAPI_ARCH ?= $(shell ../config_arch)
+GDRAPI_INC := ../include
+
+CPPFLAGS := -I $(GDRAPI_INC) -I gdrdrv/ -D GDRAPI_ARCH=$(GDRAPI_ARCH)
+LDFLAGS  := 
+COMMONCFLAGS := -O2
+CFLAGS   += $(COMMONCFLAGS)
+CXXFLAGS += $(COMMONCFLAGS)
+LIBS     := -lpthread -ldl
+
+LIB_VER:=$(LIB_MAJOR_VER).$(LIB_MINOR_VER)
+LIB_BASENAME:=libgdrapi.so
+LIB_DYNAMIC=$(LIB_BASENAME).$(LIB_VER)
+LIB_SONAME=$(LIB_BASENAME).$(LIB_MAJOR_VER)
+LIB:=$(LIB_DYNAMIC)
+
+LIBSRCS := gdrapi.c
+ifeq ($(GDRAPI_ARCH),X86)
+LIBSRCS += memcpy_avx.c memcpy_sse.c memcpy_sse41.c
+endif
+
+LIBOBJS := $(LIBSRCS:.c=.o)
+
+all: config lib
+
+config:
+	@ echo "GDRAPI_ARCH=$(GDRAPI_ARCH)"
+
+lib: $(LIB)
+
+#static
+#$(LIB): $(LIB)($(LIBOBJS))
+#dynamic
+$(LIBOBJS): CFLAGS+=-fPIC
+$(LIB): $(LIBOBJS)
+	$(CC) -shared -Wl,-soname,$(LIB_SONAME) -o $@ $^
+	PATH=/sbin:/usr/sbin:$$PATH; ldconfig -n $(PWD)
+	ln -sf $(LIB_DYNAMIC) $(LIB_SONAME)
+	ln -sf $(LIB_SONAME) $(LIB_BASENAME)
+
+# special-cased to finely tune the arch option
+memcpy_avx.o: memcpy_avx.c
+	$(COMPILE.c) -mavx -o $@ $^
+
+memcpy_sse.o: memcpy_sse.c
+	$(COMPILE.c) -msse -o $@ $^
+
+memcpy_sse41.o: memcpy_sse41.c
+	$(COMPILE.c) -msse4.1 -o $@ $^
+
+gdrapi.o: gdrapi.c $(GDRAPI_INC)/gdrapi.h gdrapi_internal.h gdrdrv/gdrdrv.h
+
+clean:
+	rm -f *.o $(EXES) lib*.so* *~ core.*
+
+.PHONY: clean all lib 
--- a/gdrcopy/src/gdrapi.c
+++ b/gdrcopy/src/gdrapi.c
@ -0,0 +1,877 @@
+/*
+ * Copyright (c) 2014-2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in 
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdarg.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <netdb.h>
+#include <malloc.h>
+#include <getopt.h>
+#include <arpa/inet.h>
+#include <sys/ioctl.h>
+#include <time.h>
+#include <asm/types.h>
+#include <assert.h>
+#include <sys/queue.h>
+
+#include "gdrconfig.h"
+#include "gdrapi.h"
+#include "gdrdrv.h"
+#include "gdrapi_internal.h"
+
+// logging/tracing
+
+enum gdrcopy_msg_level {
+    GDRCOPY_MSG_DEBUG = 1,
+    GDRCOPY_MSG_INFO,
+    GDRCOPY_MSG_WARN,
+    GDRCOPY_MSG_ERROR
+};
+
+static int gdr_msg_level = GDRCOPY_MSG_ERROR;
+static int gdr_enable_logging = -1;
+
+static void gdr_msg(enum gdrcopy_msg_level lvl, const char* fmt, ...)
+{
+    if (-1 == gdr_enable_logging) {
+        const char *env = getenv("GDRCOPY_ENABLE_LOGGING");
+        if (env)
+            gdr_enable_logging = 1;
+        else
+            gdr_enable_logging = 0;
+
+        env = getenv("GDRCOPY_LOG_LEVEL");
+        if (env)
+            gdr_msg_level = atoi(env);
+    }
+    if (gdr_enable_logging) {
+        if (lvl >= gdr_msg_level) {
+            va_list ap;
+            va_start(ap, fmt);
+            vfprintf(stderr, fmt, ap);
+            va_end(ap);
+        }
+    }
+}
+
+#define gdr_dbg(FMT, ARGS...)  gdr_msg(GDRCOPY_MSG_DEBUG, "DBG:  " FMT, ## ARGS)
+#define gdr_dbgc(C, FMT, ARGS...)  do { static int gdr_dbg_cnt=(C); if (gdr_dbg_cnt) { gdr_dbg(FMT, ## ARGS); --gdr_dbg_cnt; }} while (0)
+#define gdr_info(FMT, ARGS...) gdr_msg(GDRCOPY_MSG_INFO,  "INFO: " FMT, ## ARGS)
+#define gdr_warn(FMT, ARGS...) gdr_msg(GDRCOPY_MSG_WARN,  "WARN: " FMT, ## ARGS)
+#define gdr_err(FMT, ARGS...)  gdr_msg(GDRCOPY_MSG_ERROR, "ERR:  " FMT, ## ARGS)
+
+static gdr_memh_t *to_memh(gdr_mh_t mh) {
+    return (gdr_memh_t *)mh.h;
+}
+
+static gdr_mh_t from_memh(gdr_memh_t *memh) {
+    gdr_mh_t mh;
+    mh.h = (unsigned long)memh;
+    return mh;
+}
+
+static void gdr_init_cpu_flags(void);
+
+static inline int gdr_is_mapped(const gdr_mapping_type_t mapping_type)
+{
+    return mapping_type != GDR_MAPPING_TYPE_NONE;
+}
+
+gdr_t gdr_open(void)
+{
+    gdr_t g = NULL;
+    const char *gdrinode = "/dev/gdrdrv";
+    int ret;
+
+    g = calloc(1, sizeof(*g));
+    if (!g) {
+        gdr_err("error while allocating memory\n");
+        return NULL;
+    }
+
+    int fd = open(gdrinode, O_RDWR | O_CLOEXEC);
+    if (-1 == fd ) {
+        ret = errno;
+        gdr_err("error opening driver (errno=%d/%s)\n", ret, strerror(ret));
+        goto err_mem;
+    }
+
+    struct GDRDRV_IOC_GET_VERSION_PARAMS params;
+    int retcode = ioctl(fd, GDRDRV_IOC_GET_VERSION, &params);
+    if (0 != retcode) {
+        ret = errno;
+        gdr_err("Error getting the gdrdrv driver version with ioctl error (errno=%d). gdrdrv might be too old.\n", ret);
+        goto err_fd;
+    }
+    if (params.gdrdrv_version < MINIMUM_GDRDRV_VERSION) {
+        gdr_err(
+            "The minimum required gdrdrv driver version is %d.%d but the current gdrdrv version is %d.%d\n", 
+            MINIMUM_GDRDRV_MAJOR_VERSION, 
+            MINIMUM_GDRDRV_MINOR_VERSION, 
+            params.gdrdrv_version >> MAJOR_VERSION_SHIFT, 
+            params.gdrdrv_version & MINOR_VERSION_MASK
+        );
+        goto err_fd;
+    }
+    if (params.minimum_gdr_api_version > GDR_API_VERSION) {
+        gdr_err(
+            "gdrdrv driver requires libgdrapi version %d.%d or above but the current libgdrapi version is %d.%d\n", 
+            params.minimum_gdr_api_version >> MAJOR_VERSION_SHIFT, 
+            params.minimum_gdr_api_version & MINOR_VERSION_MASK, 
+            GDR_API_MAJOR_VERSION, 
+            GDR_API_MINOR_VERSION
+        );
+        goto err_fd;
+    }
+
+    g->fd = fd;
+    LIST_INIT(&g->memhs);
+
+    gdr_init_cpu_flags();
+
+    // Initialize page_shift, page_size, and page_mask.
+    g->page_size = sysconf(_SC_PAGESIZE);
+    g->page_mask = ~(g->page_size - 1);
+
+    size_t ps_tmp = g->page_size;
+    g->page_shift = -1;
+    while (ps_tmp > 0) {
+        ++g->page_shift;
+        if ((ps_tmp & 0x1) == 1)
+            break;
+        ps_tmp >>= 1;
+    }
+
+    g->gdrdrv_version = params.gdrdrv_version;
+
+    return g;
+
+err_fd:
+    close(fd);
+
+err_mem:
+    free(g);
+
+    return NULL;
+}
+
+int gdr_close(gdr_t g)
+{
+    int ret = 0;
+    int retcode;
+    gdr_memh_t *mh, *next_mh;
+
+    mh = g->memhs.lh_first;
+    while (mh != NULL) {
+        // gdr_unpin_buffer frees mh, so we need to get the next one
+        // beforehand.
+        next_mh = mh->entries.le_next;
+        ret = gdr_unpin_buffer(g, from_memh(mh));
+        if (ret) {
+            gdr_err("error unpinning buffer inside gdr_close (errno=%d/%s)\n", ret, strerror(ret));
+            return ret;
+        }
+        mh = next_mh;
+    }
+
+    retcode = close(g->fd);
+    if (-1 == retcode) {
+        ret = errno;
+        gdr_err("error closing driver (errno=%d/%s)\n", ret, strerror(ret));
+    }
+    g->fd = 0;
+    free(g);
+    return ret;
+}
+
+int gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle)
+{
+    int ret = 0;
+    int retcode;
+
+    if (!handle) {
+        return EINVAL;
+    }
+
+    gdr_memh_t *mh = calloc(1, sizeof(gdr_memh_t));
+    if (!mh) {
+        return ENOMEM;
+    }
+
+    struct GDRDRV_IOC_PIN_BUFFER_PARAMS params;
+    params.addr = addr;
+    params.size = size;
+    params.p2p_token = p2p_token;
+    params.va_space = va_space;
+    params.handle = 0;
+
+    retcode = ioctl(g->fd, GDRDRV_IOC_PIN_BUFFER, &params);
+    if (0 != retcode) {
+        ret = errno;
+        gdr_err("ioctl error (errno=%d)\n", ret);
+        free(mh);
+        goto err;
+    }
+    mh->handle = params.handle;
+    LIST_INSERT_HEAD(&g->memhs, mh, entries);
+    *handle = from_memh(mh);
+ err:
+    return ret;
+}
+
+int gdr_unpin_buffer(gdr_t g, gdr_mh_t handle)
+{
+    int ret = 0;
+    int retcode;
+    gdr_memh_t *mh = to_memh(handle);
+
+    struct GDRDRV_IOC_UNPIN_BUFFER_PARAMS params;
+    params.handle = mh->handle;
+    retcode = ioctl(g->fd, GDRDRV_IOC_UNPIN_BUFFER, &params);
+    if (0 != retcode) {
+        ret = errno;
+        gdr_err("ioctl error (errno=%d)\n", ret);
+    }
+    LIST_REMOVE(mh, entries);
+    free(mh);
+    
+    return ret;
+}
+
+int gdr_get_callback_flag(gdr_t g, gdr_mh_t handle, int *flag)
+{
+    int ret = 0;
+    int retcode;
+    gdr_memh_t *mh = to_memh(handle);
+
+    struct GDRDRV_IOC_GET_CB_FLAG_PARAMS params;
+    params.handle = mh->handle;
+    retcode = ioctl(g->fd, GDRDRV_IOC_GET_CB_FLAG, &params);
+    if (0 != retcode) {
+        ret = errno;
+        gdr_err("ioctl error (errno=%d)\n", ret);
+    } else {
+        *flag = params.flag;
+    }
+    return ret;
+}
+
+int gdr_get_info_v2(gdr_t g, gdr_mh_t handle, gdr_info_v2_t *info)
+{
+    int ret = 0;
+    int retcode;
+    gdr_memh_t *mh = to_memh(handle);
+
+    if (g->gdrdrv_version >= GDRDRV_MINIMUM_VERSION_WITH_GET_INFO_V2) {
+        struct GDRDRV_IOC_GET_INFO_V2_PARAMS params;
+        params.handle = mh->handle;
+
+        retcode = ioctl(g->fd, GDRDRV_IOC_GET_INFO_V2, &params);
+        if (0 != retcode) {
+            ret = errno;
+            gdr_err("ioctl error (errno=%d)\n", ret);
+            goto out;
+        } else {
+            info->va            = params.va;
+            info->mapped_size   = params.mapped_size;
+            info->page_size     = params.page_size;
+            info->tm_cycles     = params.tm_cycles;
+            info->cycles_per_ms = params.tsc_khz;
+            info->mapped        = gdr_is_mapped(params.mapping_type);
+            info->wc_mapping    = (params.mapping_type == GDR_MAPPING_TYPE_WC);
+            info->mapping_type  = params.mapping_type;
+        }
+    }
+    else
+    {
+        struct GDRDRV_IOC_GET_INFO_PARAMS params;
+        params.handle = mh->handle;
+
+        retcode = ioctl(g->fd, GDRDRV_IOC_GET_INFO, &params);
+        if (0 != retcode) {
+            ret = errno;
+            gdr_err("ioctl error (errno=%d)\n", ret);
+            goto out;
+        } else {
+            info->va            = params.va;
+            info->mapped_size   = params.mapped_size;
+            info->page_size     = params.page_size;
+            info->tm_cycles     = params.tm_cycles;
+            info->cycles_per_ms = params.tsc_khz;
+            info->mapped        = params.mapped;
+            info->wc_mapping    = params.wc_mapping;
+            info->mapping_type  = params.mapped ? (params.wc_mapping ? GDR_MAPPING_TYPE_WC : GDR_MAPPING_TYPE_CACHING) : GDR_MAPPING_TYPE_NONE;
+        }
+    }
+
+out:
+    return ret;
+}
+
+int gdr_map(gdr_t g, gdr_mh_t handle, void **ptr_va, size_t size)
+{
+    int ret = 0;
+    gdr_info_v2_t info = {0,};
+    gdr_memh_t *mh = to_memh(handle);
+
+    if (gdr_is_mapped(mh->mapping_type)) {
+        gdr_err("mh is mapped already\n");
+        return EAGAIN;
+    }
+    size_t rounded_size = (size + g->page_size - 1) & g->page_mask;
+    off_t magic_off = (off_t)mh->handle << g->page_shift;
+    void *mmio = mmap(NULL, rounded_size, PROT_READ|PROT_WRITE, MAP_SHARED, g->fd, magic_off);
+    if (mmio == MAP_FAILED) {
+        int __errno = errno;
+        mmio = NULL;
+        gdr_err("error %s(%d) while mapping handle %x, rounded_size=%zu offset=%llx\n",
+                strerror(__errno), __errno, handle, rounded_size, (long long unsigned)magic_off);
+        ret = __errno;
+        goto err;
+    }
+    *ptr_va = mmio;
+    ret = gdr_get_info_v2(g, handle, &info);
+    if (ret) {
+        gdr_err("error %d from get_info, munmapping before exiting\n", ret);
+        munmap(mmio, rounded_size);
+        goto err;
+    }
+    if (!gdr_is_mapped(info.mapping_type)) {
+        // Race could cause this issue.
+        // E.g., gdr_map and cuMemFree are triggered concurrently.
+        // The above mmap is successful but cuMemFree causes unmapping immediately.
+        gdr_err("mh is not mapped\n");
+        ret = EAGAIN;
+    }
+    mh->mapping_type = info.mapping_type;
+    gdr_dbg("mapping_type=%d\n", mh->mapping_type);
+ err:
+    return ret;
+}
+
+int gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size)
+{
+    int ret = 0;
+    int retcode = 0;
+    size_t rounded_size;
+    gdr_memh_t *mh = to_memh(handle);
+
+    rounded_size = (size + g->page_size - 1) & g->page_mask;
+
+    if (!gdr_is_mapped(mh->mapping_type)) {
+        gdr_err("mh is not mapped yet\n");
+        return EINVAL;
+    }
+    retcode = munmap(va, rounded_size);
+    if (-1 == retcode) {
+        int __errno = errno;
+        gdr_err("error %s(%d) while unmapping handle %x, rounded_size=%zu\n",
+                strerror(__errno), __errno, handle, rounded_size);
+        ret = __errno;
+        goto err;
+    }
+    mh->mapping_type = GDR_MAPPING_TYPE_NONE;
+ err:
+    return ret;
+}
+
+#ifdef GDRAPI_X86
+#include <cpuid.h>
+
+// prepare for AVX2 implementation
+#ifndef bit_AVX2
+/* Extended Features (%eax == 7) */
+/* %ebx */
+#define bit_AVX2 (1 << 5)
+#endif
+
+#include <immintrin.h>
+
+extern int memcpy_uncached_store_avx(void *dest, const void *src, size_t n_bytes);
+extern int memcpy_cached_store_avx(void *dest, const void *src, size_t n_bytes);
+extern int memcpy_uncached_store_sse(void *dest, const void *src, size_t n_bytes);
+extern int memcpy_cached_store_sse(void *dest, const void *src, size_t n_bytes);
+extern int memcpy_uncached_load_sse41(void *dest, const void *src, size_t n_bytes);
+static inline void wc_store_fence(void) { _mm_sfence(); }
+#define PREFERS_STORE_UNROLL4 0
+#define PREFERS_STORE_UNROLL8 0
+#define PREFERS_LOAD_UNROLL4  0
+#define PREFERS_LOAD_UNROLL8  0
+// GDRAPI_X86
+
+#elif defined(GDRAPI_POWER)
+static int memcpy_uncached_store_avx(void *dest, const void *src, size_t n_bytes)  { return 1; }
+static int memcpy_cached_store_avx(void *dest, const void *src, size_t n_bytes)  { return 1; }
+static int memcpy_uncached_store_sse(void *dest, const void *src, size_t n_bytes)    { return 1; }
+static int memcpy_cached_store_sse(void *dest, const void *src, size_t n_bytes)    { return 1; }
+static int memcpy_uncached_load_sse41(void *dest, const void *src, size_t n_bytes) { return 1; }
+static inline void wc_store_fence(void) { asm volatile("sync") ; }
+#define PREFERS_STORE_UNROLL4 1
+#define PREFERS_STORE_UNROLL8 0
+#define PREFERS_LOAD_UNROLL4  0
+#define PREFERS_LOAD_UNROLL8  1
+// GDRAPI_POWER
+
+#elif defined(GDRAPI_ARM64)
+static int memcpy_uncached_store_avx(void *dest, const void *src, size_t n_bytes)  { return 1; }
+static int memcpy_cached_store_avx(void *dest, const void *src, size_t n_bytes)  { return 1; }
+static int memcpy_uncached_store_sse(void *dest, const void *src, size_t n_bytes)    { return 1; }
+static int memcpy_cached_store_sse(void *dest, const void *src, size_t n_bytes)    { return 1; }
+static int memcpy_uncached_load_sse41(void *dest, const void *src, size_t n_bytes) { return 1; }
+static inline void wc_store_fence(void) { asm volatile("DMB ishld") ; }
+#define PREFERS_STORE_UNROLL4 0
+#define PREFERS_STORE_UNROLL8 0
+#define PREFERS_LOAD_UNROLL4  0
+#define PREFERS_LOAD_UNROLL8  0
+// GDRAPI_ARM64
+#endif
+
+static int has_sse = 0;
+static int has_sse2 = 0;
+static int has_sse4_1 = 0;
+static int has_avx = 0;
+static int has_avx2 = 0;
+
+static void gdr_init_cpu_flags(void)
+{
+#ifdef GDRAPI_X86
+    unsigned int info_type = 0x00000001;
+    unsigned int ax, bx, cx, dx;
+    if (__get_cpuid(info_type, &ax, &bx, &cx, &dx) == 1) {
+       has_sse4_1 = ((cx & bit_SSE4_1) != 0);
+       has_avx    = ((cx & bit_AVX)    != 0);
+       has_sse    = ((dx & bit_SSE)    != 0);
+       has_sse2   = ((dx & bit_SSE2)   != 0);
+       gdr_dbg("sse4_1=%d avx=%d sse=%d sse2=%d\n", has_sse4_1, has_avx, has_sse, has_sse2);
+    }
+#ifdef bit_AVX2
+    info_type = 0x7;
+    if (__get_cpuid(info_type, &ax, &bx, &cx, &dx) == 1) {
+        has_avx2 = bx & bit_AVX2;
+    }
+#endif // bit_AVX2
+#endif // GDRAPI_X86
+
+#ifdef GDRAPI_POWER
+    // detect and enable Altivec/SMX support
+#endif
+}
+
+// note: more than one implementation may be compiled in
+
+static void unroll8_memcpy(void *dst, const void *src, size_t size)
+{
+    const uint64_t *r = (const uint64_t *)src;
+    uint64_t *w = (uint64_t *)dst;
+    size_t nw = size / sizeof(*r);
+    assert(size % sizeof(*r) == 0);
+
+    while (nw) {
+        if (0 == (nw & 3)) {
+            uint64_t r0 = r[0];
+            uint64_t r1 = r[1];
+            uint64_t r2 = r[2];
+            uint64_t r3 = r[3];
+            w[0] = r0;
+            w[1] = r1;
+            w[2] = r2;
+            w[3] = r3;
+            r += 4;
+            w += 4;
+            nw -= 4;
+        } else if (0 == (nw & 1)) {
+            uint64_t r0 = r[0];
+            uint64_t r1 = r[1];
+            w[0] = r0;
+            w[1] = r1;
+            r += 2;
+            w += 2;
+            nw -= 2;
+        } else {
+            w[0] = r[0];
+            ++w;
+            ++r;
+            --nw;
+        }
+    }
+}
+
+static void unroll4_memcpy(void *dst, const void *src, size_t size)
+{
+    const uint32_t *r = (const uint32_t *)src;
+    uint32_t *w = (uint32_t *)dst;
+    size_t nw = size / sizeof(*r);
+    assert(size % sizeof(*r) == 0);
+
+    while (nw) {
+        if (0 == (nw & 3)) {
+            uint32_t r0 = r[0];
+            uint32_t r1 = r[1];
+            uint32_t r2 = r[2];
+            uint32_t r3 = r[3];
+            w[0] = r0;
+            w[1] = r1;
+            w[2] = r2;
+            w[3] = r3;
+            r += 4;
+            w += 4;
+            nw -= 4;
+        } else if (0 == (nw & 1)) {
+            uint32_t r0 = r[0];
+            uint32_t r1 = r[1];
+            w[0] = r0;
+            w[1] = r1;
+            r += 2;
+            w += 2;
+            nw -= 2;
+        } else {
+            w[0] = r[0];
+            ++w;
+            ++r;
+            --nw;
+        }
+    }
+}
+
+static inline int is_aligned(unsigned long value, unsigned powof2)
+{
+    return ((value & (powof2-1)) == 0);
+}
+
+static inline int ptr_is_aligned(const void *ptr, unsigned powof2)
+{
+    unsigned long addr = (unsigned long)ptr;
+    return is_aligned(addr, powof2);
+}
+
+static inline void memcpy_to_device_mapping(void *dst, const void *src, size_t size)
+{
+    size_t remaining_size = size;
+    void *curr_map_d_ptr = dst;
+    const void *curr_h_ptr = src;
+    size_t copy_size = 0;
+    while (remaining_size > 0) {
+        if (is_aligned(remaining_size, sizeof(uint64_t)) && ptr_is_aligned(curr_map_d_ptr, sizeof(uint64_t)) && ptr_is_aligned(curr_h_ptr, sizeof(uint64_t))) {
+            // We have proper alignment. memcpy can be used here. Although
+            // unlikely, this might break in the future if the implementation
+            // of memcpy changes to generate unaligned access. Still, we choose
+            // memcpy because it provides better performance than our simple
+            // aligned-access workaround.
+            memcpy(curr_map_d_ptr, curr_h_ptr, remaining_size);
+            copy_size = remaining_size;
+        }
+        else if (remaining_size >= sizeof(uint64_t) && ptr_is_aligned(curr_map_d_ptr, sizeof(uint64_t))) {
+            // memcpy cannot be used here because its internal
+            // implementation may end up in an unaligned access.
+            WRITE_ONCE(*(uint64_t *)curr_map_d_ptr, *(uint64_t *)curr_h_ptr);
+            copy_size = sizeof(uint64_t);
+        }
+        else if (remaining_size >= sizeof(uint32_t) && ptr_is_aligned(curr_map_d_ptr, sizeof(uint32_t))) {
+            WRITE_ONCE(*(uint32_t *)curr_map_d_ptr, *(uint32_t *)curr_h_ptr);
+            copy_size = sizeof(uint32_t);
+        }
+        else if (remaining_size >= sizeof(uint16_t) && ptr_is_aligned(curr_map_d_ptr, sizeof(uint16_t))) {
+            WRITE_ONCE(*(uint16_t *)curr_map_d_ptr, *(uint16_t *)curr_h_ptr);
+            copy_size = sizeof(uint16_t);
+        }
+        else {
+            WRITE_ONCE(*(uint8_t *)curr_map_d_ptr, *(uint8_t *)curr_h_ptr);
+            copy_size = sizeof(uint8_t);
+        }
+        remaining_size -= copy_size;
+        curr_map_d_ptr = (void *)((uintptr_t)curr_map_d_ptr + copy_size);
+        curr_h_ptr = (const void *)((uintptr_t)curr_h_ptr + copy_size);
+    }
+}
+
+static inline void memcpy_from_device_mapping(void *dst, const void *src, size_t size)
+{
+    size_t remaining_size = size;
+    const void *curr_map_d_ptr = src;
+    void *curr_h_ptr = dst;
+    size_t copy_size = 0;
+    while (remaining_size > 0) {
+        if (is_aligned(remaining_size, sizeof(uint64_t)) && ptr_is_aligned(curr_map_d_ptr, sizeof(uint64_t)) && ptr_is_aligned(curr_h_ptr, sizeof(uint64_t))) {
+            // We have proper alignment. memcpy can be used here. Although
+            // unlikely, this might break in the future if the implementation
+            // of memcpy changes to generate unaligned access. Still, we choose
+            // memcpy because it provides better performance than our simple
+            // aligned-access workaround.
+            memcpy(curr_h_ptr, curr_map_d_ptr, remaining_size);
+            copy_size = remaining_size;
+        }
+        else if (remaining_size >= sizeof(uint64_t) && ptr_is_aligned(curr_map_d_ptr, sizeof(uint64_t))) {
+            // memcpy cannot be used here because its internal
+            // implementation may end up in an unaligned access.
+            *(uint64_t *)curr_h_ptr = READ_ONCE(*(uint64_t *)curr_map_d_ptr);
+            copy_size = sizeof(uint64_t);
+        }
+        else if (remaining_size >= sizeof(uint32_t) && ptr_is_aligned(curr_map_d_ptr, sizeof(uint32_t))) {
+            *(uint32_t *)curr_h_ptr = READ_ONCE(*(uint32_t *)curr_map_d_ptr);
+            copy_size = sizeof(uint32_t);
+        }
+        else if (remaining_size >= sizeof(uint16_t) && ptr_is_aligned(curr_map_d_ptr, sizeof(uint16_t))) {
+            *(uint16_t *)curr_h_ptr = READ_ONCE(*(uint16_t *)curr_map_d_ptr);
+            copy_size = sizeof(uint16_t);
+        }
+        else {
+            *(uint8_t *)curr_h_ptr = READ_ONCE(*(uint8_t *)curr_map_d_ptr);
+            copy_size = sizeof(uint8_t);
+        }
+        remaining_size -= copy_size;
+        curr_map_d_ptr = (const void *)((uintptr_t)curr_map_d_ptr + copy_size);
+        curr_h_ptr = (void *)((uintptr_t)curr_h_ptr + copy_size);
+    }
+}
+
+static int gdr_copy_to_mapping_internal(void *map_d_ptr, const void *h_ptr, size_t size, gdr_mapping_type_t mapping_type)
+{
+    const int wc_mapping = (mapping_type == GDR_MAPPING_TYPE_WC);
+    const int device_mapping = (mapping_type == GDR_MAPPING_TYPE_DEVICE);
+    do {
+        // For very small sizes and aligned pointers, we use simple store.
+        if (size == sizeof(uint8_t)) {
+            WRITE_ONCE(*(uint8_t *)map_d_ptr, *(uint8_t *)h_ptr);
+            goto do_fence;
+        } else if (size == sizeof(uint16_t) && ptr_is_aligned(map_d_ptr, sizeof(uint16_t))) {
+            WRITE_ONCE(*(uint16_t *)map_d_ptr, *(uint16_t *)h_ptr);
+            goto do_fence;
+        } else if (size == sizeof(uint32_t) && ptr_is_aligned(map_d_ptr, sizeof(uint32_t))) {
+            WRITE_ONCE(*(uint32_t *)map_d_ptr, *(uint32_t *)h_ptr);
+            goto do_fence;
+        } else if (size == sizeof(uint64_t) && ptr_is_aligned(map_d_ptr, sizeof(uint64_t))) {
+            WRITE_ONCE(*(uint64_t *)map_d_ptr, *(uint64_t *)h_ptr);
+            goto do_fence;
+        }
+
+        // pick the most performing implementation compatible with the platform we are running on
+        // NOTE: write fences are included in functions below
+        if (has_avx) {
+            assert(wc_mapping);
+            gdr_dbgc(1, "using AVX implementation of gdr_copy_to_mapping\n");
+            memcpy_uncached_store_avx(map_d_ptr, h_ptr, size);
+            goto out;
+        }
+        if (has_sse) {
+            assert(wc_mapping);
+            gdr_dbgc(1, "using SSE implementation of gdr_copy_to_mapping\n");
+            memcpy_uncached_store_sse(map_d_ptr, h_ptr, size);
+            goto out;
+        }
+
+        // on POWER, compiler/libc memcpy is not optimal for MMIO
+        // 64bit stores are not better than 32bit ones, so we prefer the latter.
+        // NOTE: if preferred but not aligned, a better implementation would still try to
+        // use byte sized stores to align map_d_ptr and h_ptr to next word.
+        // NOTE2: unroll*_memcpy and memcpy do not include fencing.
+        if (wc_mapping && PREFERS_STORE_UNROLL8 && is_aligned(size, 8) && ptr_is_aligned(map_d_ptr, 8) && ptr_is_aligned(h_ptr, 8)) {
+            gdr_dbgc(1, "using unroll8_memcpy for gdr_copy_to_mapping\n");
+            unroll8_memcpy(map_d_ptr, h_ptr, size);
+        } else if (wc_mapping && PREFERS_STORE_UNROLL4 && is_aligned(size, 4) && ptr_is_aligned(map_d_ptr, 4) && ptr_is_aligned(h_ptr, 4)) {
+            gdr_dbgc(1, "using unroll4_memcpy for gdr_copy_to_mapping\n");
+            unroll4_memcpy(map_d_ptr, h_ptr, size);
+        } else if (device_mapping) {
+            gdr_dbgc(1, "using device-mapping copy for gdr_copy_to_mapping with device mapping\n");
+            memcpy_to_device_mapping(map_d_ptr, h_ptr, size);
+        } else {
+            gdr_dbgc(1, "fallback to compiler/libc memcpy implementation of gdr_copy_to_mapping\n");
+            memcpy(map_d_ptr, h_ptr, size);
+        }
+    } while (0);
+
+do_fence:
+    if (wc_mapping) {
+        // fencing is needed even for plain memcpy(), due to performance
+        // being hit by delayed flushing of WC buffers
+        wc_store_fence();
+    }
+
+out:
+    return 0;
+}
+
+static int gdr_copy_from_mapping_internal(void *h_ptr, const void *map_d_ptr, size_t size, gdr_mapping_type_t mapping_type)
+{
+    const int wc_mapping = (mapping_type == GDR_MAPPING_TYPE_WC);
+    const int device_mapping = (mapping_type == GDR_MAPPING_TYPE_DEVICE);
+
+    do {
+        // pick the most performing implementation compatible with the platform we are running on
+        if (has_sse4_1) {
+            assert(wc_mapping);
+            gdr_dbgc(1, "using SSE4_1 implementation of gdr_copy_from_mapping\n");
+            memcpy_uncached_load_sse41(h_ptr, map_d_ptr, size);
+            break;
+        }
+        if (has_avx) {
+            assert(wc_mapping);
+            gdr_dbgc(1, "using AVX implementation of gdr_copy_from_mapping\n");
+            memcpy_cached_store_avx(h_ptr, map_d_ptr, size);
+            break;
+        }
+        if (has_sse) {
+            assert(wc_mapping);
+            gdr_dbgc(1, "using SSE implementation of gdr_copy_from_mapping\n");
+            memcpy_cached_store_sse(h_ptr, map_d_ptr, size);
+            break;
+        }
+
+        // on POWER, compiler memcpy is not optimal for MMIO
+        // 64bit loads have 2x the BW of 32bit ones
+        if (wc_mapping && PREFERS_LOAD_UNROLL8 && is_aligned(size, 8) && ptr_is_aligned(map_d_ptr, 8) && ptr_is_aligned(h_ptr, 8)) {
+            gdr_dbgc(1, "using unroll8_memcpy for gdr_copy_from_mapping\n");
+            unroll8_memcpy(h_ptr, map_d_ptr, size);
+        } else if (wc_mapping && PREFERS_LOAD_UNROLL4 && is_aligned(size, 4) && ptr_is_aligned(map_d_ptr, 4) && ptr_is_aligned(h_ptr, 4)) {
+            gdr_dbgc(1, "using unroll4_memcpy for gdr_copy_from_mapping\n");
+            unroll4_memcpy(h_ptr, map_d_ptr, size);
+        } else if (device_mapping) {
+            gdr_dbgc(1, "using device-mapping copy for gdr_copy_from_mapping\n");
+            memcpy_from_device_mapping(h_ptr, map_d_ptr, size);
+        } else {
+            gdr_dbgc(1, "fallback to compiler/libc memcpy implementation of gdr_copy_from_mapping\n");
+            memcpy(h_ptr, map_d_ptr, size);
+        }
+
+        // note: fencing is not needed because plain stores are used
+        // if non-temporal/uncached stores were used on x86, a proper fence would be needed instead
+        // if (wc_mapping)
+        //    wc_store_fence();
+    } while (0);
+    
+    return 0;
+}
+
+int gdr_copy_to_mapping(gdr_mh_t handle, void *map_d_ptr, const void *h_ptr, size_t size)
+{
+    gdr_memh_t *mh = to_memh(handle);
+    if (unlikely(!gdr_is_mapped(mh->mapping_type))) {
+        gdr_err("mh is not mapped yet\n");
+        return EINVAL;
+    }
+    if (unlikely(size == 0))
+        return 0;
+    return gdr_copy_to_mapping_internal(map_d_ptr, h_ptr, size, mh->mapping_type);
+}
+
+int gdr_copy_from_mapping(gdr_mh_t handle, void *h_ptr, const void *map_d_ptr, size_t size)
+{
+    gdr_memh_t *mh = to_memh(handle);
+    if (unlikely(!gdr_is_mapped(mh->mapping_type))) {
+        gdr_err("mh is not mapped yet\n");
+        return EINVAL;
+    }
+    if (unlikely(size == 0))
+        return 0;
+    return gdr_copy_from_mapping_internal(h_ptr, map_d_ptr, size, mh->mapping_type);
+}
+
+
+void gdr_runtime_get_version(int *major, int *minor)
+{
+    *major = GDR_API_MAJOR_VERSION;
+    *minor = GDR_API_MINOR_VERSION;
+}
+
+int gdr_driver_get_version(gdr_t g, int *major, int *minor)
+{
+    assert(g != NULL);
+    assert(g->fd > 0);
+
+    struct GDRDRV_IOC_GET_VERSION_PARAMS params;
+    int retcode = ioctl(g->fd, GDRDRV_IOC_GET_VERSION, &params);
+    if (0 != retcode) {
+        int ret = errno;
+        gdr_err("Error getting the gdrdrv driver version with ioctl error (errno=%d). gdrdrv might be too old.\n", ret);
+        return ret;
+    }
+
+    *major = params.gdrdrv_version >> MAJOR_VERSION_SHIFT;
+    *minor = params.gdrdrv_version & MINOR_VERSION_MASK;
+
+    return 0;
+}
+
+// ==============================================================================
+// Obsoleted API. Provided for compatibility only.
+// ==============================================================================
+
+#ifdef gdr_get_info
+#undef gdr_get_info
+#endif
+
+typedef struct gdr_info_v1 {
+    uint64_t va;
+    uint64_t mapped_size;
+    uint32_t page_size;
+    // tm_cycles and cycles_per_ms are deprecated and will be removed in future.
+    uint64_t tm_cycles;
+    uint32_t cycles_per_ms;
+    unsigned mapped:1;
+    unsigned wc_mapping:1;
+} gdr_info_v1_t;
+
+int gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_v1_t *info)
+{
+    int ret = 0;
+    int retcode;
+    gdr_memh_t *mh = to_memh(handle);
+
+    struct GDRDRV_IOC_GET_INFO_PARAMS params;
+    params.handle = mh->handle;
+
+    retcode = ioctl(g->fd, GDRDRV_IOC_GET_INFO, &params);
+    if (0 != retcode) {
+        ret = errno;
+        gdr_err("ioctl error (errno=%d)\n", ret);
+        goto out;
+    } else {
+        info->va            = params.va;
+        info->mapped_size   = params.mapped_size;
+        info->page_size     = params.page_size;
+        info->tm_cycles     = params.tm_cycles;
+        info->cycles_per_ms = params.tsc_khz;
+        info->mapped        = params.mapped;
+        info->wc_mapping    = params.wc_mapping;
+    }
+
+out:
+    return ret;
+}
+
+
+/*
+ * Local variables:
+ *  c-indent-level: 4
+ *  c-basic-offset: 4
+ *  tab-width: 4
+ *  indent-tabs-mode: nil
+ * End:
+ */
--- a/gdrcopy/src/gdrapi_internal.h
+++ b/gdrcopy/src/gdrapi_internal.h
@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2014-2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in 
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __GDRAPI_INTERNAL_H__
+#define __GDRAPI_INTERNAL_H__
+
+#include <stdint.h> // for standard [u]intX_t types
+#include <stddef.h>
+#include <sys/queue.h>
+#include "gdrapi.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef unlikely
+#ifdef __GNUC__
+#define unlikely(x)         __builtin_expect(!!(x), 0)
+#else
+#define unlikely(x)         (x)
+#endif
+#endif
+
+#ifndef ACCESS_ONCE
+#define ACCESS_ONCE(x)      (*(volatile typeof((x)) *)&(x))
+#endif
+
+#ifndef READ_ONCE
+#define READ_ONCE(x)        ACCESS_ONCE(x)
+#endif
+
+#ifndef WRITE_ONCE
+#define WRITE_ONCE(x, v)    (ACCESS_ONCE(x) = (v))
+#endif
+
+typedef struct gdr_memh_t { 
+    uint32_t handle;
+    LIST_ENTRY(gdr_memh_t) entries;
+    gdr_mapping_type_t mapping_type;
+} gdr_memh_t;
+
+struct gdr {
+    int fd;
+    LIST_HEAD(memh_list, gdr_memh_t) memhs;
+    size_t page_size;
+    size_t page_mask;
+    uint8_t page_shift;
+    uint32_t gdrdrv_version;
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // __GDRAPI_INTERNAL_H__
--- a/gdrcopy/src/gdrdrv/Makefile
+++ b/gdrcopy/src/gdrdrv/Makefile
@ -0,0 +1,77 @@
+# Copyright (c) 2014-2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in 
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+ifneq ($(KERNELRELEASE),)
+
+kver_major:=$(shell echo $(KERNELRELEASE) | awk -F '.' '// { print $$2;}' )
+
+obj-m := nv-p2p-dummy.o
+obj-m += gdrdrv.o
+
+ccflags-y += -I$(NVIDIA_SRC_DIR)
+
+ifeq ($(NVIDIA_IS_OPENSOURCE),y)
+ccflags-y += -DGDRDRV_OPENSOURCE_NVIDIA
+endif
+
+ifeq ($(HAVE_VM_FLAGS_SET),y)
+ccflags-y += -DGDRDRV_HAVE_VM_FLAGS_SET
+endif
+
+else
+
+KVER ?= $(shell uname -r)
+MODULES_DIR := /lib/modules/$(KVER)
+KDIR := $(MODULES_DIR)/build
+MODULE_SUBDIR ?= /kernel/drivers/misc/
+MODULE_DESTDIR := $(MODULES_DIR)/$(MODULE_SUBDIR)
+DEPMOD := /sbin/depmod
+
+export NVIDIA_SRC_DIR ?= $(shell { find /usr/src/kernel-modules/nvidia-* /usr/src/nvidia-* -name "nv-p2p.c" -print -quit | xargs dirname || echo "NVIDIA_DRIVER_MISSING"; } 2>/dev/null)
+export NVIDIA_IS_OPENSOURCE ?= $(shell grep -r "MODULE_LICENSE" $(NVIDIA_SRC_DIR)/ | grep -s -q "GPL" && echo "y")
+
+CONF_SCRIPT_DIR ?= $(PWD)/../../scripts
+export HAVE_VM_FLAGS_SET ?= $(shell $(CONF_SCRIPT_DIR)/test_gdrdrv_HAVE_VM_FLAGS_SET.sh -k $(KVER))
+
+all: build
+
+build:
+	@ echo "Picking NVIDIA driver sources from NVIDIA_SRC_DIR=$(NVIDIA_SRC_DIR). If that does not meet your expectation, you might have a stale driver still around and that might cause problems."
+	@ echo "Setting NVIDIA_IS_OPENSOURCE=$(NVIDIA_IS_OPENSOURCE)"
+	@ echo "Setting HAVE_VM_FLAGS_SET=$(HAVE_VM_FLAGS_SET)"
+	@ $(MAKE) -C $(KDIR) $(MAKE_PARAMS) M=$(PWD) modules
+
+install: build
+	[ -d $(DESTDIR)/$(MODULE_DESTDIR) ] || mkdir -p $(DESTDIR)/$(MODULE_DESTDIR)
+	cp gdrdrv.ko $(DESTDIR)/$(MODULE_DESTDIR)
+	if [ ! -n "$(DESTDIR)" ]; then $(DEPMOD) -r -ae $(KVER); fi
+
+help:
+	$(MAKE) -C $(KDIR) M=$$PWD help
+
+clean:
+	rm -rf *.o .*.o.d *.ko* *.mod.* .*.cmd Module.symvers modules.order .tmp_versions/ *~ core .depend TAGS .cache.mk  *.mod
+
+TAGS:
+	find $(KERNELDIR) -follow -name \*.h -o -name \*.c  |xargs etags
+
+.PHONY: clean all help install default linksyms nvidia_src_dir build
+
+endif
--- a/gdrcopy/src/gdrdrv/gdrdrv.c
+++ b/gdrcopy/src/gdrdrv/gdrdrv.c
--- a/gdrcopy/src/gdrdrv/gdrdrv.h
+++ b/gdrcopy/src/gdrdrv/gdrdrv.h
@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2014-2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in 
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __GDR_DRV_H__
+#define __GDR_DRV_H__
+
+#define GDRDRV_STRINGIFY(s)           #s
+#define GDRDRV_TOSTRING(s)            GDRDRV_STRINGIFY(s)
+
+#define GDRDRV_MAJOR_VERSION_SHIFT    16
+
+#define GDRDRV_MAJOR_VERSION    2
+#define GDRDRV_MINOR_VERSION    4
+#define GDRDRV_VERSION          ((GDRDRV_MAJOR_VERSION << GDRDRV_MAJOR_VERSION_SHIFT) | GDRDRV_MINOR_VERSION)
+#define GDRDRV_VERSION_STRING   GDRDRV_TOSTRING(GDRDRV_MAJOR_VERSION) "." GDRDRV_TOSTRING(GDRDRV_MINOR_VERSION)
+
+#define MINIMUM_GDR_API_MAJOR_VERSION   2
+#define MINIMUM_GDR_API_MINOR_VERSION   0
+#define MINIMUM_GDR_API_VERSION         ((MINIMUM_GDR_API_MAJOR_VERSION << 16) | MINIMUM_GDR_API_MINOR_VERSION)
+
+#define GDRDRV_MINIMUM_VERSION_WITH_GET_INFO_V2 ((2 << GDRDRV_MAJOR_VERSION_SHIFT) | 4)
+
+#define GDRDRV_IOCTL                 0xDA
+
+typedef enum {
+    GDR_MR_NONE = 0,
+    GDR_MR_WC = 1,
+    GDR_MR_CACHING = 2,
+    GDR_MR_DEVICE = 3
+} gdr_mr_type_t;
+
+typedef __u64 gdr_hnd_t;
+
+//-----------
+
+struct GDRDRV_IOC_PIN_BUFFER_PARAMS
+{
+    // in
+    __u64 addr;
+    __u64 size;
+    __u64 p2p_token;
+    __u32 va_space;
+    // out
+    gdr_hnd_t handle;
+};
+
+#define GDRDRV_IOC_PIN_BUFFER _IOWR(GDRDRV_IOCTL, 1, struct GDRDRV_IOC_PIN_BUFFER_PARAMS)
+
+//-----------
+
+struct GDRDRV_IOC_UNPIN_BUFFER_PARAMS
+{
+    // in
+    gdr_hnd_t handle;
+};
+
+#define GDRDRV_IOC_UNPIN_BUFFER _IOWR(GDRDRV_IOCTL, 2, struct GDRDRV_IOC_UNPIN_BUFFER_PARAMS *)
+
+//-----------
+
+struct GDRDRV_IOC_GET_CB_FLAG_PARAMS
+{
+    // in
+    gdr_hnd_t handle;
+    // out
+    __u32 flag;
+};
+
+#define GDRDRV_IOC_GET_CB_FLAG _IOWR(GDRDRV_IOCTL, 3, struct GDRDRV_IOC_GET_CB_FLAG_PARAMS *)
+
+//-----------
+
+struct GDRDRV_IOC_GET_INFO_PARAMS
+{
+    // in
+    gdr_hnd_t handle;
+    // out
+    __u64 va;
+    __u64 mapped_size;
+    __u32 page_size;
+    __u32 tsc_khz;
+    __u64 tm_cycles;
+    __u32 mapped;
+    __u32 wc_mapping;
+};
+
+#define GDRDRV_IOC_GET_INFO _IOWR(GDRDRV_IOCTL, 4, struct GDRDRV_IOC_GET_INFO_PARAMS *)
+
+//-----------
+
+struct GDRDRV_IOC_GET_INFO_V2_PARAMS
+{
+    // in
+    gdr_hnd_t handle;
+    // out
+    __u64 va;
+    __u64 mapped_size;
+    __u32 page_size;
+    __u32 tsc_khz;
+    __u64 tm_cycles;
+    __u32 mapping_type;
+};
+
+#define GDRDRV_IOC_GET_INFO_V2 _IOWR(GDRDRV_IOCTL, 5, struct GDRDRV_IOC_GET_INFO_V2_PARAMS *)
+
+//-----------
+
+struct GDRDRV_IOC_GET_VERSION_PARAMS
+{
+    // out
+    __u32 gdrdrv_version;
+    __u32 minimum_gdr_api_version;
+};
+
+#define GDRDRV_IOC_GET_VERSION _IOWR(GDRDRV_IOCTL, 255, struct GDRDRV_IOC_GET_VERSION_PARAMS *)
+
+//-----------
+
+#endif // __GDR_DRV_H__
--- a/gdrcopy/src/gdrdrv/nv-p2p-dummy.c
+++ b/gdrcopy/src/gdrdrv/nv-p2p-dummy.c
@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2015-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Warning: this kernel module is only needed at compile time.
+ *
+ * Long story is that this module is here only to produce the correct
+ * module versions related to the very kernel where the other module (the
+ * interesting one) is going to be compiled.  In other words, this module
+ * produce the same symbol versions as the real NVIDIA kernel-mode driver.
+ *
+ * Downside: the function signatures must be kept up-to-date.
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/delay.h>
+#include <linux/compiler.h>
+#include <linux/string.h>
+#include <linux/uaccess.h>
+#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/io.h>
+
+#include "nv-p2p.h"
+
+MODULE_AUTHOR("drossetti@nvidia.com");
+MODULE_LICENSE("MIT");
+MODULE_DESCRIPTION("P2P dummy kernel-mode driver");
+MODULE_VERSION("1.0");
+
+int nvidia_p2p_init_mapping(uint64_t p2p_token,
+                            struct nvidia_p2p_params *params,
+                            void (*destroy_callback)(void *data),
+                            void *data)
+{
+    return -EINVAL;
+}
+EXPORT_SYMBOL(nvidia_p2p_init_mapping);
+
+int nvidia_p2p_destroy_mapping(uint64_t p2p_token)
+{
+    return -EINVAL;
+}
+EXPORT_SYMBOL(nvidia_p2p_destroy_mapping);
+
+int nvidia_p2p_get_pages(uint64_t p2p_token, uint32_t va_space,
+                         uint64_t virtual_address,
+                         uint64_t length,
+                         struct nvidia_p2p_page_table **page_table,
+                         void (*free_callback)(void *data),
+                         void *data)
+{
+    return -EINVAL;
+}
+EXPORT_SYMBOL(nvidia_p2p_get_pages);
+
+int nvidia_p2p_put_pages(uint64_t p2p_token, uint32_t va_space,
+                         uint64_t virtual_address,
+                         struct nvidia_p2p_page_table *page_table)
+{
+    return -EINVAL;
+}
+EXPORT_SYMBOL(nvidia_p2p_put_pages);
+
+int nvidia_p2p_free_page_table(struct nvidia_p2p_page_table *page_table)
+{
+    return -EINVAL;
+}
+EXPORT_SYMBOL(nvidia_p2p_free_page_table);
+
+#ifdef NVIDIA_P2P_CAP_PERSISTENT_PAGES
+int nvidia_p2p_cap_persistent_pages;
+EXPORT_SYMBOL(nvidia_p2p_cap_persistent_pages);
+#endif
+
+#ifdef NVIDIA_P2P_CAP_GET_PAGES_PERSISTENT_API
+int nvidia_p2p_get_pages_persistent(uint64_t virtual_address,
+        uint64_t length,
+        struct nvidia_p2p_page_table **page_table,
+        uint32_t flags)
+{
+    return -EINVAL;
+}
+EXPORT_SYMBOL(nvidia_p2p_get_pages_persistent);
+
+int nvidia_p2p_put_pages_persistent(uint64_t virtual_address,
+        struct nvidia_p2p_page_table *page_table,
+        uint32_t flags)
+{
+    return -EINVAL;
+}
+EXPORT_SYMBOL(nvidia_p2p_put_pages_persistent);
+#endif
+
+static int __init nv_p2p_dummy_init(void)
+{
+    return 0;
+}
+
+static void __exit nv_p2p_dummy_cleanup(void)
+{
+}
+
+module_init(nv_p2p_dummy_init);
+module_exit(nv_p2p_dummy_cleanup);
+
+/*
+ * Local variables:
+ *  c-indent-level: 4
+ *  c-basic-offset: 4
+ *  tab-width: 4
+ *  indent-tabs-mode: nil
+ * End:
+ */
--- a/gdrcopy/src/memcpy_avx.c
+++ b/gdrcopy/src/memcpy_avx.c
@ -0,0 +1,207 @@
+/*
+ * Copyright (c) 2014-2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in 
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <immintrin.h>
+
+#ifndef min
+#define min(A,B) ((A)<(B)?(A):(B))
+#endif
+
+int memcpy_uncached_store_avx(void *dest, const void *src, size_t n_bytes)
+{
+    int ret = 0;
+#ifdef __AVX__
+    char *d = (char*)dest;
+    uintptr_t d_int = (uintptr_t)d;
+    const char *s = (const char *)src;
+    uintptr_t s_int = (uintptr_t)s;
+    size_t n = n_bytes;
+
+    // align dest to 256-bits
+    if (d_int & 0x1f) {
+        size_t nh = min(0x20 - (d_int & 0x1f), n);
+        memcpy(d, s, nh);
+        d += nh; d_int += nh;
+        s += nh; s_int += nh;
+        n -= nh;
+    }
+
+    if (s_int & 0x1f) { // src is not aligned to 256-bits
+        __m256d r0,r1,r2,r3;
+        // unroll 4
+        while (n >= 4*sizeof(__m256d)) {
+            r0 = _mm256_loadu_pd((double *)(s+0*sizeof(__m256d)));
+            r1 = _mm256_loadu_pd((double *)(s+1*sizeof(__m256d)));
+            r2 = _mm256_loadu_pd((double *)(s+2*sizeof(__m256d)));
+            r3 = _mm256_loadu_pd((double *)(s+3*sizeof(__m256d)));
+            _mm256_stream_pd((double *)(d+0*sizeof(__m256d)), r0);
+            _mm256_stream_pd((double *)(d+1*sizeof(__m256d)), r1);
+            _mm256_stream_pd((double *)(d+2*sizeof(__m256d)), r2);
+            _mm256_stream_pd((double *)(d+3*sizeof(__m256d)), r3);
+            s += 4*sizeof(__m256d);
+            d += 4*sizeof(__m256d);
+            n -= 4*sizeof(__m256d);
+        }
+        while (n >= sizeof(__m256d)) {
+            r0 = _mm256_loadu_pd((double *)(s));
+            _mm256_stream_pd((double *)(d), r0);
+            s += sizeof(__m256d);
+            d += sizeof(__m256d);
+            n -= sizeof(__m256d);
+        }
+    } else { // or it IS aligned
+        __m256d r0,r1,r2,r3,r4,r5,r6,r7;
+        // unroll 8
+        while (n >= 8*sizeof(__m256d)) {
+            r0 = _mm256_load_pd((double *)(s+0*sizeof(__m256d)));
+            r1 = _mm256_load_pd((double *)(s+1*sizeof(__m256d)));
+            r2 = _mm256_load_pd((double *)(s+2*sizeof(__m256d)));
+            r3 = _mm256_load_pd((double *)(s+3*sizeof(__m256d)));
+            r4 = _mm256_load_pd((double *)(s+4*sizeof(__m256d)));
+            r5 = _mm256_load_pd((double *)(s+5*sizeof(__m256d)));
+            r6 = _mm256_load_pd((double *)(s+6*sizeof(__m256d)));
+            r7 = _mm256_load_pd((double *)(s+7*sizeof(__m256d)));
+            _mm256_stream_pd((double *)(d+0*sizeof(__m256d)), r0);
+            _mm256_stream_pd((double *)(d+1*sizeof(__m256d)), r1);
+            _mm256_stream_pd((double *)(d+2*sizeof(__m256d)), r2);
+            _mm256_stream_pd((double *)(d+3*sizeof(__m256d)), r3);
+            _mm256_stream_pd((double *)(d+4*sizeof(__m256d)), r4);
+            _mm256_stream_pd((double *)(d+5*sizeof(__m256d)), r5);
+            _mm256_stream_pd((double *)(d+6*sizeof(__m256d)), r6);
+            _mm256_stream_pd((double *)(d+7*sizeof(__m256d)), r7);
+            s += 8*sizeof(__m256d);
+            d += 8*sizeof(__m256d);
+            n -= 8*sizeof(__m256d);
+        }
+        while (n >= sizeof(__m256d)) {
+            r0 = _mm256_load_pd((double *)(s));
+            _mm256_stream_pd((double *)(d), r0);
+            s += sizeof(__m256d);
+            d += sizeof(__m256d);
+            n -= sizeof(__m256d);
+        }            
+    }
+
+    if (n)
+        memcpy(d, s, n);
+
+    // fencing is needed even for plain memcpy(), due to performance
+    // being hit by delayed flushing of WC buffers
+    _mm_sfence();
+
+#else
+#error "this file should be compiled with -mavx"
+#endif
+    return ret;
+}
+
+int memcpy_cached_store_avx(void *dest, const void *src, size_t n_bytes)
+{
+    int ret = 0;
+#ifdef __AVX__
+    char *d = (char*)dest;
+    uintptr_t d_int = (uintptr_t)d;
+    const char *s = (const char *)src;
+    uintptr_t s_int = (uintptr_t)s;
+    size_t n = n_bytes;
+
+    // align dest to 256-bits
+    if (d_int & 0x1f) {
+        size_t nh = min(0x20 - (d_int & 0x1f), n);
+        memcpy(d, s, nh);
+        d += nh; d_int += nh;
+        s += nh; s_int += nh;
+        n -= nh;
+    }
+
+    if (s_int & 0x1f) { // src is not aligned to 256-bits
+        __m256d r0,r1,r2,r3;
+        // unroll 4
+        while (n >= 4*sizeof(__m256d)) {
+            r0 = _mm256_loadu_pd((double *)(s+0*sizeof(__m256d)));
+            r1 = _mm256_loadu_pd((double *)(s+1*sizeof(__m256d)));
+            r2 = _mm256_loadu_pd((double *)(s+2*sizeof(__m256d)));
+            r3 = _mm256_loadu_pd((double *)(s+3*sizeof(__m256d)));
+            _mm256_store_pd((double *)(d+0*sizeof(__m256d)), r0);
+            _mm256_store_pd((double *)(d+1*sizeof(__m256d)), r1);
+            _mm256_store_pd((double *)(d+2*sizeof(__m256d)), r2);
+            _mm256_store_pd((double *)(d+3*sizeof(__m256d)), r3);
+            s += 4*sizeof(__m256d);
+            d += 4*sizeof(__m256d);
+            n -= 4*sizeof(__m256d);
+        }
+        while (n >= sizeof(__m256d)) {
+            r0 = _mm256_loadu_pd((double *)(s));
+            _mm256_store_pd((double *)(d), r0);
+            s += sizeof(__m256d);
+            d += sizeof(__m256d);
+            n -= sizeof(__m256d);
+        }
+    } else { // or it IS aligned
+        __m256d r0,r1,r2,r3;
+        // unroll 4
+        while (n >= 4*sizeof(__m256d)) {
+            r0 = _mm256_load_pd((double *)(s+0*sizeof(__m256d)));
+            r1 = _mm256_load_pd((double *)(s+1*sizeof(__m256d)));
+            r2 = _mm256_load_pd((double *)(s+2*sizeof(__m256d)));
+            r3 = _mm256_load_pd((double *)(s+3*sizeof(__m256d)));
+            _mm256_store_pd((double *)(d+0*sizeof(__m256d)), r0);
+            _mm256_store_pd((double *)(d+1*sizeof(__m256d)), r1);
+            _mm256_store_pd((double *)(d+2*sizeof(__m256d)), r2);
+            _mm256_store_pd((double *)(d+3*sizeof(__m256d)), r3);
+            s += 4*sizeof(__m256d);
+            d += 4*sizeof(__m256d);
+            n -= 4*sizeof(__m256d);
+        }
+        while (n >= sizeof(__m256d)) {
+            r0 = _mm256_load_pd((double *)(s));
+            _mm256_store_pd((double *)(d), r0);
+            s += sizeof(__m256d);
+            d += sizeof(__m256d);
+            n -= sizeof(__m256d);
+        }            
+    }
+    if (n)
+        memcpy(d, s, n);
+
+    // fencing is needed because of the use of non-temporal stores
+    _mm_sfence();
+
+#else
+#error "this file should be compiled with -mavx"
+#endif
+    return ret;
+}
+
+// add variant for _mm_stream_load_si256() / VMOVNTDQA
+
+/*
+ * Local variables:
+ *  c-indent-level: 4
+ *  c-basic-offset: 4
+ *  tab-width: 4
+ *  indent-tabs-mode: nil
+ * End:
+ */
--- a/gdrcopy/src/memcpy_sse.c
+++ b/gdrcopy/src/memcpy_sse.c
@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2014-2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in 
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <immintrin.h>
+
+#ifndef min
+#define min(A,B) ((A)<(B)?(A):(B))
+#endif
+
+int memcpy_uncached_store_sse(void *dest, const void *src, size_t n_bytes)
+{
+    int ret = 0;
+#ifdef __SSE__
+    char *d = (char*)dest;
+    uintptr_t d_int = (uintptr_t)d;
+    const char *s = (const char *)src;
+    uintptr_t s_int = (uintptr_t)s;
+    size_t n = n_bytes;
+
+    // align dest to 128-bits
+    if (d_int & 0xf) {
+        size_t nh = min(0x10 - (d_int & 0x0f), n);
+        memcpy(d, s, nh);
+        d += nh; d_int += nh;
+        s += nh; s_int += nh;
+        n -= nh;
+    }
+
+    if (s_int & 0xf) { // src is not aligned to 128-bits
+        __m128 r0,r1,r2,r3;
+        // unroll 4
+        while (n >= 4*4*sizeof(float)) {
+            r0 = _mm_loadu_ps((float *)(s+0*4*sizeof(float)));
+            r1 = _mm_loadu_ps((float *)(s+1*4*sizeof(float)));
+            r2 = _mm_loadu_ps((float *)(s+2*4*sizeof(float)));
+            r3 = _mm_loadu_ps((float *)(s+3*4*sizeof(float)));
+            _mm_stream_ps((float *)(d+0*4*sizeof(float)), r0);
+            _mm_stream_ps((float *)(d+1*4*sizeof(float)), r1);
+            _mm_stream_ps((float *)(d+2*4*sizeof(float)), r2);
+            _mm_stream_ps((float *)(d+3*4*sizeof(float)), r3);
+            s += 4*4*sizeof(float);
+            d += 4*4*sizeof(float);
+            n -= 4*4*sizeof(float);
+        }
+        while (n >= 4*sizeof(float)) {
+            r0 = _mm_loadu_ps((float *)(s));
+            _mm_stream_ps((float *)(d), r0);
+            s += 4*sizeof(float);
+            d += 4*sizeof(float);
+            n -= 4*sizeof(float);
+        }
+    } else { // or it IS aligned
+        __m128 r0,r1,r2,r3;
+        // unroll 4
+        while (n >= 4*4*sizeof(float)) {
+            r0 = _mm_load_ps((float *)(s+0*4*sizeof(float)));
+            r1 = _mm_load_ps((float *)(s+1*4*sizeof(float)));
+            r2 = _mm_load_ps((float *)(s+2*4*sizeof(float)));
+            r3 = _mm_load_ps((float *)(s+3*4*sizeof(float)));
+            _mm_stream_ps((float *)(d+0*4*sizeof(float)), r0);
+            _mm_stream_ps((float *)(d+1*4*sizeof(float)), r1);
+            _mm_stream_ps((float *)(d+2*4*sizeof(float)), r2);
+            _mm_stream_ps((float *)(d+3*4*sizeof(float)), r3);
+            s += 4*4*sizeof(float);
+            d += 4*4*sizeof(float);
+            n -= 4*4*sizeof(float);
+        }
+        while (n >= 4*sizeof(float)) {
+            r0 = _mm_load_ps((float *)(s));
+            _mm_stream_ps((float *)(d), r0);
+            s += 4*sizeof(float);
+            d += 4*sizeof(float);
+            n -= 4*sizeof(float);
+        }            
+    }
+
+    if (n)
+        memcpy(d, s, n);
+
+    // fencing is needed even for plain memcpy(), due to performance
+    // being hit by delayed flushing of WC buffers
+    _mm_sfence();
+#else
+#error "this file should be compiled with -msse"
+#endif
+    return ret;
+}
+
+int memcpy_cached_store_sse(void *dest, const void *src, size_t n_bytes)
+{
+    int ret = 0;
+#ifdef __SSE__
+    char *d = (char*)dest;
+    uintptr_t d_int = (uintptr_t)d;
+    const char *s = (const char *)src;
+    uintptr_t s_int = (uintptr_t)s;
+    size_t n = n_bytes;
+
+    // align dest to 128-bits
+    if (d_int & 0xf) {
+        size_t nh = min(0x10 - (d_int & 0x0f), n);
+        memcpy(d, s, nh);
+        d += nh; d_int += nh;
+        s += nh; s_int += nh;
+        n -= nh;
+    }
+
+    if (s_int & 0xf) { // src is not aligned to 128-bits
+        __m128 r0,r1,r2,r3;
+        // unroll 4
+        while (n >= 4*4*sizeof(float)) {
+            r0 = _mm_loadu_ps((float *)(s+0*4*sizeof(float)));
+            r1 = _mm_loadu_ps((float *)(s+1*4*sizeof(float)));
+            r2 = _mm_loadu_ps((float *)(s+2*4*sizeof(float)));
+            r3 = _mm_loadu_ps((float *)(s+3*4*sizeof(float)));
+            _mm_store_ps((float *)(d+0*4*sizeof(float)), r0);
+            _mm_store_ps((float *)(d+1*4*sizeof(float)), r1);
+            _mm_store_ps((float *)(d+2*4*sizeof(float)), r2);
+            _mm_store_ps((float *)(d+3*4*sizeof(float)), r3);
+            s += 4*4*sizeof(float);
+            d += 4*4*sizeof(float);
+            n -= 4*4*sizeof(float);
+        }
+        while (n >= 4*sizeof(float)) {
+            r0 = _mm_loadu_ps((float *)(s));
+            _mm_store_ps((float *)(d), r0);
+            s += 4*sizeof(float);
+            d += 4*sizeof(float);
+            n -= 4*sizeof(float);
+        }
+    } else { // or it IS aligned
+        __m128 r0,r1,r2,r3;
+        // unroll 4
+        while (n >= 4*4*sizeof(float)) {
+            r0 = _mm_load_ps((float *)(s+0*4*sizeof(float)));
+            r1 = _mm_load_ps((float *)(s+1*4*sizeof(float)));
+            r2 = _mm_load_ps((float *)(s+2*4*sizeof(float)));
+            r3 = _mm_load_ps((float *)(s+3*4*sizeof(float)));
+            _mm_store_ps((float *)(d+0*4*sizeof(float)), r0);
+            _mm_store_ps((float *)(d+1*4*sizeof(float)), r1);
+            _mm_store_ps((float *)(d+2*4*sizeof(float)), r2);
+            _mm_store_ps((float *)(d+3*4*sizeof(float)), r3);
+            s += 4*4*sizeof(float);
+            d += 4*4*sizeof(float);
+            n -= 4*4*sizeof(float);
+        }
+        while (n >= 4*sizeof(float)) {
+            r0 = _mm_load_ps((float *)(s));
+            _mm_store_ps((float *)(d), r0);
+            s += 4*sizeof(float);
+            d += 4*sizeof(float);
+            n -= 4*sizeof(float);
+        }            
+    }
+
+    if (n)
+        memcpy(d, s, n);
+
+    // fencing because of NT stores
+    // potential optimization: issue only when NT stores are actually emitted
+    _mm_sfence();
+
+#else
+#error "this file should be compiled with -msse"
+#endif
+    return ret;
+}
+
+/*
+ * Local variables:
+ *  c-indent-level: 4
+ *  c-basic-offset: 4
+ *  tab-width: 4
+ *  indent-tabs-mode: nil
+ * End:
+ */
--- a/gdrcopy/src/memcpy_sse41.c
+++ b/gdrcopy/src/memcpy_sse41.c
@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in 
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <immintrin.h>
+
+#ifndef min
+#define min(A,B) ((A)<(B)?(A):(B))
+#endif
+
+// implementation of copy from BAR using MOVNTDQA 
+// suggested by Nicholas Wilt <nwilt@amazon.com>
+
+// src is WC MMIO of GPU BAR
+// dest is host memory
+int memcpy_uncached_load_sse41(void *dest, const void *src, size_t n_bytes)
+{
+    int ret = 0;
+#ifdef __SSE4_1__
+    char *d = (char*)dest;
+    uintptr_t d_int = (uintptr_t)d;
+    const char *s = (const char *)src;
+    uintptr_t s_int = (uintptr_t)s;
+    size_t n = n_bytes;
+
+    // align src to 128-bits
+    if (s_int & 0xf) {
+        size_t nh = min(0x10 - (s_int & 0x0f), n);
+        memcpy(d, s, nh);
+        d += nh; d_int += nh;
+        s += nh; s_int += nh;
+        n -= nh;
+    }
+
+    if (d_int & 0xf) { // dest is not aligned to 128-bits
+        __m128i r0,r1,r2,r3,r4,r5,r6,r7;
+        // unroll 8
+        while (n >= 8*sizeof(__m128i)) {
+            r0 = _mm_stream_load_si128 ((__m128i *)(s+0*sizeof(__m128i)));
+            r1 = _mm_stream_load_si128 ((__m128i *)(s+1*sizeof(__m128i)));
+            r2 = _mm_stream_load_si128 ((__m128i *)(s+2*sizeof(__m128i)));
+            r3 = _mm_stream_load_si128 ((__m128i *)(s+3*sizeof(__m128i)));
+            r4 = _mm_stream_load_si128 ((__m128i *)(s+4*sizeof(__m128i)));
+            r5 = _mm_stream_load_si128 ((__m128i *)(s+5*sizeof(__m128i)));
+            r6 = _mm_stream_load_si128 ((__m128i *)(s+6*sizeof(__m128i)));
+            r7 = _mm_stream_load_si128 ((__m128i *)(s+7*sizeof(__m128i)));
+            _mm_storeu_si128((__m128i *)(d+0*sizeof(__m128i)), r0);
+            _mm_storeu_si128((__m128i *)(d+1*sizeof(__m128i)), r1);
+            _mm_storeu_si128((__m128i *)(d+2*sizeof(__m128i)), r2);
+            _mm_storeu_si128((__m128i *)(d+3*sizeof(__m128i)), r3);
+            _mm_storeu_si128((__m128i *)(d+4*sizeof(__m128i)), r4);
+            _mm_storeu_si128((__m128i *)(d+5*sizeof(__m128i)), r5);
+            _mm_storeu_si128((__m128i *)(d+6*sizeof(__m128i)), r6);
+            _mm_storeu_si128((__m128i *)(d+7*sizeof(__m128i)), r7);
+            s += 8*sizeof(__m128i);
+            d += 8*sizeof(__m128i);
+            n -= 8*sizeof(__m128i);
+        }
+        while (n >= sizeof(__m128i)) {
+            r0 = _mm_stream_load_si128 ((__m128i *)(s+0*sizeof(__m128i)));
+            _mm_storeu_si128((__m128i *)(d+0*sizeof(__m128i)), r0);
+            s += sizeof(__m128i);
+            d += sizeof(__m128i);
+            n -= sizeof(__m128i);
+        }
+    } else { // or it IS aligned
+        __m128i r0,r1,r2,r3,r4,r5,r6,r7;
+        // unroll 8
+        while (n >= 8*sizeof(__m128i)) {
+            r0 = _mm_stream_load_si128 ((__m128i *)(s+0*sizeof(__m128i)));
+            r1 = _mm_stream_load_si128 ((__m128i *)(s+1*sizeof(__m128i)));
+            r2 = _mm_stream_load_si128 ((__m128i *)(s+2*sizeof(__m128i)));
+            r3 = _mm_stream_load_si128 ((__m128i *)(s+3*sizeof(__m128i)));
+            r4 = _mm_stream_load_si128 ((__m128i *)(s+4*sizeof(__m128i)));
+            r5 = _mm_stream_load_si128 ((__m128i *)(s+5*sizeof(__m128i)));
+            r6 = _mm_stream_load_si128 ((__m128i *)(s+6*sizeof(__m128i)));
+            r7 = _mm_stream_load_si128 ((__m128i *)(s+7*sizeof(__m128i)));
+            _mm_stream_si128((__m128i *)(d+0*sizeof(__m128i)), r0);
+            _mm_stream_si128((__m128i *)(d+1*sizeof(__m128i)), r1);
+            _mm_stream_si128((__m128i *)(d+2*sizeof(__m128i)), r2);
+            _mm_stream_si128((__m128i *)(d+3*sizeof(__m128i)), r3);
+            _mm_stream_si128((__m128i *)(d+4*sizeof(__m128i)), r4);
+            _mm_stream_si128((__m128i *)(d+5*sizeof(__m128i)), r5);
+            _mm_stream_si128((__m128i *)(d+6*sizeof(__m128i)), r6);
+            _mm_stream_si128((__m128i *)(d+7*sizeof(__m128i)), r7);
+            s += 8*sizeof(__m128i);
+            d += 8*sizeof(__m128i);
+            n -= 8*sizeof(__m128i);
+        }
+        while (n >= sizeof(__m128i)) {
+            r0 = _mm_stream_load_si128 ((__m128i *)(s+0*sizeof(__m128i)));
+            _mm_stream_si128((__m128i *)(d+0*sizeof(__m128i)), r0);
+            s += sizeof(__m128i);
+            d += sizeof(__m128i);
+            n -= sizeof(__m128i);
+        }
+    }
+
+    if (n)
+        memcpy(d, s, n);
+
+    // fencing because of NT stores
+    // potential optimization: issue only when NT stores are actually emitted
+    _mm_sfence();
+
+#else
+#error "this file should be compiled with -msse4.1"
+#endif
+    return ret;
+}
+
+
+/*
+ * Local variables:
+ *  c-indent-level: 4
+ *  c-basic-offset: 4
+ *  tab-width: 4
+ *  indent-tabs-mode: nil
+ * End:
+ */
--- a/gdrcopy/tests/Makefile
+++ b/gdrcopy/tests/Makefile
@ -0,0 +1,69 @@
+DESTBIN ?= 
+CUDA ?= /usr/local/cuda
+NVCC ?= $(CUDA)/bin/nvcc
+
+GDRAPI_INC := ../include
+GDRAPI_SRC := ../src
+
+CUDA_LIB := -L $(CUDA)/lib64 -L $(CUDA)/lib -L /usr/lib64/nvidia -L /usr/lib/nvidia -L $(CUDA)/lib64/stubs
+CUDA_INC += -I $(CUDA)/include
+
+CPPFLAGS := $(CUDA_INC) -I $(GDRAPI_INC) -I $(GDRAPI_SRC) -I $(CUDA)/include
+LDFLAGS  := $(CUDA_LIB) -L $(CUDA)/lib64 -L $(GDRAPI_SRC)
+COMMONCFLAGS := -O2
+CFLAGS   += $(COMMONCFLAGS)
+CXXFLAGS += $(COMMONCFLAGS)
+NVCCFLAGS ?=
+LIBS     := -lcuda -lpthread -ldl -lgdrapi
+
+CPP_SRCS := copybw.cpp sanity.cpp copylat.cpp apiperf.cpp
+CU_SRCS  := pplat.cu
+EXES := $(patsubst %.cpp,gdrcopy_%,$(CPP_SRCS)) $(patsubst %.cu,gdrcopy_%,$(CU_SRCS))
+
+all: exes
+
+exes: $(EXES)
+
+testsuites/testsuite.o: testsuites/testsuite.cpp testsuites/testsuite.hpp common.hpp
+common.o: common.cpp $(GDRAPI_INC)/gdrapi.h common.hpp
+copybw.o: copybw.cpp $(GDRAPI_INC)/gdrapi.h common.hpp
+sanity.o: sanity.cpp $(GDRAPI_INC)/gdrapi.h $(GDRAPI_SRC)/gdrapi_internal.h common.hpp testsuites/testsuite.hpp
+copylat.o: copylat.cpp $(GDRAPI_INC)/gdrapi.h common.hpp
+apiperf.o: apiperf.cpp $(GDRAPI_INC)/gdrapi.h common.hpp
+
+gdrcopy_copybw: copybw.o common.o
+	$(LINK.cc)  -o $@ $^ $(LIBS) -lrt
+
+gdrcopy_sanity: sanity.o common.o testsuites/testsuite.o
+	$(LINK.cc)  -o $@ $^ $(LIBS)
+
+gdrcopy_copylat: copylat.o common.o
+	$(LINK.cc)  -o $@ $^ $(LIBS) -lrt
+
+gdrcopy_apiperf: apiperf.o common.o
+	$(LINK.cc)  -o $@ $^ $(LIBS) -lrt
+
+gdrcopy_pplat: pplat.o common.o
+	$(NVCC)  -o $@ $^ $(LDFLAGS) -lgdrapi -lcuda
+
+%.o: %.cu
+	$(NVCC) -o $@ -c $^ $(LIBS) $(CPPFLAGS) $(NVCCFLAGS)
+
+clean:
+	rm -f *.o $(EXES) *~ core.* testsuites/*.o
+
+install: exes
+	@ echo "installing exes in $(DESTBIN)..." && \
+	mkdir -p $(DESTBIN) && \
+	install -D -v -m u=rwx,g=rx,o=rx gdrcopy_copybw -t $(DESTBIN) && \
+	install -D -v -m u=rwx,g=rx,o=rx gdrcopy_copylat -t $(DESTBIN) && \
+	install -D -v -m u=rwx,g=rx,o=rx gdrcopy_apiperf -t $(DESTBIN) && \
+	install -D -v -m u=rwx,g=rx,o=rx gdrcopy_sanity -t $(DESTBIN) && \
+	install -D -v -m u=rwx,g=rx,o=rx gdrcopy_pplat -t $(DESTBIN)
+	cd $(DESTBIN) && \
+	ln -sf gdrcopy_copybw copybw && \
+	ln -sf gdrcopy_copylat copylat && \
+	ln -sf gdrcopy_apiperf apiperf && \
+	ln -sf gdrcopy_sanity sanity
+
+.PHONY: clean all exes install
--- a/gdrcopy/tests/apiperf.cpp
+++ b/gdrcopy/tests/apiperf.cpp
@ -0,0 +1,287 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <getopt.h>
+#include <memory.h>
+#include <stdio.h>
+#include <math.h>
+#include <iostream>
+#include <iomanip>
+#include <cuda.h>
+
+using namespace std;
+
+#include "gdrapi.h"
+#include "common.hpp"
+
+using namespace gdrcopy::test;
+
+// manually tuned...
+int num_iters        = 100;
+int num_bins         = 10;
+int num_warmup_iters = 10;
+size_t _size = (size_t)1 << 24;
+int dev_id = 0;
+
+void print_usage(const char *path)
+{
+    cout << "Usage: " << path << " [-h][-s <max-size>][-d <gpu>][-n <iters>][-w <iters>][-a <fn>]" << endl;
+    cout << endl;
+    cout << "Options:" << endl;
+    cout << "   -h              Print this help text" << endl;
+    cout << "   -s <max-size>   Max buffer size to benchmark (default: " << _size << ")" << endl;
+    cout << "   -d <gpu>        GPU ID (default: " << dev_id << ")" << endl;
+    cout << "   -n <iters>      Number of benchmark iterations (default: " << num_iters << ")" << endl;
+    cout << "   -w <iters>      Number of warm-up iterations (default: " << num_warmup_iters << ")" << endl;
+    cout << "   -a <fn>         GPU buffer allocation function (default: cuMemAlloc)" << endl;
+    cout << "                       Choices: cuMemAlloc, cuMemCreate" << endl;
+}
+
+void run_test(CUdeviceptr d_A, size_t size)
+{
+    // minimum pinning size is a GPU page size
+    size_t pin_request_size = GPU_PAGE_SIZE;
+    struct timespec beg, end;
+    double pin_lat_us;
+    double map_lat_us;
+    double unpin_lat_us;
+    double unmap_lat_us;
+    double inf_lat_us;
+    double delta_lat_us;
+    double *lat_arr;
+    int *bin_arr;
+
+    gdr_t g = gdr_open();
+    ASSERT_NEQ(g, (void*)0);
+
+    gdr_mh_t mh;
+    BEGIN_CHECK {
+        // tokens are optional in CUDA 6.0
+        // wave out the test if GPUDirectRDMA is not enabled
+
+        lat_arr = (double *)malloc(sizeof(double) * num_iters);
+        bin_arr = (int *)malloc(sizeof(double) * num_bins);
+
+        while (pin_request_size <= size) {
+            int iter = 0;
+            size_t actual_pin_size;
+            double min_lat, max_lat;
+            min_lat = -1;
+            max_lat = -1;
+            pin_lat_us = 0;
+            map_lat_us = 0;
+            unpin_lat_us = 0;
+            unmap_lat_us = 0;
+            inf_lat_us = 0;
+            actual_pin_size = PAGE_ROUND_UP(pin_request_size, GPU_PAGE_SIZE);
+
+            for (iter = 0; iter < num_warmup_iters; ++iter) {
+
+                BREAK_IF_NEQ(gdr_pin_buffer(g, d_A, actual_pin_size, 0, 0, &mh), 0);
+                ASSERT_NEQ(mh, null_mh);
+
+                void *map_d_ptr  = NULL;
+                ASSERT_EQ(gdr_map(g, mh, &map_d_ptr, actual_pin_size), 0);
+
+                gdr_info_t info;
+                ASSERT_EQ(gdr_get_info(g, mh, &info), 0);
+                ASSERT_EQ(gdr_unmap(g, mh, map_d_ptr, actual_pin_size), 0);
+                ASSERT_EQ(gdr_unpin_buffer(g, mh), 0);
+            }
+
+            for (iter = 0; iter < num_iters; ++iter) {
+
+                clock_gettime(MYCLOCK, &beg);
+                BREAK_IF_NEQ(gdr_pin_buffer(g, d_A, actual_pin_size, 0, 0, &mh), 0);
+                clock_gettime(MYCLOCK, &end);
+                delta_lat_us = ((end.tv_nsec-beg.tv_nsec)/1000.0 + (end.tv_sec-beg.tv_sec)*1000000.0);
+                pin_lat_us += delta_lat_us;
+                ASSERT_NEQ(mh, null_mh);
+                lat_arr[iter] = delta_lat_us;
+                min_lat = (min_lat == -1) ? delta_lat_us : ((delta_lat_us < min_lat) ? delta_lat_us : min_lat);
+                max_lat = delta_lat_us > max_lat ? delta_lat_us : max_lat;
+
+                void *map_d_ptr  = NULL;
+                clock_gettime(MYCLOCK, &beg);
+                ASSERT_EQ(gdr_map(g, mh, &map_d_ptr, actual_pin_size), 0);
+                clock_gettime(MYCLOCK, &end);
+                delta_lat_us = ((end.tv_nsec-beg.tv_nsec)/1000.0 + (end.tv_sec-beg.tv_sec)*1000000.0);
+                map_lat_us += delta_lat_us;
+
+                gdr_info_t info;
+                clock_gettime(MYCLOCK, &beg);
+                ASSERT_EQ(gdr_get_info(g, mh, &info), 0);
+                clock_gettime(MYCLOCK, &end);
+                delta_lat_us = ((end.tv_nsec-beg.tv_nsec)/1000.0 + (end.tv_sec-beg.tv_sec)*1000000.0);
+                inf_lat_us += delta_lat_us;
+
+                clock_gettime(MYCLOCK, &beg);
+                ASSERT_EQ(gdr_unmap(g, mh, map_d_ptr, actual_pin_size), 0);
+                clock_gettime(MYCLOCK, &end);
+                delta_lat_us = ((end.tv_nsec-beg.tv_nsec)/1000.0 + (end.tv_sec-beg.tv_sec)*1000000.0);
+                unmap_lat_us += delta_lat_us;
+
+                clock_gettime(MYCLOCK, &beg);
+                ASSERT_EQ(gdr_unpin_buffer(g, mh), 0);
+                clock_gettime(MYCLOCK, &end);
+                delta_lat_us = ((end.tv_nsec-beg.tv_nsec)/1000.0 + (end.tv_sec-beg.tv_sec)*1000000.0);
+                unpin_lat_us += delta_lat_us;
+            }
+
+            pin_lat_us /= iter;
+            map_lat_us /= iter;
+            inf_lat_us /= iter;
+            unpin_lat_us /= iter;
+            unmap_lat_us /= iter;
+
+            printf("Size(B)\tpin.Time(us)\tmap.Time(us)\tget_info.Time(us)\tunmap.Time(us)\tunpin.Time(us)\n");
+            printf("%zu\t%f\t%f\t%f\t%f\t%f\n",
+                    actual_pin_size, pin_lat_us, map_lat_us, inf_lat_us, unmap_lat_us, unpin_lat_us);
+            pin_request_size <<= 1;
+
+            printf("Histogram of gdr_pin_buffer latency for %ld bytes\n", actual_pin_size);
+            print_histogram(lat_arr, num_iters, bin_arr, num_bins, min_lat, max_lat);
+            printf("\n");
+        }
+
+        free(lat_arr);
+        free(bin_arr);
+    } END_CHECK;
+
+    cout << "closing gdrdrv" << endl;
+    ASSERT_EQ(gdr_close(g), 0);
+
+}
+
+int main(int argc, char *argv[])
+{
+    gpu_memalloc_fn_t galloc_fn = gpu_mem_alloc;
+    gpu_memfree_fn_t gfree_fn = gpu_mem_free;
+
+    while(1) {
+        int c;
+        c = getopt(argc, argv, "s:d:n:w:a:h");
+        if (c == -1)
+            break;
+
+        switch (c) {
+            case 's':
+                _size = strtol(optarg, NULL, 0);
+                break;
+            case 'd':
+                dev_id = strtol(optarg, NULL, 0);
+                break;
+            case 'n':
+                num_iters = strtol(optarg, NULL, 0);
+                break;
+            case 'w':
+                num_warmup_iters = strtol(optarg, NULL, 0);
+                break;
+            case 'a':
+                if (strcmp(optarg, "cuMemAlloc") == 0) {
+                    galloc_fn = gpu_mem_alloc;
+                    gfree_fn = gpu_mem_free;
+                }
+                else if (strcmp(optarg, "cuMemCreate") == 0) {
+                    galloc_fn = gpu_vmm_alloc;
+                    gfree_fn = gpu_vmm_free;
+                }
+                else {
+                    cerr << "Unrecognized fn argument" << endl;
+                    exit(EXIT_FAILURE);
+                }
+                break;
+            case 'h':
+                print_usage(argv[0]);
+                exit(EXIT_SUCCESS);
+                break;
+            default:
+                printf("ERROR: invalid option\n");
+                exit(EXIT_FAILURE);
+        }
+    }
+
+    size_t size = PAGE_ROUND_UP(_size, GPU_PAGE_SIZE);
+
+    ASSERTDRV(cuInit(0));
+
+    int n_devices = 0;
+    ASSERTDRV(cuDeviceGetCount(&n_devices));
+
+    CUdevice dev;
+    for (int n=0; n<n_devices; ++n) {
+
+        char dev_name[256];
+        int dev_pci_domain_id;
+        int dev_pci_bus_id;
+        int dev_pci_device_id;
+
+        ASSERTDRV(cuDeviceGet(&dev, n));
+        ASSERTDRV(cuDeviceGetName(dev_name, sizeof(dev_name) / sizeof(dev_name[0]), dev));
+        ASSERTDRV(cuDeviceGetAttribute(&dev_pci_domain_id, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, dev));
+        ASSERTDRV(cuDeviceGetAttribute(&dev_pci_bus_id, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev));
+        ASSERTDRV(cuDeviceGetAttribute(&dev_pci_device_id, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, dev));
+
+        cout  << "GPU id:" << n << "; name: " << dev_name
+              << "; Bus id: "
+              << std::hex
+              << std::setfill('0') << std::setw(4) << dev_pci_domain_id
+              << ":" << std::setfill('0') << std::setw(2) << dev_pci_bus_id
+              << ":" << std::setfill('0') << std::setw(2) << dev_pci_device_id
+              << std::dec
+              << endl;
+    }
+    cout << "selecting device " << dev_id << endl;
+    ASSERTDRV(cuDeviceGet(&dev, dev_id));
+
+    CUcontext dev_ctx;
+    ASSERTDRV(cuDevicePrimaryCtxRetain(&dev_ctx, dev));
+    ASSERTDRV(cuCtxSetCurrent(dev_ctx));
+
+    ASSERT_EQ(check_gdr_support(dev), true);
+
+    CUdeviceptr d_A;
+    gpu_mem_handle_t mhandle;
+    ASSERTDRV(galloc_fn(&mhandle, size, true, true));
+    d_A = mhandle.ptr;
+    cout << "device ptr: 0x" << hex << d_A << dec << endl;
+    cout << "allocated size: " << size << endl;
+
+    run_test(d_A, size);
+
+    ASSERTDRV(gfree_fn(&mhandle));
+
+    ASSERTDRV(cuCtxSetCurrent(NULL));
+    ASSERTDRV(cuDevicePrimaryCtxRelease(dev));
+
+    return 0;
+}
+
+/*
+ * Local variables:
+ *  c-indent-level: 4
+ *  c-basic-offset: 4
+ *  tab-width: 4
+ *  indent-tabs-mode: nil
+ * End:
+ */
--- a/gdrcopy/tests/common.cpp
+++ b/gdrcopy/tests/common.cpp
@ -0,0 +1,358 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in 
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <map>
+#include <cuda.h>
+#include "common.hpp"
+
+namespace gdrcopy {
+    namespace test {
+        bool print_dbg_msg = false;
+
+        void print_dbg(const char* fmt, ...)
+        {
+            if (print_dbg_msg) {
+                va_list ap;
+                va_start(ap, fmt);
+                vfprintf(stderr, fmt, ap);
+                va_end(ap);
+            }
+        }
+
+        CUresult gpu_mem_alloc(gpu_mem_handle_t *handle, const size_t size, bool aligned_mapping, bool set_sync_memops)
+        {
+            CUresult ret = CUDA_SUCCESS;
+            CUdeviceptr ptr, out_ptr;
+            size_t allocated_size;
+
+            if (aligned_mapping)
+                allocated_size = size + GPU_PAGE_SIZE - 1;
+            else
+                allocated_size = size;
+
+            ret = cuMemAlloc(&ptr, allocated_size);
+            if (ret != CUDA_SUCCESS)
+                return ret;
+
+            if (set_sync_memops) {
+                unsigned int flag = 1;
+                ret = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, ptr);
+                if (ret != CUDA_SUCCESS) {
+                    cuMemFree(ptr);
+                    return ret;
+                }
+            }
+
+            if (aligned_mapping)
+                out_ptr = PAGE_ROUND_UP(ptr, GPU_PAGE_SIZE);
+            else
+                out_ptr = ptr;
+
+            handle->ptr = out_ptr;
+            handle->unaligned_ptr = ptr;
+            handle->size = size;
+            handle->allocated_size = allocated_size;
+
+            return CUDA_SUCCESS;
+        }
+
+        CUresult gpu_mem_free(gpu_mem_handle_t *handle)
+        {
+            CUresult ret = CUDA_SUCCESS;
+            CUdeviceptr ptr;
+
+            ret = cuMemFree(handle->unaligned_ptr);
+            if (ret == CUDA_SUCCESS)
+                memset(handle, 0, sizeof(gpu_mem_handle_t));
+
+            return ret;
+        }
+
+#if CUDA_VERSION >= 11000
+        /**
+         * Allocating GPU memory using VMM API.
+         * VMM API is available since CUDA 10.2. However, the RDMA support is added in CUDA 11.0.
+         * Our tests are not useful without RDMA support. So, we enable this VMM allocation from CUDA 11.0.
+         */
+        CUresult gpu_vmm_alloc(gpu_mem_handle_t *handle, const size_t size, bool aligned_mapping, bool set_sync_memops)
+        {
+            CUresult ret = CUDA_SUCCESS;
+
+            size_t granularity, gran;
+            CUmemAllocationProp mprop;
+            CUdevice gpu_dev;
+            size_t rounded_size;
+            CUdeviceptr ptr = 0;
+            CUmemGenericAllocationHandle mem_handle = 0;
+            bool is_mapped = false;
+
+            int RDMASupported = 0;
+
+            int version;
+
+            ret = cuDriverGetVersion(&version);
+            if (ret != CUDA_SUCCESS) {
+                print_dbg("error in cuDriverGetVersion\n");
+                goto out;
+            }
+
+            if (version < 11000) {
+                print_dbg("VMM with RDMA is not supported in this CUDA version.\n");
+                ret = CUDA_ERROR_NOT_SUPPORTED;
+                goto out;
+            }
+
+            ret = cuCtxGetDevice(&gpu_dev);
+            if (ret != CUDA_SUCCESS) {
+                print_dbg("error in cuCtxGetDevice\n");
+                goto out;
+            }
+
+            ret = cuDeviceGetAttribute(&RDMASupported, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, gpu_dev);
+            if (ret != CUDA_SUCCESS) {
+                print_dbg("error in cuDeviceGetAttribute\n");
+                goto out;
+            }
+
+            if (!RDMASupported) {
+                print_dbg("GPUDirect RDMA is not supported on this GPU.\n");
+                ret = CUDA_ERROR_NOT_SUPPORTED;
+                goto out;
+            }
+
+            memset(&mprop, 0, sizeof(CUmemAllocationProp));
+            mprop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+            mprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+            mprop.location.id = gpu_dev;
+            mprop.allocFlags.gpuDirectRDMACapable = 1;
+
+            ret = cuMemGetAllocationGranularity(&gran, &mprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED);
+            if (ret != CUDA_SUCCESS) {
+                print_dbg("error in cuMemGetAllocationGranularity\n");
+                goto out;
+            }
+
+            // In case gran is smaller than GPU_PAGE_SIZE
+            granularity = PAGE_ROUND_UP(gran, GPU_PAGE_SIZE);
+
+            rounded_size = PAGE_ROUND_UP(size, granularity);
+            ret = cuMemAddressReserve(&ptr, rounded_size, granularity, 0, 0);
+            if (ret != CUDA_SUCCESS) {
+                print_dbg("error in cuMemAddressReserve\n");
+                goto out;
+            }
+
+            ret = cuMemCreate(&mem_handle, rounded_size, &mprop, 0);
+            if (ret != CUDA_SUCCESS) {
+                print_dbg("error in cuMemCreate\n");
+                goto out;
+            }
+
+            ret = cuMemMap(ptr, rounded_size, 0, mem_handle, 0);
+            if (ret != CUDA_SUCCESS) {
+                print_dbg("error in cuMemMap\n");
+                goto out;
+            }
+            is_mapped = true;
+
+            CUmemAccessDesc access;
+            access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+            access.location.id = gpu_dev;
+            access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+
+            ret = cuMemSetAccess(ptr, rounded_size, &access, 1);
+            if (ret != CUDA_SUCCESS) {
+                print_dbg("error in cuMemSetAccess\n");
+                goto out;
+            }
+
+            // cuMemAddressReserve always returns aligned ptr
+            handle->ptr = ptr;
+            handle->handle = mem_handle;
+            handle->size = size;
+            handle->allocated_size = rounded_size;
+
+out:
+            if (ret != CUDA_SUCCESS) {
+                if (is_mapped)
+                    cuMemUnmap(ptr, rounded_size);
+                
+                if (mem_handle)
+                    cuMemRelease(mem_handle);
+                
+                if (ptr)
+                    cuMemAddressFree(ptr, rounded_size);
+            }
+            return ret;
+        }
+
+        CUresult gpu_vmm_free(gpu_mem_handle_t *handle)
+        {
+            CUresult ret;
+
+            if (!handle || !handle->ptr)
+                return CUDA_ERROR_INVALID_VALUE;
+
+            ret = cuMemUnmap(handle->ptr, handle->allocated_size);
+            if (ret != CUDA_SUCCESS) {
+                print_dbg("error in cuMemUnmap\n");
+                return ret;
+            }
+
+            ret = cuMemRelease(handle->handle);
+            if (ret != CUDA_SUCCESS) {
+                print_dbg("error in cuMemRelease\n");
+                return ret;
+            }
+
+            ret = cuMemAddressFree(handle->ptr, handle->allocated_size);
+            if (ret != CUDA_SUCCESS) {
+                print_dbg("error in cuMemAddressFree\n");
+                return ret;
+            }
+
+            memset(handle, 0, sizeof(gpu_mem_handle_t));
+
+            return CUDA_SUCCESS;
+        }
+#else
+        /* VMM with RDMA is not available before CUDA 11.0 */
+        CUresult gpu_vmm_alloc(gpu_mem_handle_t *handle, const size_t size, bool aligned_mapping, bool set_sync_memops)
+        {
+            return CUDA_ERROR_NOT_SUPPORTED;
+        }
+
+        CUresult gpu_vmm_free(gpu_mem_handle_t *handle)
+        {
+            return CUDA_ERROR_NOT_SUPPORTED;
+        }
+#endif
+
+        int compare_buf(uint32_t *ref_buf, uint32_t *buf, size_t size)
+        {
+            int diff = 0;
+            if (size % 4 != 0U) {
+                print_dbg("warning: buffer size %zu is not dword aligned, ignoring trailing bytes\n", size);
+                size -= (size % 4);
+            }
+            unsigned ndwords = size/sizeof(uint32_t);
+            for(unsigned  w = 0; w < ndwords; ++w) {
+                if (ref_buf[w] != buf[w]) {
+                    if (!diff) {
+                        printf("%10.10s %8.8s %8.8s\n", "word", "content", "expected");
+                    }
+                    if (diff < 10) {
+                        printf("%10d %08x %08x\n", w, buf[w], ref_buf[w]);
+                    }
+                    ++diff;
+                }
+            }
+            if (diff) {
+                print_dbg("check error: %d different dwords out of %d\n", diff, ndwords);
+            }
+            return diff;
+        }
+
+        void init_hbuf_walking_bit(uint32_t *h_buf, size_t size)
+        {
+            uint32_t base_value = 0x3F4C5E6A; // 0xa55ad33d;
+            unsigned w;
+            ASSERT_NEQ(h_buf, (void*)0);
+            ASSERT_EQ(size % 4, 0U);
+            //OUT << "filling mem with walking bit " << endl;
+            for(w = 0; w<size/sizeof(uint32_t); ++w)
+                h_buf[w] = base_value ^ (1<< (w%32));
+        }
+
+        void init_hbuf_linear_ramp(uint32_t *h_buf, size_t size)
+        {
+            uint32_t base_value = 0x3F4C5E6A; // 0xa55ad33d;
+            unsigned w;
+            ASSERT_NEQ(h_buf, (void*)0);
+            ASSERT_EQ(size % 4, 0U);
+            //OUT << "filling mem with walking bit " << endl;
+            for(w = 0; w<size/sizeof(uint32_t); ++w)
+                h_buf[w] = w;
+        }
+
+        bool check_gdr_support(CUdevice dev)
+        {
+            #if CUDA_VERSION >= 11030
+            int drv_version;
+            ASSERTDRV(cuDriverGetVersion(&drv_version));
+
+            // Starting from CUDA 11.3, CUDA provides an ability to check GPUDirect RDMA support.
+            if (drv_version >= 11030) {
+                int gdr_support = 0;
+                ASSERTDRV(cuDeviceGetAttribute(&gdr_support, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, dev));
+
+                if (!gdr_support)
+                    print_dbg("This GPU does not support GPUDirect RDMA.\n");
+
+                return !!gdr_support;
+            }
+            #endif
+
+            // For older versions, we fall back to detect this support with gdr_pin_buffer.
+            const size_t size = GPU_PAGE_SIZE;
+            CUdeviceptr d_A;
+            gpu_mem_handle_t mhandle;
+            ASSERTDRV(gpu_mem_alloc(&mhandle, size, true, true));
+            d_A = mhandle.ptr;
+
+            gdr_t g = gdr_open_safe();
+
+            gdr_mh_t mh;
+            int status = gdr_pin_buffer(g, d_A, size, 0, 0, &mh);
+            if (status != 0) {
+                print_dbg("error in gdr_pin_buffer with code=%d\n", status);
+                print_dbg("Your GPU might not support GPUDirect RDMA\n");
+            }
+            else
+                ASSERT_EQ(gdr_unpin_buffer(g, mh), 0);
+
+            ASSERT_EQ(gdr_close(g), 0);
+
+            ASSERTDRV(gpu_mem_free(&mhandle));
+
+            return status == 0;
+        }
+
+        void print_histogram(double *lat_arr, int count, int *bin_arr, int num_bins, double min, double max)
+        {
+            int den = (max - min) / num_bins;
+            den = den > 0 ? den : 1;
+            for (int j = 0; j < num_bins; j++) 
+                bin_arr[j] = 0;
+            for (int i = 0; i < count; i++) {
+                bin_arr[(int) ((lat_arr[i] - min) / den)]++;
+            }
+            for (int j = 0; j < num_bins; j++) {
+                printf("[%lf\t-\t%lf]\t%d\n", (min * (j + 1)), (min * (j + 2)), bin_arr[j]);
+            }
+        }
+    }
+}
--- a/gdrcopy/tests/common.hpp
+++ b/gdrcopy/tests/common.hpp
@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2014-2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in 
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <stdarg.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <cuda.h>
+#include <cstring>
+#include <map>
+#include <gdrapi.h>
+#include <gdrconfig.h>
+
+#ifndef ACCESS_ONCE
+#define ACCESS_ONCE(x)      (*(volatile typeof((x)) *)&(x))
+#endif
+
+#ifndef READ_ONCE
+#define READ_ONCE(x)        ACCESS_ONCE(x)
+#endif
+
+#ifndef WRITE_ONCE
+#define WRITE_ONCE(x, v)    (ACCESS_ONCE(x) = (v))
+#endif
+
+/**
+ * Memory barrier
+ */
+#if defined(GDRAPI_X86)
+#define MB() asm volatile("mfence":::"memory")
+#define SB() asm volatile("sfence":::"memory")
+#define LB() asm volatile("lfence":::"memory")
+#elif defined(GDRAPI_POWER)
+#define MB() asm volatile("sync":::"memory")
+#define SB() MB()
+#define LB() MB()
+#elif defined(GDRAPI_ARM64)
+#define MB() asm volatile("dmb sy":::"memory")
+#define SB() asm volatile("dmb st":::"memory")
+#define LB() MB()
+#else
+#error "Compiling on an unsupported architecture."
+#endif
+
+/**
+ * Clock used for timing
+ */
+//#define MYCLOCK CLOCK_REALTIME
+//#define MYCLOCK CLOCK_RAW_MONOTONIC
+#define MYCLOCK CLOCK_MONOTONIC
+
+#define EXIT_WAIVED 2
+
+#define ASSERT(x)                                                       \
+    do                                                                  \
+        {                                                               \
+            if (!(x))                                                   \
+                {                                                       \
+                    fprintf(stderr, "Assertion \"%s\" failed at %s:%d\n", #x, __FILE__, __LINE__); \
+                    exit(EXIT_FAILURE);                                 \
+                }                                                       \
+        } while (0)
+
+#define ASSERTDRV(stmt)				\
+    do                                          \
+        {                                       \
+            CUresult result = (stmt);           \
+            if (result != CUDA_SUCCESS) {       \
+                const char *_err_name;          \
+                cuGetErrorName(result, &_err_name); \
+                fprintf(stderr, "CUDA error: %s\n", _err_name);   \
+            }                                   \
+            ASSERT(CUDA_SUCCESS == result);     \
+        } while (0)
+
+#define ASSERT_EQ(P, V) ASSERT((P) == (V))
+#define CHECK_EQ(P, V) ASSERT((P) == (V))
+#define ASSERT_NEQ(P, V) ASSERT(!((P) == (V)))
+#define BREAK_IF_NEQ(P, V) if((P) != (V)) break
+#define BEGIN_CHECK do
+#define END_CHECK while(0)
+
+#define PAGE_ROUND_UP(x, n)     (((x) + ((n) - 1)) & ~((n) - 1))
+
+namespace gdrcopy {
+    namespace test {
+        typedef struct gpuMemHandle 
+        {
+            CUdeviceptr ptr; // aligned ptr if requested; otherwise, the same as unaligned_ptr.
+            union {
+                CUdeviceptr unaligned_ptr; // for tracking original ptr; may be unaligned.
+                #if CUDA_VERSION >= 11000
+                // VMM with GDR support is available from CUDA 11.0
+                CUmemGenericAllocationHandle handle;
+                #endif
+            };
+            size_t size;
+            size_t allocated_size;
+        } gpu_mem_handle_t;
+
+        typedef CUresult (*gpu_memalloc_fn_t)(gpu_mem_handle_t *handle, const size_t size, bool aligned_mapping, bool set_sync_memops);
+        typedef CUresult (*gpu_memfree_fn_t)(gpu_mem_handle_t *handle);
+
+        static inline gdr_t gdr_open_safe()
+        {
+            gdr_t g = gdr_open();
+            if (!g) {
+                fprintf(stderr, "gdr_open error: Is gdrdrv driver installed and loaded?\n");
+                exit(EXIT_FAILURE);
+            }
+            return g;
+        }
+
+        extern bool print_dbg_msg;
+        extern const char *testname;
+
+        void print_dbg(const char* fmt, ...);
+
+        CUresult gpu_mem_alloc(gpu_mem_handle_t *handle, const size_t size, bool aligned_mapping, bool set_sync_memops);
+        CUresult gpu_mem_free(gpu_mem_handle_t *handle);
+
+        CUresult gpu_vmm_alloc(gpu_mem_handle_t *handle, const size_t size, bool aligned_mapping, bool set_sync_memops);
+        CUresult gpu_vmm_free(gpu_mem_handle_t *handle);
+
+        static inline bool operator==(const gdr_mh_t &a, const gdr_mh_t &b) {
+            return a.h == b.h;
+        }
+
+        static const gdr_mh_t null_mh = {0};
+
+        int compare_buf(uint32_t *ref_buf, uint32_t *buf, size_t size);
+
+        void init_hbuf_walking_bit(uint32_t *h_buf, size_t size);
+
+        void init_hbuf_linear_ramp(uint32_t *h_buf, size_t size);
+
+        bool check_gdr_support(CUdevice dev);
+
+        void print_histogram(double *lat_arr, int count, int *bin_arr, int num_bins, double min, double max);
+    }
+}
--- a/gdrcopy/tests/copybw.cpp
+++ b/gdrcopy/tests/copybw.cpp
@ -0,0 +1,282 @@
+/*
+ * Copyright (c) 2014-2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in 
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <getopt.h>
+#include <memory.h>
+#include <stdio.h>
+#include <math.h>
+#include <iostream>
+#include <iomanip>
+#include <cuda.h>
+
+using namespace std;
+
+#include "gdrapi.h"
+#include "common.hpp"
+
+using namespace gdrcopy::test;
+
+// manually tuned...
+int num_write_iters = 10000;
+int num_read_iters  = 100;
+size_t _size = 128*1024;
+size_t copy_size = 0;
+size_t copy_offset = 0;
+int dev_id = 0;
+
+void print_usage(const char *path)
+{
+    cout << "Usage: " << path << " [-h][-s <size>][-c <size>][-o <offset>][-d <gpu>][-w <iters>][-r <iters>][-a <fn>]" << endl;
+    cout << endl;
+    cout << "Options:" << endl;
+    cout << "   -h              Print this help text" << endl;
+    cout << "   -s <size>       Buffer allocation size (default: " << _size << ")" << endl;
+    cout << "   -c <size>       Copy size (default: " << copy_size << ")" << endl;
+    cout << "   -o <offset>     Copy offset (default: " << copy_offset << ")" << endl;
+    cout << "   -d <gpu>        GPU ID (default: " << dev_id << ")" << endl;
+    cout << "   -w <iters>      Number of write iterations (default: " << num_write_iters << ")" << endl;
+    cout << "   -r <iters>      Number of read iterations (default: " << num_read_iters << ")" << endl;
+    cout << "   -a <fn>         GPU buffer allocation function (default: cuMemAlloc)" << endl;
+    cout << "                       Choices: cuMemAlloc, cuMemCreate" << endl;
+}
+
+void run_test(CUdeviceptr d_A, size_t size)
+{
+    uint32_t *init_buf = NULL;
+    ASSERTDRV(cuMemAllocHost((void **)&init_buf, size));
+    ASSERT_NEQ(init_buf, (void*)0);
+    init_hbuf_walking_bit(init_buf, size);
+
+    gdr_t g = gdr_open_safe();
+
+    gdr_mh_t mh;
+    BEGIN_CHECK {
+        // tokens are optional in CUDA 6.0
+        // wave out the test if GPUDirectRDMA is not enabled
+        BREAK_IF_NEQ(gdr_pin_buffer(g, d_A, size, 0, 0, &mh), 0);
+        ASSERT_NEQ(mh, null_mh);
+
+        void *map_d_ptr  = NULL;
+        ASSERT_EQ(gdr_map(g, mh, &map_d_ptr, size), 0);
+        cout << "map_d_ptr: " << map_d_ptr << endl;
+
+        gdr_info_t info;
+        ASSERT_EQ(gdr_get_info(g, mh, &info), 0);
+        cout << "info.va: " << hex << info.va << dec << endl;
+        cout << "info.mapped_size: " << info.mapped_size << endl;
+        cout << "info.page_size: " << info.page_size << endl;
+        cout << "info.mapped: " << info.mapped << endl;
+        cout << "info.wc_mapping: " << info.wc_mapping << endl;
+
+        // remember that mappings start on a 64KB boundary, so let's
+        // calculate the offset from the head of the mapping to the
+        // beginning of the buffer
+        int off = info.va - d_A;
+        cout << "page offset: " << off << endl;
+
+        uint32_t *buf_ptr = (uint32_t *)((char *)map_d_ptr + off);
+        cout << "user-space pointer:" << buf_ptr << endl;
+
+        // copy to GPU benchmark
+        cout << "writing test, size=" << copy_size << " offset=" << copy_offset << " num_iters=" << num_write_iters << endl;
+        struct timespec beg, end;
+        clock_gettime(MYCLOCK, &beg);
+        for (int iter=0; iter<num_write_iters; ++iter)
+            gdr_copy_to_mapping(mh, buf_ptr + copy_offset/4, init_buf, copy_size);
+        clock_gettime(MYCLOCK, &end);
+
+        double woMBps;
+        {
+            double byte_count = (double) copy_size * num_write_iters;
+            double dt_ms = (end.tv_nsec-beg.tv_nsec)/1000000.0 + (end.tv_sec-beg.tv_sec)*1000.0;
+            double Bps = byte_count / dt_ms * 1e3;
+            woMBps = Bps / 1024.0 / 1024.0;
+            cout << "write BW: " << woMBps << "MB/s" << endl;
+        }
+
+        compare_buf(init_buf, buf_ptr + copy_offset/4, copy_size);
+
+        // copy from GPU benchmark
+        cout << "reading test, size=" << copy_size << " offset=" << copy_offset << " num_iters=" << num_read_iters << endl;
+        clock_gettime(MYCLOCK, &beg);
+        for (int iter=0; iter<num_read_iters; ++iter)
+            gdr_copy_from_mapping(mh, init_buf, buf_ptr + copy_offset/4, copy_size);
+        clock_gettime(MYCLOCK, &end);
+
+        double roMBps;
+        {
+            double byte_count = (double) copy_size * num_read_iters;
+            double dt_ms = (end.tv_nsec-beg.tv_nsec)/1000000.0 + (end.tv_sec-beg.tv_sec)*1000.0;
+            double Bps = byte_count / dt_ms * 1e3;
+            roMBps = Bps / 1024.0 / 1024.0;
+            cout << "read BW: " << roMBps << "MB/s" << endl;
+        }
+
+        cout << "unmapping buffer" << endl;
+        ASSERT_EQ(gdr_unmap(g, mh, map_d_ptr, size), 0);
+
+        cout << "unpinning buffer" << endl;
+        ASSERT_EQ(gdr_unpin_buffer(g, mh), 0);
+    } END_CHECK;
+
+    cout << "closing gdrdrv" << endl;
+    ASSERT_EQ(gdr_close(g), 0);
+}
+
+int main(int argc, char *argv[])
+{
+    gpu_memalloc_fn_t galloc_fn = gpu_mem_alloc;
+    gpu_memfree_fn_t gfree_fn = gpu_mem_free;
+
+    while(1) {        
+        int c;
+        c = getopt(argc, argv, "s:d:o:c:w:r:a:h");
+        if (c == -1)
+            break;
+
+        switch (c) {
+        case 's':
+            _size = strtol(optarg, NULL, 0);
+            break;
+        case 'c':
+            copy_size = strtol(optarg, NULL, 0);
+            break;
+        case 'o':
+            copy_offset = strtol(optarg, NULL, 0);
+            break;
+        case 'd':
+            dev_id = strtol(optarg, NULL, 0);
+            break;
+        case 'w':
+            num_write_iters = strtol(optarg, NULL, 0);
+            break;
+        case 'r':
+            num_read_iters = strtol(optarg, NULL, 0);
+            break;
+        case 'a':
+            if (strcmp(optarg, "cuMemAlloc") == 0) {
+                galloc_fn = gpu_mem_alloc;
+                gfree_fn = gpu_mem_free;
+            }
+            else if (strcmp(optarg, "cuMemCreate") == 0) {
+                galloc_fn = gpu_vmm_alloc;
+                gfree_fn = gpu_vmm_free;
+            }
+            else {
+                cerr << "Unrecognized fn argument" << endl;
+                exit(EXIT_FAILURE);
+            }
+            break;
+        case 'h':
+            print_usage(argv[0]);
+            exit(EXIT_SUCCESS);
+        default:
+            fprintf(stderr, "ERROR: invalid option\n");
+            exit(EXIT_FAILURE);
+        }
+    }
+    
+    if (!copy_size)
+        copy_size = _size;
+
+    if (copy_offset % sizeof(uint32_t) != 0) {
+        fprintf(stderr, "ERROR: offset must be multiple of 4 bytes\n");
+        exit(EXIT_FAILURE);
+    }
+
+    if (copy_offset + copy_size > _size) {
+        fprintf(stderr, "ERROR: offset + copy size run past the end of the buffer\n");
+        exit(EXIT_FAILURE);
+    }
+
+    size_t size = PAGE_ROUND_UP(_size, GPU_PAGE_SIZE);
+
+    ASSERTDRV(cuInit(0));
+
+    int n_devices = 0;
+    ASSERTDRV(cuDeviceGetCount(&n_devices));
+
+    CUdevice dev;
+    for (int n=0; n<n_devices; ++n) {
+        
+        char dev_name[256];
+        int dev_pci_domain_id;
+        int dev_pci_bus_id;
+        int dev_pci_device_id;
+
+        ASSERTDRV(cuDeviceGet(&dev, n));
+        ASSERTDRV(cuDeviceGetName(dev_name, sizeof(dev_name) / sizeof(dev_name[0]), dev));
+        ASSERTDRV(cuDeviceGetAttribute(&dev_pci_domain_id, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, dev));
+        ASSERTDRV(cuDeviceGetAttribute(&dev_pci_bus_id, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev));
+        ASSERTDRV(cuDeviceGetAttribute(&dev_pci_device_id, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, dev));
+
+        cout << "GPU id:" << n << "; name: " << dev_name 
+            << "; Bus id: "
+            << std::hex 
+            << std::setfill('0') << std::setw(4) << dev_pci_domain_id
+            << ":" << std::setfill('0') << std::setw(2) << dev_pci_bus_id
+            << ":" << std::setfill('0') << std::setw(2) << dev_pci_device_id
+            << std::dec
+            << endl;
+    }
+    cout << "selecting device " << dev_id << endl;
+    ASSERTDRV(cuDeviceGet(&dev, dev_id));
+
+
+    CUcontext dev_ctx;
+    ASSERTDRV(cuDevicePrimaryCtxRetain(&dev_ctx, dev));
+    ASSERTDRV(cuCtxSetCurrent(dev_ctx));
+
+    cout << "testing size: " << _size << endl;
+    cout << "rounded size: " << size << endl;
+
+    ASSERT_EQ(check_gdr_support(dev), true);
+
+    if (galloc_fn == gpu_mem_alloc)
+        cout << "gpu alloc fn: cuMemAlloc" << endl;
+    else
+        cout << "gpu alloc fn: cuMemCreate" << endl;
+
+    CUdeviceptr d_A;
+    gpu_mem_handle_t mhandle;
+    ASSERTDRV(galloc_fn(&mhandle, size, true, true));
+    d_A = mhandle.ptr;
+    cout << "device ptr: " << hex << d_A << dec << endl;
+
+    run_test(d_A, size);
+
+    ASSERTDRV(gfree_fn(&mhandle));
+
+    ASSERTDRV(cuDevicePrimaryCtxRelease(dev));
+
+    return 0;
+}
+
+/*
+ * Local variables:
+ *  c-indent-level: 4
+ *  c-basic-offset: 4
+ *  tab-width: 4
+ *  indent-tabs-mode: nil
+ * End:
+ */
--- a/gdrcopy/tests/copylat.cpp
+++ b/gdrcopy/tests/copylat.cpp
@ -0,0 +1,307 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in 
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <getopt.h>
+#include <memory.h>
+#include <stdio.h>
+#include <math.h>
+#include <iostream>
+#include <iomanip>
+#include <cuda.h>
+
+using namespace std;
+
+#include "gdrapi.h"
+#include "common.hpp"
+
+using namespace gdrcopy::test;
+
+// manually tuned...
+int num_write_iters = 10000;
+int num_read_iters = 100;
+int dev_id = 0;
+bool do_cumemcpy = false;
+size_t _size = (size_t)1 << 24;
+
+void print_usage(const char *path)
+{
+    cout << "Usage: " << path << " [-h][-c][-s <size>][-d <gpu>][-w <iters>][-r <iters>][-a <fn>]" << endl;
+    cout << endl;
+    cout << "Options:" << endl;
+    cout << "   -h              Print this help text" << endl;
+    cout << "   -c              Also run cuMemcpy (default: no)" << endl;
+    cout << "   -s <size>       Buffer allocation size (default: " << _size << ")" << endl;
+    cout << "   -d <gpu>        GPU ID (default: " << dev_id << ")" << endl;
+    cout << "   -w <iters>      Number of write iterations (default: " << num_write_iters << ")" << endl;
+    cout << "   -r <iters>      Number of read iterations (default: " << num_read_iters << ")" << endl;
+    cout << "   -a <fn>         GPU buffer allocation function (default: cuMemAlloc)" << endl;
+    cout << "                       Choices: cuMemAlloc, cuMemCreate" << endl;
+}
+
+int main(int argc, char *argv[])
+{
+    size_t copy_size = 1;
+    struct timespec beg, end;
+    double lat_us;
+
+    gpu_memalloc_fn_t galloc_fn = gpu_mem_alloc;
+    gpu_memfree_fn_t gfree_fn = gpu_mem_free;
+
+    while(1) {        
+        int c;
+        c = getopt(argc, argv, "s:d:w:r:a:hc");
+        if (c == -1)
+            break;
+
+        switch (c) {
+            case 's':
+                _size = strtol(optarg, NULL, 0);
+                break;
+            case 'd':
+                dev_id = strtol(optarg, NULL, 0);
+                break;
+            case 'w':
+                num_write_iters = strtol(optarg, NULL, 0);
+                break;
+            case 'r':
+                num_read_iters = strtol(optarg, NULL, 0);
+                break;
+            case 'a':
+                if (strcmp(optarg, "cuMemAlloc") == 0) {
+                    galloc_fn = gpu_mem_alloc;
+                    gfree_fn = gpu_mem_free;
+                }
+                else if (strcmp(optarg, "cuMemCreate") == 0) {
+                    galloc_fn = gpu_vmm_alloc;
+                    gfree_fn = gpu_vmm_free;
+                }
+                else {
+                    cerr << "Unrecognized fn argument" << endl;
+                    exit(EXIT_FAILURE);
+                }
+                break;
+            case 'c':
+                do_cumemcpy = true;
+                break;
+            case 'h':
+                print_usage(argv[0]);
+                exit(EXIT_SUCCESS);
+            default:
+                printf("ERROR: invalid option\n");
+                exit(EXIT_FAILURE);
+        }
+    }
+    
+    size_t size = PAGE_ROUND_UP(_size, GPU_PAGE_SIZE);
+
+    ASSERTDRV(cuInit(0));
+
+    int n_devices = 0;
+    ASSERTDRV(cuDeviceGetCount(&n_devices));
+
+    CUdevice dev;
+    for (int n=0; n<n_devices; ++n) {
+        
+        char dev_name[256];
+        int dev_pci_domain_id;
+        int dev_pci_bus_id;
+        int dev_pci_device_id;
+
+        ASSERTDRV(cuDeviceGet(&dev, n));
+        ASSERTDRV(cuDeviceGetName(dev_name, sizeof(dev_name) / sizeof(dev_name[0]), dev));
+        ASSERTDRV(cuDeviceGetAttribute(&dev_pci_domain_id, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, dev));
+        ASSERTDRV(cuDeviceGetAttribute(&dev_pci_bus_id, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev));
+        ASSERTDRV(cuDeviceGetAttribute(&dev_pci_device_id, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, dev));
+
+        cout  << "GPU id:" << n << "; name: " << dev_name 
+              << "; Bus id: "
+              << std::hex 
+              << std::setfill('0') << std::setw(4) << dev_pci_domain_id
+              << ":" << std::setfill('0') << std::setw(2) << dev_pci_bus_id
+              << ":" << std::setfill('0') << std::setw(2) << dev_pci_device_id
+              << std::dec
+              << endl;
+    }
+    cout << "selecting device " << dev_id << endl;
+    ASSERTDRV(cuDeviceGet(&dev, dev_id));
+
+    CUcontext dev_ctx;
+    ASSERTDRV(cuDevicePrimaryCtxRetain(&dev_ctx, dev));
+    ASSERTDRV(cuCtxSetCurrent(dev_ctx));
+
+    ASSERT_EQ(check_gdr_support(dev), true);
+
+    CUdeviceptr d_A;
+    gpu_mem_handle_t mhandle;
+    ASSERTDRV(galloc_fn(&mhandle, size, true, true));
+    d_A = mhandle.ptr;
+    cout << "device ptr: 0x" << hex << d_A << dec << endl;
+    cout << "allocated size: " << size << endl;
+
+    if (galloc_fn == gpu_mem_alloc)
+        cout << "gpu alloc fn: cuMemAlloc" << endl;
+    else
+        cout << "gpu alloc fn: cuMemCreate" << endl;
+
+    uint32_t *init_buf = NULL;
+    uint32_t *h_buf = NULL;
+    ASSERTDRV(cuMemAllocHost((void **)&init_buf, size));
+    ASSERT_NEQ(init_buf, (void*)0);
+    ASSERTDRV(cuMemAllocHost((void **)&h_buf, size));
+    ASSERT_NEQ(h_buf, (void*)0);
+    init_hbuf_walking_bit(init_buf, size);
+
+    if (do_cumemcpy) {
+        cout << endl;
+        cout << "cuMemcpy_H2D num iters for each size: " << num_write_iters << endl;
+        printf("Test \t\t Size(B) \t Avg.Time(us)\n");
+        BEGIN_CHECK {
+            // cuMemcpy H2D benchmark
+            copy_size = 1;
+            while (copy_size <= size) {
+                int iter = 0;
+                clock_gettime(MYCLOCK, &beg);
+                for (iter = 0; iter < num_write_iters; ++iter) {
+                    ASSERTDRV(cuMemcpy(d_A, (CUdeviceptr)init_buf, copy_size));
+                }
+                clock_gettime(MYCLOCK, &end);
+                lat_us = ((end.tv_nsec-beg.tv_nsec)/1000.0 + (end.tv_sec-beg.tv_sec)*1000000.0) / (double)iter;
+                printf("cuMemcpy_H2D \t %8zu \t %11.4f\n", copy_size, lat_us);
+                copy_size <<= 1;
+            }
+        } END_CHECK;
+
+        cout << endl;
+        cout << "cuMemcpy_D2H num iters for each size: " << num_read_iters << endl;
+        printf("Test \t\t Size(B) \t Avg.Time(us)\n");
+        BEGIN_CHECK {
+            // cuMemcpy D2H benchmark
+            copy_size = 1;
+            while (copy_size <= size) {
+                int iter = 0;
+                clock_gettime(MYCLOCK, &beg);
+                for (iter = 0; iter < num_read_iters; ++iter) {
+                    ASSERTDRV(cuMemcpy((CUdeviceptr)h_buf, d_A, copy_size));
+                }
+                clock_gettime(MYCLOCK, &end);
+                lat_us = ((end.tv_nsec-beg.tv_nsec)/1000.0 + (end.tv_sec-beg.tv_sec)*1000000.0) / (double)iter;
+                printf("cuMemcpy_D2H \t %8zu \t %11.4f\n", copy_size, lat_us);
+                copy_size <<= 1;
+            }
+        } END_CHECK;
+
+        cout << endl;
+    }
+
+    cout << endl;
+
+    gdr_t g = gdr_open_safe();
+
+    gdr_mh_t mh;
+    BEGIN_CHECK {
+        // tokens are optional in CUDA 6.0
+        ASSERT_EQ(gdr_pin_buffer(g, d_A, size, 0, 0, &mh), 0);
+        ASSERT_NEQ(mh, null_mh);
+
+        void *map_d_ptr  = NULL;
+        ASSERT_EQ(gdr_map(g, mh, &map_d_ptr, size), 0);
+        cout << "map_d_ptr: " << map_d_ptr << endl;
+
+        gdr_info_t info;
+        ASSERT_EQ(gdr_get_info(g, mh, &info), 0);
+        cout << "info.va: " << hex << info.va << dec << endl;
+        cout << "info.mapped_size: " << info.mapped_size << endl;
+        cout << "info.page_size: " << info.page_size << endl;
+        cout << "info.mapped: " << info.mapped << endl;
+        cout << "info.wc_mapping: " << info.wc_mapping << endl;
+
+        // remember that mappings start on a 64KB boundary, so let's
+        // calculate the offset from the head of the mapping to the
+        // beginning of the buffer
+        int off = info.va - d_A;
+        cout << "page offset: " << off << endl;
+
+        uint32_t *buf_ptr = (uint32_t *)((char *)map_d_ptr + off);
+        cout << "user-space pointer: " << buf_ptr << endl;
+
+        // gdr_copy_to_mapping benchmark
+        cout << endl;
+        cout << "gdr_copy_to_mapping num iters for each size: " << num_write_iters << endl;
+        cout << "WARNING: Measuring the API invocation overhead as observed by the CPU. Data might not be ordered all the way to the GPU internal visibility." << endl;
+        // For more information, see
+        // https://docs.nvidia.com/cuda/gpudirect-rdma/index.html#sync-behavior
+        printf("Test \t\t\t Size(B) \t Avg.Time(us)\n");
+        copy_size = 1;
+        while (copy_size <= size) {
+            int iter = 0;
+            clock_gettime(MYCLOCK, &beg);
+            for (iter = 0; iter < num_write_iters; ++iter) {
+                gdr_copy_to_mapping(mh, buf_ptr, init_buf, copy_size);
+            }
+            clock_gettime(MYCLOCK, &end);
+            lat_us = ((end.tv_nsec-beg.tv_nsec)/1000.0 + (end.tv_sec-beg.tv_sec)*1000000.0) / (double)iter;
+            printf("gdr_copy_to_mapping \t %8zu \t %11.4f\n", copy_size, lat_us);
+            copy_size <<= 1;
+        }
+
+        MB();
+
+        // gdr_copy_from_mapping benchmark
+        cout << endl;
+        cout << "gdr_copy_from_mapping num iters for each size: " << num_read_iters << endl;
+        printf("Test \t\t\t Size(B) \t Avg.Time(us)\n");
+        copy_size = 1;
+        while (copy_size <= size) {
+            int iter = 0;
+            clock_gettime(MYCLOCK, &beg);
+            for (iter = 0; iter < num_read_iters; ++iter)
+                gdr_copy_from_mapping(mh, h_buf, buf_ptr, copy_size);
+            clock_gettime(MYCLOCK, &end);
+            lat_us = ((end.tv_nsec-beg.tv_nsec)/1000.0 + (end.tv_sec-beg.tv_sec)*1000000.0) / (double)iter;
+            printf("gdr_copy_from_mapping \t %8zu \t %11.4f\n", copy_size, lat_us);
+            copy_size <<= 1;
+        }
+
+        cout << "unmapping buffer" << endl;
+        ASSERT_EQ(gdr_unmap(g, mh, map_d_ptr, size), 0);
+
+        cout << "unpinning buffer" << endl;
+        ASSERT_EQ(gdr_unpin_buffer(g, mh), 0);
+    } END_CHECK;
+
+    cout << "closing gdrdrv" << endl;
+    ASSERT_EQ(gdr_close(g), 0);
+
+    ASSERTDRV(gfree_fn(&mhandle));
+
+    return 0;
+}
+
+/*
+ * Local variables:
+ *  c-indent-level: 4
+ *  c-basic-offset: 4
+ *  tab-width: 4
+ *  indent-tabs-mode: nil
+ * End:
+ */
--- a/Show More
+++ b/Show More
				`@ -0,0 +1,2 @@`

				`This copyright has not been completed by the author of this package.`