first commit

This commit is contained in:
hailin 2025-09-15 10:32:17 +08:00
commit cc76bab27e
3854 changed files with 740345 additions and 0 deletions

240
.gitignore vendored Normal file
View File

@ -0,0 +1,240 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
# Tokenizer cache for tests
.tokenizer_cache/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/
# MacOS
.DS_Store
# Vim
*.swp
# Documentation
docs/_build
# SGL
benchmark/mmlu/data
benchmark/mmlu/data.tar
benchmark/llava_bench/images
benchmark/llava_bench/mme_pack
*.jsonl
tmp*.txt
# Plots
*.png
*.pdf
# personnal
work_dirs/
*.csv
!logo.png
# Prerequisites
*.d
# Compiled Object files
*.slo
*.lo
*.o
*.obj
# Precompiled Headers
*.gch
*.pch
# Compiled Dynamic libraries
*.so
*.dylib
*.dll
# Fortran module files
*.mod
*.smod
# Compiled Static libraries
*.lai
*.la
*.a
*.lib
# Executables
*.exe
*.out
*.app
compile_commands.json
*.iml
# VSCode
.vscode
1
# Autoenv
.env.leave
# Rust lib
Cargo.lock
lmms-eval

8
DeepEP/.gitignore vendored Normal file
View File

@ -0,0 +1,8 @@
compile_commands.json
.idea
.DS_Store
*.pyc
build/
.cache/
.vscode/
*/cmake-build-*/

21
DeepEP/LICENSE Normal file
View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2025 DeepSeek
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

344
DeepEP/README.md Normal file
View File

@ -0,0 +1,344 @@
# DeepEP
DeepEP is a communication library tailored for Mixture-of-Experts (MoE) and expert parallelism (EP). It provides high-throughput and low-latency all-to-all GPU kernels, which are also known as MoE dispatch and combine. The library also supports low-precision operations, including FP8.
To align with the group-limited gating algorithm proposed in the [DeepSeek-V3](https://github.com/deepseek-ai/DeepSeek-V3) paper, DeepEP offers a set of kernels optimized for asymmetric-domain bandwidth forwarding, such as forwarding data from NVLink domain to RDMA domain. These kernels deliver high throughput, making them suitable for both training and inference prefilling tasks. Additionally, they support SM (Streaming Multiprocessors) number control.
For latency-sensitive inference decoding, DeepEP includes a set of low-latency kernels with pure RDMA to minimize delays. The library also introduces a hook-based communication-computation overlapping method that does not occupy any SM resource.
Notice: the implementation in this library may have some slight differences from the [DeepSeek-V3](https://github.com/deepseek-ai/DeepSeek-V3) paper.
## Performance
### Normal kernels with NVLink and RDMA forwarding
We test normal kernels on H800 (~160 GB/s NVLink maximum bandwidth), with each connected to a CX7 InfiniBand 400 Gb/s RDMA network card (~50 GB/s maximum bandwidth). And we follow the DeepSeek-V3/R1 pretraining setting (4096 tokens per batch, 7168 hidden, top-4 groups, top-8 experts, FP8 dispatching and BF16 combining).
| Type | Dispatch #EP | Bottleneck bandwidth | Combine #EP | Bottleneck bandwidth |
|:---------:|:------------:|:--------------------:|:-----------:|:--------------------:|
| Intranode | 8 | 153 GB/s (NVLink) | 8 | 158 GB/s (NVLink) |
| Internode | 16 | 43 GB/s (RDMA) | 16 | 43 GB/s (RDMA) |
| Internode | 32 | 58 GB/s (RDMA) | 32 | 57 GB/s (RDMA) |
| Internode | 64 | 51 GB/s (RDMA) | 64 | 50 GB/s (RDMA) |
**News (2025.04.22)**: with optimizations from Tencent Network Platform Department, performance was enhanced by up to 30%, see [#130](https://github.com/deepseek-ai/DeepEP/pull/130) for more details. Thanks for the contribution!
### Low-latency kernels with pure RDMA
We test low-latency kernels on H800 with each connected to a CX7 InfiniBand 400 Gb/s RDMA network card (~50 GB/s maximum bandwidth). And we follow a typical DeepSeek-V3/R1 production setting (128 tokens per batch, 7168 hidden, top-8 experts, FP8 dispatching and BF16 combining).
| Dispatch #EP | Latency | RDMA bandwidth | Combine #EP | Latency | RDMA bandwidth |
|:------------:|:-------:|:--------------:|:-----------:|:-------:|:--------------:|
| 8 | 77 us | 98 GB/s | 8 | 114 us | 127 GB/s |
| 16 | 118 us | 63 GB/s | 16 | 195 us | 74 GB/s |
| 32 | 155 us | 48 GB/s | 32 | 273 us | 53 GB/s |
| 64 | 173 us | 43 GB/s | 64 | 314 us | 46 GB/s |
| 128 | 192 us | 39 GB/s | 128 | 369 us | 39 GB/s |
| 256 | 194 us | 39 GB/s | 256 | 360 us | 40 GB/s |
**News (2025.06.05)**: low-latency kernels now leverage NVLink as much as possible, see [#173](https://github.com/deepseek-ai/DeepEP/pull/173) for more details. Thanks for the contribution!
## Quick start
### Requirements
- Ampere (SM80), Hopper (SM90) GPUs, or other architectures with SM90 PTX ISA support
- Python 3.8 and above
- CUDA version
- CUDA 11.0 and above for SM80 GPUs
- CUDA 12.3 and above for SM90 GPUs
- PyTorch 2.1 and above
- NVLink for intranode communication
- RDMA network for internode communication
### Download and install NVSHMEM dependency
DeepEP also depends on our modified NVSHMEM. Please refer to our [NVSHMEM Installation Guide](third-party/README.md) for instructions.
### Development
```bash
# Build and make symbolic links for SO files
NVSHMEM_DIR=/path/to/installed/nvshmem python setup.py build
# You may modify the specific SO names according to your own platform
ln -s build/lib.linux-x86_64-cpython-38/deep_ep_cpp.cpython-38-x86_64-linux-gnu.so
# Run test cases
# NOTES: you may modify the `init_dist` function in `tests/utils.py`
# according to your own cluster settings, and launch into multiple nodes
python tests/test_intranode.py
python tests/test_internode.py
python tests/test_low_latency.py
```
### Installation
```bash
NVSHMEM_DIR=/path/to/installed/nvshmem python setup.py install
```
#### Installation environment variables
- `NVSHMEM_DIR`: the path to the NVSHMEM directory, disable all internode and low-latency features if not specified
- `DISABLE_SM90_FEATURES`: 0 or 1, whether to disable SM90 features, it is required for SM90 devices or CUDA 11
- `TORCH_CUDA_ARCH_LIST`: the list of target architectures, e.g. `TORCH_CUDA_ARCH_LIST="9.0"`
- `DISABLE_AGGRESSIVE_PTX_INSTRS`: 0 or 1, whether to disable aggressive load/store instructions, see [Undefined-behavior PTX usage](#undefined-behavior-ptx-usage) for more details
Then, import `deep_ep` in your Python project, and enjoy!
## Network configurations
DeepEP is fully tested with InfiniBand networks. However, it is theoretically compatible with RDMA over Converged Ethernet (RoCE) as well.
### Traffic isolation
Traffic isolation is supported by InfiniBand through Virtual Lanes (VL).
To prevent interference between different types of traffic, we recommend segregating workloads across different virtual lanes as follows:
- workloads using normal kernels
- workloads using low-latency kernels
- other workloads
For DeepEP, you can control the virtual lane assignment by setting the `NVSHMEM_IB_SL` environment variable.
### Adaptive routing
Adaptive routing is an advanced routing feature provided by InfiniBand switches that can evenly distribute traffic across multiple paths. Enabling adaptive routing can completely eliminate network congestion caused by routing conflicts, but it also introduces additional latency. We recommend the following configuration for optimal performance:
- enable adaptive routing in environments with heavy network loads
- use static routing in environments with light network loads
### Congestion control
Congestion control is disabled as we have not observed significant congestion in our production environment.
## Interfaces and examples
### Example use in model training or inference prefilling
The normal kernels can be used in model training or the inference prefilling phase (without the backward part) as the below example code shows.
```python
import torch
import torch.distributed as dist
from typing import List, Tuple, Optional, Union
from deep_ep import Buffer, EventOverlap
# Communication buffer (will allocate at runtime)
_buffer: Optional[Buffer] = None
# Set the number of SMs to use
# NOTES: this is a static variable
Buffer.set_num_sms(24)
# You may call this function at the framework initialization
def get_buffer(group: dist.ProcessGroup, hidden_bytes: int) -> Buffer:
global _buffer
# NOTES: you may also replace `get_*_config` with your auto-tuned results via all the tests
num_nvl_bytes, num_rdma_bytes = 0, 0
for config in (Buffer.get_dispatch_config(group.size()), Buffer.get_combine_config(group.size())):
num_nvl_bytes = max(config.get_nvl_buffer_size_hint(hidden_bytes, group.size()), num_nvl_bytes)
num_rdma_bytes = max(config.get_rdma_buffer_size_hint(hidden_bytes, group.size()), num_rdma_bytes)
# Allocate a buffer if not existed or not enough buffer size
if _buffer is None or _buffer.group != group or _buffer.num_nvl_bytes < num_nvl_bytes or _buffer.num_rdma_bytes < num_rdma_bytes:
_buffer = Buffer(group, num_nvl_bytes, num_rdma_bytes)
return _buffer
def get_hidden_bytes(x: torch.Tensor) -> int:
t = x[0] if isinstance(x, tuple) else x
return t.size(1) * max(t.element_size(), 2)
def dispatch_forward(x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
topk_idx: torch.Tensor, topk_weights: torch.Tensor,
num_experts: int, previous_event: Optional[EventOverlap] = None) -> \
Tuple[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]], torch.Tensor, torch.Tensor, List, Tuple, EventOverlap]:
# NOTES: an optional `previous_event` means a CUDA event captured that you want to make it as a dependency
# of the dispatch kernel, it may be useful with communication-computation overlap. For more information, please
# refer to the docs of `Buffer.dispatch`
global _buffer
# Calculate layout before actual dispatch
num_tokens_per_rank, num_tokens_per_rdma_rank, num_tokens_per_expert, is_token_in_rank, previous_event = \
_buffer.get_dispatch_layout(topk_idx, num_experts,
previous_event=previous_event, async_finish=True,
allocate_on_comm_stream=previous_event is not None)
# Do MoE dispatch
# NOTES: the CPU will wait for GPU's signal to arrive, so this is not compatible with CUDA graph
# Unless you specify `num_worst_tokens`, but this flag is for intranode only
# For more advanced usages, please refer to the docs of the `dispatch` function
recv_x, recv_topk_idx, recv_topk_weights, num_recv_tokens_per_expert_list, handle, event = \
_buffer.dispatch(x, topk_idx=topk_idx, topk_weights=topk_weights,
num_tokens_per_rank=num_tokens_per_rank, num_tokens_per_rdma_rank=num_tokens_per_rdma_rank,
is_token_in_rank=is_token_in_rank, num_tokens_per_expert=num_tokens_per_expert,
previous_event=previous_event, async_finish=True,
allocate_on_comm_stream=True)
# For event management, please refer to the docs of the `EventOverlap` class
return recv_x, recv_topk_idx, recv_topk_weights, num_recv_tokens_per_expert_list, handle, event
def dispatch_backward(grad_recv_x: torch.Tensor, grad_recv_topk_weights: torch.Tensor, handle: Tuple) -> \
Tuple[torch.Tensor, torch.Tensor, EventOverlap]:
global _buffer
# The backward process of MoE dispatch is actually a combine
# For more advanced usages, please refer to the docs of the `combine` function
combined_grad_x, combined_grad_recv_topk_weights, event = \
_buffer.combine(grad_recv_x, handle, topk_weights=grad_recv_topk_weights, async_finish=True)
# For event management, please refer to the docs of the `EventOverlap` class
return combined_grad_x, combined_grad_recv_topk_weights, event
def combine_forward(x: torch.Tensor, handle: Tuple, previous_event: Optional[EventOverlap] = None) -> \
Tuple[torch.Tensor, EventOverlap]:
global _buffer
# Do MoE combine
# For more advanced usages, please refer to the docs of the `combine` function
combined_x, _, event = _buffer.combine(x, handle, async_finish=True, previous_event=previous_event,
allocate_on_comm_stream=previous_event is not None)
# For event management, please refer to the docs of the `EventOverlap` class
return combined_x, event
def combine_backward(grad_combined_x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
handle: Tuple, previous_event: Optional[EventOverlap] = None) -> \
Tuple[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]], EventOverlap]:
global _buffer
# The backward process of MoE combine is actually a dispatch
# For more advanced usages, please refer to the docs of the `dispatch` function
grad_x, _, _, _, _, event = _buffer.dispatch(grad_combined_x, handle=handle, async_finish=True,
previous_event=previous_event,
allocate_on_comm_stream=previous_event is not None)
# For event management, please refer to the docs of the `EventOverlap` class
return grad_x, event
```
Moreover, inside the dispatch function, we may not know how many tokens to receive for the current rank. So an implicit CPU wait for GPU received count signal will be involved, as the following figure shows.
![normal](figures/normal.png)
### Example use in inference decoding
The low latency kernels can be used in the inference decoding phase as the below example code shows.
```python
import torch
import torch.distributed as dist
from typing import Tuple, Optional
from deep_ep import Buffer
# Communication buffer (will allocate at runtime)
# NOTES: there is no SM control API for the low-latency kernels
_buffer: Optional[Buffer] = None
# You may call this function at the framework initialization
def get_buffer(group: dist.ProcessGroup, num_max_dispatch_tokens_per_rank: int, hidden: int, num_experts: int) -> Buffer:
# NOTES: the low-latency mode will consume much more space than the normal mode
# So we recommend that `num_max_dispatch_tokens_per_rank` (the actual batch size in the decoding engine) should be less than 256
global _buffer
num_rdma_bytes = Buffer.get_low_latency_rdma_size_hint(num_max_dispatch_tokens_per_rank, hidden, group.size(), num_experts)
# Allocate a buffer if not existed or not enough buffer size
if _buffer is None or _buffer.group != group or not _buffer.low_latency_mode or _buffer.num_rdma_bytes < num_rdma_bytes:
# NOTES: for the best performance, the QP number **must** be equal to the number of the local experts
assert num_experts % group.size() == 0
_buffer = Buffer(group, 0, num_rdma_bytes, low_latency_mode=True, num_qps_per_rank=num_experts // group.size())
return _buffer
def low_latency_dispatch(hidden_states: torch.Tensor, topk_idx: torch.Tensor, num_max_dispatch_tokens_per_rank: int, num_experts: int):
global _buffer
# Do MoE dispatch, compatible with CUDA graph (but you may restore some buffer status once you replay)
recv_hidden_states, recv_expert_count, handle, event, hook = \
_buffer.low_latency_dispatch(hidden_states, topk_idx, num_max_dispatch_tokens_per_rank, num_experts,
async_finish=False, return_recv_hook=True)
# NOTES: the actual tensor will not be received only if you call `hook()`,
# it is useful for double-batch overlapping, but **without any SM occupation**
# If you don't want to overlap, please set `return_recv_hook=False`
# Later, you can use our GEMM library to do the computation with this specific format
return recv_hidden_states, recv_expert_count, handle, event, hook
def low_latency_combine(hidden_states: torch.Tensor,
topk_idx: torch.Tensor, topk_weights: torch.Tensor, handle: Tuple):
global _buffer
# Do MoE combine, compatible with CUDA graph (but you may restore some buffer status once you replay)
combined_hidden_states, event_overlap, hook = \
_buffer.low_latency_combine(hidden_states, topk_idx, topk_weights, handle,
async_finish=False, return_recv_hook=True)
# NOTES: the same behavior as described in the dispatch kernel
return combined_hidden_states, event_overlap, hook
```
For two-micro-batch overlapping, you can refer to the following figure. With our receiving hook interface, the RDMA network traffic is happening in the background, without costing any GPU SMs from the computation part. But notice, the overlapped parts can be adjusted, i.e., the 4 parts of attention/dispatch/MoE/combine may not have the exact same execution time. You may adjust the stage settings according to your workload.
![low-latency](figures/low-latency.png)
## Roadmap
- [x] AR support
- [x] Refactor low-latency mode AR code
- [x] A100 support (intranode only)
- [x] Support BF16 for the low-latency dispatch kernel
- [x] Support NVLink protocol for intranode low-latency kernels
- [ ] TMA copy instead of LD/ST
- [x] Intranode kernels
- [ ] Internode kernels
- [ ] Low-latency kernels
- [ ] SM-free kernels and refactors
- [ ] Fully remove undefined-behavior PTX instructions
## Notices
#### Easier potential overall design
The current DeepEP implementation uses queues for communication buffers which save memory but introduce complexity and potential deadlocks. If you're implementing your own version based on DeepEP, consider using fixed-size buffers allocated to maximum capacity for simplicity and better performance. For a detailed discussion of this alternative approach, see https://github.com/deepseek-ai/DeepEP/issues/39.
#### Undefined-behavior PTX usage
- For extreme performance, we discover and use an undefined-behavior PTX usage: using read-only PTX `ld.global.nc.L1::no_allocate.L2::256B` to **read volatile data**. The PTX modifier `.nc` indicates that a non-coherent cache is used. But the correctness is tested to be guaranteed with `.L1::no_allocate` on Hopper architectures, and performance will be much better. The reason we guess may be: the non-coherent cache is unified with L1, and the L1 modifier is not just a hint but a strong option, so that the correctness can be guaranteed by no dirty data in L1.
- Initially, because NVCC could not automatically unroll volatile read PTX, we tried using `__ldg` (i.e., `ld.nc`). Even compared to manually unrolled volatile reads, it was significantly faster (likely due to additional compiler optimizations). However, the results could be incorrect or dirty. After consulting the PTX documentation, we discovered that L1 and non-coherent cache are unified on Hopper architectures. We speculated that `.L1::no_allocate` might resolve the issue, leading to this discovery.
- If you find kernels not working on some other platforms, you may add `DISABLE_AGGRESSIVE_PTX_INSTRS=1` to `setup.py` and disable this, or file an issue.
#### Auto-tuning on your cluster
For better performance on your cluster, we recommend to run all the tests and use the best auto-tuned configuration. The default configurations are optimized on the DeepSeek's internal cluster.
## License
This code repository is released under [the MIT License](LICENSE), except for codes that reference NVSHMEM (including `csrc/kernels/ibgda_device.cuh` and `third-party/nvshmem.patch`), which are subject to [NVSHMEM SLA](https://docs.nvidia.com/nvshmem/api/sla.html).
## Community Forks
- [Infrawaves/DeepEP_ibrc_dual-ports_multiQP](https://github.com/Infrawaves/DeepEP_ibrc_dual-ports_multiQP) - Adds multi-QP solution and dual-port NIC support in IBRC transport
## Citation
If you use this codebase or otherwise find our work valuable, please cite:
```bibtex
@misc{deepep2025,
title={DeepEP: an efficient expert-parallel communication library},
author={Chenggang Zhao and Shangyan Zhou and Liyue Zhang and Chengqi Deng and Zhean Xu and Yuxuan Liu and Kuai Yu and Jiashi Li and Liang Zhao},
year={2025},
publisher = {GitHub},
howpublished = {\url{https://github.com/deepseek-ai/DeepEP}},
}
```

View File

@ -0,0 +1,36 @@
# NOTES: this CMake is only for debugging; for setup, please use Torch extension
cmake_minimum_required(VERSION 3.10)
project(deep_ep LANGUAGES CUDA CXX)
set(CMAKE_VERBOSE_MAKEFILE ON)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 -fPIC")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fPIC")
set(CUDA_SEPARABLE_COMPILATION ON)
list(APPEND CUDA_NVCC_FLAGS "-DENABLE_FAST_DEBUG")
list(APPEND CUDA_NVCC_FLAGS "-O3")
list(APPEND CUDA_NVCC_FLAGS "--ptxas-options=--verbose,--register-usage-level=10,--warn-on-local-memory-usage")
set(USE_SYSTEM_NVTX on)
set(CUDA_ARCH_LIST "9.0" CACHE STRING "List of CUDA architectures to compile")
set(TORCH_CUDA_ARCH_LIST "${CUDA_ARCH_LIST}")
find_package(CUDAToolkit REQUIRED)
find_package(pybind11 REQUIRED)
find_package(Torch REQUIRED)
find_package(NVSHMEM REQUIRED HINTS ${NVSHMEM_ROOT_DIR}/lib/cmake/nvshmem)
add_library(nvshmem ALIAS nvshmem::nvshmem)
add_library(nvshmem_host ALIAS nvshmem::nvshmem_host)
add_library(nvshmem_device ALIAS nvshmem::nvshmem_device)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CUDA_STANDARD 17)
include_directories(${CUDA_TOOLKIT_ROOT_DIR}/include ${TORCH_INCLUDE_DIRS} ${PYTHON_INCLUDE_DIRS} ${NVSHMEM_INCLUDE_DIR})
link_directories(${TORCH_INSTALL_PREFIX}/lib ${CUDA_TOOLKIT_ROOT_DIR}/lib ${NVSHMEM_LIB_DIR})
add_subdirectory(kernels)
# Link CPP and CUDA together
pybind11_add_module(deep_ep_cpp deep_ep.cpp)
target_link_libraries(deep_ep_cpp PRIVATE ${EP_CUDA_LIBRARIES} ${TORCH_LIBRARIES} torch_python)

188
DeepEP/csrc/config.hpp Normal file
View File

@ -0,0 +1,188 @@
#pragma once
#include "kernels/api.cuh"
#include "kernels/exception.cuh"
namespace deep_ep {
template <typename dtype_t>
dtype_t ceil_div(dtype_t a, dtype_t b) {
return (a + b - 1) / b;
}
template <typename dtype_t>
dtype_t align(dtype_t a, dtype_t b) {
return ceil_div<dtype_t>(a, b) * b;
}
struct Config {
int num_sms;
int num_max_nvl_chunked_send_tokens;
int num_max_nvl_chunked_recv_tokens;
int num_max_rdma_chunked_send_tokens;
int num_max_rdma_chunked_recv_tokens;
Config(int num_sms,
int num_max_nvl_chunked_send_tokens, int num_max_nvl_chunked_recv_tokens,
int num_max_rdma_chunked_send_tokens, int num_max_rdma_chunked_recv_tokens) :
num_sms(num_sms),
num_max_nvl_chunked_send_tokens(num_max_nvl_chunked_send_tokens),
num_max_nvl_chunked_recv_tokens(num_max_nvl_chunked_recv_tokens),
num_max_rdma_chunked_send_tokens(num_max_rdma_chunked_send_tokens),
num_max_rdma_chunked_recv_tokens(num_max_rdma_chunked_recv_tokens) {
EP_HOST_ASSERT(num_sms >= 0);
EP_HOST_ASSERT(num_max_nvl_chunked_send_tokens > 0 and num_max_nvl_chunked_recv_tokens > 0);
EP_HOST_ASSERT(num_max_nvl_chunked_send_tokens < num_max_nvl_chunked_recv_tokens);
EP_HOST_ASSERT(num_max_rdma_chunked_send_tokens > 0 and num_max_rdma_chunked_recv_tokens > 0);
// Ceil up RDMA buffer size
this->num_max_rdma_chunked_recv_tokens = align<int>(num_max_rdma_chunked_recv_tokens, num_max_rdma_chunked_send_tokens);
EP_HOST_ASSERT(num_max_rdma_chunked_send_tokens < num_max_rdma_chunked_recv_tokens);
// NOTES: this assertion is related to RDMA lazy head update, we must ensure senders always have space to push
EP_HOST_ASSERT(num_max_rdma_chunked_send_tokens <= num_max_rdma_chunked_recv_tokens / 2);
}
size_t get_nvl_buffer_size_hint(size_t hidden_bytes, int num_ranks) const {
// Below are some assumptions
// TODO: add assertions
constexpr int kNumMaxTopK = 128;
constexpr int kNumMaxScales = 128;
EP_HOST_ASSERT(num_ranks < NUM_MAX_NVL_PEERS or num_ranks % NUM_MAX_NVL_PEERS == 0);
EP_HOST_ASSERT(num_ranks <= NUM_MAX_NVL_PEERS or num_sms % 2 == 0);
const auto num_rdma_ranks = std::max(num_ranks / NUM_MAX_NVL_PEERS, 1);
const auto num_nvl_ranks = std::min(num_ranks, NUM_MAX_NVL_PEERS);
const int num_channels = num_sms / 2;
size_t num_bytes = 0;
num_bytes += num_channels * num_nvl_ranks * (2 * num_rdma_ranks + 3) * sizeof(int);
num_bytes += num_channels * num_nvl_ranks * num_max_nvl_chunked_recv_tokens * hidden_bytes;
#ifndef DISABLE_NVSHMEM
num_bytes += num_channels * num_nvl_ranks * num_max_nvl_chunked_recv_tokens * internode::get_source_meta_bytes();
#endif
num_bytes += num_channels * num_nvl_ranks * num_max_nvl_chunked_recv_tokens * kNumMaxTopK * sizeof(int64_t);
num_bytes += num_channels * num_nvl_ranks * num_max_nvl_chunked_recv_tokens * kNumMaxTopK * sizeof(float);
num_bytes += num_channels * num_nvl_ranks * num_max_nvl_chunked_recv_tokens * kNumMaxScales * sizeof(float);
num_bytes = ((num_bytes + 127) / 128) * 128;
return num_bytes;
}
size_t get_rdma_buffer_size_hint(int64_t hidden_bytes, int num_ranks) const {
#ifndef DISABLE_NVSHMEM
// Legacy mode
if (num_ranks <= NUM_MAX_NVL_PEERS)
return 0;
// Below are some assumptions
// TODO: add assertions
constexpr int kNumMaxTopK = 128;
constexpr int kNumMaxScales = 128;
EP_HOST_ASSERT(num_ranks % NUM_MAX_NVL_PEERS == 0);
EP_HOST_ASSERT(num_sms % 2 == 0);
const int num_rdma_ranks = num_ranks / NUM_MAX_NVL_PEERS;
const int num_channels = num_sms / 2;
size_t num_bytes = 0;
num_bytes += num_channels * num_rdma_ranks * (NUM_MAX_NVL_PEERS * 2 + 2) * 2 * sizeof(int);
num_bytes += num_channels * num_rdma_ranks * num_max_rdma_chunked_recv_tokens * hidden_bytes * 2;
num_bytes += num_channels * num_rdma_ranks * num_max_rdma_chunked_recv_tokens * internode::get_source_meta_bytes() * 2;
num_bytes += num_channels * num_rdma_ranks * num_max_rdma_chunked_recv_tokens * kNumMaxTopK * sizeof(int64_t) * 2;
num_bytes += num_channels * num_rdma_ranks * num_max_rdma_chunked_recv_tokens * kNumMaxTopK * sizeof(float) * 2;
num_bytes += num_channels * num_rdma_ranks * num_max_rdma_chunked_recv_tokens * kNumMaxScales * sizeof(float) * 2;
num_bytes += num_channels * num_rdma_ranks * num_max_rdma_chunked_recv_tokens * sizeof(int4) * 2;
num_bytes = ((num_bytes + 127) / 128) * 128;
return num_bytes;
#else
EP_HOST_ASSERT(false and "NVSHMEM is disable during compilation");
#endif
}
};
struct LowLatencyBuffer {
int num_clean_int = 0;
void* dispatch_rdma_send_buffer = nullptr;
void* dispatch_rdma_recv_data_buffer = nullptr;
int* dispatch_rdma_recv_count_buffer = nullptr;
void* combine_rdma_send_buffer = nullptr;
void* combine_rdma_recv_data_buffer = nullptr;
int* combine_rdma_recv_flag_buffer = nullptr;
void* combine_rdma_send_buffer_data_start = nullptr;
size_t num_bytes_per_combine_msg = 0;
std::pair<int*, int> clean_meta() {
EP_HOST_ASSERT(dispatch_rdma_recv_count_buffer == combine_rdma_recv_flag_buffer);
return {dispatch_rdma_recv_count_buffer, num_clean_int};
}
};
struct LowLatencyLayout {
size_t total_bytes = 0;
LowLatencyBuffer buffers[2];
template <typename out_ptr_t = void*, typename count_ptr_t = uint8_t*, typename in_ptr_t = void*>
out_ptr_t advance(const in_ptr_t& ptr, size_t count) {
return reinterpret_cast<out_ptr_t>(reinterpret_cast<count_ptr_t>(ptr) + count);
}
LowLatencyLayout(void* rdma_buffer, int num_max_dispatch_tokens_per_rank, int hidden, int num_ranks, int num_experts) {
const int num_scales = hidden / 128;
// Dispatch and combine layout:
// - 2 symmetric odd/even send buffer
// - 2 symmetric odd/even receive buffers
// - 2 symmetric odd/even signaling buffers
// Message sizes
// NOTES: you should add a control `int4` for combine messages if you want to do data transformation
EP_HOST_ASSERT(num_scales * sizeof(float) <= hidden);
size_t num_bytes_per_dispatch_msg = sizeof(int4) + std::max(hidden * sizeof(nv_bfloat16), hidden + num_scales * sizeof(float));
size_t num_bytes_per_combine_msg = hidden * sizeof(nv_bfloat16);
// Send buffer
size_t dispatch_send_buffer_bytes = num_max_dispatch_tokens_per_rank * num_bytes_per_dispatch_msg;
size_t combine_send_buffer_bytes = num_experts * num_max_dispatch_tokens_per_rank * num_bytes_per_combine_msg;
size_t send_buffer_bytes = std::max(dispatch_send_buffer_bytes, combine_send_buffer_bytes);
EP_HOST_ASSERT(send_buffer_bytes % sizeof(int4) == 0);
total_bytes += send_buffer_bytes * 2;
// Symmetric receive buffers
// TODO: optimize memory usages
size_t dispatch_recv_data_buffer_bytes = num_experts * num_max_dispatch_tokens_per_rank * num_bytes_per_dispatch_msg;
size_t combine_recv_buffer_bytes = num_experts * num_max_dispatch_tokens_per_rank * num_bytes_per_combine_msg;
size_t recv_buffer_bytes = std::max(dispatch_recv_data_buffer_bytes, combine_recv_buffer_bytes);
EP_HOST_ASSERT(recv_buffer_bytes % sizeof(int4) == 0);
total_bytes += recv_buffer_bytes * 2;
// Symmetric signaling buffers
size_t dispatch_recv_count_buffer_bytes = num_experts * sizeof(int);
size_t combine_recv_flag_buffer_bytes = dispatch_recv_count_buffer_bytes;
size_t signaling_buffer_bytes = std::max(dispatch_recv_count_buffer_bytes, combine_recv_flag_buffer_bytes);
total_bytes += signaling_buffer_bytes * 2;
// Assign pointers
// NOTES: we still leave some space for distinguishing dispatch/combine buffer,
// so you may see some parameters are duplicated
for (int i = 0; i < 2; ++ i) {
buffers[i] = {
static_cast<int>(signaling_buffer_bytes / sizeof(int)),
advance(rdma_buffer, send_buffer_bytes * i),
advance(rdma_buffer, send_buffer_bytes * 2 + recv_buffer_bytes * i),
advance<int*>(rdma_buffer, send_buffer_bytes * 2 + recv_buffer_bytes * 2 + signaling_buffer_bytes * i),
advance(rdma_buffer, send_buffer_bytes * i),
advance(rdma_buffer, send_buffer_bytes * 2 + recv_buffer_bytes * i),
advance<int*>(rdma_buffer, send_buffer_bytes * 2 + recv_buffer_bytes * 2 + signaling_buffer_bytes * i),
advance(rdma_buffer, send_buffer_bytes * i),
num_bytes_per_combine_msg
};
}
}
};
size_t get_low_latency_rdma_size_hint(int num_max_dispatch_tokens_per_rank, int hidden, int num_ranks, int num_experts) {
auto num_bytes = LowLatencyLayout(nullptr, num_max_dispatch_tokens_per_rank, hidden, num_ranks, num_experts).total_bytes;
return ((num_bytes + NUM_BUFFER_ALIGNMENT_BYTES) / NUM_BUFFER_ALIGNMENT_BYTES) * NUM_BUFFER_ALIGNMENT_BYTES;
}
} // namespace deep_ep

1347
DeepEP/csrc/deep_ep.cpp Normal file

File diff suppressed because it is too large Load Diff

157
DeepEP/csrc/deep_ep.hpp Normal file
View File

@ -0,0 +1,157 @@
#pragma once
// Forcibly disable NDEBUG
#ifdef NDEBUG
#undef NDEBUG
#endif
#include <pybind11/pybind11.h>
#include <pybind11/pytypes.h>
#include <torch/types.h>
#include <tuple>
#include <vector>
#include "config.hpp"
#include "event.hpp"
#include "kernels/configs.cuh"
#include "kernels/exception.cuh"
#ifndef TORCH_EXTENSION_NAME
#define TORCH_EXTENSION_NAME deep_ep_cpp
#endif
namespace deep_ep {
struct Buffer {
EP_STATIC_ASSERT(NUM_MAX_NVL_PEERS == 8, "The number of maximum NVLink peers must be 8");
private:
// Low-latency mode buffer
int low_latency_buffer_idx = 0;
bool low_latency_mode = false;
// NVLink Buffer
int64_t num_nvl_bytes;
void* buffer_ptrs[NUM_MAX_NVL_PEERS] = {nullptr};
void** buffer_ptrs_gpu = nullptr;
// NVSHMEM Buffer
int64_t num_rdma_bytes;
void* rdma_buffer_ptr = nullptr;
// Device info and communication
int device_id;
int num_device_sms;
int rank, rdma_rank, nvl_rank;
int num_ranks, num_rdma_ranks, num_nvl_ranks;
cudaIpcMemHandle_t ipc_handles[NUM_MAX_NVL_PEERS];
// Stream for communication
at::cuda::CUDAStream comm_stream;
// After IPC/NVSHMEM synchronization, this flag will be true
bool available = false;
// Barrier signals
int* barrier_signal_ptrs[NUM_MAX_NVL_PEERS] = {nullptr};
int** barrier_signal_ptrs_gpu = nullptr;
// Workspace
void* workspace = nullptr;
// Host-side MoE info
volatile int* moe_recv_counter = nullptr;
int* moe_recv_counter_mapped = nullptr;
// Host-side expert-level MoE info
volatile int* moe_recv_expert_counter = nullptr;
int* moe_recv_expert_counter_mapped = nullptr;
// Host-side RDMA-level MoE info
volatile int* moe_recv_rdma_counter = nullptr;
int* moe_recv_rdma_counter_mapped = nullptr;
public:
Buffer(int rank, int num_ranks, int64_t num_nvl_bytes, int64_t num_rdma_bytes, bool low_latency_mode);
~Buffer() noexcept(false);
bool is_available() const;
bool is_internode_available() const;
int get_num_rdma_ranks() const;
int get_rdma_rank() const;
int get_root_rdma_rank(bool global) const;
int get_local_device_id() const;
pybind11::bytearray get_local_ipc_handle() const;
pybind11::bytearray get_local_nvshmem_unique_id() const;
torch::Tensor get_local_buffer_tensor(const pybind11::object& dtype, int64_t offset, bool use_rdma_buffer) const;
torch::Stream get_comm_stream() const;
void sync(const std::vector<int>& device_ids, const std::vector<std::optional<pybind11::bytearray>>& all_gathered_handles, const std::optional<pybind11::bytearray>& root_unique_id_opt);
std::tuple<torch::Tensor, std::optional<torch::Tensor>, torch::Tensor, torch::Tensor, std::optional<EventHandle>>
get_dispatch_layout(const torch::Tensor& topk_idx, int num_experts, std::optional<EventHandle>& previous_event,
bool async, bool allocate_on_comm_stream);
std::tuple<torch::Tensor, std::optional<torch::Tensor>, std::optional<torch::Tensor>, std::optional<torch::Tensor>, std::vector<int>, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, std::optional<EventHandle>>
intranode_dispatch(const torch::Tensor& x, const std::optional<torch::Tensor>& x_scales,
const std::optional<torch::Tensor>& topk_idx, const std::optional<torch::Tensor>& topk_weights,
const std::optional<torch::Tensor>& num_tokens_per_rank, const torch::Tensor& is_token_in_rank, const std::optional<torch::Tensor>& num_tokens_per_expert,
int cached_num_recv_tokens, const std::optional<torch::Tensor>& cached_rank_prefix_matrix, const std::optional<torch::Tensor>& cached_channel_prefix_matrix,
int expert_alignment, int num_worst_tokens, const Config& config,
std::optional<EventHandle>& previous_event, bool async, bool allocate_on_comm_stream);
std::tuple<torch::Tensor, std::optional<torch::Tensor>, std::optional<EventHandle>>
intranode_combine(const torch::Tensor& x, const std::optional<torch::Tensor>& topk_weights,
const std::optional<torch::Tensor>& bias_0, const std::optional<torch::Tensor>& bias_1,
const torch::Tensor& src_idx, const torch::Tensor& rank_prefix_matrix, const torch::Tensor& channel_prefix_matrix,
const torch::Tensor& send_head, const Config& config, std::optional<EventHandle>& previous_event, bool async, bool allocate_on_comm_stream);
std::tuple<torch::Tensor, std::optional<torch::Tensor>, std::optional<torch::Tensor>, std::optional<torch::Tensor>, std::vector<int>, torch::Tensor, torch::Tensor, std::optional<torch::Tensor>, torch::Tensor, std::optional<torch::Tensor>, torch::Tensor, std::optional<torch::Tensor>, std::optional<torch::Tensor>, std::optional<torch::Tensor>, std::optional<EventHandle>>
internode_dispatch(const torch::Tensor& x, const std::optional<torch::Tensor>& x_scales,
const std::optional<torch::Tensor>& topk_idx, const std::optional<torch::Tensor>& topk_weights,
const std::optional<torch::Tensor>& num_tokens_per_rank, const std::optional<torch::Tensor>& num_tokens_per_rdma_rank,
const torch::Tensor& is_token_in_rank, const std::optional<torch::Tensor>& num_tokens_per_expert,
int cached_num_recv_tokens, int cached_num_rdma_recv_tokens,
const std::optional<torch::Tensor>& cached_rdma_channel_prefix_matrix, const std::optional<torch::Tensor>& cached_recv_rdma_rank_prefix_sum,
const std::optional<torch::Tensor>& cached_gbl_channel_prefix_matrix, const std::optional<torch::Tensor>& cached_recv_gbl_rank_prefix_sum,
int expert_alignment, const Config& config, std::optional<EventHandle>& previous_event, bool async, bool allocate_on_comm_stream);
std::tuple<torch::Tensor, std::optional<torch::Tensor>, std::optional<EventHandle>>
internode_combine(const torch::Tensor& x, const std::optional<torch::Tensor>& topk_weights,
const std::optional<torch::Tensor>& bias_0, const std::optional<torch::Tensor>& bias_1,
const torch::Tensor& src_meta, const torch::Tensor& is_combined_token_in_rank,
const torch::Tensor& rdma_channel_prefix_matrix, const torch::Tensor& rdma_rank_prefix_sum, const torch::Tensor& gbl_channel_prefix_matrix,
const torch::Tensor& combined_rdma_head, const torch::Tensor& combined_nvl_head,
const Config& config, std::optional<EventHandle>& previous_event, bool async, bool allocate_on_comm_stream);
void clean_low_latency_buffer(int num_max_dispatch_tokens_per_rank, int hidden, int num_experts);
std::tuple<torch::Tensor, std::optional<torch::Tensor>, torch::Tensor, torch::Tensor, torch::Tensor, std::optional<EventHandle>, std::optional<std::function<void()>>>
low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_idx,
const std::optional<torch::Tensor>& cumulative_local_expert_recv_stats,
int num_max_dispatch_tokens_per_rank, int num_experts,
bool use_fp8, bool round_scale, bool use_ue8m0,
bool async, bool return_recv_hook);
std::tuple<torch::Tensor, std::optional<EventHandle>, std::optional<std::function<void()>>>
low_latency_combine(const torch::Tensor& x, const torch::Tensor& topk_idx, const torch::Tensor& topk_weights,
const torch::Tensor& src_info, const torch::Tensor& layout_range,
int num_max_dispatch_tokens_per_rank, int num_experts,
bool zero_copy, bool async, bool return_recv_hook,
const std::optional<torch::Tensor>& out = std::nullopt);
torch::Tensor
get_next_low_latency_combine_buffer(int num_max_dispatch_tokens_per_rank, int hidden, int num_experts) const;
};
} // namespace deep_ep

43
DeepEP/csrc/event.hpp Normal file
View File

@ -0,0 +1,43 @@
#include <ATen/cuda/CUDAContext.h>
#include <memory>
#include "kernels/exception.cuh"
namespace deep_ep {
struct EventHandle {
std::shared_ptr<torch::Event> event;
EventHandle() {
event = std::make_shared<torch::Event>(torch::kCUDA);
event->record(at::cuda::getCurrentCUDAStream());
}
explicit EventHandle(const at::cuda::CUDAStream& stream) {
event = std::make_shared<torch::Event>(torch::kCUDA);
event->record(stream);
}
EventHandle(const EventHandle& other) = default;
void current_stream_wait() const {
at::cuda::getCurrentCUDAStream().unwrap().wait(*event);
}
};
torch::Event create_event(const at::cuda::CUDAStream &s) {
auto event = torch::Event(torch::kCUDA);
event.record(s);
return event;
}
void stream_wait(const at::cuda::CUDAStream& s_0, const at::cuda::CUDAStream& s_1) {
EP_HOST_ASSERT(s_0.id() != s_1.id());
s_0.unwrap().wait(create_event(s_1));
}
void stream_wait(const at::cuda::CUDAStream& s, const EventHandle& event) {
s.unwrap().wait(*event.event);
}
} // namespace deep_ep

View File

@ -0,0 +1,21 @@
function(add_deep_ep_library target_name source_file)
add_library(${target_name} STATIC ${source_file})
set_target_properties(${target_name} PROPERTIES
POSITION_INDEPENDENT_CODE ON
CXX_STANDARD_REQUIRED ON
CUDA_STANDARD_REQUIRED ON
CXX_STANDARD 17
CUDA_STANDARD 17
CUDA_SEPARABLE_COMPILATION ON
)
target_link_libraries(${target_name} PUBLIC nvshmem cudart cudadevrt mlx5)
endfunction()
add_deep_ep_library(runtime_cuda runtime.cu)
add_deep_ep_library(layout_cuda layout.cu)
add_deep_ep_library(intranode_cuda intranode.cu)
add_deep_ep_library(internode_cuda internode.cu)
add_deep_ep_library(internode_ll_cuda internode_ll.cu)
# Later, we should link all libraries in `EP_CUDA_LIBRARIES`
set(EP_CUDA_LIBRARIES runtime_cuda layout_cuda intranode_cuda internode_cuda internode_ll_cuda PARENT_SCOPE)

167
DeepEP/csrc/kernels/api.cuh Normal file
View File

@ -0,0 +1,167 @@
#pragma once
#include <vector>
namespace deep_ep {
// Intranode runtime
namespace intranode {
void barrier(int** barrier_signal_ptrs, int rank, int num_ranks, cudaStream_t stream);
} // namespace intranode
// Internode runtime
namespace internode {
std::vector<uint8_t> get_unique_id();
int init(const std::vector<uint8_t> &root_unique_id_val, int rank, int num_ranks, bool low_latency_mode);
void *alloc(size_t size, size_t alignment);
void free(void *ptr);
void barrier();
void finalize();
} // namespace internode
// Layout kernels
namespace layout {
void get_dispatch_layout(const int64_t* topk_idx,
int* num_tokens_per_rank, int* num_tokens_per_rdma_rank,
int* num_tokens_per_expert, bool* is_token_in_rank,
int num_tokens, int num_topk, int num_ranks, int num_experts,
cudaStream_t stream);
} // namespace layout
// Intranode kernels
namespace intranode {
void notify_dispatch(const int* num_tokens_per_rank, int* moe_recv_counter_mapped, int num_ranks,
const int* num_tokens_per_expert, int* moe_recv_expert_counter_mapped, int num_experts,
int num_tokens, const bool* is_token_in_rank, int* channel_prefix_matrix,
int* rank_prefix_matrix_copy, int num_memset_int, int expert_alignment,
void** buffer_ptrs, int** barrier_signal_ptrs, int rank,
cudaStream_t stream, int num_sms);
void cached_notify_dispatch(const int* rank_prefix_matrix, int num_memset_int,
void** buffer_ptrs, int** barrier_signal_ptrs, int rank, int num_ranks,
cudaStream_t stream);
void dispatch(void* recv_x, float* recv_x_scales, int* recv_src_idx, int64_t* recv_topk_idx, float* recv_topk_weights, int* recv_channel_offset,
int* send_head, const void* x, const float* x_scales, const int64_t* topk_idx, const float* topk_weights,
const bool* is_token_in_rank, const int* channel_prefix_matrix,
int num_tokens, int num_worst_tokens, int hidden_int4, int num_topk, int num_experts, int num_scales,
int scale_token_stride, int scale_hidden_stride,
void** buffer_ptrs, int rank, int num_ranks,
cudaStream_t stream, int num_sms,
int num_max_send_tokens, int num_recv_buffer_tokens);
void cached_notify_combine(void** buffer_ptrs, int* send_head, int num_channels, int num_recv_tokens, int num_memset_int,
int** barrier_signal_ptrs, int rank, int num_ranks, cudaStream_t stream);
void combine(cudaDataType_t type,
void* recv_x, float* recv_topk_weights,
const void* x, const float* topk_weights,
const void* bias_0, const void* bias_1,
const int* src_idx, const int* rank_prefix_matrix, const int* channel_prefix_matrix,
int* send_head, int num_tokens, int num_recv_tokens, int hidden, int num_topk,
void** buffer_ptrs, int rank, int num_ranks,
cudaStream_t stream, int num_sms,
int num_max_send_tokens, int num_recv_buffer_tokens);
} // namespace intranode
// Internode kernels
namespace internode {
int get_source_meta_bytes();
void notify_dispatch(const int* num_tokens_per_rank, int* moe_recv_counter_mapped, int num_ranks,
const int* num_tokens_per_rdma_rank, int* moe_recv_rdma_counter_mapped,
const int* num_tokens_per_expert, int* moe_recv_expert_counter_mapped, int num_experts,
const bool* is_token_in_rank, int num_tokens, int num_channels,
int hidden_int4, int num_scales, int num_topk, int expert_alignment,
int* rdma_channel_prefix_matrix, int* recv_rdma_rank_prefix_sum,
int* gbl_channel_prefix_matrix, int* recv_gbl_rank_prefix_sum,
void* rdma_buffer_ptr, int num_max_rdma_chunked_recv_tokens,
void** buffer_ptrs, int num_max_nvl_chunked_recv_tokens,
int** barrier_signal_ptrs, int rank,
cudaStream_t stream, int64_t num_rdma_bytes, int64_t num_nvl_bytes,
bool low_latency_mode);
void dispatch(void* recv_x, float* recv_x_scales, int64_t* recv_topk_idx, float* recv_topk_weights, void* recv_src_meta,
const void* x, const float* x_scales, const int64_t* topk_idx, const float* topk_weights,
int* send_rdma_head, int* send_nvl_head,
int* recv_rdma_channel_prefix_matrix, int* recv_gbl_channel_prefix_matrix,
const int* rdma_channel_prefix_matrix, const int* recv_rdma_rank_prefix_sum,
const int* gbl_channel_prefix_matrix, const int* recv_gbl_rank_prefix_sum,
const bool* is_token_in_rank,
int num_tokens, int hidden_int4, int num_scales, int num_topk, int num_experts,
int scale_token_stride, int scale_hidden_stride,
void* rdma_buffer_ptr, int num_max_rdma_chunked_send_tokens, int num_max_rdma_chunked_recv_tokens,
void** buffer_ptrs, int num_max_nvl_chunked_send_tokens, int num_max_nvl_chunked_recv_tokens,
int rank, int num_ranks, bool is_cached_dispatch,
cudaStream_t stream, int num_channels, bool low_latency_mode);
void cached_notify(int hidden_int4, int num_scales, int num_topk_idx, int num_topk_weights,
int num_ranks, int num_channels, int num_combined_tokens, int* combined_rdma_head,
const int* rdma_channel_prefix_matrix, const int* rdma_rank_prefix_sum, int* combined_nvl_head,
void* rdma_buffer_ptr, int num_max_rdma_chunked_recv_tokens,
void** buffer_ptrs, int num_max_nvl_chunked_recv_tokens,
int** barrier_signal_ptrs, int rank, cudaStream_t stream,
int64_t num_rdma_bytes, int64_t num_nvl_bytes,
bool is_cached_dispatch, bool low_latency_mode);
void combine(cudaDataType_t type,
void* combined_x, float* combined_topk_weights,
const bool* is_combined_token_in_rank,
const void* x, const float* topk_weights,
const void* bias_0, const void* bias_1,
const int* combined_rdma_head, const int* combined_nvl_head,
const void* src_meta, const int* rdma_channel_prefix_matrix, const int* rdma_rank_prefix_sum, const int* gbl_channel_prefix_matrix,
int num_tokens, int num_combined_tokens, int hidden, int num_topk,
void* rdma_buffer_ptr, int num_max_rdma_chunked_send_tokens, int num_max_rdma_chunked_recv_tokens,
void** buffer_ptrs, int num_max_nvl_chunked_send_tokens, int num_max_nvl_chunked_recv_tokens,
int rank, int num_ranks, cudaStream_t stream, int num_channels, bool low_latency_mode);
} // namespace internode
// Internode low-latency kernels
namespace internode_ll {
void clean_low_latency_buffer(int* clean_0, int num_clean_int_0,
int* clean_1, int num_clean_int_1,
cudaStream_t stream);
void dispatch(void* packed_recv_x, void* packed_recv_x_scales,
int* packed_recv_src_info, int64_t* packed_recv_layout_range,
int* packed_recv_count,
int* cumulative_local_expert_recv_stats,
void* rdma_recv_x, int* rdma_recv_count, void* rdma_x,
const void* x, const int64_t* topk_idx,
int* next_clean, int num_next_clean_int,
int num_tokens, int hidden, int num_max_dispatch_tokens_per_rank,
int num_topk, int num_experts, int rank, int num_ranks,
bool use_fp8, bool round_scale, bool use_ue8m0,
void* workspace, int num_device_sms,
cudaStream_t stream, int phases);
void combine(void* combined_x,
void* rdma_recv_x, int* rdma_recv_flag, void* rdma_send_x,
const void* x, const int64_t* topk_idx, const float* topk_weights,
const int* src_info, const int64_t* layout_range,
int* next_clean, int num_next_clean_int,
int num_combined_tokens, int hidden, int num_max_dispatch_tokens_per_rank,
int num_topk, int num_experts, int rank, int num_ranks,
void* workspace, int num_device_sms,
cudaStream_t stream, int phases, bool zero_copy);
} // namespace internode_ll
} // namespace deep_ep

View File

@ -0,0 +1,138 @@
#pragma once
#include "configs.cuh"
#include "exception.cuh"
namespace deep_ep {
template <typename dtype_t>
struct Buffer {
private:
uint8_t* ptr;
public:
int total_bytes;
__device__ __forceinline__ Buffer() : ptr(nullptr), total_bytes(0) {}
__device__ __forceinline__ Buffer(void* &gbl_ptr, int num_elems, int offset = 0) {
total_bytes = num_elems * sizeof(dtype_t);
ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + offset * sizeof(dtype_t);
gbl_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + total_bytes;
}
__device__ __forceinline__ Buffer advance_also(void* &gbl_ptr) {
gbl_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + total_bytes;
return *this;
}
__device__ __forceinline__ dtype_t* buffer() {
return reinterpret_cast<dtype_t*>(ptr);
}
__device__ __forceinline__ dtype_t& operator[](int idx) {
return buffer()[idx];
}
};
template <typename dtype_t, int kNumRanks = 1>
struct AsymBuffer {
private:
uint8_t* ptrs[kNumRanks];
int num_bytes;
public:
int total_bytes;
__device__ __forceinline__ AsymBuffer(void* &gbl_ptr, int num_elems, int num_ranks,
int sm_id = 0, int num_sms = 1, int offset = 0) {
EP_STATIC_ASSERT(kNumRanks == 1, "");
num_bytes = num_elems * sizeof(dtype_t);
int per_channel_bytes = num_bytes * num_ranks;
total_bytes = per_channel_bytes * num_sms;
ptrs[0] = reinterpret_cast<uint8_t*>(gbl_ptr) + per_channel_bytes * sm_id + num_bytes * offset;
gbl_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + total_bytes;
}
__device__ __forceinline__ AsymBuffer(void** gbl_ptrs, int num_elems, int num_ranks,
int sm_id = 0, int num_sms = 1, int offset = 0) {
EP_STATIC_ASSERT(kNumRanks > 1, "");
num_bytes = num_elems * sizeof(dtype_t);
int per_channel_bytes = num_bytes * num_ranks;
total_bytes = per_channel_bytes * num_sms;
for (int i = 0; i < kNumRanks; ++ i) {
ptrs[i] = reinterpret_cast<uint8_t*>(gbl_ptrs[i]) + per_channel_bytes * sm_id + num_bytes * offset;
gbl_ptrs[i] = reinterpret_cast<uint8_t*>(gbl_ptrs[i]) + total_bytes;
}
}
__device__ __forceinline__ void advance(int shift) {
#pragma unroll
for (int i = 0; i < kNumRanks; ++ i)
ptrs[i] = ptrs[i] + shift * sizeof(dtype_t);
}
__device__ __forceinline__ AsymBuffer advance_also(void* &gbl_ptr) {
gbl_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + total_bytes;
return *this;
}
template<int kNumAlsoRanks>
__device__ __forceinline__ AsymBuffer advance_also(void** gbl_ptrs) {
for (int i = 0; i < kNumAlsoRanks; ++ i)
gbl_ptrs[i] = reinterpret_cast<uint8_t*>(gbl_ptrs[i]) + total_bytes;
return *this;
}
__device__ __forceinline__ dtype_t* buffer(int idx = 0) {
EP_STATIC_ASSERT(kNumRanks == 1, "`buffer` is only available for single rank case");
return reinterpret_cast<dtype_t*>(ptrs[0] + num_bytes * idx);
}
__device__ __forceinline__ dtype_t* buffer_by(int rank_idx, int idx = 0) {
EP_STATIC_ASSERT(kNumRanks > 1, "`buffer` is only available for single rank case");
return reinterpret_cast<dtype_t*>(ptrs[rank_idx] + num_bytes * idx);
}
};
template <typename dtype_t, bool kDecoupled = true>
struct SymBuffer {
private:
// NOTES: for non-decoupled case, `recv_ptr` is not used
uint8_t* send_ptr;
uint8_t* recv_ptr;
int num_bytes;
public:
int total_bytes;
__device__ __forceinline__ SymBuffer(void* &gbl_ptr, int num_elems, int num_ranks,
int sm_id = 0, int num_sms = 1) {
num_bytes = num_elems * sizeof(dtype_t);
int per_channel_bytes = num_bytes * num_ranks;
total_bytes = per_channel_bytes * num_sms * (static_cast<int>(kDecoupled) + 1);
send_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + per_channel_bytes * sm_id;
recv_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + per_channel_bytes * (sm_id + num_sms);
gbl_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + total_bytes;
}
__device__ __forceinline__ dtype_t* send_buffer(int idx = 0) {
EP_STATIC_ASSERT(kDecoupled, "`send_buffer` is only available for non-decoupled case");
return reinterpret_cast<dtype_t*>(send_ptr + num_bytes * idx);
}
__device__ __forceinline__ dtype_t* recv_buffer(int idx = 0) {
EP_STATIC_ASSERT(kDecoupled, "`recv_buffer` is only available for non-decoupled case");
return reinterpret_cast<dtype_t*>(recv_ptr + num_bytes * idx);
}
__device__ __forceinline__ dtype_t* buffer(int idx = 0) {
EP_STATIC_ASSERT(not kDecoupled, "`buffer` is only available for decoupled case");
return reinterpret_cast<dtype_t*>(send_ptr + num_bytes * idx);
}
};
} // namespace deep_ep

View File

@ -0,0 +1,67 @@
#pragma once
#define NUM_MAX_NVL_PEERS 8
#define NUM_MAX_RDMA_PEERS 20
#define NUM_WORKSPACE_BYTES (32 * 1024 * 1024)
#define NUM_MAX_LOCAL_EXPERTS 1024
#define NUM_BUFFER_ALIGNMENT_BYTES 128
#define FINISHED_SUM_TAG 1024
#define NUM_WAIT_NANOSECONDS 500
#ifndef ENABLE_FAST_DEBUG
#define NUM_CPU_TIMEOUT_SECS 100
#define NUM_TIMEOUT_CYCLES 200000000000ull // 200G cycles ~= 100s
#else
#define NUM_CPU_TIMEOUT_SECS 10
#define NUM_TIMEOUT_CYCLES 20000000000ull // 20G cycles ~= 10s
#endif
#define LOW_LATENCY_SEND_PHASE 1
#define LOW_LATENCY_RECV_PHASE 2
// Make CLion CUDA indexing work
#ifdef __CLION_IDE__
#define __CUDA_ARCH__ 900 // NOLINT(*-reserved-identifier)
#define __CUDACC_RDC__ // NOLINT(*-reserved-identifier)
#endif
// Remove Torch restrictions
#ifdef __CUDA_NO_HALF_CONVERSIONS__
#undef __CUDA_NO_HALF_CONVERSIONS__
#endif
#ifdef __CUDA_NO_HALF_OPERATORS__
#undef __CUDA_NO_HALF_OPERATORS__
#endif
#ifdef __CUDA_NO_HALF2_OPERATORS__
#undef __CUDA_NO_HALF2_OPERATORS__
#endif
#ifdef __CUDA_NO_BFLOAT16_CONVERSIONS__
#undef __CUDA_NO_BFLOAT16_CONVERSIONS__
#endif
#ifdef __CUDA_NO_BFLOAT162_OPERATORS__
#undef __CUDA_NO_BFLOAT162_OPERATORS__
#endif
#include <cstdint>
#include <cuda_bf16.h>
#include <cuda_runtime.h>
#ifndef DISABLE_SM90_FEATURES
#include <cuda_fp8.h>
#else
// Ampere does not support FP8 features
#define __NV_E4M3 0
#define __NV_E5M2 1
typedef int __nv_fp8_interpretation_t;
typedef int __nv_fp8x4_e4m3;
typedef uint8_t __nv_fp8_storage_t;
#endif
#ifndef DISABLE_NVSHMEM
#include <nvshmem.h>
#include <nvshmemx.h>
#include <infiniband/mlx5dv.h>
#include <non_abi/device/threadgroup/nvshmemi_common_device_defines.cuh>
#include <device_host_transport/nvshmem_common_ibgda.h>
#endif

View File

@ -0,0 +1,51 @@
#pragma once
#include <string>
#include <exception>
#include "configs.cuh"
#ifndef EP_STATIC_ASSERT
#define EP_STATIC_ASSERT(cond, reason) static_assert(cond, reason)
#endif
class EPException: public std::exception {
private:
std::string message = {};
public:
explicit EPException(const char *name, const char* file, const int line, const std::string& error) {
message = std::string("Failed: ") + name + " error " + file + ":" + std::to_string(line) + " '" + error + "'";
}
const char *what() const noexcept override { return message.c_str(); }
};
#ifndef CUDA_CHECK
#define CUDA_CHECK(cmd) \
do { \
cudaError_t e = (cmd); \
if (e != cudaSuccess) { \
throw EPException("CUDA", __FILE__, __LINE__, cudaGetErrorString(e)); \
} \
} while (0)
#endif
#ifndef EP_HOST_ASSERT
#define EP_HOST_ASSERT(cond) \
do { \
if (not (cond)) { \
throw EPException("Assertion", __FILE__, __LINE__, #cond); \
} \
} while (0)
#endif
#ifndef EP_DEVICE_ASSERT
#define EP_DEVICE_ASSERT(cond) \
do { \
if (not (cond)) { \
printf("Assertion failed: %s:%d, condition: %s\n", __FILE__, __LINE__, #cond); \
asm("trap;"); \
} \
} while (0)
#endif

View File

@ -0,0 +1,482 @@
// Portions derived from NVSHMEM (https://developer.nvidia.com/nvshmem)
// Copyright (c) NVIDIA Corporation.
// Licensed under the NVSHMEM Software License Agreement (version: September 3, 2019).
// See full license at: https://docs.nvidia.com/nvshmem/api/sla.html
//
// Modified from original source:
// - nvshmem/src/include/non_abi/device/pt-to-pt/ibgda_device.cuh
#pragma once
#include "configs.cuh"
#include "exception.cuh"
#include "utils.cuh"
namespace deep_ep {
EP_STATIC_ASSERT(NVSHMEMI_IBGDA_MIN_QP_DEPTH >= 64, "Invalid QP minimum depth");
__device__ static __forceinline__
uint64_t HtoBE64(uint64_t x) {
uint64_t ret;
asm("{\n\t"
".reg .b32 ign;\n\t"
".reg .b32 lo;\n\t"
".reg .b32 hi;\n\t"
".reg .b32 new_lo;\n\t"
".reg .b32 new_hi;\n\t"
"mov.b64 {lo,hi}, %1;\n\t"
"prmt.b32 new_hi, lo, ign, 0x0123;\n\t"
"prmt.b32 new_lo, hi, ign, 0x0123;\n\t"
"mov.b64 %0, {new_lo,new_hi};\n\t"
"}" : "=l"(ret) : "l"(x));
return ret;
}
__device__ static __forceinline__
uint32_t HtoBE32(uint32_t x) {
uint32_t ret;
asm("{\n\t"
".reg .b32 ign;\n\t"
"prmt.b32 %0, %1, ign, 0x0123;\n\t"
"}" : "=r"(ret) : "r"(x));
return ret;
}
__device__ static __forceinline__
uint16_t HtoBE16(uint16_t x) {
// TODO: simplify PTX using 16-bit instructions
auto a = static_cast<uint32_t>(x);
uint32_t d;
asm volatile(
"{\n\t"
".reg .b32 mask;\n\t"
".reg .b32 ign;\n\t"
"mov.b32 mask, 0x4401;\n\t"
"mov.b32 ign, 0x0;\n\t"
"prmt.b32 %0, %1, ign, mask;\n\t"
"}"
: "=r"(d)
: "r"(a));
return static_cast<uint16_t>(d);
}
typedef struct mlx5_wqe_ctrl_seg __attribute__((__aligned__(8))) ibgda_ctrl_seg_t;
typedef struct {
uint32_t add_data;
uint32_t field_boundary;
uint64_t reserved;
} __attribute__((__packed__)) ibgda_atomic_32_masked_fa_seg_t;
__device__ static __forceinline__
nvshmemi_ibgda_device_state_t* ibgda_get_state() {
return &nvshmemi_ibgda_device_state_d;
}
__device__ static __forceinline__
nvshmemi_ibgda_device_qp_t* ibgda_get_rc(int pe, int id) {
auto state = ibgda_get_state();
const auto num_rc_per_pe = ibgda_get_state()->num_rc_per_pe;
return &state->globalmem.rcs[pe * num_rc_per_pe + id % num_rc_per_pe];
}
__device__ static __forceinline__
void ibgda_lock_acquire(int *lock) {
while (atomicCAS(lock, 0, 1) == 1);
// Prevent reordering before the lock is acquired
memory_fence_cta();
}
__device__ static __forceinline__
void ibgda_lock_release(int *lock) {
memory_fence_cta();
// Prevent reordering before lock is released
st_na_relaxed(lock, 0);
}
__device__ static __forceinline__
void ibgda_update_dbr(nvshmemi_ibgda_device_qp_t *qp, uint32_t dbrec_head) {
// `DBREC` contains the index of the next empty `WQEBB`
__be32 dbrec_val;
__be32 *dbrec_ptr = qp->tx_wq.dbrec;
// This is equivalent to `WRITE_ONCE(dbrec_ptr, HtoBE32(dbrec_head & 0xffff))`
asm("{\n\t"
".reg .b32 dbrec_head_16b;\n\t"
".reg .b32 ign;\n\t"
"and.b32 dbrec_head_16b, %1, 0xffff;\n\t"
"prmt.b32 %0, dbrec_head_16b, ign, 0x123;\n\t"
"}"
: "=r"(dbrec_val)
: "r"(dbrec_head));
st_na_release(dbrec_ptr, dbrec_val);
}
__device__ static __forceinline__
void ibgda_ring_db(nvshmemi_ibgda_device_qp_t *qp, uint16_t prod_idx) {
auto bf_ptr = reinterpret_cast<uint64_t*>(qp->tx_wq.bf);
ibgda_ctrl_seg_t ctrl_seg = {
.opmod_idx_opcode = HtoBE32(prod_idx << 8),
.qpn_ds = HtoBE32(qp->qpn << 8)
};
EP_STATIC_ASSERT(sizeof(decltype(&ctrl_seg)) == sizeof(uint64_t), "");
st_na_release(bf_ptr, *(reinterpret_cast<uint64_t*>(&ctrl_seg)));
}
__device__ static __forceinline__
void ibgda_post_send(nvshmemi_ibgda_device_qp_t *qp, uint64_t new_prod_idx) {
nvshmemi_ibgda_device_qp_management_t *mvars = &qp->mvars;
uint64_t old_prod_idx;
// Update `prod_idx` before ringing the doorbell, so that we know which index is needed in quiet/fence
ibgda_lock_acquire(&mvars->post_send_lock);
old_prod_idx = atomicMax(reinterpret_cast<unsigned long long int*>(&mvars->tx_wq.prod_idx), new_prod_idx);
if (new_prod_idx > old_prod_idx) {
ibgda_update_dbr(qp, new_prod_idx);
ibgda_ring_db(qp, new_prod_idx);
}
ibgda_lock_release(&mvars->post_send_lock);
}
template <bool kAlwaysDoPostSend>
__device__ static __forceinline__
void ibgda_submit_requests(nvshmemi_ibgda_device_qp_t *qp, uint64_t base_wqe_idx,
uint32_t num_wqes, int message_idx = 0) {
nvshmemi_ibgda_device_qp_management_t *mvars = &qp->mvars;
uint64_t new_wqe_idx = base_wqe_idx + num_wqes;
// WQE writes must be finished first
__threadfence();
// Wait for prior WQE slots to be filled first
auto *ready_idx = reinterpret_cast<unsigned long long int*>(&mvars->tx_wq.ready_head);
while (atomicCAS(ready_idx, base_wqe_idx, new_wqe_idx) != base_wqe_idx);
// Always post, not in batch
constexpr int kNumRequestInBatch = 4;
if (kAlwaysDoPostSend or (message_idx + 1) % kNumRequestInBatch == 0)
ibgda_post_send(qp, new_wqe_idx);
}
__device__ static __forceinline__ void
ibgda_write_rdma_write_inl_wqe(nvshmemi_ibgda_device_qp_t *qp, const uint32_t *val, uint64_t raddr,
__be32 rkey, uint16_t wqe_idx, void** out_wqes, uint32_t imm) {
ibgda_ctrl_seg_t ctrl_seg;
struct mlx5_wqe_raddr_seg raddr_seg;
struct mlx5_wqe_inl_data_seg inl_seg;
auto *ctrl_seg_ptr = reinterpret_cast<ibgda_ctrl_seg_t*>(out_wqes[0]);
auto *raddr_seg_ptr = reinterpret_cast<mlx5_wqe_raddr_seg*>(reinterpret_cast<uintptr_t>(ctrl_seg_ptr) + sizeof(*ctrl_seg_ptr));
auto *inl_seg_ptr = reinterpret_cast<mlx5_wqe_inl_data_seg*>(reinterpret_cast<uintptr_t>(raddr_seg_ptr) + sizeof(*raddr_seg_ptr));
auto *wqe_data_ptr = reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(inl_seg_ptr) + sizeof(*inl_seg_ptr));
raddr_seg.raddr = HtoBE64(raddr);
raddr_seg.rkey = rkey;
raddr_seg.reserved = 0;
inl_seg.byte_count = HtoBE32(4 | MLX5_INLINE_SEG);
// `imm == std::numeric_limits<uint32_t>::max()` means no imm writes
ctrl_seg = {0};
ctrl_seg.qpn_ds = HtoBE32((qp->qpn << 8) | 3);
ctrl_seg.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
ctrl_seg.opmod_idx_opcode = HtoBE32((wqe_idx << 8) | (imm != std::numeric_limits<uint32_t>::max() ? MLX5_OPCODE_RDMA_WRITE_IMM : MLX5_OPCODE_RDMA_WRITE));
if (imm != std::numeric_limits<uint32_t>::max())
ctrl_seg.imm = HtoBE32(imm);
EP_STATIC_ASSERT(sizeof(*ctrl_seg_ptr) == 16, "sizeof(*ctrl_seg_ptr) == 16");
EP_STATIC_ASSERT(sizeof(*raddr_seg_ptr) == 16, "sizeof(*raddr_seg_ptr) == 16");
EP_STATIC_ASSERT(sizeof(*inl_seg_ptr) == 4, "sizeof(*inl_seg_ptr) == 4");
st_na_relaxed(reinterpret_cast<int4*>(ctrl_seg_ptr), *reinterpret_cast<const int4*>(&ctrl_seg));
st_na_relaxed(reinterpret_cast<int4*>(raddr_seg_ptr), *reinterpret_cast<const int4*>(&raddr_seg));
st_na_relaxed(reinterpret_cast<uint32_t*>(inl_seg_ptr), *reinterpret_cast<const uint32_t*>(&inl_seg));
st_na_relaxed(reinterpret_cast<uint32_t*>(wqe_data_ptr), *reinterpret_cast<const uint32_t*>(val));
}
__device__ static __forceinline__
uint64_t ibgda_get_lkey_and_rkey(uint64_t laddr, __be32 *lkey,
uint64_t raddr, int dst_pe, uint64_t *out_raddr, __be32 *out_rkey) {
auto state = ibgda_get_state();
auto heap_start = reinterpret_cast<uint64_t>(nvshmemi_device_state_d.heap_base);
auto log2_cumem_granularity = state->log2_cumem_granularity;
// Local key
uint64_t idx = (laddr - heap_start) >> log2_cumem_granularity;
auto device_key = state->constmem.lkeys[idx];
auto lchunk_size = device_key.next_addr - laddr;
*lkey = device_key.key;
// Remote key
uint64_t roffset = raddr - heap_start;
idx = ((roffset >> log2_cumem_granularity) * nvshmemi_device_state_d.npes) + dst_pe;
if (idx < NVSHMEMI_IBGDA_MAX_CONST_RKEYS) {
device_key = state->constmem.rkeys[idx];
} else {
device_key = state->globalmem.rkeys[idx - NVSHMEMI_IBGDA_MAX_CONST_RKEYS];
}
*out_raddr = reinterpret_cast<uint64_t>(nvshmemi_device_state_d.peer_heap_base_remote[dst_pe]) + roffset;
*out_rkey = device_key.key;
// Return the minimum of local and remote chunk sizes
auto rchunk_size = device_key.next_addr - roffset;
return min(lchunk_size, rchunk_size);
}
__device__ static __forceinline__ void
ibgda_get_rkey(uint64_t addr, int dst_pe, uint64_t *out_raddr, __be32 *out_rkey) {
auto state = ibgda_get_state();
auto heap_start = reinterpret_cast<uint64_t>(nvshmemi_device_state_d.heap_base);
uint64_t roffset = addr - heap_start;
uint64_t idx = ((roffset >> state->log2_cumem_granularity) * nvshmemi_device_state_d.npes) + dst_pe;
nvshmemi_ibgda_device_key_t device_key;
if (idx < NVSHMEMI_IBGDA_MAX_CONST_RKEYS)
device_key = state->constmem.rkeys[idx];
else
device_key = state->globalmem.rkeys[idx - NVSHMEMI_IBGDA_MAX_CONST_RKEYS];
*out_raddr = reinterpret_cast<uint64_t>(nvshmemi_device_state_d.peer_heap_base_remote[dst_pe]) + roffset;
*out_rkey = device_key.key;
}
__device__ static __forceinline__ uint64_t
ibgda_reserve_wqe_slots(nvshmemi_ibgda_device_qp_t *qp, uint32_t num_wqes) {
auto mvars = &qp->mvars;
return atomicAdd(reinterpret_cast<unsigned long long*>(&mvars->tx_wq.resv_head), static_cast<unsigned long long>(num_wqes));
}
__device__ static __forceinline__ void*
ibgda_get_wqe_ptr(nvshmemi_ibgda_device_qp_t* qp, uint16_t wqe_idx) {
uint16_t cnt = qp->tx_wq.nwqes;
uint16_t idx = wqe_idx & (cnt - 1);
return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(qp->tx_wq.wqe) + (idx << MLX5_SEND_WQE_SHIFT));
}
__device__ static __forceinline__ void
nvshmemi_ibgda_rma_p(int *rptr, const int value, int dst_pe, int qp_id, uint32_t imm = std::numeric_limits<uint32_t>::max()) {
// Get rkey
// NOTES: the `p` operation will not cross multiple remote chunks
__be32 rkey;
uint64_t raddr;
ibgda_get_rkey(reinterpret_cast<uint64_t>(rptr), dst_pe, &raddr, &rkey);
// Write WQEs
auto qp = ibgda_get_rc(dst_pe, qp_id);
uint64_t base_wqe_idx = ibgda_reserve_wqe_slots(qp, 1);
void *wqe_ptrs;
wqe_ptrs = ibgda_get_wqe_ptr(qp, base_wqe_idx);
ibgda_write_rdma_write_inl_wqe(qp, reinterpret_cast<const uint32_t*>(&value), raddr, rkey, base_wqe_idx, &wqe_ptrs, imm);
// Submit requests
ibgda_submit_requests<true>(qp, base_wqe_idx, 1);
}
__device__ static __forceinline__ void
ibgda_write_rdma_write_wqe(nvshmemi_ibgda_device_qp_t *qp, uint64_t laddr, __be32 lkey,
uint64_t raddr, __be32 rkey, uint32_t bytes, uint16_t wqe_idx,
void** out_wqes) {
ibgda_ctrl_seg_t ctrl_seg;
struct mlx5_wqe_raddr_seg raddr_seg;
struct mlx5_wqe_data_seg data_seg;
auto *ctrl_seg_ptr = reinterpret_cast<ibgda_ctrl_seg_t*>(out_wqes[0]);
void *av_seg_ptr = reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(ctrl_seg_ptr) + sizeof(*ctrl_seg_ptr));
struct mlx5_wqe_raddr_seg *raddr_seg_ptr;
struct mlx5_wqe_data_seg *data_seg_ptr;
raddr_seg_ptr = reinterpret_cast<mlx5_wqe_raddr_seg*>(reinterpret_cast<uintptr_t>(av_seg_ptr));
data_seg_ptr = reinterpret_cast<mlx5_wqe_data_seg*>(reinterpret_cast<uintptr_t>(raddr_seg_ptr) + sizeof(*raddr_seg_ptr));
raddr_seg.raddr = HtoBE64(raddr);
raddr_seg.rkey = rkey;
raddr_seg.reserved = 0;
data_seg.byte_count = HtoBE32(bytes);
data_seg.lkey = lkey;
data_seg.addr = HtoBE64(laddr);
ctrl_seg = {0};
ctrl_seg.qpn_ds = HtoBE32((qp->qpn << 8) | 3);
ctrl_seg.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
ctrl_seg.opmod_idx_opcode = HtoBE32((wqe_idx << 8) | MLX5_OPCODE_RDMA_WRITE);
EP_STATIC_ASSERT(sizeof(*ctrl_seg_ptr) == 16, "sizeof(*ctrl_seg_ptr) == 16");
EP_STATIC_ASSERT(sizeof(*raddr_seg_ptr) == 16, "sizeof(*raddr_seg_ptr) == 16");
EP_STATIC_ASSERT(sizeof(*data_seg_ptr) == 16, "sizeof(*data_seg_ptr) == 16");
st_na_relaxed(reinterpret_cast<int4*>(ctrl_seg_ptr), *reinterpret_cast<const int4*>(&ctrl_seg));
st_na_relaxed(reinterpret_cast<int4*>(raddr_seg_ptr), *reinterpret_cast<const int4*>(&raddr_seg));
st_na_relaxed(reinterpret_cast<int4*>(data_seg_ptr), *reinterpret_cast<const int4*>(&data_seg));
}
__device__ static __forceinline__ void
ibgda_write_empty_recv_wqe(void *out_wqe) {
auto *data_seg_ptr = reinterpret_cast<struct mlx5_wqe_data_seg*>(out_wqe);
struct mlx5_wqe_data_seg data_seg;
// Make the first segment in the WQE invalid, then the entire list will be invalid
data_seg.byte_count = 0;
data_seg.lkey = HtoBE64(MLX5_INVALID_LKEY);
data_seg.addr = 0;
EP_STATIC_ASSERT(sizeof(mlx5_wqe_data_seg) == sizeof(int4), "Invalid data type length");
st_na_relaxed(reinterpret_cast<int4*>(data_seg_ptr), *reinterpret_cast<const int4*>(&data_seg));
}
template <bool kAlwaysDoPostSend = false>
__device__ static __forceinline__ void
nvshmemi_ibgda_put_nbi_warp(uint64_t req_rptr, uint64_t req_lptr, size_t bytes, int dst_pe, int qp_id, int lane_id, int message_idx) {
// Get lkey and rkey, store them into lanes
uint32_t num_wqes = 0;
__be32 my_lkey = 0;
uint64_t my_laddr = 0;
__be32 my_rkey = 0;
uint64_t my_raddr = 0;
uint64_t my_chunk_size = 0;
// Decide how many messages (theoretically 3 for maximum)
auto remaining_bytes = bytes;
while (remaining_bytes > 0) {
if (lane_id == num_wqes)
my_chunk_size = min(remaining_bytes, ibgda_get_lkey_and_rkey(my_laddr = req_lptr, &my_lkey, req_rptr, dst_pe, &my_raddr, &my_rkey));
// Move one more message
auto chunk_size = __shfl_sync(0xffffffff, my_chunk_size, static_cast<int>(num_wqes));
remaining_bytes -= chunk_size;
req_lptr += chunk_size;
req_rptr += chunk_size;
++ num_wqes;
}
EP_DEVICE_ASSERT(num_wqes <= 32);
// Process WQE
auto qp = ibgda_get_rc(dst_pe, qp_id);
uint64_t base_wqe_idx = 0;
if (lane_id == 0)
base_wqe_idx = ibgda_reserve_wqe_slots(qp, num_wqes);
base_wqe_idx = __shfl_sync(0xffffffff, base_wqe_idx, 0);
if (lane_id < num_wqes) {
auto wqe_ptr = ibgda_get_wqe_ptr(qp, base_wqe_idx + lane_id);
ibgda_write_rdma_write_wqe(qp, my_laddr, my_lkey, my_raddr, my_rkey, my_chunk_size,
base_wqe_idx, &wqe_ptr);
}
__syncwarp();
// Submit
if (lane_id == 0)
ibgda_submit_requests<kAlwaysDoPostSend>(qp, base_wqe_idx, num_wqes, message_idx);
__syncwarp();
}
__device__ static __forceinline__ void ibgda_write_amo_add_wqe(
nvshmemi_ibgda_device_qp_t *qp, const int &value,
uint64_t laddr, __be32 lkey, uint64_t raddr, __be32 rkey,
uint16_t wqe_idx, void** out_wqes) {
ibgda_ctrl_seg_t ctrl_seg = {0};
struct mlx5_wqe_raddr_seg raddr_seg;
struct mlx5_wqe_atomic_seg atomic_seg_1;
struct mlx5_wqe_data_seg data_seg;
auto ctrl_seg_ptr = reinterpret_cast<ibgda_ctrl_seg_t*>(out_wqes[0]);
auto raddr_seg_ptr = reinterpret_cast<mlx5_wqe_raddr_seg*>(reinterpret_cast<uintptr_t>(ctrl_seg_ptr) + sizeof(*ctrl_seg_ptr));
auto atomic_seg_ptr = reinterpret_cast<mlx5_wqe_atomic_seg*>(reinterpret_cast<uintptr_t>(raddr_seg_ptr) + sizeof(*raddr_seg_ptr));
auto data_seg_ptr = reinterpret_cast<mlx5_wqe_data_seg*>(reinterpret_cast<uintptr_t>(atomic_seg_ptr) + sizeof(*atomic_seg_ptr));
raddr_seg.raddr = HtoBE64(raddr);
raddr_seg.rkey = rkey;
raddr_seg.reserved = 0;
// NOTES: `0x08000000` means `IBGDA_4_BYTE_EXT_AMO_OPMOD`
ctrl_seg.opmod_idx_opcode = HtoBE32(MLX5_OPCODE_ATOMIC_MASKED_FA | (wqe_idx << 8) | 0x08000000);
auto atomic_32_masked_fa_seg = reinterpret_cast<ibgda_atomic_32_masked_fa_seg_t*>(&atomic_seg_1);
atomic_32_masked_fa_seg->add_data = HtoBE32(value);
atomic_32_masked_fa_seg->field_boundary = 0;
ctrl_seg.qpn_ds = HtoBE32((qp->qpn << 8) | 4);
ctrl_seg.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
data_seg.byte_count = HtoBE32(sizeof(int));
data_seg.lkey = lkey;
data_seg.addr = HtoBE64(laddr);
EP_STATIC_ASSERT(sizeof(*ctrl_seg_ptr) == sizeof(int4), "Invalid vectorization");
EP_STATIC_ASSERT(sizeof(*raddr_seg_ptr) == sizeof(int4), "Invalid vectorization");
EP_STATIC_ASSERT(sizeof(*atomic_seg_ptr) == sizeof(int4), "Invalid vectorization");
EP_STATIC_ASSERT(sizeof(*data_seg_ptr) == sizeof(int4), "Invalid vectorization");
st_na_relaxed(reinterpret_cast<int4*>(ctrl_seg_ptr), *reinterpret_cast<int4*>(&ctrl_seg));
st_na_relaxed(reinterpret_cast<int4*>(raddr_seg_ptr), *reinterpret_cast<int4*>(&raddr_seg));
st_na_relaxed(reinterpret_cast<int4*>(atomic_seg_ptr), *reinterpret_cast<int4*>(&atomic_seg_1));
st_na_relaxed(reinterpret_cast<int4*>(data_seg_ptr), *reinterpret_cast<int4*>(&data_seg));
}
__device__ __forceinline__ void nvshmemi_ibgda_amo_nonfetch_add(void *rptr, const int& value, int pe, int qp_id, bool is_local_copy = false) {
if (is_local_copy) {
atomicAdd(static_cast<unsigned long long*>(rptr), value);
} else {
nvshmemi_ibgda_device_qp_t *qp = ibgda_get_rc(pe, qp_id);
__be32 rkey;
uint64_t raddr;
ibgda_get_rkey(reinterpret_cast<uint64_t>(rptr), pe, &raddr, &rkey);
uint64_t my_wqe_idx = ibgda_reserve_wqe_slots(qp, 1);
void *wqe_ptrs = ibgda_get_wqe_ptr(qp, my_wqe_idx);
ibgda_write_amo_add_wqe(qp, value, reinterpret_cast<uint64_t>(qp->ibuf.buf),
qp->ibuf.lkey, raddr, rkey, my_wqe_idx, &wqe_ptrs);
ibgda_submit_requests<true>(qp, my_wqe_idx, 1);
}
}
__device__ __forceinline__ uint64_t nvshmemi_get_p2p_ptr(const uint64_t& ptr, const int& rank, const int& dst_rank) {
// Local rank, no need for mapping
if (rank == dst_rank)
return ptr;
auto peer_base = __ldg(reinterpret_cast<uint64_t*>(nvshmemi_device_state_d.peer_heap_base_p2p) + dst_rank);
// RDMA connected
if (peer_base == 0)
return 0;
// NVLink P2P is enabled
return peer_base + (ptr - reinterpret_cast<uint64_t>(nvshmemi_device_state_d.heap_base));
}
// This is a simplified version of NVSHMEM's `ibgda_poll_cq`.
// Note that this implementation does not guarantee thread safety,
// so we must ensure that no other threads are concurrently using the same QP.
__device__ static __forceinline__ void
ibgda_poll_cq(nvshmemi_ibgda_device_cq_t *cq, uint64_t idx) {
const auto cqe64 = static_cast<mlx5_cqe64*>(cq->cqe);
const uint32_t ncqes = cq->ncqes;
memory_fence_cta();
// NOTES: this while loop is part of do-while below.
// `wqe_counter` is the HW consumer index. However, we always maintain `index + 1`.
// To be able to compare with the index, we need to use `wqe_counter + 1`.
// Because `wqe_counter` is `uint16_t`, it may be overflow. Still, we know for
// sure that if `idx - wqe_counter - 1 < ncqes`, `wqe_counter + 1 is less than
// idx, and thus we need to wait. We don't need to wait when `idx == wqe_counter + 1`
// That's why we use `- 2` here to make this case overflow.
uint16_t wqe_counter;
do {
wqe_counter = HtoBE16(ld_na_relaxed(&cqe64->wqe_counter));
} while ((static_cast<uint16_t>(static_cast<uint16_t>(idx) - wqe_counter - static_cast<uint16_t>(2)) < ncqes));
*cq->cons_idx = idx;
// Prevent reordering of this function and later instructions
memory_fence_cta();
}
// Wait until wqe `idx - 1` is completed.
__device__ static __forceinline__ void
nvshmemi_ibgda_quiet(int dst_pe, int qp_id) {
auto qp = ibgda_get_rc(dst_pe, qp_id);
uint64_t prod_idx = ld_na_relaxed(qp->tx_wq.prod_idx);
ibgda_poll_cq(qp->tx_wq.cq, prod_idx);
}
} // namespace deep_ep

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,584 @@
#include "configs.cuh"
#include "exception.cuh"
#include "launch.cuh"
#include "ibgda_device.cuh"
namespace deep_ep {
namespace internode_ll {
template <int kNumThreads> __launch_bounds__(kNumThreads, 1)
__global__ void clean_low_latency_buffer(int* clean_0, int num_clean_int_0,
int* clean_1, int num_clean_int_1) {
// Barrier before cleaning (in case of unfinished chunked EP)
nvshmemx_barrier_all_block();
// Clean
auto thread_id = static_cast<int>(threadIdx.x);
#pragma unroll
for (int i = thread_id; i < num_clean_int_0; i += kNumThreads)
clean_0[i] = 0;
#pragma unroll
for (int i = thread_id; i < num_clean_int_1; i += kNumThreads)
clean_1[i] = 0;
// Barrier after cleaning (make sure the low-latency mode works fine)
nvshmemx_barrier_all_block();
}
void clean_low_latency_buffer(int* clean_0, int num_clean_int_0,
int* clean_1, int num_clean_int_1,
cudaStream_t stream) {
constexpr int kNumThreads = 256;
SETUP_LAUNCH_CONFIG(1, kNumThreads, stream);
LAUNCH_KERNEL(&cfg, clean_low_latency_buffer<kNumThreads>,
clean_0, num_clean_int_0, clean_1, num_clean_int_1);
}
template <bool kUseFP8, bool kUseUE8M0, int kHidden>
__global__ __launch_bounds__(1024, 1) void
dispatch(void* packed_recv_x, void* packed_recv_x_scales,
int* packed_recv_src_info, int64_t* packed_recv_layout_range,
int* packed_recv_count,
int* cumulative_local_expert_recv_stats,
void* rdma_recv_x, int* rdma_recv_count, void* rdma_x,
const void* x, const int64_t* topk_idx,
int* atomic_counter_per_expert, int* atomic_finish_counter_per_expert,
int* next_clean, int num_next_clean_int,
int num_tokens, int num_max_dispatch_tokens_per_rank,
int num_topk, int num_experts, int rank, int num_ranks,
int num_warp_groups, int num_warps_per_group,
bool round_scale, int phases) {
const auto sm_id = static_cast<int>(blockIdx.x);
const auto thread_id = static_cast<int>(threadIdx.x);
const auto warp_id = thread_id / 32, lane_id = get_lane_id();
const auto num_sms = static_cast<int>(gridDim.x);
const auto num_warps = num_warp_groups * num_warps_per_group;
const auto num_local_experts = num_experts / num_ranks;
const auto warp_group_id = warp_id / num_warps_per_group;
const auto sub_warp_id = warp_id % num_warps_per_group;
const auto responsible_expert_idx = sm_id * num_warp_groups + warp_group_id;
// May extract UE8M0 from the scales
using scale_t = std::conditional_t<kUseUE8M0, uint8_t, float>;
using packed_t = std::conditional_t<kUseUE8M0, uint32_t, float>;
EP_STATIC_ASSERT(sizeof(packed_t) % sizeof(scale_t) == 0, "Invalid vector length");
// FP8 staffs
constexpr int kNumPerChannels = 128;
const int num_scales = kHidden / kNumPerChannels;
const size_t hidden_bytes = kHidden * (kUseFP8 ? sizeof(__nv_fp8_storage_t) : sizeof(nv_bfloat16));
const size_t hidden_int4 = hidden_bytes / sizeof(int4);
// Message package: hidden data, FP8 scales, index at source
// NOTES: currently we have 3 reserved int fields for future use
using vec_t = typename std::conditional<kUseFP8, int2, int4>::type;
const size_t num_bytes_per_msg = sizeof(int4) + (kUseFP8 ? (kHidden + num_scales * sizeof(float)) : (kHidden * sizeof(nv_bfloat16)));
const size_t num_int4_per_msg = num_bytes_per_msg / sizeof(int4);
EP_DEVICE_ASSERT(num_bytes_per_msg % sizeof(int4) == 0);
// Expert counts
constexpr int kNumMaxWarpGroups = 32;
__shared__ int shared_num_tokens_sent_per_expert[kNumMaxWarpGroups];
// Sending phase
if ((phases & LOW_LATENCY_SEND_PHASE) == 0)
goto LOW_LATENCY_DISPATCH_RECV;
// There are 2 kinds of warps in this part:
// 1. The first-kind warps for FP8 cast and sending top-k tokens
// 2. The last warp for reading `topk_idx` and count for per-expert information
if (warp_id < num_warps - 1) {
constexpr int kNumElemsPerRead = sizeof(int4) / sizeof(nv_bfloat16);
EP_DEVICE_ASSERT(kHidden % kNumElemsPerRead == 0);
EP_STATIC_ASSERT(kNumElemsPerRead * 32 % kNumPerChannels == 0, "Invalid vectorization");
const auto num_threads = (num_warps - 1) * 32;
const size_t hidden_bf16_int4 = kHidden / kNumElemsPerRead;
for (int token_idx = sm_id; token_idx < num_tokens; token_idx += num_sms) {
const auto x_int4 = static_cast<const int4*>(x) + token_idx * hidden_bf16_int4;
const auto rdma_x_src_idx = reinterpret_cast<int*>(static_cast<uint8_t*>(rdma_x) + token_idx * num_bytes_per_msg);
const auto rdma_x_vec = reinterpret_cast<vec_t*>(reinterpret_cast<uint8_t*>(rdma_x_src_idx) + sizeof(int4));
const auto rdma_x_scales = reinterpret_cast<float*>(reinterpret_cast<uint8_t*>(rdma_x_vec) + hidden_bytes);
// Overlap top-k index read and source token index writes
auto dst_expert_idx = warp_id < num_topk ? static_cast<int>(__ldg(topk_idx + token_idx * num_topk + warp_id)) : -1;
thread_id == 0 ? (*rdma_x_src_idx = token_idx) : 0;
// FP8 cast
#pragma unroll
for (int i = thread_id; i < hidden_bf16_int4; i += num_threads) {
// Read
auto int4_value = __ldg(x_int4 + i);
if constexpr (kUseFP8) {
// Calculate local amax
auto bf16_values = reinterpret_cast<nv_bfloat16*>(&int4_value);
float fp32_values[kNumElemsPerRead];
float amax = kFP8Margin, scale, scale_inv;
#pragma unroll
for (int j = 0; j < kNumElemsPerRead; ++ j) {
fp32_values[j] = static_cast<float>(bf16_values[j]);
amax = fmaxf(amax, fabsf(fp32_values[j]));
}
// Reduce amax and scale
EP_STATIC_ASSERT(kNumElemsPerRead * 32 / kNumPerChannels == 2, "Invalid vectorization");
amax = half_warp_reduce_max(amax);
calculate_fp8_scales(amax, scale, scale_inv, round_scale);
if (lane_id == 0 or lane_id == 16)
rdma_x_scales[i * kNumElemsPerRead / 128] = scale_inv;
// Cast into send buffer
vec_t int2_value;
auto fp8x2_values = reinterpret_cast<__nv_fp8x2_storage_t*>(&int2_value);
#pragma unroll
for (int j = 0; j < kNumElemsPerRead; j += 2) {
float2 fp32x2 = {fp32_values[j] * scale, fp32_values[j + 1] * scale};
fp8x2_values[j / 2] = __nv_cvt_float2_to_fp8x2(fp32x2, __NV_SATFINITE, __NV_E4M3);
}
rdma_x_vec[i] = int2_value;
} else {
// Reinterpret-cast is for C++14 compatibility
rdma_x_vec[i] = *reinterpret_cast<vec_t*>(&int4_value);
}
}
asm volatile("bar.sync 1, %0;" :: "r"(num_threads));
// Issue IBGDA sends
if (dst_expert_idx >= 0) {
int slot_idx = lane_id == 0 ? atomicAdd(atomic_counter_per_expert + dst_expert_idx, 1) : 0;
slot_idx = __shfl_sync(0xffffffff, slot_idx, 0);
const auto dst_rank = dst_expert_idx / num_local_experts;
const auto dst_expert_local_idx = dst_expert_idx % num_local_experts;
const auto src_ptr = reinterpret_cast<uint64_t>(rdma_x_src_idx);
const auto dst_ptr = reinterpret_cast<uint64_t>(rdma_recv_x) +
dst_expert_local_idx * num_ranks * num_max_dispatch_tokens_per_rank * num_bytes_per_msg +
rank * num_max_dispatch_tokens_per_rank * num_bytes_per_msg +
slot_idx * num_bytes_per_msg;
const auto dst_p2p_ptr = nvshmemi_get_p2p_ptr(dst_ptr, rank, dst_rank);
if (dst_p2p_ptr == 0) {
nvshmemi_ibgda_put_nbi_warp(dst_ptr, src_ptr, num_bytes_per_msg, dst_rank, dst_expert_local_idx, lane_id, slot_idx);
} else {
// NOTES: only 2 load iterations for 7K hidden with 8 unrolls
const auto* src_int4_ptr = reinterpret_cast<const int4*>(src_ptr);
const auto* dst_int4_ptr = reinterpret_cast<int4*>(dst_p2p_ptr);
UNROLLED_WARP_COPY(8, lane_id, num_int4_per_msg, dst_int4_ptr, src_int4_ptr, ld_nc_global, st_na_global);
}
// Increase counter after finishing
__syncwarp();
lane_id == 0 ? atomic_add_release_global(atomic_finish_counter_per_expert + dst_expert_idx, 1) : 0;
}
}
} else if (warp_id == num_warps - 1) {
EP_DEVICE_ASSERT(num_sms > 1);
if (sm_id == 0) {
// The first SM is also responsible for checking QPs
EP_DEVICE_ASSERT(ibgda_get_state()->num_rc_per_pe >= num_local_experts);
// The first SM is also responsible for cleaning the next buffer
#pragma unroll
for (int i = lane_id; i < num_next_clean_int; i += 32)
next_clean[i] = 0;
// Notify before executing `int_p`
__syncwarp();
#pragma unroll
for (int i = lane_id; i < num_experts; i += 32)
atomic_add_release_global(atomic_finish_counter_per_expert + i, FINISHED_SUM_TAG);
}
// This SM should be responsible for some destination experts, read `topk_idx` for them
int expert_count[kNumMaxWarpGroups] = {0};
const auto expert_begin_idx = sm_id * num_warp_groups;
const auto expert_end_idx = min(expert_begin_idx + num_warp_groups, num_experts);
// Per lane count
#pragma unroll 8
for (int i = lane_id; i < num_tokens * num_topk; i += 32) {
auto idx = static_cast<int>(__ldg(topk_idx + i));
if (idx >= expert_begin_idx and idx < expert_end_idx)
expert_count[idx - expert_begin_idx] ++;
}
// Warp reduce
#pragma unroll
for (int i = expert_begin_idx; i < expert_end_idx; ++ i) {
auto sum = warp_reduce_sum(expert_count[i - expert_begin_idx]);
if (lane_id == 0) {
shared_num_tokens_sent_per_expert[i - expert_begin_idx] = sum;
atomic_add_release_global(atomic_finish_counter_per_expert + i, FINISHED_SUM_TAG - sum);
}
}
}
__syncthreads();
// Issue count sends
if (responsible_expert_idx < num_experts and sub_warp_id == 0 and lane_id == 0) {
const auto dst_rank = responsible_expert_idx / num_local_experts;
const auto dst_expert_local_idx = responsible_expert_idx % num_local_experts;
const auto num_tokens_sent = shared_num_tokens_sent_per_expert[responsible_expert_idx - sm_id * num_warp_groups];
// Wait local sends issued and send expert counts
while (ld_acquire_global(atomic_finish_counter_per_expert + responsible_expert_idx) != FINISHED_SUM_TAG * 2);
auto dst_ptr = reinterpret_cast<uint64_t>(rdma_recv_count + dst_expert_local_idx * num_ranks + rank);
auto dst_p2p_ptr = nvshmemi_get_p2p_ptr(dst_ptr, rank, dst_rank);
if (dst_p2p_ptr == 0) {
nvshmemi_ibgda_amo_nonfetch_add(reinterpret_cast<int*>(dst_ptr), -num_tokens_sent - 1, dst_rank, dst_expert_local_idx);
} else {
st_release_sys_global(reinterpret_cast<int*>(dst_p2p_ptr), -num_tokens_sent - 1);
}
// Clean workspace for next use
atomic_counter_per_expert[responsible_expert_idx] = 0;
atomic_finish_counter_per_expert[responsible_expert_idx] = 0;
// Clean `packed_recv_count`
if (dst_rank == 0)
packed_recv_count[dst_expert_local_idx] = 0;
}
__syncwarp();
// Receiving phase
LOW_LATENCY_DISPATCH_RECV:
if ((phases & LOW_LATENCY_RECV_PHASE) == 0)
return;
// For send-and-recv kernels, we need a grid sync for making `packed_recv_count` visible
if (phases & LOW_LATENCY_SEND_PHASE)
cg::this_grid().sync();
// Receiving and packing
if (responsible_expert_idx < num_experts) {
const auto src_rank = responsible_expert_idx / num_local_experts;
const auto local_expert_idx = responsible_expert_idx % num_local_experts;
const auto rdma_recv_x_uint8 = static_cast<uint8_t*>(rdma_recv_x) +
local_expert_idx * num_ranks * num_max_dispatch_tokens_per_rank * num_bytes_per_msg +
src_rank * num_max_dispatch_tokens_per_rank * num_bytes_per_msg;
const auto recv_x_int4 = static_cast<int4*>(packed_recv_x) +
local_expert_idx * num_ranks * num_max_dispatch_tokens_per_rank * hidden_int4;
const auto recv_src_info = packed_recv_src_info + local_expert_idx * num_ranks * num_max_dispatch_tokens_per_rank;
const auto recv_range = packed_recv_layout_range + local_expert_idx * num_ranks;
const auto num_aligned_scales = align<int>(num_scales, sizeof(float) / sizeof(scale_t));
const auto recv_x_scales = static_cast<scale_t*>(packed_recv_x_scales) + local_expert_idx * num_ranks * num_max_dispatch_tokens_per_rank * num_aligned_scales;
// Shared between sub-warps in warp groups
__shared__ int shared_num_recv_tokens[kNumMaxWarpGroups], shared_recv_token_begin_idx[kNumMaxWarpGroups];
// Wait tokens to arrive
// NOTES: using sub-warp 1 to overlap with sub-warp 0
int num_recv_tokens, recv_token_begin_idx;
EP_DEVICE_ASSERT(num_warps_per_group > 1 and num_warp_groups < 15);
if (sub_warp_id == 1 and lane_id == 0) {
while ((num_recv_tokens = ld_acquire_sys_global(rdma_recv_count + local_expert_idx * num_ranks + src_rank)) == 0);
num_recv_tokens = -num_recv_tokens - 1;
recv_token_begin_idx = atomicAdd(packed_recv_count + local_expert_idx, num_recv_tokens);
shared_num_recv_tokens[warp_group_id] = num_recv_tokens;
shared_recv_token_begin_idx[warp_group_id] = recv_token_begin_idx;
recv_range[src_rank] = pack2<int, int64_t>(num_recv_tokens, recv_token_begin_idx);
if (cumulative_local_expert_recv_stats != nullptr)
atomicAdd(cumulative_local_expert_recv_stats + local_expert_idx, num_recv_tokens);
}
asm volatile("bar.sync %0, %1;" :: "r"(warp_group_id + 2), "r"(num_warps_per_group * 32));
num_recv_tokens = shared_num_recv_tokens[warp_group_id];
recv_token_begin_idx = shared_recv_token_begin_idx[warp_group_id];
// Copy tokens
EP_DEVICE_ASSERT(num_scales <= 64);
for (int i = sub_warp_id; i < num_recv_tokens; i += num_warps_per_group) {
// Copy source info
const auto src_src_idx = reinterpret_cast<int*>(rdma_recv_x_uint8 + i * num_bytes_per_msg);
if (lane_id == 0)
recv_src_info[recv_token_begin_idx + i] = ld_nc_global(src_src_idx);
__syncwarp();
// Copy data
// NOTES: only 2 load iterations for 7K hidden with 7 unrolls
const auto src_data = reinterpret_cast<int4*>(reinterpret_cast<uint8_t*>(src_src_idx) + sizeof(int4));
const auto dst_data = recv_x_int4 + (recv_token_begin_idx + i) * hidden_int4;
UNROLLED_WARP_COPY(7, lane_id, hidden_int4, dst_data, src_data, ld_nc_global, st_na_global);
// Copy scales
if constexpr (kUseFP8) {
// Equivalent CuTe layout:
// (num_tokens, (num_packed, num_elems_per_pack)):(num_elems_per_pack, (num_tokens * num_elems_per_pack, 1))
const auto src_scales = reinterpret_cast<float*>(reinterpret_cast<uint8_t*>(src_data) + hidden_bytes);
const auto num_elems_per_pack = static_cast<int>(sizeof(packed_t) / sizeof(scale_t));
const auto token_idx = recv_token_begin_idx + i;
const auto token_stride = num_elems_per_pack;
const auto pack_stride = num_ranks * num_max_dispatch_tokens_per_rank * num_elems_per_pack;
if (lane_id < num_scales) {
const auto pack_idx = lane_id / num_elems_per_pack;
const auto elem_idx = lane_id % num_elems_per_pack;
auto scale = extract_required_scale_format<kUseUE8M0>(ld_nc_global(src_scales + lane_id));
recv_x_scales[token_idx * token_stride + pack_idx * pack_stride + elem_idx] = scale;
}
if (lane_id + 32 < num_scales) {
const auto pack_idx = (lane_id + 32) / num_elems_per_pack;
const auto elem_idx = (lane_id + 32) % num_elems_per_pack;
auto scale = extract_required_scale_format<kUseUE8M0>(ld_nc_global(src_scales + lane_id + 32));
recv_x_scales[token_idx * token_stride + pack_idx * pack_stride + elem_idx] = scale;
}
}
}
}
}
void dispatch(void* packed_recv_x, void* packed_recv_x_scales,
int* packed_recv_src_info, int64_t* packed_recv_layout_range,
int* packed_recv_count,
int* cumulative_local_expert_recv_stats,
void* rdma_recv_x, int* rdma_recv_count, void* rdma_x,
const void* x, const int64_t* topk_idx,
int* next_clean, int num_next_clean_int,
int num_tokens, int hidden, int num_max_dispatch_tokens_per_rank,
int num_topk, int num_experts, int rank, int num_ranks,
bool use_fp8, bool round_scale, bool use_ue8m0,
void* workspace, int num_device_sms,
cudaStream_t stream, int phases) {
constexpr int kNumMaxTopK = 9;
const int num_warp_groups = ceil_div(num_experts, num_device_sms);
const int num_warps_per_group = 32 / num_warp_groups;
EP_HOST_ASSERT(num_warp_groups > 0 and num_warps_per_group > 0);
EP_HOST_ASSERT(kNumMaxTopK + 1 <= num_warp_groups * num_warps_per_group);
const auto num_warps = num_warp_groups * num_warps_per_group;
const auto num_sms = ceil_div(num_experts, num_warp_groups);
EP_HOST_ASSERT(num_topk <= kNumMaxTopK);
// Workspace checks
auto atomic_counter_per_expert = static_cast<int*>(workspace);
auto atomic_finish_counter_per_expert = atomic_counter_per_expert + num_experts;
EP_HOST_ASSERT(num_experts * sizeof(int) * 2 <= NUM_WORKSPACE_BYTES);
// FP8 checks
if (use_ue8m0)
EP_HOST_ASSERT(round_scale and "UE8M0 SF requires `round_scale=True`");
#define DISPATCH_LAUNCH_CASE(hidden) { \
auto dispatch_func = dispatch<false, false, hidden>; \
if (use_fp8 and not use_ue8m0) \
dispatch_func = dispatch<true, false, hidden>; \
if (use_fp8 and use_ue8m0) \
dispatch_func = dispatch<true, true, hidden>; \
LAUNCH_KERNEL(&cfg, dispatch_func, \
packed_recv_x, packed_recv_x_scales, \
packed_recv_src_info, packed_recv_layout_range, \
packed_recv_count, \
cumulative_local_expert_recv_stats, \
rdma_recv_x, rdma_recv_count, rdma_x, \
x, topk_idx, \
atomic_counter_per_expert, atomic_finish_counter_per_expert, \
next_clean, num_next_clean_int, \
num_tokens, num_max_dispatch_tokens_per_rank, \
num_topk, num_experts, rank, num_ranks, \
num_warp_groups, num_warps_per_group, \
round_scale, phases); } break
SETUP_LAUNCH_CONFIG(num_sms, num_warps * 32, stream);
SWITCH_HIDDEN(DISPATCH_LAUNCH_CASE);
#undef DISPATCH_LAUNCH_CASE
}
template <int kHidden, int kNumMaxTopk>
__global__ __launch_bounds__(1024, 1) void
combine(void* combined_x,
void* rdma_recv_x, int* rdma_recv_flag, void* rdma_send_x,
const void* x, const int64_t* topk_idx, const float* topk_weights,
const int* src_info, const int64_t* layout_range,
int* next_clean, int num_next_clean_int,
int* atomic_clean_flag,
int num_combined_tokens, int hidden, int num_topk,
int num_max_dispatch_tokens_per_rank,
int num_experts, int rank, int num_ranks,
int num_warp_groups, int num_warps_per_group,
int phases, bool zero_copy) {
const auto sm_id = static_cast<int>(blockIdx.x);
const auto num_sms = static_cast<int>(gridDim.x);
const auto thread_id = static_cast<int>(threadIdx.x);
const auto num_threads = static_cast<int>(blockDim.x);
const auto warp_id = thread_id / 32, lane_id = get_lane_id();
const auto num_local_experts = num_experts / num_ranks;
const auto warp_group_id = warp_id / num_warps_per_group;
const auto sub_warp_id = warp_id % num_warps_per_group;
const auto responsible_expert_idx = sm_id * num_warp_groups + warp_group_id;
// Data type staffs
constexpr int kNumElemsPerInt4 = sizeof(int4) / sizeof(nv_bfloat16);
const size_t hidden_bf16_int4 = kHidden / kNumElemsPerInt4;
// Message package
constexpr size_t num_bytes_per_slot = kHidden * sizeof(nv_bfloat16);
EP_STATIC_ASSERT(num_bytes_per_slot % sizeof(int4) == 0, "Invalid vectorization");
// Sending phase
if ((phases & LOW_LATENCY_SEND_PHASE) == 0)
goto LOW_LATENCY_COMBINE_RECV;
// Clean up next buffer
if (sm_id == 0 and warp_group_id == 0 and sub_warp_id == 0) {
#pragma unroll
for (int i = lane_id; i < num_next_clean_int; i += 32)
next_clean[i] = 0;
// Notify before executing `int_p`
__syncwarp();
if (lane_id == 0)
atomic_add_release_global(atomic_clean_flag, num_experts);
}
// Issue IBGDA sends
if (responsible_expert_idx < num_experts) {
const auto dst_rank = responsible_expert_idx / num_local_experts;
const auto local_expert_idx = responsible_expert_idx % num_local_experts;
const auto global_expert_idx = rank * num_local_experts + local_expert_idx;
const auto layout = __ldg(layout_range + local_expert_idx * num_ranks + dst_rank);
const auto local_x = static_cast<const int4*>(x) +
local_expert_idx * num_ranks * num_max_dispatch_tokens_per_rank * hidden_bf16_int4;
const auto local_src_info = src_info + local_expert_idx * num_ranks * num_max_dispatch_tokens_per_rank;
const auto rdma_send_x_vec = static_cast<uint8_t*>(rdma_send_x) +
local_expert_idx * num_ranks * num_max_dispatch_tokens_per_rank * num_bytes_per_slot;
// Unpack layout
int offset, num_tokens_to_send;
unpack2(layout, num_tokens_to_send, offset);
// Issue IBGDA send
for (int token_idx = offset + sub_warp_id; token_idx < offset + num_tokens_to_send; token_idx += num_warps_per_group) {
const auto x_int4 = local_x + token_idx * hidden_bf16_int4;
const auto rdma_send_type_row = reinterpret_cast<int*>(rdma_send_x_vec + token_idx * num_bytes_per_slot);
const auto rdma_send_x_vec_row = reinterpret_cast<uint8_t*>(rdma_send_type_row);
// Copy directly to local rank, or copy to buffer and issue RDMA
auto src_idx = __ldg(local_src_info + token_idx);
const auto buf_ptr = reinterpret_cast<int64_t>(rdma_send_x_vec_row);
const auto dst_ptr = reinterpret_cast<uint64_t>(rdma_recv_x) + (global_expert_idx * num_max_dispatch_tokens_per_rank + src_idx) * num_bytes_per_slot;
const auto dst_p2p_ptr = nvshmemi_get_p2p_ptr(dst_ptr, rank, dst_rank);
if (dst_p2p_ptr == 0) {
const auto buf_int4_ptr = reinterpret_cast<int4*>(buf_ptr);
if (not zero_copy)
UNROLLED_WARP_COPY(7, lane_id, hidden_bf16_int4, buf_int4_ptr, x_int4, ld_nc_global, st_na_global);
nvshmemi_ibgda_put_nbi_warp(dst_ptr, buf_ptr, hidden * sizeof(nv_bfloat16), dst_rank, local_expert_idx, lane_id, token_idx - offset);
} else {
const auto dst_int4_ptr = reinterpret_cast<int4*>(dst_p2p_ptr);
UNROLLED_WARP_COPY(7, lane_id, hidden_bf16_int4, dst_int4_ptr, x_int4, ld_nc_global, st_na_global);
}
}
// Put the finishing flag
EP_DEVICE_ASSERT(num_warps_per_group > 1 and num_warp_groups < 16);
asm volatile("bar.sync %0, %1;" :: "r"(warp_group_id + 1), "r"(num_warps_per_group * 32));
if (sub_warp_id == 1 and lane_id == 0) {
while (ld_acquire_global(atomic_clean_flag) == 0);
auto dst_ptr = reinterpret_cast<uint64_t>(rdma_recv_flag + global_expert_idx);
auto dst_p2p_ptr = nvshmemi_get_p2p_ptr(dst_ptr, rank, dst_rank);
if (dst_p2p_ptr == 0) {
nvshmemi_ibgda_amo_nonfetch_add(reinterpret_cast<int*>(dst_ptr), 1, dst_rank, local_expert_idx);
} else {
st_release_sys_global(reinterpret_cast<int*>(dst_p2p_ptr), 1);
}
atomic_add_release_global(atomic_clean_flag, -1);
}
__syncwarp();
}
// Receiving phase
LOW_LATENCY_COMBINE_RECV:
if ((phases & LOW_LATENCY_RECV_PHASE) == 0)
return;
// Wait all ranks to arrive
if (responsible_expert_idx < num_experts) {
EP_DEVICE_ASSERT(num_warps_per_group > 1);
if (sub_warp_id == 0 and lane_id == 0) {
while (ld_acquire_sys_global(rdma_recv_flag + responsible_expert_idx) == 0);
}
}
cg::this_grid().sync();
// Reduce tokens
EP_DEVICE_ASSERT(num_topk <= 32 and hidden_bf16_int4 <= num_threads);
EP_STATIC_ASSERT(kHidden % (32 * kNumElemsPerInt4) == 0, "Invalid vectorization");
if (thread_id < hidden_bf16_int4) {
for (int token_idx = sm_id; token_idx < num_combined_tokens; token_idx += num_sms) {
// Read top-k indices and weights
int reg_topk_idx[kNumMaxTopk];
float reg_topk_weights[kNumMaxTopk];
#pragma unroll
for (int i = 0; i < num_topk; ++ i) {
reg_topk_idx[i] = static_cast<int>(__ldg(topk_idx + token_idx * num_topk + i));
reg_topk_weights[i] = __ldg(topk_weights + token_idx * num_topk + i);
}
float combined_values[kNumElemsPerInt4] = {0.0f};
#pragma unroll
for (int i = 0; i < num_topk; ++ i) if (reg_topk_idx[i] >= 0) {
// Read from sources
auto rdma_buffer_type = reinterpret_cast<const int*>(static_cast<uint8_t*>(rdma_recv_x) + (reg_topk_idx[i] * num_max_dispatch_tokens_per_rank + token_idx) * num_bytes_per_slot);
auto rdma_buffer_row = reinterpret_cast<const uint8_t*>(rdma_buffer_type);
// Reduce
auto x_vec = ld_nc_global(reinterpret_cast<const int4*>(rdma_buffer_row) + thread_id);
const auto x_bf16 = reinterpret_cast<nv_bfloat16*>(&x_vec);
#pragma unroll
for (int j = 0; j < kNumElemsPerInt4; ++ j)
combined_values[j] += static_cast<float>(x_bf16[j]) * reg_topk_weights[i];
}
// Write results
int4& combined_int4 = *reinterpret_cast<int4*>(combined_values);
auto combined_bf16 = reinterpret_cast<nv_bfloat16*>(&combined_values);
#pragma unroll
for (int j = 0; j < kNumElemsPerInt4; ++ j)
combined_bf16[j] = static_cast<nv_bfloat16>(combined_values[j]);
(static_cast<int4*>(combined_x) + token_idx * hidden_bf16_int4)[thread_id] = combined_int4;
}
}
}
void combine(void* combined_x,
void* rdma_recv_x, int* rdma_recv_flag, void* rdma_send_x,
const void* x, const int64_t* topk_idx, const float* topk_weights,
const int* src_info, const int64_t* layout_range,
int* next_clean, int num_next_clean_int,
int num_combined_tokens, int hidden, int num_max_dispatch_tokens_per_rank,
int num_topk, int num_experts, int rank, int num_ranks,
void* workspace, int num_device_sms,
cudaStream_t stream, int phases, bool zero_copy) {
constexpr int kNumMaxTopk = 9;
const int num_warp_groups = ceil_div(num_experts, num_device_sms);
const int num_warps_per_group = 32 / num_warp_groups;
EP_HOST_ASSERT(num_warp_groups > 0 and num_warps_per_group > 0);
const auto num_warps = num_warp_groups * num_warps_per_group;
const auto num_sms = ceil_div(num_experts, num_warp_groups);
// Check workspace
auto atomic_clean_flag = static_cast<int*>(workspace);
EP_HOST_ASSERT(sizeof(int) <= NUM_WORKSPACE_BYTES);
EP_HOST_ASSERT(num_topk <= kNumMaxTopk);
#define COMBINE_LAUNCH_CASE(hidden) { \
auto combine_func = combine<hidden, kNumMaxTopk>; \
LAUNCH_KERNEL(&cfg, combine_func, \
combined_x, \
rdma_recv_x, rdma_recv_flag, rdma_send_x, \
x, topk_idx, topk_weights, src_info, layout_range, \
next_clean, num_next_clean_int, \
atomic_clean_flag, \
num_combined_tokens, hidden, num_topk, \
num_max_dispatch_tokens_per_rank, \
num_experts, rank, num_ranks, \
num_warp_groups, num_warps_per_group, \
phases, zero_copy); } break
SETUP_LAUNCH_CONFIG(num_sms, num_warps * 32, stream);
SWITCH_HIDDEN(COMBINE_LAUNCH_CASE);
#undef COMBINE_LAUNCH_CASE
}
} // namespace internode_ll
} // namespace deep_ep

View File

@ -0,0 +1,935 @@
#include "configs.cuh"
#include "buffer.cuh"
#include "exception.cuh"
#include "launch.cuh"
#include "utils.cuh"
namespace deep_ep {
namespace intranode {
template<int kNumRanks>
__global__ void
notify_dispatch(const int* num_tokens_per_rank, int* moe_recv_counter_mapped,
const int* num_tokens_per_expert, int* moe_recv_expert_counter_mapped, int num_experts,
int num_tokens, int num_channels, const bool* is_token_in_rank, int* channel_prefix_matrix,
int* rank_prefix_matrix_copy, int num_memset_int, int expert_alignment,
void** buffer_ptrs, int** barrier_signal_ptrs, int rank) {
auto sm_id = static_cast<int>(blockIdx.x);
auto thread_id = static_cast<int>(threadIdx.x), num_threads = static_cast<int>(blockDim.x);
auto lane_id = thread_id % 32, warp_id = thread_id / 32, num_warps = num_threads / 32;
if (sm_id == 0) {
// Barrier first
barrier_block<kNumRanks, true>(barrier_signal_ptrs, rank);
int *per_rank_buffer, *per_expert_buffer;
if (thread_id < kNumRanks) {
per_rank_buffer = static_cast<int*>(buffer_ptrs[thread_id]);
per_expert_buffer = per_rank_buffer + kNumRanks * kNumRanks;
}
// After this loop:
// - `per_rank_buffer[rank][i, j]` means the number of tokens from rank i to rank j
// - `per_expert_buffer[rank][i, j]` means the number of tokens from rank i to local expert j
int num_experts_per_rank = num_experts / kNumRanks;
if (thread_id < kNumRanks) {
#pragma unroll
for (int i = 0; i < kNumRanks; ++ i)
per_rank_buffer[rank * kNumRanks + i] = num_tokens_per_rank[i];
#pragma unroll
for (int i = 0; i < num_experts_per_rank; ++ i)
per_expert_buffer[rank * num_experts_per_rank + i] = num_tokens_per_expert[thread_id * num_experts_per_rank + i];
}
// Wait for all ranks to be finished
barrier_block<kNumRanks>(barrier_signal_ptrs, rank);
// Sum per-rank counts and return to CPU
// Also pre-compute the prefix sum for data sending
auto local_per_rank_buffer = static_cast<int*>(buffer_ptrs[rank]);
if (thread_id < kNumRanks) {
#pragma unroll
for (int i = 1; i < kNumRanks; ++ i)
local_per_rank_buffer[i * kNumRanks + thread_id] += local_per_rank_buffer[(i - 1) * kNumRanks + thread_id];
if (thread_id == rank)
*moe_recv_counter_mapped = local_per_rank_buffer[(kNumRanks - 1) * kNumRanks + rank];
}
// Sum per-experts counts and return to CPU
auto local_per_expert_buffer = local_per_rank_buffer + kNumRanks * kNumRanks;
if (thread_id < num_experts_per_rank) {
int sum = 0;
#pragma unroll
for (int i = 0; i < kNumRanks; ++ i)
sum += local_per_expert_buffer[i * num_experts_per_rank + thread_id];
sum = (sum + expert_alignment - 1) / expert_alignment * expert_alignment;
moe_recv_expert_counter_mapped[thread_id] = sum;
}
__syncthreads();
// Copy rank size prefix matrix to another tensor
#pragma unroll
for (int i = thread_id; i < kNumRanks * kNumRanks; i += num_threads)
rank_prefix_matrix_copy[i] = local_per_rank_buffer[i];
// Extra memset for later communication queue
#pragma unroll
for (int i = thread_id; i < num_memset_int; i += num_threads)
local_per_expert_buffer[i] = 0;
// Barrier
barrier_block<kNumRanks>(barrier_signal_ptrs, rank);
} else {
int dst_rank = sm_id - 1;
for (int channel_id = warp_id; channel_id < num_channels; channel_id += num_warps) {
int token_start_idx, token_end_idx;
get_channel_task_range(num_tokens, num_channels, channel_id, token_start_idx, token_end_idx);
// Iterate over tokens
int count = 0;
for (int64_t i = token_start_idx + lane_id; i < token_end_idx; i += 32)
count += is_token_in_rank[i * kNumRanks + dst_rank];
count = warp_reduce_sum(count);
if (lane_id == 0)
channel_prefix_matrix[dst_rank * num_channels + channel_id] = count;
}
__syncthreads();
// Pre-compute prefix sum for all channels
if (thread_id == 0) {
#pragma unroll
for (int i = 1; i < num_channels; ++ i)
channel_prefix_matrix[dst_rank * num_channels + i] += channel_prefix_matrix[dst_rank * num_channels + i - 1];
}
}
}
void notify_dispatch(const int* num_tokens_per_rank, int* moe_recv_counter_mapped, int num_ranks,
const int* num_tokens_per_expert, int* moe_recv_expert_counter_mapped, int num_experts,
int num_tokens, const bool* is_token_in_rank, int* channel_prefix_matrix,
int* rank_prefix_matrix_copy, int num_memset_int, int expert_alignment,
void** buffer_ptrs, int** barrier_signal_ptrs, int rank,
cudaStream_t stream, int num_channels) {
#define NOTIFY_DISPATCH_LAUNCH_CASE(ranks) \
LAUNCH_KERNEL(&cfg, notify_dispatch<ranks>, \
num_tokens_per_rank, moe_recv_counter_mapped, \
num_tokens_per_expert, moe_recv_expert_counter_mapped, num_experts, \
num_tokens, num_channels, is_token_in_rank, channel_prefix_matrix, \
rank_prefix_matrix_copy, num_memset_int, expert_alignment, \
buffer_ptrs, barrier_signal_ptrs, rank); \
break
constexpr int kNumThreads = 128;
EP_HOST_ASSERT(num_experts % num_ranks == 0);
EP_HOST_ASSERT(num_experts / num_ranks <= kNumThreads and num_ranks <= kNumThreads);
SETUP_LAUNCH_CONFIG(1 + num_ranks, kNumThreads, stream);
SWITCH_RANKS(NOTIFY_DISPATCH_LAUNCH_CASE);
#undef NOTIFY_DISPATCH_LAUNCH_CASE
}
template<int kNumRanks>
__global__ void
cached_notify_dispatch(const int* rank_prefix_matrix, int num_memset_int,
void** buffer_ptrs, int** barrier_signal_ptrs, int rank) {
// A simplified version for cached handles
barrier_block<kNumRanks, true>(barrier_signal_ptrs, rank);
// Copy and clean
auto thread_id = static_cast<int>(threadIdx.x), num_threads = static_cast<int>(blockDim.x);
auto ptr = static_cast<int*>(buffer_ptrs[rank]);
#pragma unroll
for (int i = thread_id; i < kNumRanks * kNumRanks; i += num_threads)
ptr[i] = rank_prefix_matrix[i];
#pragma unroll
for (int i = thread_id; i < num_memset_int; i += num_threads)
ptr[kNumRanks * kNumRanks + i] = 0;
// Barrier after cleaning
barrier_block<kNumRanks>(barrier_signal_ptrs, rank);
}
void cached_notify_dispatch(const int* rank_prefix_matrix, int num_memset_int,
void** buffer_ptrs, int** barrier_signal_ptrs,
int rank, int num_ranks, cudaStream_t stream) {
#define CACHED_NOTIFY_DISPATCH_LAUNCH_CASE(ranks) \
LAUNCH_KERNEL(&cfg, cached_notify_dispatch<ranks>, \
rank_prefix_matrix, num_memset_int, buffer_ptrs, barrier_signal_ptrs, rank); \
break
SETUP_LAUNCH_CONFIG(1, 128, stream);
SWITCH_RANKS(CACHED_NOTIFY_DISPATCH_LAUNCH_CASE);
#undef CACHED_NOTIFY_DISPATCH_LAUNCH_CASE
}
template <int kNumRanks, int kNumThreads, int kNumTMABytesPerWarp>
__global__ void __launch_bounds__(kNumThreads, 1)
dispatch(int4* recv_x, float* recv_x_scales, int* recv_src_idx, int64_t* recv_topk_idx, float* recv_topk_weights, int* recv_channel_offset,
int* send_head, const int4* x, const float* x_scales, const int64_t* topk_idx, const float* topk_weights,
const bool* is_token_in_rank, const int* channel_prefix_matrix,
int num_tokens, int num_worst_tokens, int hidden_int4, int num_topk, int num_experts, int num_scales,
int scale_token_stride, int scale_hidden_stride,
void** buffer_ptrs, int rank,
int num_max_send_tokens, int num_recv_buffer_tokens) {
const auto num_sms = static_cast<int>(gridDim.x), sm_id = static_cast<int>(blockIdx.x);
const auto thread_id = static_cast<int>(threadIdx.x), lane_id = get_lane_id();
const bool is_sender = sm_id % 2 == 0;
EP_DEVICE_ASSERT(num_sms % 2 == 0);
// Several warps are response for a single rank
const auto num_threads_per_rank = kNumThreads / kNumRanks;
const auto num_channels = num_sms / 2;
const auto responsible_rank = (static_cast<int>(thread_id)) / num_threads_per_rank;
// Even-numbered blocks for sending, odd-numbered blocks for receiving.
const auto responsible_channel = sm_id / 2;
int num_experts_per_rank = num_experts / kNumRanks;
EP_DEVICE_ASSERT(num_experts_per_rank > 0 or num_topk == 0);
EP_DEVICE_ASSERT(num_topk <= 32);
EP_DEVICE_ASSERT((topk_idx == nullptr) == (topk_weights == nullptr));
EP_DEVICE_ASSERT((recv_topk_idx == nullptr) == (recv_topk_weights == nullptr));
// Calculate pointers by the specific layout
// `rank_prefix_matrix`: kNumRanks * kNumRanks * sizeof(int)
auto ptr = reinterpret_cast<void*>(static_cast<int8_t*>(buffer_ptrs[is_sender ? responsible_rank : rank]) + kNumRanks * kNumRanks * sizeof(int));
int target_rank = is_sender ? rank : responsible_rank;
auto num_channels_total = num_channels * kNumRanks;
auto channel_rank_offset = responsible_channel * kNumRanks + target_rank;
// Channel buffer metadata
// Senders are responsible for tails, and receivers are responsible for heads
// Stored on the receiver side
// The retired signals are actually boolean flags, but to align with 16 bytes, we make it `int64_t`
// `start_offset`: kNumChannels * kNumRanks * sizeof(int)
// `end_offset`: kNumChannels * kNumRanks * sizeof(int)
// `head_idx`: kNumChannels * kNumRanks * sizeof(int)
// `tail_idx`: kNumChannels * kNumRanks * sizeof(int)
auto channel_start_offset = Buffer<int>(ptr, num_channels_total, channel_rank_offset);
auto channel_end_offset = Buffer<int>(ptr, num_channels_total, channel_rank_offset);
auto channel_head_idx = Buffer<int>(ptr, num_channels_total, channel_rank_offset);
auto channel_tail_idx = Buffer<int>(ptr, num_channels_total, channel_rank_offset);
// Channel data buffers, stored on the receiver side
// `x_buffers`: kNumChannels * kNumRanks * num_recv_buffer_tokens * hidden_int4 * sizeof(int4)
// `src_idx_buffers`: kNumChannels * kNumRanks * num_recv_buffer_tokens * sizeof(int)
// `topk_idx_buffers`: kNumChannels * kNumRanks * num_recv_buffer_tokens * num_topk * sizeof(int64_t)
// `topk_weights_buffers`: kNumChannels * kNumRanks * num_recv_buffer_tokens * num_topk * sizeof(float)
// `x_scales_buffers`: kNumChannels * kNumRanks * num_recv_buffer_tokens * num_scales * sizeof(float)
auto channel_x_buffers = Buffer<int4>(ptr, num_channels_total * num_recv_buffer_tokens * hidden_int4, channel_rank_offset * num_recv_buffer_tokens * hidden_int4);
auto channel_src_idx_buffers = Buffer<int>(ptr, num_channels_total * num_recv_buffer_tokens, channel_rank_offset * num_recv_buffer_tokens);
auto channel_topk_idx_buffers = Buffer<int64_t>(ptr, num_channels_total * num_recv_buffer_tokens * num_topk, channel_rank_offset * num_recv_buffer_tokens * num_topk);
auto channel_topk_weights_buffers = Buffer<float>(ptr, num_channels_total * num_recv_buffer_tokens * num_topk, channel_rank_offset * num_recv_buffer_tokens * num_topk);
auto channel_x_scales_buffers = Buffer<float>(ptr, num_channels_total * num_recv_buffer_tokens * num_scales, channel_rank_offset * num_recv_buffer_tokens * num_scales);
// TMA stuffs
#ifndef DISABLE_SM90_FEATURES
extern __shared__ __align__(1024) uint8_t smem_buffer[];
auto half_hidden_int4 = hidden_int4 / 2;
auto half_hidden_bytes = half_hidden_int4 * static_cast<int>(sizeof(int4));
auto tma_buffer = smem_buffer + (thread_id / 32) * kNumTMABytesPerWarp;
auto tma_mbarrier = reinterpret_cast<uint64_t*>(tma_buffer + half_hidden_bytes);
uint32_t tma_phase = 0;
if (lane_id == 0) {
mbarrier_init(tma_mbarrier, 1);
fence_view_async_shared();
fence_barrier_init();
EP_DEVICE_ASSERT(hidden_int4 % 2 == 0 and half_hidden_bytes + sizeof(uint64_t) <= kNumTMABytesPerWarp);
}
__syncwarp();
#endif
if (is_sender) {
// Workers for sending
constexpr int num_send_warps = kNumThreads / 32;
constexpr int num_send_warps_per_rank = num_send_warps / kNumRanks;
const auto send_thread_id = thread_id;
const auto send_warp_id_in_rank = send_thread_id % num_threads_per_rank / 32;
EP_DEVICE_ASSERT(kNumRanks <= 32);
EP_DEVICE_ASSERT(num_send_warps % kNumRanks == 0);
// Send offset by `-value - 1`, e.g. 0 -> -1, 1 -> -2
// NOTES: this is for distinguishing zero tokens
if (lane_id == 0 and send_warp_id_in_rank == 0) {
int value = responsible_channel > 0 ? channel_prefix_matrix[responsible_rank * num_channels + responsible_channel - 1] : 0;
st_relaxed_sys_global(channel_start_offset.buffer(), -value - 1);
value = channel_prefix_matrix[responsible_rank * num_channels + responsible_channel];
st_relaxed_sys_global(channel_end_offset.buffer(), -value - 1);
}
__syncwarp();
// Get tasks
int token_start_idx, token_end_idx;
get_channel_task_range(num_tokens, num_channels, responsible_channel, token_start_idx, token_end_idx);
// Iterate over all tokens and send by chunks
int cached_channel_tail_idx = 0;
for (int64_t token_idx = token_start_idx; token_idx < token_end_idx; ) {
// Check destination queue emptiness, or wait a buffer to be released (rare cases)
// NOTES: the head index received by different warps may not be the same
auto start_time = clock64();
while (lane_id == 0) {
// NOTES: we only consider the worst case, because counting the real numbers are time-consuming
int num_used_slots = cached_channel_tail_idx - ld_volatile_global(channel_head_idx.buffer());
if (num_recv_buffer_tokens - num_used_slots >= num_max_send_tokens)
break;
// Rare cases to loop again
if (clock64() - start_time > NUM_TIMEOUT_CYCLES) {
printf("DeepEP timeout for dispatch senders, rank %d, responsible_channel = %d\n", rank, responsible_channel);
trap();
}
}
__syncwarp();
int chunk_token_idx = 0;
while (chunk_token_idx < num_max_send_tokens and token_idx < token_end_idx) {
// NOTES: for the same token, the warp assigned to save `send_head` may be different from the warp assigned to send the following data
if (lane_id == 0 and token_idx % num_send_warps_per_rank == send_warp_id_in_rank)
send_head[token_idx * kNumRanks + responsible_rank] = is_token_in_rank[token_idx * kNumRanks + responsible_rank] ? cached_channel_tail_idx : -1;
// Skip if not selected
if (not is_token_in_rank[token_idx * kNumRanks + responsible_rank]) {
token_idx ++;
continue;
}
// Get an empty slot
int dst_slot_idx = (cached_channel_tail_idx ++) % num_recv_buffer_tokens;
if (cached_channel_tail_idx % num_send_warps_per_rank == send_warp_id_in_rank) {
// Copy data
auto shifted_channel_x_buffers = channel_x_buffers.buffer() + dst_slot_idx * hidden_int4;
auto shifted_x = x + token_idx * hidden_int4;
UNROLLED_WARP_COPY(5, lane_id, hidden_int4, shifted_channel_x_buffers, shifted_x, __ldg, st_na_global);
// Copy source index
if (lane_id == 0)
channel_src_idx_buffers[dst_slot_idx] = static_cast<int>(token_idx);
// Copy `topk_idx` and `topk_weights` with transformed index
if (lane_id < num_topk) {
// Top-k index
int recv_expert_begin = responsible_rank * num_experts_per_rank, recv_expert_end = (responsible_rank + 1) * num_experts_per_rank;
auto idx_value = __ldg(topk_idx + token_idx * num_topk + lane_id);
idx_value = (idx_value >= recv_expert_begin and idx_value < recv_expert_end) ? idx_value - recv_expert_begin : -1;
channel_topk_idx_buffers[dst_slot_idx * num_topk + lane_id] = idx_value;
// Top-k weights
auto weight_value = __ldg(topk_weights + token_idx * num_topk + lane_id);
weight_value = (idx_value >= 0) ? weight_value : 0.0f;
channel_topk_weights_buffers[dst_slot_idx * num_topk + lane_id] = weight_value;
}
// Copy `x_scales`
#pragma unroll
for (int i = lane_id; i < num_scales; i += 32) {
auto offset = token_idx * scale_token_stride + i * scale_hidden_stride;
channel_x_scales_buffers[dst_slot_idx * num_scales + i] = __ldg(x_scales + offset);
}
}
// Move token index
chunk_token_idx ++, token_idx ++;
}
// Move tail index
// NOTES: here all warps should share the same new tail
asm volatile("bar.sync %0, %1;" :: "r"(responsible_rank), "r"(num_threads_per_rank));
if (send_warp_id_in_rank == 0 and lane_id == 0)
st_release_sys_global(channel_tail_idx.buffer(), cached_channel_tail_idx);
}
} else {
// Workers for receiving and copying into buffer
constexpr int num_recv_warps = kNumThreads / 32;
constexpr int num_recv_warps_per_rank = num_recv_warps / kNumRanks;
const auto recv_thread_id = thread_id;
const auto recv_thread_id_in_rank = recv_thread_id % num_threads_per_rank;
const auto recv_warp_id_in_rank = recv_thread_id_in_rank / 32;
EP_DEVICE_ASSERT(kNumRanks <= 32);
EP_DEVICE_ASSERT(recv_thread_id >= 0 and num_recv_warps % kNumRanks == 0);
// Calculate offset first
auto rank_prefix_matrix = static_cast<int*>(buffer_ptrs[rank]);
int rank_offset = responsible_rank > 0 ? rank_prefix_matrix[(responsible_rank - 1) * kNumRanks + rank] : 0;
// Receive channel offset
int total_offset, num_tokens_to_recv;
while (lane_id == 0 and (total_offset = ld_volatile_global(channel_start_offset.buffer())) == 0);
while (lane_id == 0 and (num_tokens_to_recv = ld_volatile_global(channel_end_offset.buffer())) == 0);
if (lane_id == 0) {
total_offset = -total_offset - 1, num_tokens_to_recv = -num_tokens_to_recv - 1;
if (recv_warp_id_in_rank == 0)
recv_channel_offset[responsible_rank * num_channels + responsible_channel] = total_offset;
num_tokens_to_recv -= total_offset;
}
total_offset = __shfl_sync(0xffffffff, total_offset, 0);
total_offset += rank_offset;
num_tokens_to_recv = __shfl_sync(0xffffffff, num_tokens_to_recv, 0);
// Shared tail indices for different warps
__shared__ volatile int shared_channel_tail_idx[kNumRanks];
auto start_time = clock64();
int cached_channel_head_idx = 0, cached_channel_tail_idx = 0;
while (num_tokens_to_recv > 0) {
// NOTES: unlike the sender, the receiver must ensure that the tail indices hold by different warps are the same
while (recv_thread_id_in_rank == 0) {
cached_channel_tail_idx = ld_acquire_sys_global(channel_tail_idx.buffer());
// Ready to copy
if (cached_channel_head_idx != cached_channel_tail_idx) {
shared_channel_tail_idx[responsible_rank] = cached_channel_tail_idx;
break;
}
// Timeout check
if (clock64() - start_time > NUM_TIMEOUT_CYCLES) {
printf("DeepEP timeout for dispatch receivers, rank %d, responsible_channel = %d, tokens remained: %d\n", rank, responsible_channel, num_tokens_to_recv);
trap();
}
}
// Synchronize queue tail
asm volatile("bar.sync %0, %1;" :: "r"(responsible_rank), "r"(num_threads_per_rank));
cached_channel_tail_idx = shared_channel_tail_idx[responsible_rank];
// Copy data
int num_recv_tokens = cached_channel_tail_idx - cached_channel_head_idx;
for (int chunk_idx = recv_warp_id_in_rank; chunk_idx < num_recv_tokens; chunk_idx += num_recv_warps_per_rank) {
int token_idx_in_buffer = (cached_channel_head_idx + chunk_idx) % num_recv_buffer_tokens;
auto shifted_buffer_x_int4 = channel_x_buffers.buffer() + token_idx_in_buffer * hidden_int4;
auto shifted_recv_x_int4 = recv_x + static_cast<int64_t>(total_offset + chunk_idx) * hidden_int4;
#ifndef DISABLE_SM90_FEATURES
#pragma unroll
for (int i = 0; i < 2; ++ i) if (lane_id == 0) {
tma_store_wait();
tma_load_1d(tma_buffer, shifted_buffer_x_int4 + i * half_hidden_int4, tma_mbarrier, half_hidden_bytes);
mbarrier_arrive_and_expect_tx(tma_mbarrier, half_hidden_bytes);
mbarrier_wait(tma_mbarrier, tma_phase);
tma_store_1d(tma_buffer, shifted_recv_x_int4 + i * half_hidden_int4, half_hidden_bytes, false);
}
__syncwarp();
#else
UNROLLED_WARP_COPY(5, lane_id, hidden_int4, shifted_recv_x_int4, shifted_buffer_x_int4,
ld_nc_global, st_na_global);
#endif
}
// Copy `src_idx`
#pragma unroll 4
for (int chunk_idx = cached_channel_head_idx + recv_thread_id_in_rank; chunk_idx < cached_channel_tail_idx; chunk_idx += 32 * num_recv_warps_per_rank)
recv_src_idx[total_offset + chunk_idx - cached_channel_head_idx] = ld_nc_global(channel_src_idx_buffers.buffer() + chunk_idx % num_recv_buffer_tokens);
// Copy `topk_idx` and `topk_weights`
#pragma unroll 4
for (int idx = recv_thread_id_in_rank; idx < num_recv_tokens * num_topk; idx += 32 * num_recv_warps_per_rank) {
int chunk_idx = idx / num_topk, token_topk_idx = idx % num_topk;
int token_idx_in_buffer = (cached_channel_head_idx + chunk_idx) % num_recv_buffer_tokens;
auto recv_idx = static_cast<int64_t>(total_offset + chunk_idx) * num_topk + token_topk_idx;
auto buffer_idx = token_idx_in_buffer * num_topk + token_topk_idx;
recv_topk_idx[recv_idx] = ld_nc_global(channel_topk_idx_buffers.buffer() + buffer_idx);
recv_topk_weights[recv_idx] = ld_nc_global(channel_topk_weights_buffers.buffer() + buffer_idx);
}
// Copy `x_scales`
#pragma unroll 4
for (int i = recv_thread_id_in_rank; i < num_recv_tokens * num_scales; i += 32 * num_recv_warps_per_rank) {
int chunk_idx = i / num_scales, scales_idx = i % num_scales;
int token_idx_in_buffer = (cached_channel_head_idx + chunk_idx) % num_recv_buffer_tokens;
recv_x_scales[static_cast<int64_t>(total_offset + chunk_idx) * num_scales + scales_idx] =
ld_nc_global(channel_x_scales_buffers.buffer() + token_idx_in_buffer * num_scales + scales_idx);
}
// Move queue
cached_channel_head_idx += num_recv_tokens;
total_offset += num_recv_tokens;
asm volatile("bar.sync %0, %1;" :: "r"(responsible_rank), "r"(num_threads_per_rank));
if (recv_warp_id_in_rank == num_recv_warps_per_rank - 1 and lane_id == 0)
st_relaxed_sys_global(channel_head_idx.buffer(), cached_channel_head_idx);
// Exit
num_tokens_to_recv -= num_recv_tokens;
}
// Make TMA store visible to the next kernel
#ifndef DISABLE_SM90_FEATURES
if (lane_id == 0)
tma_store_wait();
#endif
}
// Clean unused `recv_topk_idx` as -1
if (num_worst_tokens > 0) {
auto rank_prefix_matrix = static_cast<int*>(buffer_ptrs[rank]);
const auto num_recv_tokens = rank_prefix_matrix[(kNumRanks - 1) * kNumRanks + rank];
const auto clean_start = num_recv_tokens * num_topk + sm_id * kNumThreads;
const auto clean_end = num_worst_tokens * num_topk;
const auto clean_stride = num_sms * kNumThreads;
#pragma unroll
for (int i = clean_start + thread_id; i < clean_end; i += clean_stride)
recv_topk_idx[i] = -1;
}
}
void dispatch(void* recv_x, float* recv_x_scales, int* recv_src_idx, int64_t* recv_topk_idx, float* recv_topk_weights, int* recv_channel_offset,
int* send_head, const void* x, const float* x_scales, const int64_t* topk_idx, const float* topk_weights,
const bool* is_token_in_rank, const int* channel_prefix_matrix,
int num_tokens, int num_worst_tokens, int hidden_int4, int num_topk, int num_experts, int num_scales,
int scale_token_stride, int scale_hidden_stride,
void** buffer_ptrs, int rank, int num_ranks,
cudaStream_t stream, int num_sms, int num_max_send_tokens, int num_recv_buffer_tokens) {
constexpr int kNumThreads = 768;
constexpr int kNumTMABytesPerWarp = 8192;
#ifndef DISABLE_SM90_FEATURES
constexpr int smem_size = kNumTMABytesPerWarp * (kNumThreads / 32);
#endif
// Make sure never OOB
EP_HOST_ASSERT(static_cast<int64_t>(num_scales) * scale_hidden_stride < std::numeric_limits<int>::max());
#define DISPATCH_LAUNCH_CASE(ranks) { \
auto kernel = dispatch<ranks, kNumThreads, kNumTMABytesPerWarp>; \
SET_SHARED_MEMORY_FOR_TMA(kernel); \
LAUNCH_KERNEL(&cfg, kernel, \
reinterpret_cast<int4*>(recv_x), recv_x_scales, recv_src_idx, recv_topk_idx, recv_topk_weights, recv_channel_offset, \
send_head, reinterpret_cast<const int4*>(x), x_scales, topk_idx, topk_weights, \
is_token_in_rank, channel_prefix_matrix, \
num_tokens, num_worst_tokens, hidden_int4, num_topk, num_experts, num_scales, \
scale_token_stride, scale_hidden_stride, \
buffer_ptrs, rank, \
num_max_send_tokens, num_recv_buffer_tokens); \
} break
// Even-numbered blocks for sending, odd-numbered blocks for receiving.
EP_HOST_ASSERT(num_sms % 2 == 0);
SETUP_LAUNCH_CONFIG(num_sms, kNumThreads, stream);
SWITCH_RANKS(DISPATCH_LAUNCH_CASE);
#undef DISPATCH_LAUNCH_CASE
}
template<int kNumRanks>
__global__ void
cached_notify_combine(void** buffer_ptrs, int* send_head, int num_channels, int num_recv_tokens, int num_memset_int,
int** barrier_signal_ptrs, int rank) {
const auto sm_id = static_cast<int>(blockIdx.x);
if (sm_id == 0) {
// Barrier before cleaning
barrier_block<kNumRanks, true>(barrier_signal_ptrs, rank);
// Clean
auto thread_id = static_cast<int>(threadIdx.x), num_threads = static_cast<int>(blockDim.x);
auto ptr = static_cast<int*>(buffer_ptrs[rank]);
#pragma unroll
for (int i = thread_id; i < num_memset_int; i += num_threads)
ptr[i] = 0;
// Barrier after cleaning
barrier_block<kNumRanks>(barrier_signal_ptrs, rank);
} else {
const auto channel_id = sm_id - 1;
const auto thread_id = static_cast<int>(threadIdx.x);
const auto rank_id = thread_id / 32;
const auto lane_id = thread_id % 32;
if (rank_id >= kNumRanks)
return;
int token_start_idx, token_end_idx;
get_channel_task_range(num_recv_tokens, num_channels, channel_id, token_start_idx, token_end_idx);
// NOTES: `1 << 25` is a heuristic large number
int last_head = 1 << 25;
#pragma unroll
for (int token_idx_tail = token_end_idx - 1; token_idx_tail >= token_start_idx; token_idx_tail -= 32) {
int token_idx = token_idx_tail - lane_id, expected_head = 0;
auto current_head = (token_idx >= token_start_idx) ? __ldg(send_head + token_idx * kNumRanks + rank_id) : -1;
for (int i = 0; i < min(32, token_idx_tail - token_start_idx + 1); ++ i) {
const int head = __shfl_sync(0xffffffff, current_head, i);
if (head < 0) {
if (lane_id == i)
expected_head = -last_head - 1;
} else {
last_head = head;
}
}
if (current_head < 0 and token_idx >= token_start_idx)
send_head[token_idx * kNumRanks + rank_id] = expected_head;
}
}
}
void cached_notify_combine(void** buffer_ptrs, int* send_head, int num_channels,
int num_recv_tokens, int num_memset_int,
int** barrier_signal_ptrs, int rank, int num_ranks,
cudaStream_t stream) {
#define CACHED_NOTIFY_COMBINE(ranks) \
LAUNCH_KERNEL(&cfg, cached_notify_combine<ranks>, \
buffer_ptrs, send_head, num_channels, num_recv_tokens, num_memset_int, barrier_signal_ptrs, rank); \
break
const int num_threads = std::max(128, 32 * num_ranks);
EP_HOST_ASSERT(num_ranks <= num_threads);
EP_HOST_ASSERT(num_threads <= 1024);
EP_HOST_ASSERT(1 + num_channels <= num_channels * 2);
SETUP_LAUNCH_CONFIG(1 + num_channels, num_threads, stream);
SWITCH_RANKS(CACHED_NOTIFY_COMBINE);
#undef CACHED_NOTIFY_COMBINE
}
template<typename dtype_t, int kNumRanks, int kNumThreads, int kNumTMABytesPerWarp>
__global__ void __launch_bounds__(kNumThreads, 1)
combine(dtype_t* recv_x, float* recv_topk_weights,
const dtype_t* x, const float* topk_weights,
const dtype_t* bias_0, const dtype_t* bias_1,
const int* src_idx, const int* rank_prefix_matrix, const int* channel_prefix_matrix,
int* send_head, int num_tokens, int num_recv_tokens, int hidden, int num_topk,
void** buffer_ptrs, int rank,
int num_max_send_tokens, int num_recv_buffer_tokens) {
const auto num_sms = static_cast<int>(gridDim.x);
const auto thread_id = static_cast<int>(threadIdx.x);
const auto sm_id = static_cast<int>(blockIdx.x), lane_id = get_lane_id();
const auto num_channels = num_sms / 2;
const bool is_sender = sm_id % 2 == 0;
const int responsible_channel = sm_id / 2;
EP_DEVICE_ASSERT(num_topk <= 32);
constexpr int kDtypePerInt4 = sizeof(int4) / sizeof(dtype_t);
int hidden_int4 = hidden * sizeof(dtype_t) / sizeof(int4);
auto x_int4 = reinterpret_cast<const int4*>(x);
auto bias_0_int4 = reinterpret_cast<const int4*>(bias_0);
auto bias_1_int4 = reinterpret_cast<const int4*>(bias_1);
auto recv_int4 = reinterpret_cast<int4*>(recv_x);
// TMA stuffs
#ifndef DISABLE_SM90_FEATURES
extern __shared__ __align__(1024) uint8_t smem_buffer[];
auto tma_buffer = smem_buffer + (thread_id / 32) * kNumTMABytesPerWarp;
#endif
if (is_sender) {
// Workers for sending
// Several warps are responsible for a single rank
constexpr int num_send_warps_per_rank = (kNumThreads / 32) / kNumRanks;
constexpr int num_send_warps = num_send_warps_per_rank * kNumRanks;
const auto num_threads_per_rank = num_send_warps_per_rank * 32;
const auto send_thread_id = thread_id;
const auto send_warp_id = send_thread_id / 32;
const auto send_rank_id = (responsible_channel + send_warp_id) % kNumRanks;
const auto send_warp_id_in_rank = send_warp_id / kNumRanks;
EP_STATIC_ASSERT(num_send_warps * 32 == kNumThreads, "Invalid warp count");
// Calculate pointers by the specific layout
auto ptr = reinterpret_cast<void*>(static_cast<int8_t*>(buffer_ptrs[send_rank_id]));
auto num_channels_total = num_channels * kNumRanks;
auto channel_rank_offset = responsible_channel * kNumRanks + rank;
// Channel meta data
// `head_idx`: kNumChannels * kNumRanks * sizeof(int)
// `tail_idx`: kNumChannels * kNumRanks * sizeof(int)
// `x_buffers`: kNumChannels * kNumRanks * num_recv_buffer_tokens * hidden_int4 * sizeof(int4)
// `src_idx_buffers`: kNumChannels * kNumRanks * num_recv_buffer_tokens * sizeof(int)
// `topk_weights_buffers`: kNumChannels * kNumRanks * num_recv_buffer_tokens * num_topk * sizeof(float)
auto channel_head_idx = Buffer<int>(ptr, num_channels_total, channel_rank_offset);
auto channel_tail_idx = Buffer<int>(ptr, num_channels_total, channel_rank_offset);
auto channel_x_buffers = Buffer<int4>(ptr, num_channels_total * num_recv_buffer_tokens * hidden_int4, channel_rank_offset * num_recv_buffer_tokens * hidden_int4);
auto channel_src_idx_buffers = Buffer<int>(ptr, num_channels_total * num_recv_buffer_tokens, channel_rank_offset * num_recv_buffer_tokens);
auto channel_topk_weights_buffers = Buffer<float>(ptr, num_channels_total * num_recv_buffer_tokens * num_topk, channel_rank_offset * num_recv_buffer_tokens * num_topk);
// Get tasks
// NOTES: `channel_offset` is already shifted
int rank_offset = send_rank_id > 0 ? rank_prefix_matrix[(send_rank_id - 1) * kNumRanks + rank] : 0;
int num_rank_tokens = rank_prefix_matrix[send_rank_id * kNumRanks + rank] - rank_offset;
int channel_offset = channel_prefix_matrix[send_rank_id * num_channels + responsible_channel];
int num_channel_tokens = (responsible_channel == num_channels - 1 ? num_rank_tokens : channel_prefix_matrix[send_rank_id * num_channels + responsible_channel + 1]) - channel_offset;
int token_start_idx = rank_offset + channel_offset, token_end_idx = rank_offset + channel_offset + num_channel_tokens;
// Iterate over all tokens and send by chunks
int current_channel_tail_idx = 0;
for (int64_t token_idx = token_start_idx; token_idx < token_end_idx; ) {
// Check destination queue emptiness, or wait a buffer to be released (rare cases)
auto start_time = clock64();
int num_round_tokens = min(num_max_send_tokens, token_end_idx - static_cast<int>(token_idx));
while (lane_id == 0) {
// NOTES: we only consider the worst case, because counting the real numbers are time-consuming
int num_used_slots = current_channel_tail_idx - ld_volatile_global(channel_head_idx.buffer());
if (num_recv_buffer_tokens - num_used_slots >= num_round_tokens)
break;
// Rare cases to loop again
if (clock64() - start_time > NUM_TIMEOUT_CYCLES) {
printf("DeepEP timeout for combine senders, rank %d, responsible_channel = %d\n", rank, responsible_channel);
trap();
}
}
__syncwarp();
// Send by chunk
#pragma unroll
for (int i = send_warp_id_in_rank; i < num_round_tokens; i += num_send_warps_per_rank) {
// Get an empty slot
int dst_slot_idx = (current_channel_tail_idx + i) % num_recv_buffer_tokens;
// Copy data
auto shifted_x_buffers = channel_x_buffers.buffer() + dst_slot_idx * hidden_int4;
auto shifted_x = x_int4 + (token_idx + i) * hidden_int4;
UNROLLED_WARP_COPY(4, lane_id, hidden_int4, shifted_x_buffers, shifted_x, ld_nc_global, st_na_global);
// Send source index
if (lane_id == 0)
channel_src_idx_buffers[dst_slot_idx] = __ldg(src_idx + token_idx + i);
// Send `topk_weights`
if (num_topk > 0 and lane_id < num_topk)
channel_topk_weights_buffers[dst_slot_idx * num_topk + lane_id] = __ldg(topk_weights + (token_idx + i) * num_topk + lane_id);
}
token_idx += num_round_tokens;
current_channel_tail_idx += num_round_tokens;
// Move tail index
asm volatile("bar.sync %0, %1;" :: "r"(send_rank_id), "r"(num_threads_per_rank));
if (lane_id == 0 and send_warp_id_in_rank == 0)
st_release_sys_global(channel_tail_idx.buffer(), current_channel_tail_idx);
}
} else {
// Workers for receiving
// One warp for moving the queue head, others for reduction
constexpr int num_recv_warps = kNumThreads / 32;
const auto recv_warp_id = thread_id / 32;
EP_DEVICE_ASSERT(kNumRanks <= 32 and kNumThreads > 32);
EP_DEVICE_ASSERT(thread_id >= 0 and kNumThreads % 32 == 0);
// Shared head, tail and retired flags for receiver warps
__shared__ volatile int warp_channel_head_idx[num_recv_warps][kNumRanks];
__shared__ volatile int channel_tail_idx[kNumRanks];
__shared__ volatile bool warp_retired[num_recv_warps];
if (thread_id < num_recv_warps)
warp_retired[thread_id] = false;
if (lane_id < kNumRanks)
warp_channel_head_idx[recv_warp_id][lane_id] = 0;
if (thread_id < kNumRanks)
channel_tail_idx[thread_id] = 0;
asm volatile("bar.sync 0, %0;" :: "r"(kNumThreads));
if (thread_id < 32) {
int* channel_head_idx_ptr = static_cast<int*>(buffer_ptrs[rank]) + responsible_channel * kNumRanks + lane_id;
int* channel_tail_idx_ptr = channel_head_idx_ptr + num_channels * kNumRanks;
// Queue head updater
int last_head = 0;
while (lane_id < kNumRanks) {
// Check retired
bool retired = true;
#pragma unroll
for (int i = 1; i < num_recv_warps; ++ i)
retired = retired and warp_retired[i];
if (retired)
break;
// Update queue tail
channel_tail_idx[lane_id] = ld_acquire_sys_global(channel_tail_idx_ptr);
// Update minimum head
int min_head = std::numeric_limits<int>::max();
#pragma unroll
for (int i = 1; i < num_recv_warps; ++ i) if (not warp_retired[i])
min_head = min(min_head, warp_channel_head_idx[i][lane_id]);
if (min_head != std::numeric_limits<int>::max() and min_head > last_head)
st_relaxed_sys_global(channel_head_idx_ptr, last_head = min_head);
}
} else {
// Receivers
// Channel metadata
// All lanes will use data buffer, but only rank lane will use `head/tail/src_idx`
Buffer<int4> channel_x_buffers[kNumRanks];
Buffer<float> channel_topk_weights_buffers[kNumRanks];
// Calculate pointers by the specific layout
#pragma unroll
for (int i = 0; i < kNumRanks; ++ i) {
auto channel_rank_offset = responsible_channel * kNumRanks + i;
auto num_channels_total = num_channels * kNumRanks;
// `head_idx` & `tail_idx`: kNumChannels * kNumRanks * sizeof(int)
auto ptr = reinterpret_cast<void*>(static_cast<int8_t*>(buffer_ptrs[rank]) + 2 * num_channels * kNumRanks * sizeof(int));
// `x_buffers`: kNumChannels * kNumRanks * num_recv_buffer_tokens * hidden_int4 * sizeof(int4)
channel_x_buffers[i] = Buffer<int4>(ptr, num_channels_total * num_recv_buffer_tokens * hidden_int4, channel_rank_offset * num_recv_buffer_tokens * hidden_int4);
// `src_idx_buffers`: kNumChannels * kNumRanks * num_recv_buffer_tokens * sizeof(int)
ptr = reinterpret_cast<void*>(static_cast<int8_t*>(ptr) + num_channels_total * num_recv_buffer_tokens * sizeof(int));
// `topk_weights_buffers`: kNumChannels * kNumRanks * num_recv_buffer_tokens * num_topk * sizeof(float)
channel_topk_weights_buffers[i] = Buffer<float>(ptr, num_channels_total * num_recv_buffer_tokens * num_topk, channel_rank_offset * num_recv_buffer_tokens * num_topk);
}
// The same tokens as the dispatch process
int token_start_idx, token_end_idx;
get_channel_task_range(num_recv_tokens, num_channels, responsible_channel, token_start_idx, token_end_idx);
// Iterate over all tokens and combine
for (int64_t token_idx = token_start_idx + recv_warp_id - 1; token_idx < token_end_idx; token_idx += num_recv_warps - 1) {
// Read expected head
int expected_head = -1;
if (lane_id < kNumRanks)
expected_head = ld_nc_global(send_head + token_idx * kNumRanks + lane_id);
auto start_time = clock64();
while (__any_sync(0xffffffff, channel_tail_idx[lane_id] <= expected_head and expected_head >= 0)) {
// Timeout check
if (clock64() - start_time > NUM_TIMEOUT_CYCLES) {
printf("DeepEP timeout for combine receivers, rank %d, responsible_channel = %d, expect = %d\n", rank, responsible_channel, expected_head);
trap();
}
}
__syncwarp();
// Broadcast current heads
int num_topk_ranks = 0, topk_ranks[kNumRanks], slot_indices[kNumRanks];
#pragma unroll
for (int i = 0; i < kNumRanks; ++ i) {
auto expected_head_i = __shfl_sync(0xffffffff, expected_head, i);
if (expected_head_i >= 0) {
slot_indices[num_topk_ranks] = expected_head_i % num_recv_buffer_tokens;
topk_ranks[num_topk_ranks ++] = i;
}
}
// Wait shared memory release
#ifndef DISABLE_SM90_FEATURES
if (lane_id == 0)
tma_store_wait();
__syncwarp();
#endif
// Reduce data with pipeline
constexpr int kNumStages = 8;
EP_STATIC_ASSERT(kNumStages * 32 * sizeof(int4) <= kNumTMABytesPerWarp, "Invalid count");
#pragma unroll
for (int i = lane_id; i < hidden_int4; i += 32) {
// Read bias
// TODO: make it as a template
int4 bias_0_value_int4 = bias_0_int4 != nullptr ? __ldg(bias_0_int4 + token_idx * hidden_int4 + i) : make_int4(0, 0, 0, 0);
int4 bias_1_value_int4 = bias_1_int4 != nullptr ? __ldg(bias_1_int4 + token_idx * hidden_int4 + i) : make_int4(0, 0, 0, 0);
// Read buffers
int4 recv_value_int4[kNumRanks];
#pragma unroll
for (int j = 0; j < num_topk_ranks; ++ j)
recv_value_int4[j] = ld_nc_global(channel_x_buffers[topk_ranks[j]].buffer() + slot_indices[j] * hidden_int4 + i);
// Reduce bias
float values[kDtypePerInt4];
auto bias_0_values = reinterpret_cast<const dtype_t*>(&bias_0_value_int4);
auto bias_1_values = reinterpret_cast<const dtype_t*>(&bias_1_value_int4);
#pragma unroll
for (int j = 0; j < kDtypePerInt4; ++ j)
values[j] = static_cast<float>(bias_0_values[j]) + static_cast<float>(bias_1_values[j]);
// Reduce all-to-all results
#pragma unroll
for (int j = 0; j < num_topk_ranks; ++ j) {
auto recv_value_dtypes = reinterpret_cast<const dtype_t*>(&recv_value_int4[j]);
#pragma unroll
for (int k = 0; k < kDtypePerInt4; ++ k)
values[k] += static_cast<float>(recv_value_dtypes[k]);
}
// Cast back to `dtype_t`
int4 out_int4;
auto out_dtypes = reinterpret_cast<dtype_t*>(&out_int4);
#pragma unroll
for (int j = 0; j < kDtypePerInt4; ++ j)
out_dtypes[j] = static_cast<dtype_t>(values[j]);
#ifndef DISABLE_SM90_FEATURES
// Wait TMA arrival
if (lane_id == 0)
tma_store_wait<kNumStages - 1>();
__syncwarp();
// Write into TMA buffer
auto tma_stage_idx = (i / 32) % kNumStages;
reinterpret_cast<int4*>(tma_buffer)[tma_stage_idx * 32 + lane_id] = out_int4;
// Issue TMA
tma_store_fence();
__syncwarp();
if (lane_id == 0) {
auto tma_bytes = min(32, hidden_int4 - i) * static_cast<int>(sizeof(int4));
tma_store_1d(reinterpret_cast<int4*>(tma_buffer) + tma_stage_idx * 32,
recv_int4 + token_idx * hidden_int4 + i, tma_bytes, false);
}
__syncwarp();
#else
recv_int4[token_idx * hidden_int4 + i] = out_int4;
#endif
}
// Reduce `topk_weights`
if (lane_id < num_topk) {
float value = 0;
#pragma unroll
for (int i = 0; i < num_topk_ranks; ++ i)
value += ld_nc_global(channel_topk_weights_buffers[topk_ranks[i]].buffer() + slot_indices[i] * num_topk + lane_id);
recv_topk_weights[token_idx * num_topk + lane_id] = value;
}
// Update head
if (lane_id < kNumRanks)
warp_channel_head_idx[recv_warp_id][lane_id] = (expected_head < 0) ? -expected_head - 1 : expected_head + 1;
}
// Retired
__syncwarp();
if (lane_id == 0)
warp_retired[recv_warp_id] = true;
// Make TMA store visible to the next kernel
#ifndef DISABLE_SM90_FEATURES
if (lane_id == 0)
tma_store_wait();
#endif
}
}
}
void combine(cudaDataType_t type,
void* recv_x, float* recv_topk_weights,
const void* x, const float* topk_weights,
const void* bias_0, const void* bias_1,
const int* src_idx, const int* rank_prefix_matrix, const int* channel_prefix_matrix,
int* send_head, int num_tokens, int num_recv_tokens, int hidden, int num_topk,
void** buffer_ptrs, int rank, int num_ranks,
cudaStream_t stream, int num_sms,
int num_max_send_tokens, int num_recv_buffer_tokens) {
constexpr int kNumThreads = 768;
constexpr int kNumTMABytesPerWarp = 4096;
#ifndef DISABLE_SM90_FEATURES
constexpr int smem_size = kNumTMABytesPerWarp * (kNumThreads / 32);
#endif
#define COMBINE_LAUNCH_CASE(dtype, ranks) { \
auto kernel = combine<dtype, ranks, kNumThreads, kNumTMABytesPerWarp>; \
SET_SHARED_MEMORY_FOR_TMA(kernel); \
LAUNCH_KERNEL(&cfg, kernel, \
reinterpret_cast<dtype*>(recv_x), recv_topk_weights, \
reinterpret_cast<const dtype*>(x), topk_weights, \
reinterpret_cast<const dtype*>(bias_0), reinterpret_cast<const dtype*>(bias_1), \
src_idx, rank_prefix_matrix, channel_prefix_matrix, \
send_head, num_tokens, num_recv_tokens, hidden, num_topk, \
buffer_ptrs, rank, \
num_max_send_tokens, num_recv_buffer_tokens); } \
break
#define COMBINE_DTYPE_LAUNCH_CASE(dtype) SWITCH_RANKS_WITH_DTYPE(dtype, COMBINE_LAUNCH_CASE); break
// Even-numbered blocks for sending, odd-numbered blocks for receiving
EP_HOST_ASSERT(num_sms % 2 == 0);
EP_HOST_ASSERT(kNumThreads >= num_ranks * 32);
SETUP_LAUNCH_CONFIG(num_sms, kNumThreads, stream);
SWITCH_TYPES(COMBINE_DTYPE_LAUNCH_CASE);
#undef COMBINE_DTYPE_LAUNCH_CASE
#undef COMBINE_LAUNCH_CASE
}
} // namespace intranode
} // namespace deep_ep

View File

@ -0,0 +1,89 @@
#pragma once
#include "configs.cuh"
#include "exception.cuh"
#ifndef SETUP_LAUNCH_CONFIG
#ifndef DISABLE_SM90_FEATURES
#define SETUP_LAUNCH_CONFIG(num_sms, num_threads, stream) \
cudaLaunchConfig_t cfg = {(num_sms), (num_threads), 0, stream, nullptr, 0}; \
cudaLaunchAttribute attr[1]; \
attr[0].id = cudaLaunchAttributeCooperative; \
attr[0].val.cooperative = 1; \
cfg.attrs = attr; \
cfg.numAttrs = 1
#else
#define SETUP_LAUNCH_CONFIG(sms, threads, stream) \
int __num_sms = (sms); \
int __num_threads = (threads); \
auto __stream = (stream)
#endif
#endif
#ifndef LAUNCH_KERNEL
#ifndef DISABLE_SM90_FEATURES
#define LAUNCH_KERNEL(config, kernel, ...) CUDA_CHECK(cudaLaunchKernelEx(config, kernel, ##__VA_ARGS__))
#else
#define LAUNCH_KERNEL(config, kernel, ...) \
do { \
kernel<<<__num_sms, __num_threads, 0, __stream>>>(__VA_ARGS__); \
cudaError_t e = cudaGetLastError(); \
if (e != cudaSuccess) { \
EPException cuda_exception("CUDA", __FILE__, __LINE__, cudaGetErrorString(e)); \
fprintf(stderr, "%s\n", cuda_exception.what()); \
throw cuda_exception; \
} \
} while (0)
#endif
#endif
#ifndef SET_SHARED_MEMORY_FOR_TMA
#ifndef DISABLE_SM90_FEATURES
#define SET_SHARED_MEMORY_FOR_TMA(kernel) \
EP_HOST_ASSERT(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size) == cudaSuccess); \
cfg.dynamicSmemBytes = smem_size;
#else
#define SET_SHARED_MEMORY_FOR_TMA(kernel) void()
#endif
#endif
#define SWITCH_RANKS(case_macro) \
switch (num_ranks) { \
case 2: case_macro(2); \
case 4: case_macro(4); \
case 8: case_macro(8); \
default: EP_HOST_ASSERT(false and "Unsupported ranks"); \
} while (false)
#define SWITCH_RDMA_RANKS(case_macro) \
switch (num_ranks / NUM_MAX_NVL_PEERS) { \
case 2: case_macro(2); \
case 4: case_macro(4); \
case 8: case_macro(8); \
case 16: case_macro(16); \
default: EP_HOST_ASSERT(false and "Unsupported RDMA ranks"); \
} while (false)
#define SWITCH_RANKS_WITH_DTYPE(dtype, case_macro) \
switch (num_ranks) { \
case 2: case_macro(dtype, 2); \
case 4: case_macro(dtype, 4); \
case 8: case_macro(dtype, 8); \
default: EP_HOST_ASSERT(false && "Unsupported ranks"); \
} while (false)
#define SWITCH_TYPES(case_macro) \
switch (type) { \
case CUDA_R_16BF: case_macro(nv_bfloat16); \
default: EP_HOST_ASSERT(false && "Unsupported type"); \
} while (false)
#define SWITCH_HIDDEN(case_macro) \
switch (hidden) { \
case 2048: case_macro(2048); \
case 2560: case_macro(2560); \
case 4096: case_macro(4096); \
case 5120: case_macro(5120); \
case 7168: case_macro(7168); \
default: EP_HOST_ASSERT(false && "Unsupported hidden"); \
} while (false)

View File

@ -0,0 +1,136 @@
#include "configs.cuh"
#include "exception.cuh"
#include "launch.cuh"
namespace deep_ep {
namespace layout {
template <int kNumThreads, int kNumExpertsPerSM, int kNumRanksPerSM>
__global__ void __launch_bounds__(kNumThreads, 1)
get_dispatch_layout(const int64_t* topk_idx,
int* num_tokens_per_rank, int* num_tokens_per_rdma_rank,
int* num_tokens_per_expert, bool* is_token_in_rank,
int num_tokens, int num_topk, int num_ranks, int num_experts) {
auto sm_id = static_cast<int>(blockIdx.x);
auto thread_id = static_cast<int>(threadIdx.x);
// Count expert statistics
__shared__ int num_tokens_per_expert_per_thread[kNumThreads][kNumExpertsPerSM];
int expert_begin_idx = sm_id * kNumExpertsPerSM, expert_end_idx = min(expert_begin_idx + kNumExpertsPerSM, num_experts);
if (expert_begin_idx < expert_end_idx) {
// Per-thread count
#pragma unroll
for (int i = 0; i < kNumExpertsPerSM; ++ i)
num_tokens_per_expert_per_thread[thread_id][i] = 0;
#pragma unroll
for (int i = thread_id; i < num_tokens; i += kNumThreads) {
auto shifted_topk_idx = topk_idx + i * num_topk;
#pragma unroll
for (int j = 0, expert_idx; j < num_topk; ++ j) {
expert_idx = static_cast<int>(shifted_topk_idx[j]);
if (expert_begin_idx <= expert_idx and expert_idx < expert_end_idx)
++ num_tokens_per_expert_per_thread[thread_id][expert_idx - expert_begin_idx];
}
}
__syncthreads();
// Sum up
EP_STATIC_ASSERT(kNumExpertsPerSM <= kNumThreads, "Too many experts per SM");
if (expert_begin_idx + thread_id < expert_end_idx) {
int sum = 0;
#pragma unroll
for (int i = 0; i < kNumThreads; ++ i)
sum += num_tokens_per_expert_per_thread[i][thread_id];
num_tokens_per_expert[expert_begin_idx + thread_id] = sum;
}
return;
}
if (num_tokens_per_rdma_rank != nullptr)
EP_DEVICE_ASSERT(num_ranks % NUM_MAX_NVL_PEERS == 0 and num_ranks > NUM_MAX_NVL_PEERS);
// Count rank statistics
constexpr int kNumRDMARanksPerSM = kNumRanksPerSM / NUM_MAX_NVL_PEERS;
__shared__ int num_tokens_per_rank_per_thread[kNumThreads][kNumRanksPerSM];
__shared__ int num_tokens_per_rdma_rank_per_thread[kNumThreads][kNumRDMARanksPerSM];
auto sm_begin = (num_experts + kNumExpertsPerSM - 1) / kNumExpertsPerSM;
int rank_begin_idx = (sm_id - sm_begin) * kNumRanksPerSM, rank_end_idx = min(rank_begin_idx + kNumRanksPerSM, num_ranks);
int rdma_rank_begin_idx = rank_begin_idx / NUM_MAX_NVL_PEERS, rdma_rank_end_idx = rank_end_idx / NUM_MAX_NVL_PEERS;
if (rank_begin_idx < rank_end_idx) {
const auto num_expert_per_rank = num_experts / num_ranks;
auto expert_begin = rank_begin_idx * num_expert_per_rank;
auto expert_end = rank_end_idx * num_expert_per_rank;
// Per-thread count
#pragma unroll
for (int i = 0; i < kNumRanksPerSM; ++ i)
num_tokens_per_rank_per_thread[thread_id][i] = 0;
#pragma unroll
for (int i = 0; i < kNumRDMARanksPerSM; ++ i)
num_tokens_per_rdma_rank_per_thread[thread_id][i] = 0;
#pragma unroll
for (int i = thread_id; i < num_tokens; i += kNumThreads) {
auto shifted_topk_idx = topk_idx + i * num_topk;
int is_in_rank[kNumRanksPerSM] = {0}, is_in_rdma_rank[kNumRDMARanksPerSM] = {0};
#pragma unroll
for (int j = 0, expert_idx, rank_idx; j < num_topk; ++j) {
expert_idx = static_cast<int>(shifted_topk_idx[j]);
if (expert_begin <= expert_idx and expert_idx < expert_end) {
// Count single rank
rank_idx = expert_idx / num_expert_per_rank - rank_begin_idx;
is_in_rank[rank_idx] ++, is_in_rdma_rank[rank_idx / NUM_MAX_NVL_PEERS] ++;
}
}
auto shifted_is_token_in_rank = is_token_in_rank + i * num_ranks;
#pragma unroll
for (int j = 0; j + rank_begin_idx < rank_end_idx; ++ j) {
shifted_is_token_in_rank[j + rank_begin_idx] = (is_in_rank[j] > 0);
num_tokens_per_rank_per_thread[thread_id][j] += (is_in_rank[j] > 0);
}
#pragma unroll
for (int j = 0; j + rdma_rank_begin_idx < rdma_rank_end_idx; ++ j)
num_tokens_per_rdma_rank_per_thread[thread_id][j] += (is_in_rdma_rank[j] > 0);
}
__syncthreads();
// Sum up
EP_STATIC_ASSERT(kNumRanksPerSM <= kNumThreads, "Too many ranks per SM");
if (rank_begin_idx + thread_id < rank_end_idx) {
int sum = 0;
#pragma unroll
for (int i = 0; i < kNumThreads; ++ i)
sum += num_tokens_per_rank_per_thread[i][thread_id];
num_tokens_per_rank[rank_begin_idx + thread_id] = sum;
}
if (num_tokens_per_rdma_rank != nullptr and rdma_rank_begin_idx + thread_id < rdma_rank_end_idx) {
int sum = 0;
#pragma unroll
for (int i = 0; i < kNumThreads; ++ i)
sum += num_tokens_per_rdma_rank_per_thread[i][thread_id];
num_tokens_per_rdma_rank[rdma_rank_begin_idx + thread_id] = sum;
}
}
}
void get_dispatch_layout(const int64_t* topk_idx,
int* num_tokens_per_rank, int* num_tokens_per_rdma_rank,
int* num_tokens_per_expert, bool* is_token_in_rank,
int num_tokens, int num_topk, int num_ranks, int num_experts,
cudaStream_t stream) {
constexpr int kNumThreads = 256, kNumExpertsPerSM = 32, kNumRanksPerSM = 8;
int num_sms = ((num_experts + kNumExpertsPerSM - 1) / kNumExpertsPerSM) + (num_ranks + kNumRanksPerSM - 1) / kNumRanksPerSM;
EP_STATIC_ASSERT(kNumExpertsPerSM % NUM_MAX_NVL_PEERS == 0, "Invalid number of experts per SM");
SETUP_LAUNCH_CONFIG(num_sms, kNumThreads, stream);
LAUNCH_KERNEL(&cfg, (get_dispatch_layout<kNumThreads, kNumExpertsPerSM, kNumRanksPerSM>),
topk_idx, num_tokens_per_rank, num_tokens_per_rdma_rank, num_tokens_per_expert, is_token_in_rank,
num_tokens, num_topk, num_ranks, num_experts);
}
} // namespace layout
} // namespace deep_ep

View File

@ -0,0 +1,92 @@
#include <vector>
#include <cstring>
#include "configs.cuh"
#include "exception.cuh"
#include "launch.cuh"
#include "utils.cuh"
#ifndef DISABLE_NVSHMEM
#include "ibgda_device.cuh"
#endif
namespace deep_ep {
namespace intranode {
template<int kNumRanks>
__global__ void barrier(int** barrier_signal_ptrs, int rank) {
barrier_block<kNumRanks>(barrier_signal_ptrs, rank);
}
void barrier(int** barrier_signal_ptrs, int rank, int num_ranks, cudaStream_t stream) {
#define BARRIER_LAUNCH_CASE(ranks) \
LAUNCH_KERNEL(&cfg, barrier<ranks>, barrier_signal_ptrs, rank); \
break
SETUP_LAUNCH_CONFIG(1, 32, stream);
SWITCH_RANKS(BARRIER_LAUNCH_CASE);
#undef BARRIER_LAUNCH_CASE
}
} // namespace intranode
namespace internode {
#ifndef DISABLE_NVSHMEM
nvshmem_team_t cpu_rdma_team = NVSHMEM_TEAM_INVALID;
nvshmem_team_config_t cpu_rdma_team_config;
std::vector<uint8_t> get_unique_id() {
nvshmemx_uniqueid_t unique_id;
nvshmemx_get_uniqueid(&unique_id);
std::vector<uint8_t> result(sizeof(nvshmemx_uniqueid_t));
std::memcpy(result.data(), &unique_id, sizeof(nvshmemx_uniqueid_t));
return result;
}
int init(const std::vector<uint8_t> &root_unique_id_val, int rank, int num_ranks, bool low_latency_mode) {
nvshmemx_uniqueid_t root_unique_id;
nvshmemx_init_attr_t attr;
std::memcpy(&root_unique_id, root_unique_id_val.data(), sizeof(nvshmemx_uniqueid_t));
nvshmemx_set_attr_uniqueid_args(rank, num_ranks, &root_unique_id, &attr);
nvshmemx_init_attr(NVSHMEMX_INIT_WITH_UNIQUEID, &attr);
// Create sub-RDMA teams
// NOTES: if `num_ranks <= NUM_MAX_NVL_PEERS` then only low-latency kernels are used
if (low_latency_mode and num_ranks > NUM_MAX_NVL_PEERS) {
EP_HOST_ASSERT(cpu_rdma_team == NVSHMEM_TEAM_INVALID);
EP_HOST_ASSERT(num_ranks % NUM_MAX_NVL_PEERS == 0);
EP_HOST_ASSERT(nvshmem_team_split_strided(NVSHMEM_TEAM_WORLD, rank % NUM_MAX_NVL_PEERS, NUM_MAX_NVL_PEERS,
num_ranks / NUM_MAX_NVL_PEERS, &cpu_rdma_team_config, 0, &cpu_rdma_team) == 0);
EP_HOST_ASSERT(cpu_rdma_team != NVSHMEM_TEAM_INVALID);
}
nvshmem_barrier_all();
return nvshmem_my_pe();
}
void* alloc(size_t size, size_t alignment) {
return nvshmem_align(alignment, size);
}
void free(void* ptr) {
nvshmem_free(ptr);
}
void barrier() {
nvshmem_barrier_all();
}
void finalize() {
if (cpu_rdma_team != NVSHMEM_TEAM_INVALID) {
nvshmem_team_destroy(cpu_rdma_team);
cpu_rdma_team = NVSHMEM_TEAM_INVALID;
}
nvshmem_finalize();
}
#endif
} // namespace internode
} // namespace deep_ep

View File

@ -0,0 +1,496 @@
#pragma once
#include "exception.cuh"
#define UNROLLED_WARP_COPY(UNROLL_FACTOR, LANE_ID, N, DST, SRC, LD_FUNC, ST_FUNC) \
{ \
constexpr int kLoopStride = 32 * (UNROLL_FACTOR); \
typename std::remove_reference<decltype(LD_FUNC((SRC) + 0))>::type unrolled_values[(UNROLL_FACTOR)]; \
auto __src = (SRC); \
auto __dst = (DST); \
for (int __i = (LANE_ID); __i < ((N) / kLoopStride) * kLoopStride; __i += kLoopStride) { \
_Pragma("unroll") \
for (int __j = 0; __j < (UNROLL_FACTOR); ++ __j) \
unrolled_values[__j] = LD_FUNC(__src + __i + __j * 32); \
_Pragma("unroll") \
for (int __j = 0; __j < (UNROLL_FACTOR); ++ __j) \
ST_FUNC(__dst + __i + __j * 32, unrolled_values[__j]); \
} \
for (int __i = ((N) / kLoopStride) * kLoopStride + (LANE_ID); __i < (N); __i += 32) \
ST_FUNC(__dst + __i, LD_FUNC(__src + __i)); \
}
namespace deep_ep {
template <int kBytes>
struct VecInt {};
template<> struct VecInt<1> { using vec_t = int8_t; };
template<> struct VecInt<2> { using vec_t = int16_t; };
template<> struct VecInt<4> { using vec_t = int; };
template<> struct VecInt<8> { using vec_t = int64_t; };
template<> struct VecInt<16> { using vec_t = int4; };
__device__ __forceinline__ void trap() {
asm("trap;");
}
__device__ __forceinline__ void memory_fence() {
asm volatile("fence.acq_rel.sys;":: : "memory");
}
__device__ __forceinline__ void memory_fence_gpu() {
asm volatile("fence.acq_rel.gpu;":: : "memory");
}
__device__ __forceinline__ void memory_fence_cta() {
asm volatile("fence.acq_rel.cta;":: : "memory");
}
__device__ __forceinline__ void st_relaxed_sys_global(const int *ptr, int val) {
asm volatile("st.relaxed.sys.global.s32 [%0], %1;"::"l"(ptr), "r"(val) : "memory");
}
__device__ __forceinline__ void st_release_sys_global(const int *ptr, int val) {
asm volatile("st.release.sys.global.s32 [%0], %1;"::"l"(ptr), "r"(val) : "memory");
}
__device__ __forceinline__ void st_release_cta(const int *ptr, int val) {
asm volatile("st.release.cta.s32 [%0], %1;"::"l"(ptr), "r"(val) : "memory");
}
__device__ __forceinline__ int ld_acquire_sys_global(const int *ptr) {
int ret;
asm volatile("ld.acquire.sys.global.s32 %0, [%1];" : "=r"(ret) : "l"(ptr));
return ret;
}
__device__ __forceinline__ uint64_t ld_acquire_sys_global(const uint64_t *ptr) {
uint64_t ret;
asm volatile("ld.acquire.sys.global.u64 %0, [%1];" : "=l"(ret) : "l"(ptr));
return ret;
}
__device__ __forceinline__ int ld_acquire_global(const int *ptr) {
int ret;
asm volatile("ld.acquire.gpu.global.s32 %0, [%1];" : "=r"(ret) : "l"(ptr));
return ret;
}
__device__ __forceinline__ int atomic_add_release_sys_global(const int* ptr, int value) {
int ret;
asm volatile("atom.add.release.sys.global.s32 %0, [%1], %2;" : "=r"(ret) : "l"(ptr), "r"(value));
return ret;
}
__device__ __forceinline__ int atomic_add_release_global(const int* ptr, int value) {
int ret;
asm volatile("atom.add.release.gpu.global.s32 %0, [%1], %2;" : "=r"(ret) : "l"(ptr), "r"(value));
return ret;
}
__device__ __forceinline__ int ld_acquire_cta(const int *ptr) {
int ret;
asm volatile("ld.acquire.cta.s32 %0, [%1];" : "=r"(ret) : "l"(ptr));
return ret;
}
__device__ __forceinline__ uint8_t ld_na_relaxed(const uint8_t *ptr) {
uint16_t ret;
asm volatile("ld.relaxed.gpu.global.L1::no_allocate.b8 %0, [%1];" : "=h"(ret) : "l"(ptr));
return static_cast<uint8_t>(ret);
}
__device__ __forceinline__ uint16_t ld_na_relaxed(const uint16_t *ptr) {
uint16_t ret;
asm volatile("ld.relaxed.gpu.global.L1::no_allocate.b16 %0, [%1];" : "=h"(ret) : "l"(ptr));
return ret;
}
__device__ __forceinline__ uint32_t ld_na_relaxed(const uint32_t *ptr) {
uint32_t ret;
asm volatile("ld.relaxed.gpu.global.L1::no_allocate.b32 %0, [%1];" : "=r"(ret) : "l"(ptr));
return ret;
}
__device__ __forceinline__ uint64_t ld_na_relaxed(const uint64_t *ptr) {
uint64_t ret;
asm volatile("ld.relaxed.gpu.global.L1::no_allocate.b64 %0, [%1];" : "=l"(ret) : "l"(ptr));
return ret;
}
__device__ __forceinline__ int ld_volatile_global(const int *ptr) {
int ret;
asm volatile("ld.volatile.global.s32 %0, [%1];" : "=r"(ret) : "l"(ptr));
return ret;
}
__device__ __forceinline__ float ld_volatile_global(const float *ptr) {
float ret;
asm volatile("ld.volatile.global.f32 %0, [%1];" : "=f"(ret) : "l"(ptr));
return ret;
}
__device__ __forceinline__ int64_t ld_volatile_global(const int64_t *ptr) {
int64_t ret;
asm volatile("ld.volatile.global.s64 %0, [%1];" : "=l"(ret) : "l"(ptr));
return ret;
}
__device__ __forceinline__ int64_t ld_volatile_global(const uint64_t *ptr) {
int64_t ret;
asm volatile("ld.volatile.global.u64 %0, [%1];" : "=l"(ret) : "l"(ptr));
return ret;
}
#ifndef DISABLE_AGGRESSIVE_PTX_INSTRS
#define LD_NC_FUNC "ld.global.nc.L1::no_allocate.L2::256B"
#else
#define LD_NC_FUNC "ld.volatile.global.L2::256B"
#endif
// `ld.global.nc.L1::no_allocate` will be translated into `LDG.E.NA.[width].CONSTANT` in SASS
template <typename dtype_t>
__device__ __forceinline__ dtype_t ld_nc_global(const dtype_t *ptr) {
auto ret = ld_nc_global(reinterpret_cast<const typename VecInt<sizeof(dtype_t)>::vec_t*>(ptr));
return *reinterpret_cast<dtype_t*>(&ret);
}
template <>
__device__ __forceinline__ uint8_t ld_nc_global(const uint8_t *ptr) {
uint16_t ret;
// NOTES: we must use `uint16_t` as inline ASM does not support 8-bit constraint letter (`h` below means unsigned 16-bit)
asm volatile(LD_NC_FUNC ".u8 %0, [%1];" : "=h"(ret) : "l"(ptr));
return static_cast<uint8_t>(ret);
}
template <>
__device__ __forceinline__ int ld_nc_global(const int *ptr) {
int ret;
asm volatile(LD_NC_FUNC ".s32 %0, [%1];" : "=r"(ret) : "l"(ptr));
return ret;
}
template <>
__device__ __forceinline__ int64_t ld_nc_global(const int64_t *ptr) {
int64_t ret;
asm volatile(LD_NC_FUNC ".s64 %0, [%1];" : "=l"(ret) : "l"(ptr));
return ret;
}
template <>
__device__ __forceinline__ float ld_nc_global(const float *ptr) {
float ret;
asm volatile(LD_NC_FUNC ".f32 %0, [%1];" : "=f"(ret) : "l"(ptr));
return ret;
}
template <>
__device__ __forceinline__ int2 ld_nc_global(const int2 *ptr) {
int2 ret;
asm volatile(LD_NC_FUNC ".v2.s32 {%0, %1}, [%2];" : "=r"(ret.x), "=r"(ret.y) : "l"(ptr));
return ret;
}
template <>
__device__ __forceinline__ int4 ld_nc_global(const int4 *ptr) {
int4 ret;
asm volatile(LD_NC_FUNC ".v4.s32 {%0, %1, %2, %3}, [%4];"
: "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : "l"(ptr));
return ret;
}
__device__ __forceinline__ void st_na_relaxed(const uint8_t *ptr, uint8_t val) {
asm volatile("st.relaxed.gpu.global.L1::no_allocate.b8 [%0], %1;" : : "l"(ptr), "h"(static_cast<uint16_t>(val)));
}
__device__ __forceinline__ void st_na_relaxed(const uint16_t *ptr, uint16_t val) {
asm volatile("st.relaxed.gpu.global.L1::no_allocate.b16 [%0], %1;" : : "l"(ptr), "h"(val));
}
__device__ __forceinline__ void st_na_relaxed(const uint32_t *ptr, uint32_t val) {
asm volatile("st.relaxed.gpu.global.L1::no_allocate.b32 [%0], %1;" : : "l"(ptr), "r"(val));
}
__device__ __forceinline__ void st_na_relaxed(const int *ptr, int val) {
asm volatile("st.relaxed.gpu.global.L1::no_allocate.b32 [%0], %1;" : : "l"(ptr), "r"(val));
}
__device__ __forceinline__ void st_na_relaxed(const int4 *ptr, int4 val) {
asm volatile("st.relaxed.gpu.global.L1::no_allocate.v4.s32 [%0], {%1, %2, %3, %4};"
: : "l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w));
}
__device__ __forceinline__ void st_na_release(const int *ptr, int val) {
asm volatile("st.release.gpu.global.L1::no_allocate.b32 [%0], %1;" : : "l"(ptr), "r"(val));
}
__device__ __forceinline__ void st_na_release(const uint32_t *ptr, uint32_t val) {
asm volatile("st.release.gpu.global.L1::no_allocate.b32 [%0], %1;" : : "l"(ptr), "r"(val));
}
__device__ __forceinline__ void st_na_release(const uint64_t *ptr, uint64_t val) {
asm volatile("st.release.gpu.global.L1::no_allocate.b64 [%0], %1;" : : "l"(ptr), "l"(val));
}
// `st.global.L1::no_allocate` will be translated into `ST.E.NA.[width]` in SASS
#ifndef DISABLE_AGGRESSIVE_PTX_INSTRS
#define ST_NA_FUNC "st.global.L1::no_allocate"
#else
#define ST_NA_FUNC "st.global"
#endif
template <typename dtype_t>
__device__ __forceinline__ void st_na_global(const dtype_t *ptr, const dtype_t& value) {
st_na_global(reinterpret_cast<const typename VecInt<sizeof(dtype_t)>::vec_t*>(ptr),
*reinterpret_cast<const typename VecInt<sizeof(dtype_t)>::vec_t*>(&value));
}
template <>
__device__ __forceinline__ void st_na_global(const int *ptr, const int& value) {
asm volatile(ST_NA_FUNC ".s32 [%0], %1;" ::"l"(ptr), "r"(value));
}
template <>
__device__ __forceinline__ void st_na_global(const int64_t *ptr, const int64_t& value) {
asm volatile(ST_NA_FUNC ".s64 [%0], %1;" ::"l"(ptr), "l"(value));
}
template <>
__device__ __forceinline__ void st_na_global(const float *ptr, const float& value) {
asm volatile(ST_NA_FUNC ".f32 [%0], %1;" ::"l"(ptr), "f"(value));
}
template <>
__device__ __forceinline__ void st_na_global(const int4 *ptr, const int4& value) {
asm volatile(ST_NA_FUNC ".v4.s32 [%0], {%1, %2, %3, %4};"
::"l"(ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w));
}
// TMA PTX instructions
#ifndef DISABLE_SM90_FEATURES
__device__ __forceinline__ void fence_view_async_shared() {
asm volatile("fence.proxy.async.shared::cta; \n" :: );
}
__device__ __forceinline__ void fence_barrier_init() {
asm volatile("fence.mbarrier_init.release.cluster; \n" :: );
}
__device__ __forceinline__ void mbarrier_init(uint64_t* mbar_ptr, uint32_t arrive_count) {
auto mbar_int_ptr = static_cast<uint32_t>(__cvta_generic_to_shared(mbar_ptr));
asm volatile("mbarrier.init.shared::cta.b64 [%1], %0;" :: "r"(arrive_count), "r"(mbar_int_ptr));
}
__device__ __forceinline__ void mbarrier_wait(uint64_t* mbar_ptr, uint32_t& phase) {
auto mbar_int_ptr = static_cast<uint32_t>(__cvta_generic_to_shared(mbar_ptr));
asm volatile("{\n\t"
".reg .pred P1; \n\t"
"LAB_WAIT: \n\t"
"mbarrier.try_wait.parity.shared::cta.b64 P1, [%0], %1, %2; \n\t"
"@P1 bra DONE; \n\t"
"bra LAB_WAIT; \n\t"
"DONE: \n\t"
"}" :: "r"(mbar_int_ptr), "r"(phase), "r"(0x989680));
phase ^= 1;
}
__device__ __forceinline__ void mbarrier_arrive_and_expect_tx(uint64_t* mbar_ptr, int num_bytes) {
auto mbar_int_ptr = static_cast<uint32_t>(__cvta_generic_to_shared(mbar_ptr));
asm volatile("mbarrier.arrive.expect_tx.shared::cta.b64 _, [%1], %0; \n\t" :: "r"(num_bytes), "r"(mbar_int_ptr));
}
__device__ __forceinline__ void tma_store_fence() {
asm volatile ("fence.proxy.async.shared::cta;");
}
constexpr uint64_t kEvictFirst = 0x12f0000000000000;
constexpr uint64_t kEvictNormal = 0x1000000000000000;
__device__ __forceinline__ void tma_load_1d(const void* smem_ptr, const void* gmem_ptr, uint64_t* mbar_ptr, int num_bytes,
bool evict_first = true) {
auto mbar_int_ptr = static_cast<uint32_t>(__cvta_generic_to_shared(mbar_ptr));
auto smem_int_ptr = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
const auto cache_hint = evict_first ? kEvictFirst : kEvictNormal;
asm volatile("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.L2::cache_hint [%0], [%1], %2, [%3], %4;\n"
:: "r"(smem_int_ptr), "l"(gmem_ptr), "r"(num_bytes), "r"(mbar_int_ptr), "l"(cache_hint) : "memory");
}
__device__ __forceinline__ void tma_store_1d(const void* smem_ptr, const void* gmem_ptr, int num_bytes,
bool evict_first = true) {
auto smem_int_ptr = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
const auto cache_hint = evict_first ? kEvictFirst : kEvictNormal;
asm volatile("cp.async.bulk.global.shared::cta.bulk_group.L2::cache_hint [%0], [%1], %2, %3;\n"
:: "l"(gmem_ptr), "r"(smem_int_ptr), "r"(num_bytes), "l"(cache_hint) : "memory");
asm volatile("cp.async.bulk.commit_group;");
}
template <int N = 0>
__device__ __forceinline__ void tma_store_wait() {
asm volatile("cp.async.bulk.wait_group.read %0;" :: "n"(N) : "memory");
}
#endif
template <typename dtype_t>
__host__ __device__ dtype_t ceil_div(dtype_t a, dtype_t b) {
return (a + b - 1) / b;
}
template <typename dtype_t>
__host__ __device__ dtype_t align(dtype_t a, dtype_t b) {
return ceil_div<dtype_t>(a, b) * b;
}
__forceinline__ __device__ void get_channel_task_range(int num_tokens, int num_sms, int sm_id,
int& token_start_idx, int& token_end_idx) {
int num_tokens_per_sm = ceil_div(num_tokens, num_sms);
token_start_idx = min(num_tokens_per_sm * sm_id, num_tokens);
token_end_idx = min(token_start_idx + num_tokens_per_sm, num_tokens);
}
template <typename dtype_a_t, typename dtype_b_t>
__device__ __forceinline__ dtype_b_t pack2(const dtype_a_t& x, const dtype_a_t& y) {
EP_STATIC_ASSERT(sizeof(dtype_a_t) * 2 == sizeof(dtype_b_t), "Invalid dtypes");
dtype_b_t packed;
auto unpacked_ptr = reinterpret_cast<dtype_a_t*>(&packed);
unpacked_ptr[0] = x, unpacked_ptr[1] = y;
return packed;
}
template <typename dtype_a_t, typename dtype_b_t>
__device__ __forceinline__ void unpack2(const dtype_b_t& packed, dtype_a_t& x, dtype_a_t& y) {
EP_STATIC_ASSERT(sizeof(dtype_a_t) * 2 == sizeof(dtype_b_t), "Invalid dtypes");
auto unpacked_ptr = reinterpret_cast<const dtype_a_t*>(&packed);
x = unpacked_ptr[0], y = unpacked_ptr[1];
}
template <typename dtype_t>
__device__ __forceinline__ dtype_t broadcast(dtype_t& ptr, int src_lane_idx) {
EP_STATIC_ASSERT(sizeof(dtype_t) % sizeof(int) == 0, "");
auto send_int_values = reinterpret_cast<int*>(&ptr);
int recv_int_values[sizeof(dtype_t) / sizeof(int)];
#pragma unroll
for (int i = 0; i < sizeof(dtype_t) / sizeof(int); ++ i)
recv_int_values[i] = __shfl_sync(0xffffffff, send_int_values[i], src_lane_idx);
return *reinterpret_cast<dtype_t*>(recv_int_values);
}
__forceinline__ __device__ int warp_reduce_sum(int value) {
value += __shfl_xor_sync(0xffffffff, value, 16);
value += __shfl_xor_sync(0xffffffff, value, 8);
value += __shfl_xor_sync(0xffffffff, value, 4);
value += __shfl_xor_sync(0xffffffff, value, 2);
value += __shfl_xor_sync(0xffffffff, value, 1);
return value;
}
__forceinline__ __device__ float half_warp_reduce_max(float value) {
auto mask = __activemask();
// The mask be in `{0xffffffff, 0xffff}`
value = max(value, __shfl_xor_sync(mask, value, 8));
value = max(value, __shfl_xor_sync(mask, value, 4));
value = max(value, __shfl_xor_sync(mask, value, 2));
value = max(value, __shfl_xor_sync(mask, value, 1));
return value;
}
__forceinline__ __device__ int get_lane_id() {
int lane_id;
asm("mov.s32 %0, %laneid;" : "=r"(lane_id));
return lane_id;
}
constexpr float kFP8Margin = 1e-4;
constexpr float kFinfoAmaxE4M3 = 448.0f;
constexpr float kFinfoAmaxInvE4M3 = 1 / 448.0f;
__forceinline__ __device__ float fast_pow2(int x) {
// We can ensure `-126 <= x and x <= 127`
uint32_t bits_x = (x + 127) << 23;
return *reinterpret_cast<float*>(&bits_x);
}
__forceinline__ __device__ int fast_log2_ceil(float x) {
auto bits_x = *reinterpret_cast<uint32_t*>(&x);
auto exp_x = (bits_x >> 23) & 0xff;
auto man_bits = bits_x & ((1 << 23) - 1);
return exp_x - 127 + (man_bits != 0);
}
__forceinline__ __device__ void calculate_fp8_scales(float amax, float& scale, float& scale_inv, bool round_scale) {
if (round_scale) {
auto exp_scale_inv = fast_log2_ceil(amax * kFinfoAmaxInvE4M3);
scale = fast_pow2(-exp_scale_inv);
scale_inv = fast_pow2(exp_scale_inv);
} else {
scale_inv = amax * kFinfoAmaxInvE4M3;
scale = kFinfoAmaxE4M3 / amax;
}
}
template <bool kIsUE8M0, typename out_dtype_t = std::conditional_t<kIsUE8M0, uint8_t, float>>
__forceinline__ __device__ out_dtype_t extract_required_scale_format(float value) {
if constexpr (kIsUE8M0) {
return static_cast<uint8_t>((*reinterpret_cast<uint32_t*>(&value)) >> 23);
} else {
return value;
}
}
template <int kNumRanks, bool kSyncOnly = false>
__forceinline__ __device__ void
barrier_block(int** barrier_signal_ptrs, int rank) {
auto thread_id = static_cast<int>(threadIdx.x);
// For non-sync-only cases, the memory operations by other threads in the block must be visible to the `sys` scope
if constexpr (not kSyncOnly) {
memory_fence();
__syncthreads();
}
// Add self-ranks, sub other ranks
if (thread_id < kNumRanks) {
atomicAdd_system(barrier_signal_ptrs[rank] + thread_id, FINISHED_SUM_TAG);
atomicSub_system(barrier_signal_ptrs[thread_id] + rank, FINISHED_SUM_TAG);
}
EP_DEVICE_ASSERT(kNumRanks <= blockDim.x);
// Check timeout
auto start_time = clock64();
while (true) {
auto value = thread_id < kNumRanks ? ld_volatile_global(barrier_signal_ptrs[rank] + thread_id) : 0;
if (__all_sync(0xffffffff, value <= 0))
break;
if (clock64() - start_time > NUM_TIMEOUT_CYCLES and get_lane_id() == 0) {
printf("DeepEP timeout check failed: rank = %d, thread = %d)\n", rank, thread_id);
trap();
}
}
__syncthreads();
}
__forceinline__ __device__ int atomic_cas_cta_acquire(int* addr, int x, int y) {
int ret;
asm volatile("atom.acquire.cta.shared::cta.cas.b32 %0, [%1], %2, %3;" : "=r"(ret) : "l"(addr), "r"(x), "r"(y) : "memory");
return ret;
}
__forceinline__ __device__ int atomic_exch_cta_release(int* addr, int x) {
int ret;
asm volatile("atom.release.cta.shared::cta.exch.b32 %0, [%1], %2;" : "=r"(ret) : "l"(addr), "r"(x) : "memory");
return ret;
}
__forceinline__ __device__ void acquire_lock(int* mutex) {
// To make later memory operations valid, we must use `acquire` for memory semantics
while (atomic_cas_cta_acquire(mutex, 0, 1) != 0);
}
__forceinline__ __device__ void release_lock(int* mutex) {
// To make previous memory operations visible to other threads, we must use `release` for memory semantics
atomic_exch_cta_release(mutex, 0);
}
} // namespace deep_ep

View File

@ -0,0 +1,7 @@
import torch
from .utils import EventOverlap
from .buffer import Buffer
# noinspection PyUnresolvedReferences
from deep_ep_cpp import Config

617
DeepEP/deep_ep/buffer.py Normal file
View File

@ -0,0 +1,617 @@
import os
import torch
import torch.distributed as dist
from typing import Callable, List, Tuple, Optional, Union
# noinspection PyUnresolvedReferences
import deep_ep_cpp
# noinspection PyUnresolvedReferences
from deep_ep_cpp import Config, EventHandle
from .utils import EventOverlap, check_nvlink_connections
class Buffer:
"""
The core expert-parallel (EP) communication buffers for Mixture of Experts (MoE) model, which supports:
- high-throughput intranode all-to-all (dispatch and combine, using NVLink)
- high-throughput internode all-to-all (dispatch and combine, using RDMA and NVLink)
- low-latency all-to-all (dispatch and combine, using RDMA)
Attributes:
num_sms: the SMs used in high-throughput kernels.
rank: the local rank number.
group_size: the number of ranks in the group.
group: the communication group.
num_nvl_bytes: the buffer size for intranode NVLink communication.
num_rdma_bytes: the buffer size for internode (also for intranode with low-latency mode) RDMA communication.
runtime: the C++ runtime.
"""
num_sms: int = 20
def __init__(self, group: dist.ProcessGroup,
num_nvl_bytes: int = 0, num_rdma_bytes: int = 0,
low_latency_mode: bool = False, num_qps_per_rank: int = 24,
allow_nvlink_for_low_latency_mode: bool = True,
allow_mnnvl: bool = False) -> None:
"""
Initialize the communication buffer.
Arguments:
group: the communication group.
num_nvl_bytes: the buffer size for intranode NVLink communication.
num_rdma_bytes: the buffer size for internode (also for intranode with low-latency mode) RDMA communication.
low_latency_mode: whether to enable low-latency mode.
num_qps_per_rank: the number of QPs for RDMA, the low-latency mode requires that this number equals
to the number of local experts.
allow_nvlink_for_low_latency_mode: whether allow NVLink traffic for low-latency mode, you should notice
this is somehow incompatible with the hook-based overlapping.
Warning: PCIe connections may lead to errors due to memory ordering issues,
please make sure all connections are via NVLink.
allow_mnnvl: whether to allow MNNVL
"""
check_nvlink_connections(group)
# Initialize the CPP runtime
self.rank = group.rank()
self.group_size = group.size()
self.group = group
self.num_nvl_bytes = num_nvl_bytes
self.num_rdma_bytes = num_rdma_bytes
self.low_latency_mode = low_latency_mode
self.runtime = deep_ep_cpp.Buffer(self.rank, self.group_size, num_nvl_bytes, num_rdma_bytes, low_latency_mode)
# Synchronize device IDs
device_ids = [None, ] * self.group_size
local_device_id = self.runtime.get_local_device_id()
dist.all_gather_object(device_ids, local_device_id, group)
# Synchronize IPC handles
ipc_handles = [None, ] * self.group_size
local_ipc_handle = self.runtime.get_local_ipc_handle()
dist.all_gather_object(ipc_handles, local_ipc_handle, group)
# Synchronize NVSHMEM unique IDs
root_unique_id = None
if self.runtime.get_num_rdma_ranks() > 1 or low_latency_mode:
# Enable IBGDA
assert num_qps_per_rank > 0
os.environ['NVSHMEM_DISABLE_P2P'] = '0' if allow_nvlink_for_low_latency_mode else '1'
os.environ['NVSHMEM_IB_ENABLE_IBGDA'] = '1'
os.environ['NVSHMEM_IBGDA_NIC_HANDLER'] = 'gpu'
os.environ['NVSHMEM_IBGDA_NUM_RC_PER_PE'] = f'{num_qps_per_rank}'
# Make sure QP depth is always larger than the number of on-flight WRs, so that we can skip WQ slot check
os.environ['NVSHMEM_QP_DEPTH'] = '1024'
# Reduce gpu memory usage
# 6 default teams + 1 extra team
os.environ['NVSHMEM_MAX_TEAMS'] = '7'
# Disable NVLink SHArP
os.environ['NVSHMEM_DISABLE_NVLS'] = '1'
# NOTES: NVSHMEM initialization requires at least 256 MiB
os.environ['NVSHMEM_CUMEM_GRANULARITY'] = f'{2 ** 29}'
if not allow_mnnvl:
# Disable multi-node NVLink detection
os.environ['NVSHMEM_DISABLE_MNNVL'] = '1'
# Synchronize using the root ID
nvshmem_unique_ids = [None, ] * self.group_size
if (low_latency_mode and self.rank == 0) or (not low_latency_mode and self.runtime.get_rdma_rank() == 0):
root_unique_id = self.runtime.get_local_nvshmem_unique_id()
dist.all_gather_object(nvshmem_unique_ids, root_unique_id, group)
root_unique_id = nvshmem_unique_ids[0 if low_latency_mode else self.runtime.get_root_rdma_rank(True)]
# Make CPP runtime available
self.runtime.sync(device_ids, ipc_handles, root_unique_id)
assert self.runtime.is_available()
@staticmethod
def is_sm90_compiled():
return deep_ep_cpp.is_sm90_compiled()
@staticmethod
def set_num_sms(new_num_sms: int) -> None:
"""
Set the number of SMs to use in high-throughput kernels.
Arguments:
new_num_sms: the new number to be set.
"""
assert new_num_sms % 2 == 0, 'The SM count must be even'
Buffer.num_sms = new_num_sms
@staticmethod
def capture() -> EventOverlap:
"""
Capture a CUDA event on the current stream, i.e. `torch.cuda.current_stream()`.
Returns:
event: the captured event.
"""
return EventOverlap(EventHandle())
@staticmethod
def get_low_latency_rdma_size_hint(num_max_dispatch_tokens_per_rank: int, hidden: int, num_ranks: int, num_experts: int) -> int:
"""
Get a minimum size requirement for the RDMA buffer. The size calculation will be done with BF16.
Arguments:
num_max_dispatch_tokens_per_rank: the maximum number of tokens to dispatch, all the ranks must hold the same value.
hidden: the hidden dimension of each token.
num_ranks: the number of EP group ranks.
num_experts: the number of all experts.
Returns:
size: the RDMA buffer size recommended.
"""
return deep_ep_cpp.get_low_latency_rdma_size_hint(num_max_dispatch_tokens_per_rank, hidden, num_ranks, num_experts)
def get_comm_stream(self) -> torch.Stream:
"""
Get the communication stream.
Returns:
stream: the communication stream.
"""
ts: torch.Stream = self.runtime.get_comm_stream()
return torch.cuda.Stream(stream_id=ts.stream_id, device_index=ts.device_index, device_type=ts.device_type)
def get_local_buffer_tensor(self, dtype: torch.dtype, size: Optional[torch.Size] = None,
offset: int = 0, use_rdma_buffer: bool = False) -> torch.Tensor:
"""
Get the raw buffer (slice supported) as a PyTorch tensor.
Argument:
dtype: the data type (PyTorch `dtype`) for the tensor.
size: the slice size (by elements) to get from the buffer.
offset: the offset of the beginning element.
use_rdma_buffer: whether to return the RDMA buffer.
"""
tensor = self.runtime.get_local_buffer_tensor(dtype, offset, use_rdma_buffer)
if size is None:
return tensor
assert tensor.numel() >= size.numel()
return tensor[:size.numel()].view(size)
@staticmethod
def _unpack_bias(bias: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]):
bias_0, bias_1 = None, None
if isinstance(bias, torch.Tensor):
bias_0 = bias
elif isinstance(bias, tuple):
assert len(bias) == 2
bias_0, bias_1 = bias
return bias_0, bias_1
@staticmethod
def get_dispatch_config(num_ranks: int) -> Config:
"""
Get a recommended dispatch config.
Argument:
num_ranks: the number of ranks.
Returns:
config: the recommended config.
"""
# TODO: automatically tune
config_map = {
2: Config(Buffer.num_sms, 24, 256, 6, 128),
4: Config(Buffer.num_sms, 6, 256, 6, 128),
8: Config(Buffer.num_sms, 6, 256, 6, 128),
16: Config(Buffer.num_sms, 16, 288, 20, 128),
24: Config(Buffer.num_sms, 8, 288, 32, 128),
32: Config(Buffer.num_sms, 8, 288, 32, 128),
64: Config(Buffer.num_sms, 20, 288, 28, 128),
128: Config(Buffer.num_sms, 20, 560, 32, 128),
144: Config(Buffer.num_sms, 32, 720, 12, 128),
160: Config(Buffer.num_sms, 28, 720, 12, 128),
}
assert num_ranks in config_map, f'Unsupported number of EP ranks: {num_ranks}'
return config_map[num_ranks]
@staticmethod
def get_combine_config(num_ranks: int) -> Config:
"""
Get a recommended combine config.
Argument:
num_ranks: the number of ranks.
Returns:
config: the recommended config.
"""
# TODO: automatically tune
config_map = {
2: Config(Buffer.num_sms, 10, 256, 6, 128),
4: Config(Buffer.num_sms, 9, 256, 6, 128),
8: Config(Buffer.num_sms, 4, 256, 6, 128),
16: Config(Buffer.num_sms, 2, 288, 28, 128),
24: Config(Buffer.num_sms, 1, 288, 20, 128),
32: Config(Buffer.num_sms, 1, 288, 20, 128),
64: Config(Buffer.num_sms, 1, 288, 20, 128),
128: Config(Buffer.num_sms, 1, 560, 12, 128),
144: Config(Buffer.num_sms, 2, 720, 8, 128),
160: Config(Buffer.num_sms, 2, 720, 8, 128),
}
assert num_ranks in config_map, f'Unsupported number of EP ranks: {num_ranks}'
return config_map[num_ranks]
# noinspection PyTypeChecker
def get_dispatch_layout(self, topk_idx: torch.Tensor, num_experts: int,
previous_event: Optional[EventOverlap] = None, async_finish: bool = False,
allocate_on_comm_stream: bool = False) -> \
Tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor, torch.Tensor, EventOverlap]:
"""
Calculate the layout required for later communication.
Arguments:
topk_idx: `[num_tokens, num_topk]`, dtype must be `torch.int64`, the expert indices selected by each token,
`-1` means no selections.
num_experts: the number of experts.
previous_event: the event to wait before actually executing the kernel.
async_finish: the current stream will not wait for the communication kernels to be finished if set.
allocate_on_comm_stream: control whether all the allocated tensors' ownership to be on the communication stream.
Returns:
num_tokens_per_rank: `[num_ranks]` with `torch.int`, the number of tokens to be sent to each rank.
num_tokens_per_rdma_rank: `[num_rdma_ranks]` with `torch.int`, the number of tokens to be sent to each RDMA
rank (with the same GPU index), return `None` for intranode settings.
num_tokens_per_expert: `[num_experts]` with `torch.int`, the number of tokens to be sent to each expert.
is_token_in_rank: `[num_tokens, num_ranks]` with `torch.bool`, whether a token be sent to a rank.
event: the event after executing the kernel (valid only if `async_finish` is set).
"""
num_tokens_per_rank, num_tokens_per_rdma_rank, num_tokens_per_expert, is_token_in_rank, event = \
self.runtime.get_dispatch_layout(topk_idx, num_experts, getattr(previous_event, 'event', None),
async_finish, allocate_on_comm_stream)
return num_tokens_per_rank, num_tokens_per_rdma_rank, num_tokens_per_expert, is_token_in_rank, EventOverlap(event)
# noinspection PyTypeChecker
def dispatch(self, x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
handle: Optional[Tuple] = None,
num_tokens_per_rank: Optional[torch.Tensor] = None, num_tokens_per_rdma_rank: Optional[torch.Tensor] = None,
is_token_in_rank: Optional[torch.Tensor] = None, num_tokens_per_expert: Optional[torch.Tensor] = None,
topk_idx: Optional[torch.Tensor] = None, topk_weights: Optional[torch.Tensor] = None,
expert_alignment: int = 1, num_worst_tokens: int = 0,
config: Optional[Config] = None,
previous_event: Optional[EventOverlap] = None, async_finish: bool = False,
allocate_on_comm_stream: bool = False) -> \
Tuple[Union[Tuple[torch.Tensor, torch.Tensor], torch.Tensor], Optional[torch.Tensor],
Optional[torch.Tensor], List[int], Tuple, EventOverlap]:
"""
Dispatch tokens to different ranks, both intranode and internode settings are supported.
Intranode kernels require all the ranks should be visible via NVLink.
Internode kernels require the ranks in a node should be visible via NVLink, while the ranks with the same GPU
index should be visible via RDMA.
Arguments:
x: `torch.Tensor` or tuple of `torch.Tensor`, for the first type, the shape must be `[num_tokens, hidden]`,
and type must be `torch.bfloat16`; for the second type, the first element of the tuple must be shaped as
`[num_tokens, hidden]` with type `torch.float8_e4m3fn`, the second must be `[num_tokens, hidden // 128]`
(requiring divisible) with type `torch.float`.
handle: an optional communication handle, if set, the CPU will reuse the layout information to save some time.
num_tokens_per_rank: `[num_ranks]` with `torch.int`, the number of tokens to be sent to each rank.
num_tokens_per_rdma_rank: `[num_rdma_ranks]` with `torch.int`, the number of tokens to be sent to each RDMA
rank (with the same GPU index), return `None` for intranode settings.
is_token_in_rank: `[num_tokens, num_ranks]` with `torch.bool`, whether a token be sent to a rank.
num_tokens_per_expert: `[num_experts]` with `torch.int`, the number of tokens to be sent to each expert.
topk_idx: `[num_tokens, num_topk]` with `torch.int64`, the expert indices selected by each token,
`-1` means no selections.
topk_weights: `[num_tokens, num_topk]` with `torch.float`, the expert weights of each token to dispatch.
expert_alignment: align the number of tokens received by each local expert to this variable.
num_worst_tokens: the worst number of tokens to receive, if specified, there will be no CPU sync, and it
will be CUDA-graph compatible. Please also notice that this flag is for intranode only.
config: the performance tuning config.
previous_event: the event to wait before actually executing the kernel.
async_finish: the current stream will not wait for the communication kernels to be finished if set.
allocate_on_comm_stream: control whether all the allocated tensors' ownership to be on the communication stream.
Returns:
recv_x: received tokens, the same type and tuple as the input `x`, but the number of tokens equals to the
received token count.
recv_topk_idx: received expert indices.
recv_topk_weights: received expert weights.
num_recv_tokens_per_expert_list: Python list shaped `[num_local_experts]`, the received token count by
each local expert, aligned to the input `expert_alignment`. If `num_worst_tokens` is specified, the list
will be empty.
handle: the returned communication handle.
event: the event after executing the kernel (valid only if `async_finish` is set).
"""
# Default config
config = self.get_dispatch_config(self.group_size) if config is None else config
# Internode
if self.runtime.get_num_rdma_ranks() > 1:
assert num_worst_tokens == 0, 'Internode dispatch does not support `num_worst_tokens > 0`'
return self.internode_dispatch(x, handle, num_tokens_per_rank, num_tokens_per_rdma_rank, is_token_in_rank, num_tokens_per_expert,
topk_idx, topk_weights, expert_alignment, config, previous_event, async_finish, allocate_on_comm_stream)
# Launch the kernel with cached or non-cached mode
x, x_scales = x if isinstance(x, tuple) else (x, None)
if handle is not None:
assert topk_idx is None and topk_weights is None
rank_prefix_matrix, channel_prefix_matrix, recv_channel_prefix_matrix, recv_src_idx, is_token_in_rank, send_head = handle
num_recv_tokens = recv_src_idx.size(0)
recv_x, recv_x_scales, _, _, _, _, _, _, _, _, event = self.runtime.intranode_dispatch(
x, x_scales, None, None,
None, is_token_in_rank, None, num_recv_tokens, rank_prefix_matrix, channel_prefix_matrix,
expert_alignment, num_worst_tokens, config,
getattr(previous_event, 'event', None), async_finish, allocate_on_comm_stream)
return (recv_x, recv_x_scales) if x_scales is not None else recv_x, None, None, None, None, EventOverlap(event)
else:
assert num_tokens_per_rank is not None and is_token_in_rank is not None and num_tokens_per_expert is not None
recv_x, recv_x_scales, recv_topk_idx, recv_topk_weights, num_recv_tokens_per_expert_list, rank_prefix_matrix, channel_prefix_matrix, recv_channel_prefix_matrix, recv_src_idx, send_head, event = \
self.runtime.intranode_dispatch(x, x_scales, topk_idx, topk_weights,
num_tokens_per_rank, is_token_in_rank, num_tokens_per_expert, 0, None, None,
expert_alignment, num_worst_tokens, config,
getattr(previous_event, 'event', None), async_finish, allocate_on_comm_stream)
handle = (rank_prefix_matrix, channel_prefix_matrix, recv_channel_prefix_matrix, recv_src_idx, is_token_in_rank, send_head)
return (recv_x, recv_x_scales) if x_scales is not None else recv_x, recv_topk_idx, recv_topk_weights, num_recv_tokens_per_expert_list, handle, EventOverlap(event)
# noinspection PyTypeChecker
def combine(self, x: torch.Tensor, handle: Tuple,
topk_weights: Optional[torch.Tensor] = None,
bias: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]] = None,
config: Optional[Config] = None,
previous_event: Optional[EventOverlap] = None, async_finish: bool = False,
allocate_on_comm_stream: bool = False) -> \
Tuple[torch.Tensor, Optional[torch.Tensor], EventOverlap]:
"""
Combine (reduce) tokens (addition **without** weights) from different ranks, both intranode and internode
settings are supported.
Intranode kernels require all the ranks should be visible via NVLink.
Internode kernels require the ranks in a node should be visible via NVLink, while the ranks with the same GPU
index should be visible via RDMA.
Arguments:
x: `[num_tokens, hidden]` with `torch.bfloat16`, the tokens to send for reducing to its original ranks.
handle: a must-set communication handle, you can obtain this from the dispatch function.
topk_weights: `[num_tokens, num_topk]` with `torch.float`, the tokens' top-k weights for reducing to its original ranks.
config: the performance tuning config.
previous_event: the event to wait before actually executing the kernel.
async_finish: the current stream will not wait for the communication kernels to be finished if set.
allocate_on_comm_stream: control whether all the allocated tensors' ownership to be on the communication stream.
Returns:
recv_x: the reduced token from its dispatched ranks.
recv_topk_weights: the reduced top-k weights from its dispatch ranks.
event: the event after executing the kernel (valid only if `async_finish` is set).
"""
# Default config
config = self.get_combine_config(self.group_size) if config is None else config
# Internode
if self.runtime.get_num_rdma_ranks() > 1:
return self.internode_combine(x, handle, topk_weights, bias, config, previous_event, async_finish, allocate_on_comm_stream)
# NOTES: the second `_` is for the sending side, so we should use the third one
rank_prefix_matrix, _, channel_prefix_matrix, src_idx, is_recv_token_in_rank, send_head = handle
bias_0, bias_1 = Buffer._unpack_bias(bias)
# Launch the kernel
recv_x, recv_topk_weights, event = self.runtime.intranode_combine(
x, topk_weights, bias_0, bias_1,
src_idx, rank_prefix_matrix, channel_prefix_matrix, send_head, config,
getattr(previous_event, 'event', None), async_finish, allocate_on_comm_stream)
return recv_x, recv_topk_weights, EventOverlap(event)
# noinspection PyTypeChecker
def internode_dispatch(self, x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
handle: Optional[Tuple] = None,
num_tokens_per_rank: Optional[torch.Tensor] = None, num_tokens_per_rdma_rank: Optional[torch.Tensor] = None,
is_token_in_rank: Optional[torch.Tensor] = None, num_tokens_per_expert: Optional[torch.Tensor] = None,
topk_idx: Optional[torch.Tensor] = None, topk_weights: Optional[torch.Tensor] = None, expert_alignment: int = 1,
config: Optional[Config] = None,
previous_event: Optional[EventOverlap] = None, async_finish: bool = False,
allocate_on_comm_stream: bool = False) -> \
Tuple[Union[Tuple[torch.Tensor, torch.Tensor], torch.Tensor], Optional[torch.Tensor],
Optional[torch.Tensor], List[int], Tuple, EventOverlap]:
"""
Internode dispatch implementation, for more details, please refer to the `dispatch` docs.
Normally, you should not directly call this function.
"""
assert config is not None
# Launch the kernel with cached or non-cached mode
x, x_scales = x if isinstance(x, tuple) else (x, None)
if handle is not None:
assert topk_idx is None and topk_weights is None
is_token_in_rank, \
rdma_channel_prefix_matrix, gbl_channel_prefix_matrix, \
recv_rdma_channel_prefix_matrix, recv_rdma_rank_prefix_sum, recv_gbl_channel_prefix_matrix, recv_gbl_rank_prefix_sum, \
recv_src_meta, send_rdma_head, send_nvl_head = handle
num_recv_tokens = recv_src_meta.size(0)
num_rdma_recv_tokens = send_nvl_head.size(0)
recv_x, recv_x_scales, _, _, _, _, _, _, _, _, _, _, _, _, event = self.runtime.internode_dispatch(
x, x_scales, topk_idx, topk_weights,
None, None, is_token_in_rank, None,
num_recv_tokens, num_rdma_recv_tokens,
rdma_channel_prefix_matrix, recv_rdma_rank_prefix_sum, gbl_channel_prefix_matrix, recv_gbl_rank_prefix_sum,
expert_alignment, config, getattr(previous_event, 'event', None), async_finish, allocate_on_comm_stream)
return (recv_x, recv_x_scales) if x_scales is not None else recv_x, None, None, None, None, EventOverlap(event)
else:
assert num_tokens_per_rank is not None and is_token_in_rank is not None and num_tokens_per_expert is not None
recv_x, recv_x_scales, recv_topk_idx, recv_topk_weights, num_recv_tokens_per_expert_list, \
rdma_channel_prefix_matrix, gbl_channel_prefix_matrix, \
recv_rdma_channel_prefix_matrix, recv_rdma_rank_prefix_sum, \
recv_gbl_channel_prefix_matrix, recv_gbl_rank_prefix_sum, \
recv_src_meta, send_rdma_head, send_nvl_head, event = self.runtime.internode_dispatch(
x, x_scales, topk_idx, topk_weights,
num_tokens_per_rank, num_tokens_per_rdma_rank, is_token_in_rank, num_tokens_per_expert,
0, 0, None, None, None, None,
expert_alignment, config, getattr(previous_event, 'event', None), async_finish, allocate_on_comm_stream)
handle = (is_token_in_rank,
rdma_channel_prefix_matrix, gbl_channel_prefix_matrix,
recv_rdma_channel_prefix_matrix, recv_rdma_rank_prefix_sum, recv_gbl_channel_prefix_matrix, recv_gbl_rank_prefix_sum,
recv_src_meta, send_rdma_head, send_nvl_head)
return (recv_x, recv_x_scales) if x_scales is not None else recv_x, recv_topk_idx, recv_topk_weights, num_recv_tokens_per_expert_list, handle, EventOverlap(event)
# noinspection PyTypeChecker
def internode_combine(self, x: torch.Tensor, handle: Union[tuple, list],
topk_weights: Optional[torch.Tensor] = None,
bias: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]] = None,
config: Optional[Config] = None,
previous_event: Optional[EventOverlap] = None, async_finish: bool = False,
allocate_on_comm_stream: bool = False) -> \
Tuple[torch.Tensor, Optional[torch.Tensor], EventOverlap]:
"""
Internode combine implementation, for more details, please refer to the `combine` docs.
Normally, you should not directly call this function.
"""
assert config is not None
# Unpack handle and bias
is_combined_token_in_rank, \
_, _, \
rdma_channel_prefix_matrix, rdma_rank_prefix_sum, gbl_channel_prefix_matrix, gbl_rank_prefix_sum, \
src_meta, send_rdma_head, send_nvl_head = handle
bias_0, bias_1 = Buffer._unpack_bias(bias)
# Launch the kernel
combined_x, combined_topk_weights, event = self.runtime.internode_combine(
x, topk_weights, bias_0, bias_1,
src_meta, is_combined_token_in_rank,
rdma_channel_prefix_matrix, rdma_rank_prefix_sum, gbl_channel_prefix_matrix,
send_rdma_head, send_nvl_head, config, getattr(previous_event, 'event', None),
async_finish, allocate_on_comm_stream)
return combined_x, combined_topk_weights, EventOverlap(event)
def clean_low_latency_buffer(self, num_max_dispatch_tokens_per_rank: int, hidden: int, num_experts: int) -> None:
"""
As low-latency kernels require part of the buffer to be zero-initialized, so it is vital to clean the buffer
if the buffer is dirty at some time.
For example, after running the normal dispatch/combine, you must run this function before executing any
low-latency kernel.
Arguments:
num_max_dispatch_tokens_per_rank: the maximum number of tokens to dispatch, all the ranks must hold the same value.
hidden: the hidden dimension of each token.
num_experts: the number of all experts.
"""
self.runtime.clean_low_latency_buffer(num_max_dispatch_tokens_per_rank, hidden, num_experts)
# noinspection PyTypeChecker
def low_latency_dispatch(self, x: torch.Tensor, topk_idx: torch.Tensor,
num_max_dispatch_tokens_per_rank: int, num_experts: int,
cumulative_local_expert_recv_stats: Optional[torch.Tensor] = None,
use_fp8: bool = True, round_scale: bool = False, use_ue8m0: bool = False,
async_finish: bool = False, return_recv_hook: bool = False) -> \
Tuple[Tuple[torch.Tensor, torch.Tensor], torch.Tensor, Tuple, EventOverlap, Callable]:
"""
A low-latency implementation for dispatching with IBGDA.
This kernel requires all the ranks (no matter intranode or internode) should be visible via RDMA
(specifically, IBGDA must be enabled).
Warning: as there are only two buffers, and the returned tensors reuse the buffer, you cannot hold more than 2
low-latency kernels' result tensors at a single moment.
Arguments:
x: `torch.Tensor` with `torch.bfloat16`, shaped as `[num_tokens, hidden]`, only several hidden shapes are
supported. The number of tokens to be dispatched must be less than `num_max_dispatch_tokens_per_rank`.
topk_idx: `torch.Tensor` with `torch.int64`, shaped as `[num_tokens, num_topk]`, only several top-k shapes
are supported. `-1` indices (not selecting any expert) are supported.
num_max_dispatch_tokens_per_rank: the maximum number of tokens to dispatch, all the ranks must hold the same value.
num_experts: the number of all experts.
cumulative_local_expert_recv_stats: a cumulative expert count tensor for statistics, which should have shape
`[num_local_experts]` and be typed as `torch.int`. This is useful for online service EP load balance
monitoring.
use_fp8: whether to enable FP8 casting, with this, the received data will be a tuple of FP8 tensor and scaling factors.
round_scale: whether round the scaling factors into power of 2.
use_ue8m0: whether use UE8M0 as scaling factor format (available only with `round_scale=True`).
async_finish: the current stream will not wait for the communication kernels to be finished if set.
return_recv_hook: return a receiving hook if set. If set, the kernel will just do the RDMA request issues,
but **without actually receiving the data**. You must call the received hook to make sure the data's arrival.
If you do not set this flag, the kernel will ensure the data's arrival.
Returns:
recv_x: a tensor or tuple with received tokens for each expert.
With `use_fp8=True`: the first element is a `torch.Tensor` shaped as
`[num_local_experts, num_max_dispatch_tokens_per_rank * num_ranks, hidden]` with `torch.float8_e4m3fn`.
The second tensor is the corresponding scales for the first element with shape
`[num_local_experts, num_max_dispatch_tokens_per_rank * num_ranks, hidden // 128]` with `torch.float`,
if `use_ue8m0=False`. With `use_ue8m0=True`, the second one is packed and shaped as
`[num_local_experts, num_max_dispatch_tokens_per_rank * num_ranks, hidden // 512]` with type `torch.int`.
Notice that, the last-two-dimension of the scaling tensors are in column-major for TMA compatibility.
With `use_fp8=False`, the result would be a tensor shaped as
`[num_local_experts, num_max_dispatch_tokens_per_rank * num_ranks, hidden]` with `torch.bfloat16`.
Moreover, not all tokens are valid, only some of the `num_max_dispatch_tokens_per_rank * num_ranks` are,
as we do not synchronize CPU received count with GPU (also not incompatible with CUDA graph if synced).
recv_count: a tensor shaped `[num_local_experts]` with type `torch.int`, indicating how many tokens each
expert receives. As mentioned before, not all tokens are valid in `recv_x`.
handle: the communication handle to be used in the `low_latency_combine` function.
event: the event after executing the kernel (valid only if `async_finish` is set).
hook: the receiving hook function (valid only if `return_recv_hook` is set).
"""
packed_recv_x, packed_recv_x_scales, packed_recv_count, packed_recv_src_info, packed_recv_layout_range, event, hook = \
self.runtime.low_latency_dispatch(x, topk_idx,
cumulative_local_expert_recv_stats,
num_max_dispatch_tokens_per_rank, num_experts,
use_fp8, round_scale, use_ue8m0,
async_finish, return_recv_hook)
handle = (packed_recv_src_info, packed_recv_layout_range, num_max_dispatch_tokens_per_rank, x.size(1), num_experts)
tensors_to_record = (x, topk_idx,
packed_recv_x, packed_recv_x_scales, packed_recv_count,
packed_recv_src_info, packed_recv_layout_range,
cumulative_local_expert_recv_stats)
return (packed_recv_x, packed_recv_x_scales) if use_fp8 else packed_recv_x, packed_recv_count, handle, \
EventOverlap(event, tensors_to_record if async_finish else None), hook
# noinspection PyTypeChecker
def low_latency_combine(self, x: torch.Tensor, topk_idx: torch.Tensor, topk_weights: torch.Tensor,
handle: tuple, zero_copy: bool = False, async_finish: bool = False,
return_recv_hook: bool = False, out: Optional[torch.Tensor] = None) -> \
Tuple[torch.Tensor, EventOverlap, Callable]:
"""
A low-latency implementation for combining tokens (reduce **with weights**) with IBGDA.
This kernel requires all the ranks (no matter intranode or internode) should be visible via RDMA
(specifically, IBGDA must be enabled).
Warning: as there are only two buffers, and the returned tensors reuse the buffer, you cannot hold more than 2
low-latency kernels' result tensors at a single moment.
Arguments:
x: `[num_local_experts, num_max_dispatch_tokens_per_rank * num_ranks, hidden]` with `torch.bfloat16`,
the local calculated tokens to be sent to this original rank and reduced.
topk_idx: `[num_combined_tokens, num_topk]` with `torch.int64`, the expert indices selected by the dispatched
tokens. `-1` indices (not selecting any expert) are supported. Note that, `num_combined_tokens` equals
to the number of dispatched tokens.
topk_weights: `[num_combined_tokens, num_topk]` with `torch.float`, the expert weights selected by the dispatched
tokens. The received tokens will be reduced with the weights in this tensor.
handle: the communication handle given by the `dispatch` function.
zero_copy: whether the tensor is already copied into the RDMA buffer, should be cooperative
with `get_next_low_latency_combine_buffer`.
async_finish: the current stream will not wait for the communication kernels to be finished if set.
return_recv_hook: return a receiving hook if set. If set, the kernel will just do the RDMA request issues,
but **without actually receiving the data**. You must call the received hook to make sure the data's arrival.
If you do not set this flag, the kernel will ensure the data's arrival.
out: the in-place output tensor, if set, the kernel will write the result to this tensor and return it directly.
Returns:
combined_x: the reduced token tensor, with shape `[num_combined_tokens, hidden]` and type `torch.bfloat16`.
event: the event after executing the kernel (valid only if `async_finish` is set).
hook: the receiving hook function (valid only if `return_recv_hook` is set).
"""
src_info, layout_range, num_max_dispatch_tokens_per_rank, hidden, num_experts = handle
combined_x, event, hook = self.runtime.low_latency_combine(x, topk_idx, topk_weights, src_info, layout_range,
num_max_dispatch_tokens_per_rank, num_experts,
zero_copy, async_finish, return_recv_hook, out)
tensors_to_record = (x, topk_idx, topk_weights, src_info, layout_range, combined_x)
return combined_x, EventOverlap(event, tensors_to_record if async_finish else None), hook
def get_next_low_latency_combine_buffer(self, handle: object):
"""
Get the raw registered RDMA buffer tensor for next low-latency combine, so that the next combine kernel can skip the copying.
Arguments:
handle: the communication handle given by the `dispatch` function.
Returns:
buffer: the raw RDMA low-latency buffer as a BF16 PyTorch tensor with shape
`[num_local_experts, num_ranks * num_max_dispatch_tokens_per_rank, hidden]`, you should fill this buffer
by yourself.
"""
src_info, layout_range, num_max_dispatch_tokens_per_rank, hidden, num_experts = handle
return self.runtime.get_next_low_latency_combine_buffer(num_max_dispatch_tokens_per_rank, hidden, num_experts)

101
DeepEP/deep_ep/utils.py Normal file
View File

@ -0,0 +1,101 @@
import os
import subprocess
import torch
import torch.distributed as dist
from typing import Any, Optional, Tuple
# noinspection PyUnresolvedReferences
from deep_ep_cpp import Config, EventHandle
class EventOverlap:
"""
A wrapper class to manage CUDA events, also for better overlapping convenience.
Attributes:
event: the CUDA event captured.
extra_tensors: an easier way to simulate PyTorch tensor `record_stream`, may be useful with CUDA graph.
"""
def __init__(self, event: Optional[EventHandle] = None,
extra_tensors: Optional[Tuple[torch.Tensor]] = None) -> None:
"""
Initialize the class.
Arguments:
event: the CUDA event captured.
extra_tensors: an easier way to simulate PyTorch tensor `record_stream`, may be useful with CUDA graph.
"""
self.event = event
# NOTES: we use extra tensors to achieve stream recording, otherwise,
# stream recording will be incompatible with CUDA graph.
self.extra_tensors = extra_tensors
def current_stream_wait(self) -> None:
"""
The current stream `torch.cuda.current_stream()` waits for the event to be finished.
"""
assert self.event is not None
self.event.current_stream_wait()
def __enter__(self) -> Any:
"""
Utility for overlapping and Python `with` syntax.
You can overlap the kernels on the current stream with the following example:
```python
event_overlap = event_after_all_to_all_kernels()
with event_overlap():
do_something_on_current_stream()
# After exiting the `with` scope, the current stream with wait the event to be finished.
```
"""
return self
def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
"""
Utility for overlapping and Python `with` syntax.
Please follow the example in the `__enter__` function.
"""
if self.event is not None:
self.event.current_stream_wait()
def check_nvlink_connections(group: dist.ProcessGroup):
"""
Check NVLink connection between every pair of GPUs.
Arguments:
group: the communication group.
"""
# Check NVLink connection
# NOTES: some A100 PCIE GPUs only have pairwise NVLink connection, so that we can only use EP2
# TODO: check all cases, all local-node GPUs in the group should be connected via NVLink
if 'PCIE' in torch.cuda.get_device_name():
assert group.size() <= 2, 'PCIe GPUs only have pairwise NVLink connections'
# noinspection PyUnresolvedReferences
import pynvml
pynvml.nvmlInit()
# noinspection PyTypeChecker
devices = os.environ.get('CUDA_VISIBLE_DEVICES', '0,1,2,3,4,5,6,7').strip(',').split(',')
physical_device_idx = int(devices[torch.cuda.current_device()])
physical_device_indices = [0, ] * group.size()
dist.all_gather_object(physical_device_indices, physical_device_idx, group)
# Check whether they are all connected via NVLink
# Reference: https://github.com/vllm-project/vllm/blob/b8e809a057765c574726a6077fd124db5077ce1f/vllm/platforms/cuda.py#L438
handles = [pynvml.nvmlDeviceGetHandleByIndex(i) for i in physical_device_indices]
for i, handle in enumerate(handles):
for j, peer_handle in enumerate(handles):
if i >= j:
continue
status = pynvml.nvmlDeviceGetP2PStatus(handle, peer_handle, pynvml.NVML_P2P_CAPS_INDEX_NVLINK)
assert status == pynvml.NVML_P2P_STATUS_OK,\
f'GPU {physical_device_indices[i]} and GPU {physical_device_indices[j]} are not connected via NVLink'
# Close NVML
pynvml.nvmlShutdown()

12
DeepEP/install.sh Executable file
View File

@ -0,0 +1,12 @@
# Change current directory into project root
original_dir=$(pwd)
script_dir=$(dirname "$0")
cd "$script_dir"
# Remove old dist file, build, and install
rm -rf dist
python setup.py bdist_wheel
pip install dist/*.whl
# Open users' original directory
cd "$original_dir"

107
DeepEP/setup.py Normal file
View File

@ -0,0 +1,107 @@
import os
import subprocess
import setuptools
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
if __name__ == '__main__':
nvshmem_dir = os.getenv('NVSHMEM_DIR', None)
disable_nvshmem = nvshmem_dir is None
if disable_nvshmem:
print('Warning: `NVSHMEM_DIR` is not specified, all internode and low-latency features are disabled\n')
else:
assert os.path.exists(nvshmem_dir), f'Failed to find NVSHMEM: {nvshmem_dir}'
cxx_flags = ['-O3', '-Wno-deprecated-declarations', '-Wno-unused-variable',
'-Wno-sign-compare', '-Wno-reorder', '-Wno-attributes']
nvcc_flags = ['-O3', '-Xcompiler', '-O3']
sources = ['csrc/deep_ep.cpp', 'csrc/kernels/runtime.cu', 'csrc/kernels/layout.cu', 'csrc/kernels/intranode.cu']
include_dirs = ['csrc/']
library_dirs = []
nvcc_dlink = []
extra_link_args = []
# NVSHMEM flags
if disable_nvshmem:
cxx_flags.append('-DDISABLE_NVSHMEM')
nvcc_flags.append('-DDISABLE_NVSHMEM')
else:
sources.extend(['csrc/kernels/internode.cu', 'csrc/kernels/internode_ll.cu'])
include_dirs.extend([f'{nvshmem_dir}/include'])
library_dirs.extend([f'{nvshmem_dir}/lib'])
nvcc_dlink.extend(['-dlink', f'-L{nvshmem_dir}/lib', '-lnvshmem'])
extra_link_args.extend(['-l:libnvshmem.a', '-l:nvshmem_bootstrap_uid.so', f'-Wl,-rpath,{nvshmem_dir}/lib'])
if int(os.getenv('DISABLE_SM90_FEATURES', 0)):
# Prefer A100
os.environ['TORCH_CUDA_ARCH_LIST'] = os.getenv('TORCH_CUDA_ARCH_LIST', '8.0')
# Disable some SM90 features: FP8, launch methods, and TMA
cxx_flags.append('-DDISABLE_SM90_FEATURES')
nvcc_flags.append('-DDISABLE_SM90_FEATURES')
# Disable internode and low-latency kernels
assert disable_nvshmem
else:
# Prefer H800 series
os.environ['TORCH_CUDA_ARCH_LIST'] = os.getenv('TORCH_CUDA_ARCH_LIST', '9.0')
# CUDA 12 flags
nvcc_flags.extend(['-rdc=true', '--ptxas-options=--register-usage-level=10'])
# Disable LD/ST tricks, as some CUDA version does not support `.L1::no_allocate`
if os.environ['TORCH_CUDA_ARCH_LIST'].strip() != '9.0':
assert int(os.getenv('DISABLE_AGGRESSIVE_PTX_INSTRS', 1)) == 1
os.environ['DISABLE_AGGRESSIVE_PTX_INSTRS'] = '1'
# Disable aggressive PTX instructions
if int(os.getenv('DISABLE_AGGRESSIVE_PTX_INSTRS', '0')):
cxx_flags.append('-DDISABLE_AGGRESSIVE_PTX_INSTRS')
nvcc_flags.append('-DDISABLE_AGGRESSIVE_PTX_INSTRS')
# Put them together
extra_compile_args = {
'cxx': cxx_flags,
'nvcc': nvcc_flags,
}
if len(nvcc_dlink) > 0:
extra_compile_args['nvcc_dlink'] = nvcc_dlink
# Summary
print(f'Build summary:')
print(f' > Sources: {sources}')
print(f' > Includes: {include_dirs}')
print(f' > Libraries: {library_dirs}')
print(f' > Compilation flags: {extra_compile_args}')
print(f' > Link flags: {extra_link_args}')
print(f' > Arch list: {os.environ["TORCH_CUDA_ARCH_LIST"]}')
print(f' > NVSHMEM path: {nvshmem_dir}')
print()
# noinspection PyBroadException
try:
cmd = ['git', 'rev-parse', '--short', 'HEAD']
revision = '+' + subprocess.check_output(cmd).decode('ascii').rstrip()
except Exception as _:
revision = ''
setuptools.setup(
name='deep_ep',
version='1.1.0' + revision,
packages=setuptools.find_packages(
include=['deep_ep']
),
ext_modules=[
CUDAExtension(
name='deep_ep_cpp',
include_dirs=include_dirs,
library_dirs=library_dirs,
sources=sources,
extra_compile_args=extra_compile_args,
extra_link_args=extra_link_args
)
],
cmdclass={
'build_ext': BuildExtension
}
)

View File

@ -0,0 +1,254 @@
import os
import time
import torch
import torch.distributed as dist
# noinspection PyUnresolvedReferences
import deep_ep
from utils import init_dist, bench, calc_diff, create_grouped_scores, inplace_unique, per_token_cast_to_fp8, per_token_cast_back
# Test compatibility with low latency functions
import test_low_latency
def test_main(num_sms: int, local_rank: int, num_local_ranks: int, num_ranks: int, num_nodes: int, rank: int, buffer: deep_ep.Buffer, group: dist.ProcessGroup):
# Settings
num_tokens, hidden, num_topk_groups, num_topk, num_experts = 4096, 7168, min(num_nodes, 4), 8, (256 // num_ranks) * num_ranks
assert num_experts % num_ranks == 0 and num_local_ranks == 8
if local_rank == 0:
print(f'[config] num_tokens={num_tokens}, hidden={hidden}, num_topk_groups={num_topk_groups}, num_topk={num_topk}', flush=True)
# Random data
x = torch.ones((num_tokens, hidden), dtype=torch.bfloat16, device='cuda') * rank
x_pure_rand = torch.randn((num_tokens, hidden), dtype=torch.bfloat16, device='cuda')
x_e4m3 = per_token_cast_to_fp8(x)
x_e4m3 = (x_e4m3[0], x_e4m3[1].T.contiguous().T)
scores = torch.randn((num_tokens, num_experts), dtype=torch.float32, device='cuda').abs() + 1
group_scores = scores.view(num_tokens, num_nodes, -1).amax(dim=-1)
group_idx = torch.topk(group_scores, k=num_topk_groups, dim=-1, sorted=False).indices
masked_scores = create_grouped_scores(scores, group_idx, num_nodes)
topk_idx = torch.topk(masked_scores, num_topk, dim=-1, largest=True, sorted=False)[1]
topk_weights = torch.ones((num_tokens, num_topk), dtype=torch.float32, device='cuda') * rank
topk_weights_pure_rand = torch.randn((num_tokens, num_topk), dtype=torch.float32, device='cuda')
rank_idx = topk_idx // (num_experts // num_ranks)
rank_idx.masked_fill_(topk_idx == -1, -1)
inplace_unique(rank_idx, num_ranks)
rdma_rank_idx = rank_idx // num_local_ranks
rdma_rank_idx.masked_fill_(rank_idx == -1, -1)
inplace_unique(rdma_rank_idx, num_nodes)
# RDMA dispatch counts
rdma_idx = topk_idx // (num_experts // num_nodes)
rdma_idx.masked_fill_(topk_idx == -1, -1)
inplace_unique(rdma_idx, num_nodes)
num_rdma_token_sent = rdma_idx.ne(-1).sum().item()
# Expert meta
num_tokens_per_expert = torch.zeros((num_experts, ), dtype=torch.int, device='cuda')
for i in range(num_experts):
num_tokens_per_expert[i] = (topk_idx == i).sum()
gbl_num_tokens_per_expert = num_tokens_per_expert.clone()
dist.all_reduce(gbl_num_tokens_per_expert, group=group)
# Rank layout meta
num_tokens_per_rank = torch.empty((num_ranks, ), dtype=torch.int, device='cuda')
num_tokens_per_rdma_rank = torch.empty((num_nodes, ), dtype=torch.int, device='cuda')
token_idx_in_rank = torch.full((num_ranks, num_tokens), -1, dtype=torch.long, device='cuda')
for i in range(num_ranks):
num_tokens_per_rank[i] = (rank_idx == i).sum()
token_sel = (rank_idx == i).max(dim=-1)[0]
count = token_sel.sum().item()
tokens = torch.sort(token_sel.to(torch.int), descending=True)[1]
tokens[:count] = torch.sort(tokens[:count])[0]
token_idx_in_rank[i][tokens[:count]] = torch.arange(count, dtype=torch.long, device='cuda')
for i in range(num_nodes):
num_tokens_per_rdma_rank[i] = (rdma_rank_idx == i).sum()
token_idx_in_rank = token_idx_in_rank.T.contiguous().to(torch.int)
is_token_in_rank = token_idx_in_rank >= 0
gbl_num_tokens_per_rank = num_tokens_per_rank.clone()
dist.all_reduce(gbl_num_tokens_per_rank, group=group)
ref_num_tokens_per_rank, ref_num_tokens_per_rdma_rank, ref_num_tokens_per_expert, ref_is_token_in_rank, _ = \
buffer.get_dispatch_layout(topk_idx, num_experts)
assert torch.allclose(ref_num_tokens_per_rank, num_tokens_per_rank)
assert torch.allclose(ref_num_tokens_per_rdma_rank, num_tokens_per_rdma_rank)
assert torch.allclose(ref_num_tokens_per_expert, num_tokens_per_expert)
assert torch.allclose(ref_is_token_in_rank, is_token_in_rank)
t = bench(lambda: buffer.get_dispatch_layout(topk_idx, num_experts))[0]
if local_rank == 0:
print(f'[layout] Kernel performance: {t * 1000:.3f} ms', flush=True)
print('', flush=True)
group.barrier()
time.sleep(1)
# Config
rdma_buffer_size, nvl_buffer_size = 128, (720 if num_ranks in (144, 160) else 512)
config = deep_ep.Config(num_sms, 8, nvl_buffer_size, 16, rdma_buffer_size)
# Test dispatch
# noinspection PyShadowingNames
def check_data(check_x, recv_gbl_rank_prefix_sum):
assert torch.allclose(check_x.amin(dim=1), check_x.amax(dim=1))
check_start = 0
for i in range(num_ranks):
check_end = recv_gbl_rank_prefix_sum[i].item()
assert (check_x[check_start:check_end, :].int() - i).sum().item() == 0
check_start = check_end
for previous_mode in (False, True):
for async_mode in (False, True):
for current_x in (x_pure_rand, x, x_e4m3):
for with_topk in (False, True):
if local_rank == 0:
print(f'[testing] Running with {"FP8" if isinstance(current_x, tuple) else "BF16"}, {"with" if with_topk else "without"} top-k (async={async_mode}, previous={previous_mode}) ...', flush=True, end='')
dispatch_args = {'x': current_x, 'num_tokens_per_rank': num_tokens_per_rank, 'num_tokens_per_rdma_rank': num_tokens_per_rdma_rank, 'is_token_in_rank': is_token_in_rank,
'num_tokens_per_expert': num_tokens_per_expert, 'config': config, 'async_finish': async_mode}
if with_topk:
dispatch_args.update({'topk_idx': topk_idx, 'topk_weights': topk_weights_pure_rand if current_x is x_pure_rand else topk_weights})
if previous_mode:
dispatch_args.update({'previous_event': buffer.capture()})
recv_x, recv_topk_idx, recv_topk_weights, recv_num_tokens_per_expert_list, handle, event = buffer.dispatch(**dispatch_args)
event.current_stream_wait() if async_mode else ()
recv_x = per_token_cast_back(*recv_x) if isinstance(recv_x, tuple) else recv_x
# Checks
recv_gbl_rank_prefix_sum = handle[-4]
assert gbl_num_tokens_per_rank[rank].item() == recv_x.size(0), f'{gbl_num_tokens_per_rank[rank].item()} != {recv_x.size(0)}'
assert gbl_num_tokens_per_expert.view(num_ranks, -1)[rank].tolist() == recv_num_tokens_per_expert_list
if current_x is not x_pure_rand:
check_data(recv_x, recv_gbl_rank_prefix_sum)
if with_topk:
# Check `topk_idx`
assert (recv_topk_idx.eq(-1) | ((recv_topk_idx >= 0) & (recv_topk_idx < (num_experts // num_ranks)))).sum().item() == recv_topk_idx.numel()
for i, count in enumerate(recv_num_tokens_per_expert_list):
assert recv_topk_idx.eq(i).sum().item() == count
# Check `topk_weights`
if current_x is not x_pure_rand:
recv_topk_weights[recv_topk_idx.eq(-1)] = recv_topk_weights.amax(dim=1, keepdim=True).expand_as(recv_topk_weights)[recv_topk_idx.eq(-1)]
check_data(recv_topk_weights, recv_gbl_rank_prefix_sum)
# Test cached dispatch (must without top-k staffs)
if not with_topk:
dispatch_args = {'x': current_x, 'handle': handle, 'config': config, 'async_finish': async_mode}
if previous_mode:
dispatch_args.update({'previous_event': buffer.capture()})
recv_x, _, _, _, _, event = buffer.dispatch(**dispatch_args)
event.current_stream_wait() if async_mode else ()
recv_x = per_token_cast_back(*recv_x) if isinstance(recv_x, tuple) else recv_x
if current_x is not x_pure_rand:
check_data(recv_x, recv_gbl_rank_prefix_sum)
# Test combine
bias_0 = torch.ones((num_tokens, hidden), dtype=torch.bfloat16, device='cuda')
bias_1 = torch.randn((num_tokens, hidden), dtype=torch.bfloat16, device='cuda')
combine_args = {'x': recv_x, 'bias': (bias_0, bias_1), 'handle': handle, 'config': config, 'async_finish': async_mode}
if with_topk:
combine_args.update({'topk_weights': recv_topk_weights})
if previous_mode:
combine_args.update({'previous_event': buffer.capture()})
combined_x, combined_topk_weights, event = buffer.combine(**combine_args)
event.current_stream_wait() if async_mode else ()
check_x = (combined_x.float() - bias_0.float() - bias_1.float()) / is_token_in_rank.sum(dim=1).unsqueeze(1)
ref_x = x_pure_rand if current_x is x_pure_rand else x
assert calc_diff(check_x, ref_x) < 5e-6
if with_topk:
check_topk_weights = combined_topk_weights if (current_x is x_pure_rand) else (combined_topk_weights / is_token_in_rank.sum(dim=1).unsqueeze(1))
ref_topk_weights = topk_weights_pure_rand if current_x is x_pure_rand else topk_weights
assert calc_diff(check_topk_weights, ref_topk_weights) < 1e-9
# For later tuning
dispatch_bf16_rdma_send_bytes = num_rdma_token_sent * hidden * 2
dispatch_bf16_nvl_recv_bytes = recv_x.numel() * 2
combine_bf16_nvl_send_bytes = dispatch_bf16_nvl_recv_bytes
combine_bf16_rdma_recv_bytes = dispatch_bf16_rdma_send_bytes
if local_rank == 0:
print(' passed', flush=True)
if local_rank == 0:
print('', flush=True)
# Tune dispatch performance
best_dispatch_results = None
fp8_factor = (1 + 4 / 128) / 2
for current_x in (x_e4m3, x):
best_time, best_results = 1e10, None
rdma_send_bytes = (dispatch_bf16_rdma_send_bytes * fp8_factor) if isinstance(current_x, tuple) else dispatch_bf16_rdma_send_bytes
nvl_recv_bytes = (dispatch_bf16_nvl_recv_bytes * fp8_factor) if isinstance(current_x, tuple) else dispatch_bf16_nvl_recv_bytes
for nvl_chunk_size in range(4, 33, 4):
for rdma_chunk_size in range(4, 33, 4):
config = deep_ep.Config(num_sms, nvl_chunk_size, nvl_buffer_size, rdma_chunk_size, rdma_buffer_size)
tune_args = {'x': current_x, 'handle': handle, 'config': config}
t = bench(lambda: buffer.dispatch(**tune_args))[0]
if t < best_time:
best_time, best_results = t, (num_sms, nvl_chunk_size, rdma_chunk_size)
if local_rank == 0:
print(f'[tuning] SMs {num_sms}, NVL chunk {nvl_chunk_size}, RDMA chunk {rdma_chunk_size}: {rdma_send_bytes / 1e9 / t:.2f} GB/s (RDMA), {nvl_recv_bytes / 1e9 / t:.2f} GB/s (NVL) ', flush=True)
if local_rank == 0:
print(f'[tuning] Best dispatch ({"FP8" if isinstance(current_x, tuple) else "BF16"}): SMs {best_results[0]}, NVL chunk {best_results[1]}, RDMA chunk {best_results[2]}: {rdma_send_bytes / 1e9 / best_time:.2f} GB/s (RDMA), {nvl_recv_bytes / 1e9 / best_time:.2f} GB/s (NVL)', flush=True)
print('', flush=True)
if isinstance(current_x, tuple):
# Gather FP8 the best config from rank 0
best_dispatch_results = torch.tensor([best_results[0], best_results[1], best_results[2]], dtype=torch.int32, device='cuda')
all_best_fp8_results_list = [torch.zeros_like(best_dispatch_results) for _ in range(torch.distributed.get_world_size())]
dist.all_gather(all_best_fp8_results_list, best_dispatch_results, group=group)
best_dispatch_results = all_best_fp8_results_list[0].tolist()
dispatch_config = deep_ep.Config(best_dispatch_results[0], best_dispatch_results[1], nvl_buffer_size, best_dispatch_results[2], rdma_buffer_size)
dispatch_args = {'x': x, 'num_tokens_per_rank': num_tokens_per_rank, 'num_tokens_per_rdma_rank': num_tokens_per_rdma_rank,
'is_token_in_rank': is_token_in_rank, 'num_tokens_per_expert': num_tokens_per_expert,
'config': dispatch_config if dispatch_config is not None else config}
recv_x, _, _, _, handle, _ = buffer.dispatch(**dispatch_args)
# Tune combine performance
best_time, best_results = 1e10, None
for nvl_chunk_size in range(1, 5, 1):
for rdma_chunk_size in range(8, 33, 4):
config = deep_ep.Config(num_sms, nvl_chunk_size, nvl_buffer_size, rdma_chunk_size, rdma_buffer_size)
tune_args = {'x': recv_x, 'handle': handle, 'config': config}
t = bench(lambda: buffer.combine(**tune_args))[0]
if local_rank == 0:
print(f'[tuning] SMs {num_sms}, NVL chunk {nvl_chunk_size}, RDMA chunk {rdma_chunk_size}: {combine_bf16_rdma_recv_bytes / 1e9 / t:.2f} GB/s (RDMA), {combine_bf16_nvl_send_bytes / 1e9 / t:.2f} GB/s (NVL) ', flush=True)
if t < best_time:
best_time, best_results = t, (num_sms, nvl_chunk_size, rdma_chunk_size)
if local_rank == 0:
print(f'[tuning] Best combine: SMs {best_results[0]}, NVL chunk {best_results[1]}, RDMA chunk {best_results[2]}: {combine_bf16_rdma_recv_bytes / 1e9 / best_time:.2f} GB/s (RDMA), {combine_bf16_nvl_send_bytes / 1e9 / best_time:.2f} GB/s (NVL)', flush=True)
print('', flush=True)
# noinspection PyUnboundLocalVariable
def test_loop(local_rank: int, num_local_ranks: int):
num_nodes = int(os.getenv('WORLD_SIZE', 1))
rank, num_ranks, group = init_dist(local_rank, num_local_ranks)
test_ll_compatibility = os.getenv('EP_TEST_LL_COMPATIBILITY', False)
if test_ll_compatibility:
ll_num_tokens, ll_hidden, ll_num_experts, ll_num_topk = 16, 5120, 256, 9
num_sms = 24
num_qps_per_rank = max(num_sms, ll_num_experts // num_ranks if test_ll_compatibility else 0)
buffer = deep_ep.Buffer(group, int(1e9), int(1e9), low_latency_mode=test_ll_compatibility,
num_qps_per_rank=num_qps_per_rank)
assert num_local_ranks == 8 and num_ranks > 8
torch.manual_seed(rank)
for i in (num_sms, ):
test_main(i, local_rank, num_local_ranks, num_ranks, num_nodes, rank, buffer, group)
if local_rank == 0:
print('', flush=True)
# Test compatibility with low latency functions
if test_ll_compatibility:
buffer.clean_low_latency_buffer(ll_num_tokens, ll_hidden, ll_num_experts)
test_low_latency.test_main(ll_num_tokens, ll_hidden, ll_num_experts, ll_num_topk, rank, num_ranks, group, buffer, seed=1)
# Destroy the communication group
dist.barrier()
dist.destroy_process_group()
if __name__ == '__main__':
num_processes = 8
torch.multiprocessing.spawn(test_loop, args=(num_processes, ), nprocs=num_processes)

View File

@ -0,0 +1,256 @@
import time
import torch
import torch.distributed as dist
# noinspection PyUnresolvedReferences
import deep_ep
from utils import init_dist, bench, calc_diff, inplace_unique, per_token_cast_to_fp8, per_token_cast_back
# Test compatibility with low latency functions
import test_low_latency
def test_main(num_sms: int, local_rank: int, num_ranks: int, rank: int, buffer: deep_ep.Buffer, group: dist.ProcessGroup):
# Settings
num_tokens, hidden, num_topk, num_experts = 4096, 7168, 8, (256 // num_ranks) * num_ranks
assert num_experts % num_ranks == 0
if local_rank == 0:
print(f'[config] num_tokens={num_tokens}, hidden={hidden}, num_topk={num_topk}', flush=True)
# Random data
x = torch.ones((num_tokens, hidden), dtype=torch.bfloat16, device='cuda') * rank
x_pure_rand = torch.randn((num_tokens, hidden), dtype=torch.bfloat16, device='cuda')
x_e4m3 = per_token_cast_to_fp8(x) if deep_ep.Buffer.is_sm90_compiled() else None
x_e4m3 = (x_e4m3[0], x_e4m3[1].T.contiguous().T) if x_e4m3 is not None else None
scores = torch.randn((num_tokens, num_experts), dtype=torch.float32, device='cuda').abs() + 1
topk_idx = torch.topk(scores, num_topk, dim=-1, largest=True, sorted=False)[1]
topk_weights = torch.ones((num_tokens, num_topk), dtype=torch.float32, device='cuda') * rank
topk_weights_pure_rand = torch.randn((num_tokens, num_topk), dtype=torch.float32, device='cuda')
rank_idx = topk_idx // (num_experts // num_ranks)
rank_idx.masked_fill_(topk_idx == -1, -1)
inplace_unique(rank_idx, num_ranks)
# Expert meta
num_tokens_per_expert = torch.zeros((num_experts, ), dtype=torch.int, device='cuda')
for i in range(num_experts):
num_tokens_per_expert[i] = (topk_idx == i).sum()
gbl_num_tokens_per_expert = num_tokens_per_expert.clone()
dist.all_reduce(gbl_num_tokens_per_expert, group=group)
# Rank layout meta
num_tokens_per_rank = torch.empty((num_ranks, ), dtype=torch.int, device='cuda')
token_idx_in_rank = torch.full((num_ranks, num_tokens), -1, dtype=torch.long, device='cuda')
for i in range(num_ranks):
num_tokens_per_rank[i] = (rank_idx == i).sum()
token_sel = (rank_idx == i).max(dim=-1)[0]
count = token_sel.sum().item()
tokens = torch.sort(token_sel.to(torch.int), descending=True)[1]
tokens[:count] = torch.sort(tokens[:count])[0]
token_idx_in_rank[i][tokens[:count]] = torch.arange(count, dtype=torch.long, device='cuda')
token_idx_in_rank = token_idx_in_rank.T.contiguous().to(torch.int)
is_token_in_rank = token_idx_in_rank >= 0
gbl_num_tokens_per_rank = num_tokens_per_rank.clone()
dist.all_reduce(gbl_num_tokens_per_rank, group=group)
ref_num_tokens_per_rank, _, ref_num_tokens_per_expert, ref_is_token_in_rank, _ = \
buffer.get_dispatch_layout(topk_idx, num_experts)
assert torch.allclose(ref_num_tokens_per_rank, num_tokens_per_rank)
assert torch.allclose(ref_num_tokens_per_expert, num_tokens_per_expert)
assert torch.allclose(ref_is_token_in_rank, is_token_in_rank)
t = bench(lambda: buffer.get_dispatch_layout(topk_idx, num_experts))[0]
if local_rank == 0:
print(f'[layout] Kernel performance: {t * 1000:.3f} ms', flush=True)
print('', flush=True)
group.barrier()
time.sleep(1)
# Config
nvl_buffer_size = 256
config = deep_ep.Config(num_sms, 8, nvl_buffer_size)
# Test dispatch
# noinspection PyShadowingNames
def check_data(check_x, rank_prefix_matrix):
assert torch.allclose(check_x.amin(dim=1), check_x.amax(dim=1))
check_start = 0
for i in range(num_ranks):
check_end = rank_prefix_matrix[i][rank].item()
assert (check_x[check_start:check_end, :].int() - i).sum().item() == 0
check_start = check_end
for previous_mode in (False, True):
for async_mode in (False, True):
for current_x in filter(lambda elem: elem is not None, (x_pure_rand, x, x_e4m3)):
for with_topk in (False, True):
if local_rank == 0:
print(f'[testing] Running with {"FP8" if isinstance(current_x, tuple) else "BF16"}, {"with" if with_topk else "without"} top-k (async={async_mode}, previous={previous_mode}) ...', flush=True, end='')
dispatch_args = {'x': current_x, 'num_tokens_per_rank': num_tokens_per_rank, 'is_token_in_rank': is_token_in_rank,
'num_tokens_per_expert': num_tokens_per_expert, 'config': config, 'async_finish': async_mode}
if with_topk:
dispatch_args.update({'topk_idx': topk_idx, 'topk_weights': topk_weights_pure_rand if current_x is x_pure_rand else topk_weights})
if previous_mode:
dispatch_args.update({'previous_event': buffer.capture()})
recv_x, recv_topk_idx, recv_topk_weights, recv_num_tokens_per_expert_list, handle, event = buffer.dispatch(**dispatch_args)
event.current_stream_wait() if async_mode else ()
recv_x = per_token_cast_back(*recv_x) if isinstance(recv_x, tuple) else recv_x
# Checks
rank_prefix_matrix = handle[0]
assert gbl_num_tokens_per_rank[rank].item() == recv_x.size(0), f'{gbl_num_tokens_per_rank[rank].item()} != {recv_x.size(0)}'
assert gbl_num_tokens_per_expert.view(num_ranks, -1)[rank].tolist() == recv_num_tokens_per_expert_list
if current_x is not x_pure_rand:
check_data(recv_x, rank_prefix_matrix)
recv_topk_weights_clone = None
if with_topk:
# Check `topk_idx`
assert (recv_topk_idx.eq(-1) | ((recv_topk_idx >= 0) & (recv_topk_idx < (num_experts // num_ranks)))).sum().item() == recv_topk_idx.numel()
for i, count in enumerate(recv_num_tokens_per_expert_list):
assert recv_topk_idx.eq(i).sum().item() == count
# Check `topk_weights`
recv_topk_weights_clone = recv_topk_weights.clone()
if current_x is not x_pure_rand:
recv_topk_weights[recv_topk_idx.eq(-1)] = recv_topk_weights.amax(dim=1, keepdim=True).expand_as(recv_topk_weights)[recv_topk_idx.eq(-1)]
check_data(recv_topk_weights, rank_prefix_matrix)
# Test `num_worst_tokens != 0`
if with_topk:
num_worst_tokens = num_tokens * num_ranks
dispatch_args.update({'num_worst_tokens': num_worst_tokens})
recv_worst_x, recv_worst_topk_idx, recv_worst_topk_weights, empty_list, _, event = buffer.dispatch(**dispatch_args)
event.current_stream_wait() if async_mode else ()
recv_worst_x = per_token_cast_back(*recv_worst_x) if isinstance(recv_worst_x, tuple) else recv_worst_x
assert len(empty_list) == 0
assert num_worst_tokens == recv_worst_x.size(0)
assert num_worst_tokens == recv_worst_topk_idx.size(0)
assert num_worst_tokens == recv_worst_topk_weights.size(0)
assert torch.equal(recv_x, recv_worst_x[:recv_x.size(0)])
assert torch.equal(recv_topk_idx, recv_worst_topk_idx[:recv_x.size(0)])
assert torch.equal(recv_topk_weights_clone, recv_worst_topk_weights[:recv_x.size(0)])
assert torch.all(recv_worst_topk_idx[recv_x.size(0):] == -1).item()
# Test cached dispatch (must without top-k staffs)
if not with_topk:
dispatch_args = {'x': current_x, 'handle': handle, 'config': config, 'async_finish': async_mode}
if previous_mode:
dispatch_args.update({'previous_event': buffer.capture()})
recv_x, _, _, _, _, event = buffer.dispatch(**dispatch_args)
event.current_stream_wait() if async_mode else ()
recv_x = per_token_cast_back(*recv_x) if isinstance(recv_x, tuple) else recv_x
if current_x is not x_pure_rand:
check_data(recv_x, rank_prefix_matrix)
# Test combine
combine_args = {'x': recv_x, 'handle': handle, 'config': config, 'async_finish': async_mode}
if with_topk:
combine_args.update({'topk_weights': recv_topk_weights})
if previous_mode:
combine_args.update({'previous_event': buffer.capture()})
combined_x, combined_topk_weights, event = buffer.combine(**combine_args)
event.current_stream_wait() if async_mode else ()
check_x = combined_x.float() / is_token_in_rank.sum(dim=1).unsqueeze(1)
ref_x = x_pure_rand if current_x is x_pure_rand else x
assert calc_diff(check_x, ref_x) < 5e-6
if with_topk:
check_topk_weights = combined_topk_weights if (current_x is x_pure_rand) else (combined_topk_weights / is_token_in_rank.sum(dim=1).unsqueeze(1))
ref_topk_weights = topk_weights_pure_rand if current_x is x_pure_rand else topk_weights
assert calc_diff(check_topk_weights, ref_topk_weights) < 1e-9
# For later tuning
dispatch_bf16_nvl_recv_bytes = recv_x.numel() * 2
combine_bf16_nvl_send_bytes = dispatch_bf16_nvl_recv_bytes
if local_rank == 0:
print(' passed', flush=True)
if local_rank == 0:
print('', flush=True)
# Tune dispatch performance
best_dispatch_results = None
fp8_factor = (1 + 4 / 128) / 2
for current_x in filter(lambda elem: elem is not None, (x_e4m3, x)):
best_time, best_results = 1e10, None
nvl_recv_bytes = (dispatch_bf16_nvl_recv_bytes * fp8_factor) if isinstance(current_x, tuple) else dispatch_bf16_nvl_recv_bytes
for nvl_chunk_size in tuple(range(4, 33, 2)) + (0, ):
if nvl_chunk_size > 0:
config = deep_ep.Config(num_sms, nvl_chunk_size, nvl_buffer_size)
else:
# Test default config as well
deep_ep.Buffer.set_num_sms(num_sms)
config = deep_ep.Buffer.get_dispatch_config(num_ranks)
tune_args = {'x': current_x, 'handle': handle, 'config': config}
t = bench(lambda: buffer.dispatch(**tune_args))[0]
if t < best_time and nvl_chunk_size > 0:
best_time, best_results = t, (num_sms, nvl_chunk_size)
if local_rank == 0:
print(f'[tuning] SMs {num_sms}, NVL chunk {nvl_chunk_size if nvl_chunk_size else "default"}: '
f'{nvl_recv_bytes / 1e9 / t:.2f} GB/s (NVL), avg_t: {t * 1e6:.2f} us', flush=True)
if local_rank == 0:
print(f'[tuning] Best dispatch ({"FP8" if isinstance(current_x, tuple) else "BF16"}): SMs {best_results[0]}, NVL chunk {best_results[1]}, {nvl_recv_bytes / 1e9 / best_time:.2f} GB/s (NVL), t: {best_time * 1e6:.2f} us', flush=True)
print('', flush=True)
# Gather the best config from rank 0 and the first test setting
if best_dispatch_results is None:
best_dispatch_results = torch.tensor([best_results[0], best_results[1]], dtype=torch.int32, device='cuda')
all_best_fp8_results_list = [torch.zeros_like(best_dispatch_results) for _ in range(torch.distributed.get_world_size())]
dist.all_gather(all_best_fp8_results_list, best_dispatch_results, group=group)
best_dispatch_results = all_best_fp8_results_list[0].tolist()
dispatch_config = deep_ep.Config(best_dispatch_results[0], best_dispatch_results[1], nvl_buffer_size)
dispatch_args = {'x': x, 'num_tokens_per_rank': num_tokens_per_rank,
'is_token_in_rank': is_token_in_rank, 'num_tokens_per_expert': num_tokens_per_expert,
'config': dispatch_config if dispatch_config is not None else config}
recv_x, _, _, _, handle, _ = buffer.dispatch(**dispatch_args)
# Tune combine performance
best_time, best_results = 1e10, None
for nvl_chunk_size in tuple(range(1, 17, 1)) + (0, ):
if nvl_chunk_size > 0:
config = deep_ep.Config(num_sms, nvl_chunk_size, nvl_buffer_size)
else:
# Test default config as well
deep_ep.Buffer.set_num_sms(num_sms)
config = deep_ep.Buffer.get_combine_config(num_ranks)
tune_args = {'x': recv_x, 'handle': handle, 'config': config}
t = bench(lambda: buffer.combine(**tune_args))[0]
if local_rank == 0:
print(f'[tuning] SMs {num_sms}, NVL chunk {nvl_chunk_size if nvl_chunk_size else "default"}: '
f'{combine_bf16_nvl_send_bytes / 1e9 / t:.2f} GB/s (NVL), avg_t: {t * 1e6:.2f} us', flush=True)
if t < best_time and nvl_chunk_size > 0:
best_time, best_results = t, (num_sms, nvl_chunk_size)
if local_rank == 0:
print(f'[tuning] Best combine: SMs {best_results[0]}, NVL chunk {best_results[1]}: {combine_bf16_nvl_send_bytes / 1e9 / best_time:.2f} GB/s (NVL), t: {best_time * 1e6:.2f} us', flush=True)
print('', flush=True)
# noinspection PyUnboundLocalVariable
def test_loop(local_rank: int, num_local_ranks: int):
rank, num_ranks, group = init_dist(local_rank, num_local_ranks)
test_ll_compatibility, num_rdma_bytes = False, 0
if test_ll_compatibility:
ll_num_tokens, ll_hidden, ll_num_experts, ll_num_topk = 16, 5120, 256, 9
num_rdma_bytes = deep_ep.Buffer.get_low_latency_rdma_size_hint(ll_num_tokens, ll_hidden, num_ranks, ll_num_experts)
buffer = deep_ep.Buffer(group, int(2e9), num_rdma_bytes, low_latency_mode=test_ll_compatibility,
num_qps_per_rank=(ll_num_experts // num_ranks if test_ll_compatibility else 1))
torch.manual_seed(rank)
for i in (24, ):
test_main(i, local_rank, num_ranks, rank, buffer, group)
if local_rank == 0:
print('', flush=True)
# Test compatibility with low latency functions
if test_ll_compatibility:
buffer.clean_low_latency_buffer(ll_num_tokens, ll_hidden, ll_num_experts)
test_low_latency.test_main(ll_num_tokens, ll_hidden, ll_num_experts, ll_num_topk, rank, num_ranks, group, buffer, seed=1)
# Destroy the communication group
dist.barrier()
dist.destroy_process_group()
if __name__ == '__main__':
num_processes = 8
torch.multiprocessing.spawn(test_loop, args=(num_processes, ), nprocs=num_processes)

View File

@ -0,0 +1,187 @@
import random
import torch
import torch.distributed as dist
from functools import partial
import deep_ep
from utils import init_dist, bench, bench_kineto, calc_diff, hash_tensor, per_token_cast_back
def test_main(num_tokens: int, hidden: int, num_experts: int, num_topk: int,
rank: int, num_ranks: int, group: dist.ProcessGroup, buffer: deep_ep.Buffer, seed: int = 0):
torch.manual_seed(seed + rank)
random.seed(seed + rank)
assert num_experts % num_ranks == 0
num_local_experts = num_experts // num_ranks
# NOTES: the integers greater than 256 exceeds the BF16 precision limit
rank_offset = 128
assert num_ranks - rank_offset < 257, 'Too many ranks (exceeding test precision limit)'
x = torch.ones((num_tokens, hidden), dtype=torch.bfloat16, device='cuda') * (rank - rank_offset)
x[:, -128:] = torch.arange(num_tokens, device='cuda').to(torch.bfloat16).view(-1, 1)
scores = torch.randn((num_tokens, num_experts), dtype=torch.float32, device='cuda').abs() + 1
topk_idx = torch.topk(scores, num_topk, dim=-1, largest=True, sorted=True)[1]
topk_weights = torch.randn((num_tokens, num_topk), dtype=torch.float32, device='cuda').abs()
# Randomly mask some positions
for i in range(10):
topk_idx[random.randint(0, num_tokens - 1), random.randint(0, num_topk - 1)] = -1
# Check dispatch correctness
do_check = True
hash_value, num_times = 0, 0
for return_recv_hook in (False, True):
for dispatch_use_fp8 in (False, True):
for round_scale in (False, True) if dispatch_use_fp8 else (False, ):
for use_ue8m0 in (False, True) if round_scale else (False, ):
num_times += 1
for i in range((num_times % 2) + 1):
cumulative_local_expert_recv_stats = torch.zeros((num_local_experts, ), dtype=torch.int, device='cuda')
packed_recv_x, packed_recv_count, handle, event, hook = \
buffer.low_latency_dispatch(x, topk_idx, num_tokens, num_experts,
use_fp8=dispatch_use_fp8, round_scale=round_scale, use_ue8m0=use_ue8m0,
cumulative_local_expert_recv_stats=cumulative_local_expert_recv_stats,
async_finish=not return_recv_hook, return_recv_hook=return_recv_hook)
hook() if return_recv_hook else event.current_stream_wait()
packed_recv_x = (packed_recv_x[0], packed_recv_x[1].contiguous()) if dispatch_use_fp8 else packed_recv_x
simulated_gemm_x = per_token_cast_back(packed_recv_x[0].view(-1, hidden), packed_recv_x[1].view(-1, hidden // 128)).view(packed_recv_x[0].shape) \
if dispatch_use_fp8 else packed_recv_x.clone()
all_topk_idx = torch.empty((num_ranks, num_tokens, num_topk), dtype=topk_idx.dtype, device='cuda')
dist.all_gather_into_tensor(all_topk_idx, topk_idx, group=group)
for i in range(num_local_experts if do_check else 0):
expert_id = rank * num_local_experts + i
recv_x = per_token_cast_back(packed_recv_x[0][i], packed_recv_x[1][i]) if dispatch_use_fp8 else packed_recv_x[i]
recv_count, recv_src_info, recv_layout_range = packed_recv_count[i], handle[0][i], handle[1][i]
# Check expert indices
int_mask = (2 ** 32) - 1
num_valid_tokens = recv_count.item()
assert cumulative_local_expert_recv_stats[i].item() == num_valid_tokens, f'{cumulative_local_expert_recv_stats[i].item()} != {num_valid_tokens}'
assert num_valid_tokens == (recv_layout_range & int_mask).sum().item(), f'{num_valid_tokens} != {recv_layout_range & int_mask}.sum().item()'
assert num_valid_tokens == (all_topk_idx == expert_id).sum().item(), f'{num_valid_tokens} != {(all_topk_idx == expert_id).sum().item()}'
# Check received data
recv_x = recv_x[:num_valid_tokens]
recv_x_amin = recv_x[:, :-128].amin(dim=-1)
recv_src_info = recv_src_info[:num_valid_tokens]
assert torch.equal(recv_x_amin, recv_x[:, :-128].amax(dim=-1))
if round_scale:
assert calc_diff(recv_x[:, -1], recv_src_info.view(-1)) < 0.007
else:
assert (recv_x[:, -128:] - recv_src_info.view(-1, 1) % num_tokens).sum().item() == 0
for j in range(num_ranks):
begin_idx, count = (recv_layout_range[j] >> 32).item(), (recv_layout_range[j] & int_mask).item()
if not round_scale:
assert (recv_x_amin == j - rank_offset).sum().item() == (all_topk_idx[j] == expert_id).sum().item()
assert (recv_x[begin_idx:begin_idx + count][:-128] - j).sum().item() == 0
if dispatch_use_fp8:
hash_value ^= hash_tensor(packed_recv_x[0][i, :num_valid_tokens])
hash_value ^= hash_tensor(packed_recv_x[1][i, :num_valid_tokens])
else:
hash_value ^= hash_tensor(packed_recv_x[i, :num_valid_tokens])
# Check combine correctness
for zero_copy in (False, True):
if zero_copy:
buffer.get_next_low_latency_combine_buffer(handle)[:, :, :] = simulated_gemm_x
out = torch.empty((num_tokens, hidden), dtype=torch.bfloat16, device='cuda')
combined_x, event, hook = buffer.low_latency_combine(simulated_gemm_x, topk_idx, topk_weights, handle,
async_finish=not return_recv_hook, zero_copy=zero_copy,
return_recv_hook=return_recv_hook, out=out)
hook() if return_recv_hook else event.current_stream_wait()
if do_check:
diff = calc_diff(x * topk_weights.masked_fill(topk_idx == -1, 0).sum(dim=1).view(-1, 1), combined_x)
assert torch.isnan(combined_x).sum().item() == 0
assert diff < (7e-4 if round_scale else 1e-5), f'Error: {diff=}, {zero_copy=}'
hash_value ^= hash_tensor(combined_x)
def create_test_cast_with_outliers(num_outliers):
tmp = torch.randn((num_tokens, hidden), dtype=torch.bfloat16, device='cuda')
tmp /= tmp.abs().amax(dim=1).view(-1, 1)
assert tmp.abs().amax().item() <= 1
# Create some amax outliers
for i in range(num_outliers):
tmp[random.randint(0, num_tokens - 1)] *= 1e3
return tmp
# noinspection PyShadowingNames
def large_gemm_with_hook(hook):
mat_0 = torch.randn((8192, 8192), dtype=torch.float)
mat_1 = torch.randn((8192, 8192), dtype=torch.float)
mat_0 @ mat_1
hook()
# noinspection PyShadowingNames
def test_func(zero_copy: bool, return_recv_hook: bool):
recv_x, recv_count, handle, event, hook = \
buffer.low_latency_dispatch(x, topk_idx, num_tokens, num_experts,
cumulative_local_expert_recv_stats=cumulative_local_expert_recv_stats,
use_fp8=True, async_finish=False, return_recv_hook=return_recv_hook)
large_gemm_with_hook(hook) if return_recv_hook else None
if zero_copy:
buffer.get_next_low_latency_combine_buffer(handle)[:, :, :] = simulated_gemm_x
combined_x, event, hook = buffer.low_latency_combine(simulated_gemm_x, topk_idx, topk_weights, handle,
zero_copy=zero_copy, return_recv_hook=return_recv_hook)
large_gemm_with_hook(hook) if return_recv_hook else None
# Calculate bandwidth
num_fp8_bytes, num_bf16_bytes = (hidden + hidden / 128 * 4 + 16), hidden * 2
num_dispatch_comm_bytes, num_combine_comm_bytes = 0, 0
for i in range(num_tokens):
num_selections = (topk_idx[i] != -1).sum().item()
num_dispatch_comm_bytes += num_fp8_bytes * num_selections
num_combine_comm_bytes += num_bf16_bytes * num_selections
# Dispatch + combine testing
avg_t, min_t, max_t = bench(partial(test_func, zero_copy=False, return_recv_hook=False))
print(f'[rank {rank}] Dispatch + combine bandwidth: {(num_dispatch_comm_bytes + num_combine_comm_bytes) / 1e9 / avg_t:.2f} GB/s, '
f'avg_t={avg_t * 1e6:.2f} us, min_t={min_t * 1e6:.2f} us, max_t={max_t * 1e6:.2f} us', flush=True)
# Separate profiling
for return_recv_hook in (False, True):
group.barrier()
dispatch_t, combine_t = bench_kineto(partial(test_func, zero_copy=True, return_recv_hook=return_recv_hook),
kernel_names=('dispatch', 'combine'), barrier_comm_profiling=True,
suppress_kineto_output=True)
if not return_recv_hook:
print(f'[rank {rank}] Dispatch bandwidth: {num_dispatch_comm_bytes / 1e9 / dispatch_t:.2f} GB/s, avg_t={dispatch_t * 1e6:.2f} us | '
f'Combine bandwidth: {num_combine_comm_bytes / 1e9 / combine_t:.2f} GB/s, avg_t={combine_t * 1e6:.2f} us', flush=True)
else:
print(f'[rank {rank}] Dispatch send/recv time: {dispatch_t * 2 * 1e6:.2f} us | '
f'Combine send/recv time: {combine_t * 2 * 1e6:.2f} us', flush=True)
return hash_value
# noinspection PyUnboundLocalVariable
def test_loop(local_rank: int, num_local_ranks: int):
rank, num_ranks, group = init_dist(local_rank, num_local_ranks)
num_tokens, hidden, num_topk, num_experts = 128, 7168, 8, 288
num_rdma_bytes = deep_ep.Buffer.get_low_latency_rdma_size_hint(num_tokens, hidden, num_ranks, num_experts)
if local_rank == 0:
print(f'Allocating buffer size: {num_rdma_bytes / 1e6} MB ...', flush=True)
buffer = deep_ep.Buffer(group, num_rdma_bytes=num_rdma_bytes, low_latency_mode=True,
num_qps_per_rank=num_experts // num_ranks)
test_main(num_tokens, hidden, num_experts, num_topk, rank, num_ranks, group, buffer, seed=1)
do_pressure_test = False
for seed in range(int(1e9) if do_pressure_test else 0):
if local_rank == 0:
print(f'Testing with seed {seed} ...', flush=True)
ref_hash = test_main(num_tokens, hidden, num_experts, num_topk, rank, num_ranks, group, buffer, seed=seed)
for i in range(20):
assert test_main(num_tokens, hidden, num_experts, num_topk, rank, num_ranks, group, buffer, seed=seed) == ref_hash, f'Error: seed={seed}'
# Destroy the communication group
dist.barrier()
dist.destroy_process_group()
if __name__ == '__main__':
# TODO: you may modify NUMA binding for less CPU overhead
num_processes = 8
torch.multiprocessing.spawn(test_loop, args=(num_processes,), nprocs=num_processes)

201
DeepEP/tests/utils.py Normal file
View File

@ -0,0 +1,201 @@
import inspect
import numpy as np
import os
import sys
import torch
import torch.distributed as dist
from typing import Optional
def init_dist(local_rank: int, num_local_ranks: int):
# NOTES: you may rewrite this function with your own cluster settings
ip = os.getenv('MASTER_ADDR', '127.0.0.1')
port = int(os.getenv('MASTER_PORT', '8361'))
num_nodes = int(os.getenv('WORLD_SIZE', 1))
node_rank = int(os.getenv('RANK', 0))
assert (num_local_ranks < 8 and num_nodes == 1) or num_local_ranks == 8
sig = inspect.signature(dist.init_process_group)
params = {
'backend': 'nccl',
'init_method': f'tcp://{ip}:{port}',
'world_size': num_nodes * num_local_ranks,
'rank': node_rank * num_local_ranks + local_rank,
}
if 'device_id' in sig.parameters:
# noinspection PyTypeChecker
params['device_id'] = torch.device(f'cuda:{local_rank}')
dist.init_process_group(**params)
torch.set_default_dtype(torch.bfloat16)
torch.set_default_device('cuda')
torch.cuda.set_device(local_rank)
return dist.get_rank(), dist.get_world_size(), dist.new_group(list(range(num_local_ranks * num_nodes)))
def calc_diff(x: torch.Tensor, y: torch.Tensor):
x, y = x.double() + 1, y.double() + 1
denominator = (x * x + y * y).sum()
sim = 2 * (x * y).sum() / denominator
return (1 - sim).item()
def per_token_cast_to_fp8(x: torch.Tensor):
assert x.dim() == 2 and x.size(1) % 128 == 0
m, n = x.shape
x_view = x.view(m, -1, 128)
x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
return (x_view * (448.0 / x_amax.unsqueeze(2))).to(torch.float8_e4m3fn).view(m, n), (x_amax / 448.0).view(m, -1)
def per_token_cast_back(x_fp8: torch.Tensor, x_scales: torch.Tensor):
if x_scales.dtype == torch.int:
x_scales = x_scales.view(dtype=torch.int8).to(torch.int) << 23
x_scales = x_scales.view(dtype=torch.float)
x_fp32 = x_fp8.to(torch.float32).view(x_fp8.size(0), -1, 128)
x_scales = x_scales.view(x_fp8.size(0), -1, 1)
return (x_fp32 * x_scales).view(x_fp8.shape).to(torch.bfloat16)
def inplace_unique(x: torch.Tensor, num_slots: int):
assert x.dim() == 2
mask = x < 0
x_padded = x.masked_fill(mask, num_slots)
bin_count = torch.zeros((x.size(0), num_slots + 1), dtype=x.dtype, device=x.device)
bin_count.scatter_add_(1, x_padded, torch.ones_like(x_padded))
bin_count = bin_count[:, :num_slots]
sorted_bin_count, sorted_bin_idx = torch.sort(bin_count, dim=-1, descending=True)
sorted_bin_idx.masked_fill_(sorted_bin_count == 0, -1)
sorted_bin_idx = torch.sort(sorted_bin_idx, descending=True, dim=-1).values
x[:, :].fill_(-1)
valid_len = min(num_slots, x.size(1))
x[:, :valid_len] = sorted_bin_idx[:, :valid_len]
def create_grouped_scores(scores: torch.Tensor, group_idx: torch.Tensor, num_groups: int):
num_tokens, num_experts = scores.shape
scores = scores.view(num_tokens, num_groups, -1)
mask = torch.zeros((num_tokens, num_groups), dtype=torch.bool, device=scores.device)
mask = mask.scatter_(1, group_idx, True).unsqueeze(-1).expand_as(scores)
return (scores * mask).view(num_tokens, num_experts)
def bench(fn, num_warmups: int = 50, num_tests: int = 50, post_fn=None):
# Flush L2 cache with 256 MB data
torch.cuda.synchronize()
cache = torch.empty(int(256e6 // 4), dtype=torch.int, device='cuda')
# Warmup
for _ in range(num_warmups):
fn()
# Flush L2
cache.zero_()
# Testing
start_events = [torch.cuda.Event(enable_timing=True) for _ in range(num_tests)]
end_events = [torch.cuda.Event(enable_timing=True) for _ in range(num_tests)]
for i in range(num_tests):
# Record
start_events[i].record()
fn()
end_events[i].record()
if post_fn is not None:
post_fn()
torch.cuda.synchronize()
times = np.array([s.elapsed_time(e) / 1e3 for s, e in zip(start_events, end_events)])[1:]
return np.average(times), np.min(times), np.max(times)
class empty_suppress:
def __enter__(self):
return self
def __exit__(self, *_):
pass
class suppress_stdout_stderr:
def __enter__(self):
self.outnull_file = open(os.devnull, 'w')
self.errnull_file = open(os.devnull, 'w')
self.old_stdout_fileno_undup = sys.stdout.fileno()
self.old_stderr_fileno_undup = sys.stderr.fileno()
self.old_stdout_fileno = os.dup(sys.stdout.fileno())
self.old_stderr_fileno = os.dup(sys.stderr.fileno())
self.old_stdout = sys.stdout
self.old_stderr = sys.stderr
os.dup2(self.outnull_file.fileno(), self.old_stdout_fileno_undup)
os.dup2(self.errnull_file.fileno(), self.old_stderr_fileno_undup)
sys.stdout = self.outnull_file
sys.stderr = self.errnull_file
return self
def __exit__(self, *_):
sys.stdout = self.old_stdout
sys.stderr = self.old_stderr
os.dup2(self.old_stdout_fileno, self.old_stdout_fileno_undup)
os.dup2(self.old_stderr_fileno, self.old_stderr_fileno_undup)
os.close(self.old_stdout_fileno)
os.close(self.old_stderr_fileno)
self.outnull_file.close()
self.errnull_file.close()
def bench_kineto(fn, kernel_names, num_tests: int = 30, suppress_kineto_output: bool = False,
trace_path: Optional[str] = None, barrier_comm_profiling: bool = False):
# Profile
suppress = suppress_stdout_stderr if suppress_kineto_output else empty_suppress
with suppress():
schedule = torch.profiler.schedule(wait=0, warmup=1, active=1, repeat=1)
with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CUDA], schedule=schedule) as prof:
for i in range(2):
# NOTES: use a large kernel and a barrier to eliminate the unbalanced CPU launch overhead
if barrier_comm_profiling:
lhs = torch.randn((8192, 8192), dtype=torch.float, device='cuda')
rhs = torch.randn((8192, 8192), dtype=torch.float, device='cuda')
lhs @ rhs
dist.all_reduce(torch.ones(1, dtype=torch.float, device='cuda'))
for _ in range(num_tests):
fn()
prof.step()
# Parse the profiling table
assert isinstance(kernel_names, str) or isinstance(kernel_names, tuple)
is_tupled = isinstance(kernel_names, tuple)
prof_lines = prof.key_averages().table(sort_by='cuda_time_total', max_name_column_width=100).split('\n')
kernel_names = (kernel_names, ) if isinstance(kernel_names, str) else kernel_names
assert all([isinstance(name, str) for name in kernel_names])
for name in kernel_names:
assert sum([name in line for line in prof_lines]) == 1, f'Errors of the kernel {name} in the profiling table'
# Save chrome traces
if trace_path is not None:
prof.export_chrome_trace(trace_path)
# Return average kernel times
units = {'ms': 1e3, 'us': 1e6}
kernel_times = []
for name in kernel_names:
for line in prof_lines:
if name in line:
time_str = line.split()[-2]
for unit, scale in units.items():
if unit in time_str:
kernel_times.append(float(time_str.replace(unit, '')) / scale)
break
break
return tuple(kernel_times) if is_tupled else kernel_times[0]
def hash_tensor(t: torch.Tensor):
return t.view(torch.int64).sum().item()

89
DeepEP/third-party/README.md vendored Normal file
View File

@ -0,0 +1,89 @@
# Install NVSHMEM
## Important notices
**This project is neither sponsored nor supported by NVIDIA.**
**Use of NVIDIA NVSHMEM is governed by the terms at [NVSHMEM Software License Agreement](https://docs.nvidia.com/nvshmem/api/sla.html).**
## Prerequisites
Hardware requirements:
- GPUs inside one node needs to be connected by NVLink
- GPUs across different nodes needs to be connected by RDMA devices, see [GPUDirect RDMA Documentation](https://docs.nvidia.com/cuda/gpudirect-rdma/)
- InfiniBand GPUDirect Async (IBGDA) support, see [IBGDA Overview](https://developer.nvidia.com/blog/improving-network-performance-of-hpc-systems-using-nvidia-magnum-io-nvshmem-and-gpudirect-async/)
- For more detailed requirements, see [NVSHMEM Hardware Specifications](https://docs.nvidia.com/nvshmem/release-notes-install-guide/install-guide/abstract.html#hardware-requirements)
## Installation procedure
### 1. Acquiring NVSHMEM source code
Download NVSHMEM v3.2.5 from the [NVIDIA NVSHMEM OPEN SOURCE PACKAGES](https://developer.nvidia.com/downloads/assets/secure/nvshmem/nvshmem_src_3.2.5-1.txz).
### 2. Apply our custom patch
Navigate to your NVSHMEM source directory and apply our provided patch:
```bash
git apply /path/to/deep_ep/dir/third-party/nvshmem.patch
```
### 3. Configure NVIDIA driver (required by inter-node communication)
Enable IBGDA by modifying `/etc/modprobe.d/nvidia.conf`:
```bash
options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1;"
```
Update kernel configuration:
```bash
sudo update-initramfs -u
sudo reboot
```
For more detailed configurations, please refer to the [NVSHMEM Installation Guide](https://docs.nvidia.com/nvshmem/release-notes-install-guide/install-guide/abstract.html).
### 4. Build and installation
DeepEP uses NVLink for intra-node communication and IBGDA for inter-node communication. All the other features are disabled to reduce the dependencies.
```bash
export CUDA_HOME=/path/to/cuda
# disable all features except IBGDA
export NVSHMEM_IBGDA_SUPPORT=1
export NVSHMEM_SHMEM_SUPPORT=0
export NVSHMEM_UCX_SUPPORT=0
export NVSHMEM_USE_NCCL=0
export NVSHMEM_PMIX_SUPPORT=0
export NVSHMEM_TIMEOUT_DEVICE_POLLING=0
export NVSHMEM_USE_GDRCOPY=0
export NVSHMEM_IBRC_SUPPORT=0
export NVSHMEM_BUILD_TESTS=0
export NVSHMEM_BUILD_EXAMPLES=0
export NVSHMEM_MPI_SUPPORT=0
export NVSHMEM_BUILD_HYDRA_LAUNCHER=0
export NVSHMEM_BUILD_TXZ_PACKAGE=0
export NVSHMEM_TIMEOUT_DEVICE_POLLING=0
cmake -G Ninja -S . -B build -DCMAKE_INSTALL_PREFIX=/path/to/your/dir/to/install
cmake --build build/ --target install
```
## Post-installation configuration
Set environment variables in your shell configuration:
```bash
export NVSHMEM_DIR=/path/to/your/dir/to/install # Use for DeepEP installation
export LD_LIBRARY_PATH="${NVSHMEM_DIR}/lib:$LD_LIBRARY_PATH"
export PATH="${NVSHMEM_DIR}/bin:$PATH"
```
## Verification
```bash
nvshmem-info -a # Should display details of nvshmem
```

474
DeepEP/third-party/nvshmem.patch vendored Normal file
View File

@ -0,0 +1,474 @@
From 9e6cc27cceb3130784e4ea7b61ea3171156365fd Mon Sep 17 00:00:00 2001
From: Shangyan Zhou <sy.zhou@deepseek.com>
Date: Fri, 20 Dec 2024 10:57:12 +0800
Subject: [PATCH 1/4] Change QP creating order.
---
src/modules/transport/ibgda/ibgda.cpp | 13 ++++++++-----
1 file changed, 8 insertions(+), 5 deletions(-)
diff --git a/src/modules/transport/ibgda/ibgda.cpp b/src/modules/transport/ibgda/ibgda.cpp
index ef325cd..286132e 100644
--- a/src/modules/transport/ibgda/ibgda.cpp
+++ b/src/modules/transport/ibgda/ibgda.cpp
@@ -2936,17 +2936,20 @@ int nvshmemt_ibgda_connect_endpoints(nvshmem_transport_t t, int *selected_dev_id
INFO(ibgda_state->log_level, "Creating %d RC QPs", device->rc.num_eps_per_pe);
for (int i = 0; i < num_rc_eps; ++i) {
// Do not create loopback to self
- if (i / device->rc.num_eps_per_pe == mype) {
+ int dst_pe = (i + 1 + mype) % n_pes;
+ int offset = i / n_pes;
+ int mapped_i = dst_pe * device->rc.num_eps_per_pe + offset;
+ if (dst_pe == mype) {
continue;
}
- status = ibgda_create_qp(&device->rc.eps[i], device, portid, i,
+ status = ibgda_create_qp(&device->rc.eps[mapped_i], device, portid, mapped_i,
NVSHMEMI_IBGDA_DEVICE_QP_TYPE_RC);
NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, out,
- "ibgda_create_dci failed on RC #%d.", i);
+ "ibgda_create_dci failed on RC #%d.", mapped_i);
- status = ibgda_get_rc_handle(&local_rc_handles[i], device->rc.eps[i], device);
+ status = ibgda_get_rc_handle(&local_rc_handles[mapped_i], device->rc.eps[mapped_i], device);
NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, out,
- "ibgda_get_rc_handle failed on RC #%d.", i);
+ "ibgda_get_rc_handle failed on RC #%d.", mapped_i);
}
if (num_rc_eps) {
--
2.25.1
From b11d41e4f3727f2f6ccc00a8c852e59e2ee33c8a Mon Sep 17 00:00:00 2001
From: Shangyan Zhou <sy.zhou@deepseek.com>
Date: Fri, 10 Jan 2025 11:53:38 +0800
Subject: [PATCH 2/4] Add recv queue and recv cq for rc qps.
Let the ibgda rc qps use regular recv queue.
Add recv queue to ibgda dev qp.
IBGDA create recv cq
Setup recv cq.
fix recv queue.
Remove some useless idx.
Longer recv queue.
---
.../nvshmem_common_ibgda.h | 19 +++++-
src/modules/transport/ibgda/ibgda.cpp | 65 ++++++++++++++++---
2 files changed, 71 insertions(+), 13 deletions(-)
diff --git a/src/include/device_host_transport/nvshmem_common_ibgda.h b/src/include/device_host_transport/nvshmem_common_ibgda.h
index 8b8a263..1be3dec 100644
--- a/src/include/device_host_transport/nvshmem_common_ibgda.h
+++ b/src/include/device_host_transport/nvshmem_common_ibgda.h
@@ -168,14 +168,17 @@ typedef struct {
uint64_t get_head; // last wqe idx + 1 with a "fetch" operation (g, get, amo_fetch)
uint64_t get_tail; // last wqe idx + 1 polled with cst; get_tail > get_head is possible
} tx_wq;
+ struct {
+ uint64_t resv_head; // last reserved wqe idx + 1
+ } rx_wq;
struct {
uint64_t head;
uint64_t tail;
} ibuf;
char padding[NVSHMEMI_IBGDA_QP_MANAGEMENT_PADDING];
} __attribute__((__aligned__(8))) nvshmemi_ibgda_device_qp_management_v1;
-static_assert(sizeof(nvshmemi_ibgda_device_qp_management_v1) == 96,
- "ibgda_device_qp_management_v1 must be 96 bytes.");
+static_assert(sizeof(nvshmemi_ibgda_device_qp_management_v1) == 104,
+ "ibgda_device_qp_management_v1 must be 104 bytes.");
typedef nvshmemi_ibgda_device_qp_management_v1 nvshmemi_ibgda_device_qp_management_t;
@@ -199,9 +202,19 @@ typedef struct nvshmemi_ibgda_device_qp {
// May point to mvars.prod_idx or internal prod_idx
uint64_t *prod_idx;
} tx_wq;
+ struct {
+ uint16_t nwqes;
+ uint64_t tail;
+ void *wqe;
+ __be32 *dbrec;
+ void *bf;
+ nvshmemi_ibgda_device_cq_t *cq;
+ // May point to mvars.prod_idx or internal prod_idx
+ uint64_t *prod_idx;
+ } rx_wq;
nvshmemi_ibgda_device_qp_management_v1 mvars; // management variables
} nvshmemi_ibgda_device_qp_v1;
-static_assert(sizeof(nvshmemi_ibgda_device_qp_v1) == 184, "ibgda_device_qp_v1 must be 184 bytes.");
+static_assert(sizeof(nvshmemi_ibgda_device_qp_v1) == 248, "ibgda_device_qp_v1 must be 248 bytes.");
typedef nvshmemi_ibgda_device_qp_v1 nvshmemi_ibgda_device_qp_t;
diff --git a/src/modules/transport/ibgda/ibgda.cpp b/src/modules/transport/ibgda/ibgda.cpp
index 286132e..e0b2d5c 100644
--- a/src/modules/transport/ibgda/ibgda.cpp
+++ b/src/modules/transport/ibgda/ibgda.cpp
@@ -198,6 +198,7 @@ struct ibgda_ep {
off_t dbr_offset;
struct ibgda_cq *send_cq;
+ struct ibgda_cq *recv_cq;
struct ibv_ah *ah;
uint32_t user_index;
@@ -1538,7 +1539,8 @@ static int ibgda_create_cq_shared_objects(nvshmemt_ibgda_state_t *ibgda_state,
struct ibv_context *context = device->context;
- unsigned int num_cqs = device->dci.num_eps + device->rc.num_eps_per_pe * n_pes;
+ // Each RC qp has one send CQ and one recv CQ.
+ unsigned int num_cqs = device->dci.num_eps + device->rc.num_eps_per_pe * n_pes * 2;
assert(ibgda_qp_depth > 0);
size_t num_cqe = IBGDA_ROUND_UP_POW2_OR_0(ibgda_qp_depth);
@@ -1701,7 +1703,8 @@ static int ibgda_create_qp_shared_objects(nvshmemt_ibgda_state_t *ibgda_state,
}
// Allocate and map WQ buffer for all QPs.
- wq_buf_size_per_qp = num_wqebb * MLX5_SEND_WQE_BB; // num_wqebb is always a power of 2
+ // Todo: reduce the size of wq buffer.
+ wq_buf_size_per_qp = num_wqebb * MLX5_SEND_WQE_BB * 2; // num_wqebb is always a power of 2
wq_buf_size = wq_buf_size_per_qp * num_eps;
status = ibgda_nic_control_alloc(&wq_mobject, wq_buf_size, IBGDA_GPAGE_SIZE);
NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, out, "cannot allocate wq buf.\n");
@@ -1882,8 +1885,11 @@ static int ibgda_create_qp(struct ibgda_ep **ep_ptr, struct ibgda_device *device
int cqe_version = 0;
struct ibgda_cq *send_cq = NULL;
+ struct ibgda_cq *recv_cq = NULL;
size_t num_wqebb = IBGDA_ROUND_UP_POW2_OR_0(ibgda_qp_depth);
+ size_t num_recv_wqe = ibgda_qp_depth;
+ size_t recv_wqe_size = 16;
int status = 0;
@@ -1911,6 +1917,11 @@ static int ibgda_create_qp(struct ibgda_ep **ep_ptr, struct ibgda_device *device
status = ibgda_create_cq(&send_cq, device);
NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, out, "ibgda_create_cq failed.\n");
+ if (qp_type == NVSHMEMI_IBGDA_DEVICE_QP_TYPE_RC) {
+ status = ibgda_create_cq(&recv_cq, device);
+ NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, out, "ibgda_create_cq failed.\n");
+ }
+
ep = (struct ibgda_ep *)calloc(1, sizeof(struct ibgda_ep));
NVSHMEMI_NULL_ERROR_JMP(ep, status, NVSHMEMX_ERROR_OUT_OF_MEMORY, out,
"Unable to allocate mem for ep.\n");
@@ -1939,12 +1950,9 @@ static int ibgda_create_qp(struct ibgda_ep **ep_ptr, struct ibgda_device *device
DEVX_SET(qpc, qp_context, pm_state, MLX5_QPC_PM_STATE_MIGRATED);
DEVX_SET(qpc, qp_context, pd, device->qp_shared_object.pdn);
DEVX_SET(qpc, qp_context, uar_page, uar_mobject->uar->page_id); // BF register
- DEVX_SET(qpc, qp_context, rq_type, IBGDA_SRQ_TYPE_VALUE); // Shared Receive Queue
- DEVX_SET(qpc, qp_context, srqn_rmpn_xrqn, device->qp_shared_object.srqn);
DEVX_SET(qpc, qp_context, cqn_snd, send_cq->cqn);
- DEVX_SET(qpc, qp_context, cqn_rcv, device->qp_shared_object.rcqn);
+ DEVX_SET(qpc, qp_context, cqn_rcv, qp_type == NVSHMEMI_IBGDA_DEVICE_QP_TYPE_RC ? recv_cq->cqn : device->qp_shared_object.rcqn);
DEVX_SET(qpc, qp_context, log_sq_size, IBGDA_ILOG2_OR0(num_wqebb));
- DEVX_SET(qpc, qp_context, log_rq_size, 0);
DEVX_SET(qpc, qp_context, cs_req, 0); // Disable CS Request
DEVX_SET(qpc, qp_context, cs_res, 0); // Disable CS Response
DEVX_SET(qpc, qp_context, dbr_umem_valid, IBGDA_MLX5_UMEM_VALID_ENABLE); // Enable dbr_umem_id
@@ -1953,6 +1961,15 @@ static int ibgda_create_qp(struct ibgda_ep **ep_ptr, struct ibgda_device *device
DEVX_SET(qpc, qp_context, dbr_umem_id, dbr_umem->umem_id); // DBR buffer
DEVX_SET(qpc, qp_context, user_index, qp_idx);
DEVX_SET(qpc, qp_context, page_offset, 0);
+ if (qp_type == NVSHMEMI_IBGDA_DEVICE_QP_TYPE_RC){
+ DEVX_SET(qpc, qp_context, rq_type, 0); // Regular recv queue
+ DEVX_SET(qpc, qp_context, log_rq_size, IBGDA_ILOG2(num_recv_wqe)); // 4 wqe
+ DEVX_SET(qpc, qp_context, log_rq_stride, IBGDA_ILOG2(recv_wqe_size) - 4); // max recv wqe size = 16B
+ } else {
+ DEVX_SET(qpc, qp_context, rq_type, IBGDA_SRQ_TYPE_VALUE); // Shared Receive Queue, DC must use this.
+ DEVX_SET(qpc, qp_context, srqn_rmpn_xrqn, device->qp_shared_object.srqn);
+ DEVX_SET(qpc, qp_context, log_rq_size, 0);
+ }
ep->devx_qp = mlx5dv_devx_obj_create(context, cmd_in, sizeof(cmd_in), cmd_out, sizeof(cmd_out));
NVSHMEMI_NULL_ERROR_JMP(ep->devx_qp, status, NVSHMEMX_ERROR_INTERNAL, out,
@@ -1962,9 +1979,9 @@ static int ibgda_create_qp(struct ibgda_ep **ep_ptr, struct ibgda_device *device
ep->portid = portid;
ep->sq_cnt = num_wqebb;
- ep->sq_buf_offset = 0;
+ ep->sq_buf_offset = num_recv_wqe * recv_wqe_size;
- ep->rq_cnt = 0;
+ ep->rq_cnt = num_recv_wqe;
ep->rq_buf_offset = 0;
ep->wq_mobject = device->qp_shared_object.wq_mobject;
@@ -1978,6 +1995,7 @@ static int ibgda_create_qp(struct ibgda_ep **ep_ptr, struct ibgda_device *device
ep->uar_mobject = uar_mobject;
ep->send_cq = send_cq;
+ ep->recv_cq = recv_cq;
ep->qp_type = qp_type;
@@ -1989,6 +2007,7 @@ out:
if (status) {
if (uar_mobject) ibgda_unmap_and_free_qp_uar(uar_mobject);
if (send_cq) ibgda_destroy_cq(send_cq);
+ if (recv_cq) ibgda_destroy_cq(recv_cq);
if (ep) free(ep);
}
@@ -2287,6 +2306,10 @@ static int ibgda_destroy_ep(struct ibgda_ep *ep) {
ibgda_destroy_cq(ep->send_cq);
}
+ if (ep->recv_cq) {
+ ibgda_destroy_cq(ep->recv_cq);
+ }
+
if (ep->ah) {
ftable.destroy_ah(ep->ah);
}
@@ -2318,7 +2341,7 @@ static void ibgda_get_device_qp(nvshmemi_ibgda_device_qp_t *dev_qp, struct ibgda
dev_qp->qpn = ep->qpn;
assert(ep->wq_mobject->has_gpu_mapping);
- dev_qp->tx_wq.wqe = (void *)((uintptr_t)ep->wq_mobject->aligned.gpu_ptr + ep->wq_offset);
+ dev_qp->tx_wq.wqe = (void *)((uintptr_t)ep->wq_mobject->aligned.gpu_ptr + ep->wq_offset + ep->sq_buf_offset);
if (ibgda_nic_handler == IBGDA_NIC_HANDLER_GPU) {
assert(ep->dbr_mobject->has_gpu_mapping);
@@ -2330,6 +2353,12 @@ static void ibgda_get_device_qp(nvshmemi_ibgda_device_qp_t *dev_qp, struct ibgda
}
dev_qp->tx_wq.nwqes = ep->sq_cnt;
+ if (ep->qp_type == NVSHMEMI_IBGDA_DEVICE_QP_TYPE_RC) {
+ dev_qp->rx_wq.nwqes = ep->rq_cnt;
+ dev_qp->rx_wq.wqe = (void *)((uintptr_t)ep->wq_mobject->aligned.gpu_ptr + ep->wq_offset + ep->rq_buf_offset);
+ dev_qp->rx_wq.dbrec = (__be32 *)((uintptr_t)ep->dbr_mobject->aligned.gpu_ptr + ep->dbr_offset);
+ dev_qp->rx_wq.bf = (void *)ep->uar_mobject->aligned.gpu_ptr;
+ }
ibuf_dci_start = (uintptr_t)device->qp_shared_object.internal_buf.mem_object->aligned.gpu_ptr;
ibuf_rc_start = ibuf_dci_start + (size_per_dci * device->dci.num_eps);
@@ -2379,6 +2408,9 @@ static int ibgda_setup_gpu_state(nvshmem_transport_t t) {
nvshmemi_ibgda_device_cq_t *cq_d = NULL;
nvshmemi_ibgda_device_cq_t *cq_h = NULL;
+ nvshmemi_ibgda_device_cq_t *recv_cq_d = NULL;
+ nvshmemi_ibgda_device_cq_t *recv_cq_h = NULL;
+
uint8_t *qp_group_switches_d = NULL;
const size_t mvars_offset = offsetof(nvshmemi_ibgda_device_qp_t, mvars);
@@ -2386,6 +2418,7 @@ static int ibgda_setup_gpu_state(nvshmem_transport_t t) {
const size_t cons_t_offset = offsetof(nvshmemi_ibgda_device_qp_management_t, tx_wq.cons_idx);
const size_t wqe_h_offset = offsetof(nvshmemi_ibgda_device_qp_management_t, tx_wq.resv_head);
const size_t wqe_t_offset = offsetof(nvshmemi_ibgda_device_qp_management_t, tx_wq.ready_head);
+ const size_t rx_resv_head_offset = offsetof(nvshmemi_ibgda_device_qp_management_t, rx_wq.resv_head);
nvshmemi_ibgda_device_qp_map_type_t rc_map_type = NVSHMEMI_IBGDA_DEVICE_QP_MAP_TYPE_INVALID;
nvshmemi_ibgda_device_qp_map_type_t dc_map_type = NVSHMEMI_IBGDA_DEVICE_QP_MAP_TYPE_INVALID;
@@ -2421,7 +2454,7 @@ static int ibgda_setup_gpu_state(nvshmem_transport_t t) {
num_dct_handles += device->dct.num_eps * n_pes;
num_dci_handles += device->dci.num_eps;
num_rc_handles += device->rc.num_eps_per_pe * n_pes;
- num_cq_handles += device->dci.num_eps + (device->rc.num_eps_per_pe * (n_pes - 1));
+ num_cq_handles += device->dci.num_eps + (device->rc.num_eps_per_pe * (n_pes - 1) * 2);
num_shared_dci_handles += device->dci.num_shared_eps;
}
assert(num_dci_handles - num_shared_dci_handles >= 0);
@@ -2456,6 +2489,10 @@ static int ibgda_setup_gpu_state(nvshmem_transport_t t) {
for (int i = 0; i < num_cq_handles; i++) {
nvshmemi_init_ibgda_device_cq(cq_h[i]);
}
+
+ recv_cq_h = (nvshmemi_ibgda_device_cq_t *)calloc(1, sizeof(*recv_cq_h));
+ NVSHMEMI_NULL_ERROR_JMP(recv_cq_h, status, NVSHMEMX_ERROR_OUT_OF_MEMORY, out, "recv_cq calloc err.");
+ nvshmemi_init_ibgda_device_cq(recv_cq_h[0]);
/* allocate host memory for dct, rc, cq, dci end */
/* allocate device memory for dct, rc, cq, dci start */
@@ -2559,6 +2596,14 @@ static int ibgda_setup_gpu_state(nvshmem_transport_t t) {
}
++cq_idx;
+
+ rc_h[arr_idx].rx_wq.cq = &cq_d[cq_idx];
+
+ ibgda_get_device_cq(&cq_h[cq_idx], device->rc.eps[i]->recv_cq);
+ cq_h[cq_idx].resv_head = (uint64_t *)(base_mvars_d_addr + rx_resv_head_offset);
+ cq_h[cq_idx].qpn = rc_h[arr_idx].qpn;
+ cq_h[cq_idx].qp_type = rc_h[arr_idx].qp_type;
+ ++cq_idx;
}
}
}
--
2.25.1
From af479f9f23103d4a1579fae38676d6b3022df887 Mon Sep 17 00:00:00 2001
From: Shangyan Zhou <sy.zhou@deepseek.com>
Date: Sat, 8 Feb 2025 18:02:39 +0800
Subject: [PATCH 3/4] Maintain recv queue's cons_idx.
---
src/include/device_host_transport/nvshmem_common_ibgda.h | 5 +++--
src/modules/transport/ibgda/ibgda.cpp | 6 ++++--
2 files changed, 7 insertions(+), 4 deletions(-)
diff --git a/src/include/device_host_transport/nvshmem_common_ibgda.h b/src/include/device_host_transport/nvshmem_common_ibgda.h
index 1be3dec..ea1e284 100644
--- a/src/include/device_host_transport/nvshmem_common_ibgda.h
+++ b/src/include/device_host_transport/nvshmem_common_ibgda.h
@@ -170,6 +170,7 @@ typedef struct {
} tx_wq;
struct {
uint64_t resv_head; // last reserved wqe idx + 1
+ uint64_t cons_idx; // polled wqe idx + 1 (consumer index + 1)
} rx_wq;
struct {
uint64_t head;
@@ -177,7 +178,7 @@ typedef struct {
} ibuf;
char padding[NVSHMEMI_IBGDA_QP_MANAGEMENT_PADDING];
} __attribute__((__aligned__(8))) nvshmemi_ibgda_device_qp_management_v1;
-static_assert(sizeof(nvshmemi_ibgda_device_qp_management_v1) == 104,
- "ibgda_device_qp_management_v1 must be 104 bytes.");
+static_assert(sizeof(nvshmemi_ibgda_device_qp_management_v1) == 112,
+ "ibgda_device_qp_management_v1 must be 112 bytes.");
typedef nvshmemi_ibgda_device_qp_management_v1 nvshmemi_ibgda_device_qp_management_t;
@@ -214,7 +215,7 @@ typedef struct nvshmemi_ibgda_device_qp {
} rx_wq;
nvshmemi_ibgda_device_qp_management_v1 mvars; // management variables
} nvshmemi_ibgda_device_qp_v1;
-static_assert(sizeof(nvshmemi_ibgda_device_qp_v1) == 248, "ibgda_device_qp_v1 must be 248 bytes.");
+static_assert(sizeof(nvshmemi_ibgda_device_qp_v1) == 256, "ibgda_device_qp_v1 must be 256 bytes.");
typedef nvshmemi_ibgda_device_qp_v1 nvshmemi_ibgda_device_qp_t;
diff --git a/src/modules/transport/ibgda/ibgda.cpp b/src/modules/transport/ibgda/ibgda.cpp
index e0b2d5c..bc339c5 100644
--- a/src/modules/transport/ibgda/ibgda.cpp
+++ b/src/modules/transport/ibgda/ibgda.cpp
@@ -1067,7 +1067,7 @@ static inline void ibgda_nic_control_free(struct ibgda_mem_object *mobject) {
ibgda_host_mem_free(mobject);
}
-static int ibgda_create_cq(struct ibgda_cq **pgcq, struct ibgda_device *device) {
+static int ibgda_create_cq(struct ibgda_cq **pgcq, struct ibgda_device *device, int cc = 1) {
int status = 0;
struct ibgda_cq *gcq = NULL;
@@ -1118,7 +1118,7 @@ static int ibgda_create_cq(struct ibgda_cq **pgcq, struct ibgda_device *device)
cq_context = DEVX_ADDR_OF(create_cq_in, cmd_in, cq_context);
DEVX_SET(cqc, cq_context, dbr_umem_valid, IBGDA_MLX5_UMEM_VALID_ENABLE);
DEVX_SET(cqc, cq_context, cqe_sz, MLX5_CQE_SIZE_64B);
- DEVX_SET(cqc, cq_context, cc, 0x1); // Use collapsed CQ
+ DEVX_SET(cqc, cq_context, cc, cc); // Use collapsed CQ
DEVX_SET(cqc, cq_context, oi, 0x1); // Allow overrun
DEVX_SET(cqc, cq_context, dbr_umem_id, dbr_umem->umem_id);
DEVX_SET(cqc, cq_context, log_cq_size, IBGDA_ILOG2_OR0(num_cqe));
@@ -2419,6 +2419,7 @@ static int ibgda_setup_gpu_state(nvshmem_transport_t t) {
const size_t wqe_h_offset = offsetof(nvshmemi_ibgda_device_qp_management_t, tx_wq.resv_head);
const size_t wqe_t_offset = offsetof(nvshmemi_ibgda_device_qp_management_t, tx_wq.ready_head);
const size_t rx_resv_head_offset = offsetof(nvshmemi_ibgda_device_qp_management_t, rx_wq.resv_head);
+ const size_t rx_cons_offset = offsetof(nvshmemi_ibgda_device_qp_management_t, rx_wq.cons_idx);
nvshmemi_ibgda_device_qp_map_type_t rc_map_type = NVSHMEMI_IBGDA_DEVICE_QP_MAP_TYPE_INVALID;
nvshmemi_ibgda_device_qp_map_type_t dc_map_type = NVSHMEMI_IBGDA_DEVICE_QP_MAP_TYPE_INVALID;
@@ -2601,6 +2602,7 @@ static int ibgda_setup_gpu_state(nvshmem_transport_t t) {
ibgda_get_device_cq(&cq_h[cq_idx], device->rc.eps[i]->recv_cq);
cq_h[cq_idx].resv_head = (uint64_t *)(base_mvars_d_addr + rx_resv_head_offset);
+ cq_h[cq_idx].cons_idx = (uint64_t *)(base_mvars_d_addr + rx_cons_offset);
cq_h[cq_idx].qpn = rc_h[arr_idx].qpn;
cq_h[cq_idx].qp_type = rc_h[arr_idx].qp_type;
++cq_idx;
--
2.25.1
From e0ba3fa21b4b633b481c6684c3ad04f2670c8df4 Mon Sep 17 00:00:00 2001
From: Shangyan Zhou <sy.zhou@deepseek.com>
Date: Tue, 11 Feb 2025 11:00:57 +0800
Subject: [PATCH 4/4] Init rx_wq counters.
---
src/include/device_host_transport/nvshmem_common_ibgda.h | 2 ++
1 file changed, 2 insertions(+)
diff --git a/src/include/device_host_transport/nvshmem_common_ibgda.h b/src/include/device_host_transport/nvshmem_common_ibgda.h
index ea1e284..e6640d6 100644
--- a/src/include/device_host_transport/nvshmem_common_ibgda.h
+++ b/src/include/device_host_transport/nvshmem_common_ibgda.h
@@ -46,6 +46,8 @@
qp_man.tx_wq.cons_idx = NVSHMEMI_IBGDA_ULSCALAR_INVALID; \
qp_man.tx_wq.get_head = NVSHMEMI_IBGDA_ULSCALAR_INVALID; \
qp_man.tx_wq.get_tail = NVSHMEMI_IBGDA_ULSCALAR_INVALID; \
+ qp_man.rx_wq.resv_head = NVSHMEMI_IBGDA_ULSCALAR_INVALID; \
+ qp_man.rx_wq.cons_idx = NVSHMEMI_IBGDA_ULSCALAR_INVALID; \
qp_man.ibuf.head = NVSHMEMI_IBGDA_ULSCALAR_INVALID; \
qp_man.ibuf.tail = NVSHMEMI_IBGDA_ULSCALAR_INVALID; \
} while (0);
--
2.25.1
diff --git a/src/modules/transport/common/transport_ib_common.cpp b/src/modules/transport/common/transport_ib_common.cpp
index c89f408..f99018a 100644
--- a/src/modules/transport/common/transport_ib_common.cpp
+++ b/src/modules/transport/common/transport_ib_common.cpp
@@ -26,6 +26,9 @@ int nvshmemt_ib_common_nv_peer_mem_available() {
if (access("/sys/kernel/mm/memory_peers/nvidia-peermem/version", F_OK) == 0) {
return NVSHMEMX_SUCCESS;
}
+ if (access("/sys/module/nvidia_peermem/version", F_OK) == 0) {
+ return NVSHMEMX_SUCCESS;
+ }
return NVSHMEMX_ERROR_INTERNAL;
}
From 099f608fcd9a1d34c866ad75d0af5d02d2020374 Mon Sep 17 00:00:00 2001
From: Kaichao You <youkaichao@gmail.com>
Date: Tue, 10 Jun 2025 00:35:03 -0700
Subject: [PATCH] remove gdrcopy dependency
---
src/modules/transport/ibgda/ibgda.cpp | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/src/modules/transport/ibgda/ibgda.cpp b/src/modules/transport/ibgda/ibgda.cpp
index ef325cd..16ee09c 100644
--- a/src/modules/transport/ibgda/ibgda.cpp
+++ b/src/modules/transport/ibgda/ibgda.cpp
@@ -406,6 +406,7 @@ static size_t ibgda_get_host_page_size() {
return host_page_size;
}
+#ifdef NVSHMEM_USE_GDRCOPY
int nvshmemt_ibgda_progress(nvshmem_transport_t t) {
nvshmemt_ibgda_state_t *ibgda_state = (nvshmemt_ibgda_state_t *)t->state;
int n_devs_selected = ibgda_state->n_devs_selected;
@@ -459,6 +460,11 @@ int nvshmemt_ibgda_progress(nvshmem_transport_t t) {
}
return 0;
}
+#else
+int nvshmemt_ibgda_progress(nvshmem_transport_t t) {
+ return NVSHMEMX_ERROR_NOT_SUPPORTED;
+}
+#endif
int nvshmemt_ibgda_show_info(struct nvshmem_transport *transport, int style) {
NVSHMEMI_ERROR_PRINT("ibgda show info not implemented");
--
2.34.1

184
Dockerfile Normal file
View File

@ -0,0 +1,184 @@
###############################################################################
# Stage 0 ─ builder-torch编译 PyTorch 2.7.1 (+cu126)
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
ENV USE_CUDA=1 \
USE_DISTRIBUTED=1 \
USE_MPI=1 \
USE_GLOO=1 \
USE_NCCL=1 \
USE_SYSTEM_NCCL=1 \
BUILD_TEST=0
ARG MAX_JOBS=90
ENV DEBIAN_FRONTEND=noninteractive \
PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
libopenblas-dev libopenmpi-dev \
libnccl2=2.22.3-1+cuda12.6 \
libnccl-dev=2.22.3-1+cuda12.6 \
libjpeg-dev libpng-dev ca-certificates && \
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
WORKDIR /opt
RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
WORKDIR /opt/pytorch
ENV MAX_JOBS=${MAX_JOBS}
RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
python3 setup.py bdist_wheel
###############################################################################
# Stage 1 ─ builder-extras用自编 Torch 装 TV / flashinfer / sglang并收集轮子
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-pip python3-distutils python3.10-dev git build-essential \
cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
libopenmpi-dev libopenblas-dev\
libnccl2=2.22.3-1+cuda12.6 \
libnccl-dev=2.22.3-1+cuda12.6 && \
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
RUN set -e && \
echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
WORKDIR /opt
RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
WORKDIR /opt/vision
RUN python3 setup.py bdist_wheel
# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
WORKDIR /opt
RUN git clone https://github.com/flashinfer-ai/flashinfer.git
WORKDIR /opt/flashinfer
RUN pip install . && \
python3 -m pip wheel . --no-deps -w dist/
# ── 下载 vllm 预编译 wheel避免编译 flash-attn ───────────────────────────────
WORKDIR /opt
RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
COPY ./sglang /sgl/sglang
WORKDIR /sgl/sglang/python
RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
# ── 🔄 下载 sgl-kernel与 sglang 同步)───────────────────────────────────────
RUN pip download --only-binary=:all: --no-deps sgl-kernel==0.1.9 -d /tmp/sgl_kernel_wheels
# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
RUN mkdir -p /wheels && \
cp /tmp/torch_dist/torch*.whl /wheels/ && \
cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
cp /tmp/sgl_kernel_wheels/sgl_kernel-*.whl /wheels/ && \
pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
RUN pip wheel \
pydantic orjson psutil pyzmq pynvml \
transformers==4.52.0 uvicorn fastapi IPython aiohttp \
setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
-w /wheels
# ── ✅ 打包 gradio UI 所需依赖 ────────────────────────────────────────────────
RUN pip wheel "gradio==5.38.2" requests -w /wheels
###############################################################################
# Stage 2 ─ runtime极简运行镜像仅离线安装 wheel
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
python3 python3-dev python3-pip python3-distutils curl ca-certificates \
libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
rm -rf /var/lib/apt/lists/* && \
python3 -m pip install --no-cache-dir --upgrade pip \
&& python3 -m pip install --no-cache-dir xgrammar
# 👉 拷贝 cupti 动态库(避免写死版本号)
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
# 👇建议在后面补上
RUN ldconfig
# ---- 拷贝预调优的 MoE Triton kernel config ----------------------------
COPY moe_kernels /usr/local/lib/python3.10/dist-packages/sglang/srt/layers/moe/fused_moe_triton/configs
COPY --from=builder-extras /wheels /tmp/wheels
# ✅ 优先装你自编的 torch避免被 PyPI 上的覆盖
RUN ls -lh /tmp/wheels && \
rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
rm -f /tmp/wheels/huggingface_hub-0.33.4*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/sgl_kernel-*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/gradio-5.38.2*.whl && \
python3 -m pip install --no-cache-dir --no-deps $(ls /tmp/wheels | grep -v '^gradio-' | sed 's|^|/tmp/wheels/|') && \
python3 -m pip install --no-cache-dir --no-deps $(find /tmp/wheels -maxdepth 1 -type f -name '*.whl' ! -name 'gradio-*') && \
python3 -c "import gradio, sys; print('✅ Gradio version =', gradio.__version__)" && \
rm -rf /tmp/wheels
# ✅ 安装 Prometheus client
RUN python3 -m pip install --no-cache-dir prometheus_client
# ✅ 设置多进程 metrics 收集目录(用于 MultiProcessCollector
ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
# ✅ 确保目录存在
RUN mkdir -p /tmp/prometheus
# ✅ 添加 Tini推荐
ENV TINI_VERSION=v0.19.0
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
RUN chmod +x /tini
ENTRYPOINT ["/tini", "--"]
# ---- 拷贝模型(路径可换) ----
# COPY ./Alibaba/Qwen3-30B-A3B /root/.cradle/Alibaba/Qwen3-30B-A3B
HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
# ---- 暴露端口 ----
EXPOSE 30000 30001
# 安装 supervisor
RUN apt-get update && apt-get install -y supervisor && \
mkdir -p /etc/supervisor/conf.d
# 拷贝 supervisord 配置文件和 UI 脚本
COPY ./meta_ui.py /app/meta_ui.py
COPY ./supervisord.conf /etc/supervisor/supervisord.conf
# 作为容器主进程运行 supervisor
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]

25
gdrcopy/.gitignore vendored Normal file
View File

@ -0,0 +1,25 @@
# Compiled Object files
*.slo
*.lo
*.o
*.obj
# Compiled Dynamic libraries
*.so
*.dylib
*.dll
# Compiled Static libraries
*.lai
*.la
*.a
*.lib
# Executables
*.exe
*.out
*.app
# Editor files
*~
*.swp

108
gdrcopy/CHANGELOG.md Normal file
View File

@ -0,0 +1,108 @@
# Changelog
## [2.4.4] - 2024-12-16
- Fix the use-after-free bug of mr objects in gdrdv\_vma\_close.
- Fix the resource leakage bug in gdrdrv\_release.
## [2.4.3] - 2024-12-02
- Fix NVIDIA\_IS\_OPENSOURCE detection when compile with NVIDIA driver version 545 or newer.
- Fix compile error in gdrdrv when compile on RHEL9.5.
## [2.4.2] - 2024-10-31
- Fix the size alignment bug in gdrdrv.
- Fix memory leak in gdr\_pin\_buffer.
- Add support for another flavor of BF3.
## [2.4.1] - 2023-12-18
- Add support for persistent mapping.
- Fix bug in src/gdrdrv/Makefile.
- Fix compile-time bug when check.h is not found.
## [2.4] - 2023-09-19
- Various bug fixes in the test and benchmark applications.
- Prefix all applications with "gdrcopy\_".
- Introduce more unit tests in gdrcopy\_sanity.
- Introduce gdrcopy\_pplat benchmark application.
- Remove dependency on libcheck and libsubunit
- Introduce gdr\_get\_info\_v2.
- Introduce new copy algorithm for device mappings.
- Add support for NVIDIA BLUEFIELD-3.
- Add support for Linux kernel >= 6.3.
- Add support for SLES and OpenSUSE.
- Add support for systemd service on RHEL9.
- Relicense gdrdrv to Dual MIT/GPL.
- Fix bugs in gdrdrv when pinning two small buffers back-to-back.
- Add support for coherent platforms such as Grace-Hopper.
- Add support for Confidential Computing (CC).
## [2.3.1] - 2023-05-12
- Add a workaround for the GPL-compatibility issue when compile with CONFIG\_ARCH\_HAS\_CC\_PLATFORM on Linux kernel 5.18+.
- Fix error in init.d/gdrcopy due to missing /etc/rc.d/init.d/functions.
## [2.3] - 2021-07-27
- Remove automatically-generated build id links in rpm packages.
- Remove gdrcopy-kmod from the Requires field of the gdrcopy rpm package.
- Remove gdrdrv-dkms dependency enforcement from the gdrcopy deb package.
- Add libsubunit0 to the dependency list of the gdrcopy deb package.
- Add apiperf test.
- Revamp gdrdrv to fix race-condition bugs.
- Add an option to build kmod package.
- Split the gdrcopy deb package into meta, libgdrapi, and tests packages.
- Update the package maintainer.
- Various updates in README.
## [2.2] - 2021-02-01
- Add support for ARM64.
- Update various information on README.
- Improve Makefile.
- Add multi-arch support.
- Handle removal of HAVE\_UNLOCKED\_IOCTL in Linux kernel v5.9 and later.
- Prevent dpkg package creation to unnecessarily compile gdrdrv.
- Improve gdr\_open error message.
- Fix bug that prevents sanity from correctly summarizing failure.
- Add dkms support in kmod package.
- Handle the removal of kzfree in Linux kernel v5.10 and later.
- Improve small-size copy-to-mapping.
## [2.1] - 2020-08-07
- fix build problem on RHL8 kernels
- relax checks in gdrdrv to support multi-threading use cases
- fix fd leak in gdr\_open()
- introduce new copylat test
- remove CUDA RT dependency in tests
- assorted cleanups
## [2.0] - 2019-09-16
- Harden security in gdrdrv.
- Enable cached mappings in POWER9.
- Improve copy performance with unrolling in POWERPC.
- Creates _sanity_ unit test for testing the functionality and security.
- Consolidate _basic_ and _validate_ into _sanity_ unit test.
- Introduce compile time and runtime version checking in _libgdrapi_.
- Improve rpm packaging.
- Introduce deb packaging for the userspace library and the applications.
- Introduce dkms packaging for the _gdrdrv_ driver.
- Rename gdr\_copy\_from/to\_bar to gdr\_copy\_from/to\_mapping.
- Update README
## [1.3] - 2018-07-26
- Add _gdrdrv_ driver for converting cudaMalloc'd addresses to the GPU's BAR1
addresses and exposing them to CPU-accessible virtual addresses.
- Add _libgdrapi_, a user-space library for communicating with the gdrdrv driver.
- Add _basic_ application as an minimal example on how to use gdrcopy.
- Add _copybw_ application as a complete example on how CPU could read/write to
cudaMalloc'd memory via BAR1 mappings.
- Add _validate_ unit test to ensure that gdrcopy functions as expected.
- Add a script for packaging gdrcopy in the rpm format.
[2.4.3]: https://github.com/NVIDIA/gdrcopy/releases/tag/v2.4.3
[2.4.2]: https://github.com/NVIDIA/gdrcopy/releases/tag/v2.4.2
[2.4.1]: https://github.com/NVIDIA/gdrcopy/releases/tag/v2.4.1
[2.4]: https://github.com/NVIDIA/gdrcopy/releases/tag/v2.4
[2.3.1]: https://github.com/NVIDIA/gdrcopy/releases/tag/v2.3.1
[2.3]: https://github.com/NVIDIA/gdrcopy/releases/tag/v2.3
[2.2]: https://github.com/NVIDIA/gdrcopy/releases/tag/v2.2
[2.1]: https://github.com/NVIDIA/gdrcopy/releases/tag/v2.1
[2.0]: https://github.com/NVIDIA/gdrcopy/releases/tag/v2.0
[1.3]: https://github.com/NVIDIA/gdrcopy/releases/tag/v1.3

19
gdrcopy/LICENSE Normal file
View File

@ -0,0 +1,19 @@
Copyright (c) 2014-2021, NVIDIA CORPORATION. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the "Software"),
to deal in the Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.

94
gdrcopy/Makefile Normal file
View File

@ -0,0 +1,94 @@
# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
prefix ?= /usr/local
exec_prefix ?= $(prefix)
libdir ?= $(exec_prefix)/lib
bindir ?= $(exec_prefix)/bin
includedir ?= $(prefix)/include
DESTDIR := $(abspath $(DESTDIR))
DESTLIB = $(DESTDIR)$(libdir)
DESTBIN = $(DESTDIR)$(bindir)
DESTINC = $(DESTDIR)$(includedir)
CUDA ?= /usr/local/cuda
LIB_MAJOR_VER ?= $(shell awk '/\#define GDR_API_MAJOR_VERSION/ { print $$3 }' include/gdrapi.h | tr -d '\n')
LIB_MINOR_VER ?= $(shell awk '/\#define GDR_API_MINOR_VERSION/ { print $$3 }' include/gdrapi.h | tr -d '\n')
GDRAPI_ARCH := $(shell ./config_arch)
GDRAPI_INC := ../include
LIB_VER:=$(LIB_MAJOR_VER).$(LIB_MINOR_VER)
LIB_BASENAME:=libgdrapi.so
LIB_DYNAMIC=$(LIB_BASENAME).$(LIB_VER)
LIB_SONAME=$(LIB_BASENAME).$(LIB_MAJOR_VER)
all: config driver lib exes
version:
@ echo "$(LIB_VER)"
config:
@ echo "GDRAPI_ARCH=$(GDRAPI_ARCH)"
driver:
cd src/gdrdrv && \
$(MAKE) $(MAKE_PARAMS)
lib:
cd src && \
$(MAKE) LIB_MAJOR_VER=$(LIB_MAJOR_VER) LIB_MINOR_VER=$(LIB_MINOR_VER)
exes: lib
cd tests && \
$(MAKE) CUDA=$(CUDA)
install: lib_install exes_install
lib_install: lib
@ echo "installing in $(DESTLIB) $(DESTINC)..." && \
mkdir -p $(DESTLIB) && \
install -D -v -m u=rwx,g=rx,o=rx src/$(LIB_DYNAMIC) -t $(DESTLIB) && \
mkdir -p $(DESTINC) && \
install -D -v -m u=rw,g=rw,o=r include/* -t $(DESTINC); \
cd $(DESTLIB); \
ln -sf $(LIB_DYNAMIC) $(LIB_SONAME); \
ln -sf $(LIB_SONAME) $(LIB_BASENAME);
exes_install: exes
cd tests && $(MAKE) install DESTBIN=$(DESTBIN)
drv_install: driver
cd src/gdrdrv && \
$(MAKE) install
clean:
cd tests && \
$(MAKE) clean
cd src && \
$(MAKE) clean
cd src/gdrdrv && \
$(MAKE) clean
.PHONY: driver clean all lib exes lib_install drv_install exes_install install

495
gdrcopy/README.md Normal file
View File

@ -0,0 +1,495 @@
# GDRCopy
A low-latency GPU memory copy library based on NVIDIA GPUDirect RDMA
technology.
## Introduction
While GPUDirect RDMA is meant for direct access to GPU memory from
third-party devices, it is possible to use these same APIs to create
perfectly valid CPU mappings of the GPU memory.
The advantage of a CPU driven copy is the very small overhead
involved. That might be useful when low latencies are required.
## What is inside
GDRCopy offers the infrastructure to create user-space mappings of GPU memory,
which can then be manipulated as if it was plain host memory (caveats apply
here).
A simple by-product of it is a copy library with the following characteristics:
- very low overhead, as it is driven by the CPU. As a reference, currently a
cudaMemcpy can incur in a 6-7us overhead.
- An initial memory *pinning* phase is required, which is potentially expensive,
10us-1ms depending on the buffer size.
- Fast H-D, because of write-combining. H-D bandwidth is 6-8GB/s on Ivy
Bridge Xeon but it is subject to NUMA effects.
- Slow D-H, because the GPU BAR, which backs the mappings, can't be
prefetched and so burst reads transactions are not generated through
PCIE
The library comes with a few tests like:
- gdrcopy_sanity, which contains unit tests for the library and the driver.
- gdrcopy_copybw, a minimal application which calculates the R/W bandwidth for a specific buffer size.
- gdrcopy_copylat, a benchmark application which calculates the R/W copy latency for a range of buffer sizes.
- gdrcopy_apiperf, an application for benchmarking the latency of each GDRCopy API call.
- gdrcopy_pplat, a benchmark application which calculates the round-trip ping-pong latency between GPU and CPU.
## Requirements
GPUDirect RDMA requires NVIDIA Tesla or Quadro class GPUs based on Kepler,
Pascal, Volta, or Turing, see [GPUDirect
RDMA](http://developer.nvidia.com/gpudirect). For more technical informations,
please refer to the official GPUDirect RDMA [design
document](http://docs.nvidia.com/cuda/gpudirect-rdma).
The device driver requires GPU display driver >= 418.40 on ppc64le and >= 331.14 on other platforms. The library and tests
require CUDA >= 6.0.
DKMS is a prerequisite for installing GDRCopy kernel module package. On RHEL
or SLE,
however, users have an option to build kmod and install it instead of the DKMS
package. See [Build and installation](#build-and-installation) section for more details.
```shell
# On RHEL
# dkms can be installed from epel-release. See https://fedoraproject.org/wiki/EPEL.
$ sudo yum install dkms
# On Debian - No additional dependency
# On SLE / Leap
# On SLE dkms can be installed from PackageHub.
$ sudo zypper install dkms rpmbuild
```
CUDA and GPU display driver must be installed before building and/or installing GDRCopy.
The installation instructions can be found in https://developer.nvidia.com/cuda-downloads.
GPU display driver header files are also required. They are installed as a part
of the driver (or CUDA) installation with *runfile*. If you install the driver
via package management, we suggest
- On RHEL, `sudo dnf module install nvidia-driver:latest-dkms`.
- On Debian, `sudo apt install nvidia-dkms-<your-nvidia-driver-version>`.
- On SLE, `sudo zypper install nvidia-gfx<your-nvidia-driver-version>-kmp`.
The supported architectures are Linux x86\_64, ppc64le, and arm64. The supported
platforms are RHEL8, RHEL9, Ubuntu20\_04, Ubuntu22\_04,
SLE-15 (any SP) and Leap 15.x.
Root privileges are necessary to load/install the kernel-mode device
driver.
## Build and installation
We provide three ways for building and installing GDRCopy.
### rpm package
```shell
# For RHEL:
$ sudo yum groupinstall 'Development Tools'
$ sudo yum install dkms rpm-build make
# For SLE:
$ sudo zypper in dkms rpmbuild
$ cd packages
$ CUDA=<cuda-install-top-dir> ./build-rpm-packages.sh
$ sudo rpm -Uvh gdrcopy-kmod-<version>dkms.noarch.<platform>.rpm
$ sudo rpm -Uvh gdrcopy-<version>.<arch>.<platform>.rpm
$ sudo rpm -Uvh gdrcopy-devel-<version>.noarch.<platform>.rpm
```
DKMS package is the default kernel module package that `build-rpm-packages.sh`
generates. To create kmod package, `-m` option must be passed to the script.
Unlike the DKMS package, the kmod package contains a prebuilt GDRCopy kernel
module which is specific to the NVIDIA driver version and the Linux kernel
version used to build it.
### deb package
```shell
$ sudo apt install build-essential devscripts debhelper fakeroot pkg-config dkms
$ cd packages
$ CUDA=<cuda-install-top-dir> ./build-deb-packages.sh
$ sudo dpkg -i gdrdrv-dkms_<version>_<arch>.<platform>.deb
$ sudo dpkg -i libgdrapi_<version>_<arch>.<platform>.deb
$ sudo dpkg -i gdrcopy-tests_<version>_<arch>.<platform>.deb
$ sudo dpkg -i gdrcopy_<version>_<arch>.<platform>.deb
```
### from source
```shell
$ make prefix=<install-to-this-location> CUDA=<cuda-install-top-dir> all install
$ sudo ./insmod.sh
```
### Notes
Compiling the gdrdrv driver requires the NVIDIA driver source code, which is typically installed at
`/usr/src/nvidia-<version>`. Our make file automatically detects and picks that source code. In case there are multiple
versions installed, it is possible to pass the correct path by defining the NVIDIA_SRC_DIR variable, e.g. `export
NVIDIA_SRC_DIR=/usr/src/nvidia-520.61.05/nvidia` before building the gdrdrv module.
There are two major flavors of NVIDIA driver: 1) proprietary, and 2)
[opensource](https://developer.nvidia.com/blog/nvidia-releases-open-source-gpu-kernel-modules/). We detect the flavor
when compiling gdrdrv based on the source code of the NVIDIA driver. Different flavors come with different features and
restrictions:
- gdrdrv compiled with the opensource flavor will provide functionality and high performance on all platforms. However,
you will not be able to load this gdrdrv driver when the proprietary NVIDIA driver is loaded.
- gdrdrv compiled with the proprietary flavor can always be loaded regardless of the flavor of NVIDIA driver you have
loaded. However, it may have suboptimal performance on coherent platforms such as Grace-Hopper. Functionally, it will not
work correctly on Intel CPUs with Linux kernel built with confidential compute (CC) support, i.e.
`CONFIG_ARCH_HAS_CC_PLATFORM=y`, *WHEN* CC is enabled at runtime.
## Tests
Execute provided tests:
```shell
$ gdrcopy_sanity
Total: 28, Passed: 28, Failed: 0, Waived: 0
List of passed tests:
basic_child_thread_pins_buffer_cumemalloc
basic_child_thread_pins_buffer_vmmalloc
basic_cumemalloc
basic_small_buffers_mapping
basic_unaligned_mapping
basic_vmmalloc
basic_with_tokens
data_validation_cumemalloc
data_validation_vmmalloc
invalidation_access_after_free_cumemalloc
invalidation_access_after_free_vmmalloc
invalidation_access_after_gdr_close_cumemalloc
invalidation_access_after_gdr_close_vmmalloc
invalidation_fork_access_after_free_cumemalloc
invalidation_fork_access_after_free_vmmalloc
invalidation_fork_after_gdr_map_cumemalloc
invalidation_fork_after_gdr_map_vmmalloc
invalidation_fork_child_gdr_map_parent_cumemalloc
invalidation_fork_child_gdr_map_parent_vmmalloc
invalidation_fork_child_gdr_pin_parent_with_tokens
invalidation_fork_map_and_free_cumemalloc
invalidation_fork_map_and_free_vmmalloc
invalidation_two_mappings_cumemalloc
invalidation_two_mappings_vmmalloc
invalidation_unix_sock_shared_fd_gdr_map_cumemalloc
invalidation_unix_sock_shared_fd_gdr_map_vmmalloc
invalidation_unix_sock_shared_fd_gdr_pin_buffer_cumemalloc
invalidation_unix_sock_shared_fd_gdr_pin_buffer_vmmalloc
$ gdrcopy_copybw
GPU id:0; name: Tesla V100-SXM2-32GB; Bus id: 0000:06:00
GPU id:1; name: Tesla V100-SXM2-32GB; Bus id: 0000:07:00
GPU id:2; name: Tesla V100-SXM2-32GB; Bus id: 0000:0a:00
GPU id:3; name: Tesla V100-SXM2-32GB; Bus id: 0000:0b:00
GPU id:4; name: Tesla V100-SXM2-32GB; Bus id: 0000:85:00
GPU id:5; name: Tesla V100-SXM2-32GB; Bus id: 0000:86:00
GPU id:6; name: Tesla V100-SXM2-32GB; Bus id: 0000:89:00
GPU id:7; name: Tesla V100-SXM2-32GB; Bus id: 0000:8a:00
selecting device 0
testing size: 131072
rounded size: 131072
gpu alloc fn: cuMemAlloc
device ptr: 7f1153a00000
map_d_ptr: 0x7f1172257000
info.va: 7f1153a00000
info.mapped_size: 131072
info.page_size: 65536
info.mapped: 1
info.wc_mapping: 1
page offset: 0
user-space pointer:0x7f1172257000
writing test, size=131072 offset=0 num_iters=10000
write BW: 9638.54MB/s
reading test, size=131072 offset=0 num_iters=100
read BW: 530.135MB/s
unmapping buffer
unpinning buffer
closing gdrdrv
$ gdrcopy_copylat
GPU id:0; name: Tesla V100-SXM2-32GB; Bus id: 0000:06:00
GPU id:1; name: Tesla V100-SXM2-32GB; Bus id: 0000:07:00
GPU id:2; name: Tesla V100-SXM2-32GB; Bus id: 0000:0a:00
GPU id:3; name: Tesla V100-SXM2-32GB; Bus id: 0000:0b:00
GPU id:4; name: Tesla V100-SXM2-32GB; Bus id: 0000:85:00
GPU id:5; name: Tesla V100-SXM2-32GB; Bus id: 0000:86:00
GPU id:6; name: Tesla V100-SXM2-32GB; Bus id: 0000:89:00
GPU id:7; name: Tesla V100-SXM2-32GB; Bus id: 0000:8a:00
selecting device 0
device ptr: 0x7fa2c6000000
allocated size: 16777216
gpu alloc fn: cuMemAlloc
map_d_ptr: 0x7fa2f9af9000
info.va: 7fa2c6000000
info.mapped_size: 16777216
info.page_size: 65536
info.mapped: 1
info.wc_mapping: 1
page offset: 0
user-space pointer: 0x7fa2f9af9000
gdr_copy_to_mapping num iters for each size: 10000
WARNING: Measuring the API invocation overhead as observed by the CPU. Data
might not be ordered all the way to the GPU internal visibility.
Test Size(B) Avg.Time(us)
gdr_copy_to_mapping 1 0.0889
gdr_copy_to_mapping 2 0.0884
gdr_copy_to_mapping 4 0.0884
gdr_copy_to_mapping 8 0.0884
gdr_copy_to_mapping 16 0.0905
gdr_copy_to_mapping 32 0.0902
gdr_copy_to_mapping 64 0.0902
gdr_copy_to_mapping 128 0.0952
gdr_copy_to_mapping 256 0.0983
gdr_copy_to_mapping 512 0.1176
gdr_copy_to_mapping 1024 0.1825
gdr_copy_to_mapping 2048 0.2549
gdr_copy_to_mapping 4096 0.4366
gdr_copy_to_mapping 8192 0.8141
gdr_copy_to_mapping 16384 1.6155
gdr_copy_to_mapping 32768 3.2284
gdr_copy_to_mapping 65536 6.4906
gdr_copy_to_mapping 131072 12.9761
gdr_copy_to_mapping 262144 25.9459
gdr_copy_to_mapping 524288 51.9100
gdr_copy_to_mapping 1048576 103.8028
gdr_copy_to_mapping 2097152 207.5990
gdr_copy_to_mapping 4194304 415.2856
gdr_copy_to_mapping 8388608 830.6355
gdr_copy_to_mapping 16777216 1661.3285
gdr_copy_from_mapping num iters for each size: 100
Test Size(B) Avg.Time(us)
gdr_copy_from_mapping 1 0.9069
gdr_copy_from_mapping 2 1.7170
gdr_copy_from_mapping 4 1.7169
gdr_copy_from_mapping 8 1.7164
gdr_copy_from_mapping 16 0.8601
gdr_copy_from_mapping 32 1.7024
gdr_copy_from_mapping 64 3.1016
gdr_copy_from_mapping 128 3.4944
gdr_copy_from_mapping 256 3.6400
gdr_copy_from_mapping 512 2.4394
gdr_copy_from_mapping 1024 2.8022
gdr_copy_from_mapping 2048 4.6615
gdr_copy_from_mapping 4096 7.9783
gdr_copy_from_mapping 8192 14.9209
gdr_copy_from_mapping 16384 28.9571
gdr_copy_from_mapping 32768 56.9373
gdr_copy_from_mapping 65536 114.1008
gdr_copy_from_mapping 131072 234.9382
gdr_copy_from_mapping 262144 496.4011
gdr_copy_from_mapping 524288 985.5196
gdr_copy_from_mapping 1048576 1970.7057
gdr_copy_from_mapping 2097152 3942.5611
gdr_copy_from_mapping 4194304 7888.9468
gdr_copy_from_mapping 8388608 18361.5673
gdr_copy_from_mapping 16777216 36758.8342
unmapping buffer
unpinning buffer
closing gdrdrv
$ gdrcopy_apiperf -s 8
GPU id:0; name: Tesla V100-SXM2-32GB; Bus id: 0000:06:00
GPU id:1; name: Tesla V100-SXM2-32GB; Bus id: 0000:07:00
GPU id:2; name: Tesla V100-SXM2-32GB; Bus id: 0000:0a:00
GPU id:3; name: Tesla V100-SXM2-32GB; Bus id: 0000:0b:00
GPU id:4; name: Tesla V100-SXM2-32GB; Bus id: 0000:85:00
GPU id:5; name: Tesla V100-SXM2-32GB; Bus id: 0000:86:00
GPU id:6; name: Tesla V100-SXM2-32GB; Bus id: 0000:89:00
GPU id:7; name: Tesla V100-SXM2-32GB; Bus id: 0000:8a:00
selecting device 0
device ptr: 0x7f1563a00000
allocated size: 65536
Size(B) pin.Time(us) map.Time(us) get_info.Time(us) unmap.Time(us)
unpin.Time(us)
65536 1346.034060 3.603800 0.340270 4.700930 676.612800
Histogram of gdr_pin_buffer latency for 65536 bytes
[1303.852000 - 2607.704000] 93
[2607.704000 - 3911.556000] 0
[3911.556000 - 5215.408000] 0
[5215.408000 - 6519.260000] 0
[6519.260000 - 7823.112000] 0
[7823.112000 - 9126.964000] 0
[9126.964000 - 10430.816000] 0
[10430.816000 - 11734.668000] 0
[11734.668000 - 13038.520000] 0
[13038.520000 - 14342.372000] 2
closing gdrdrv
$ numactl -N 1 -l gdrcopy_pplat
GPU id:0; name: NVIDIA A40; Bus id: 0000:09:00
selecting device 0
device ptr: 0x7f99d2600000
gpu alloc fn: cuMemAlloc
map_d_ptr: 0x7f9a054fb000
info.va: 7f99d2600000
info.mapped_size: 4
info.page_size: 65536
info.mapped: 1
info.wc_mapping: 1
page offset: 0
user-space pointer: 0x7f9a054fb000
CPU does gdr_copy_to_mapping and GPU writes back via cuMemHostAlloc'd buffer.
Running 1000 iterations with data size 4 bytes.
Round-trip latency per iteration is 1.08762 us
unmapping buffer
unpinning buffer
closing gdrdrv
```
## NUMA effects
Depending on the platform architecture, like where the GPU are placed in
the PCIe topology, performance may suffer if the processor which is driving
the copy is not the one which is hosting the GPU, for example in a
multi-socket server.
In the example below, GPU ID 0 is hosted by
CPU socket 0. By explicitly playing with the OS process and memory
affinity, it is possible to run the test onto the optimal processor:
```shell
$ numactl -N 0 -l gdrcopy_copybw -d 0 -s $((64 * 1024)) -o $((0 * 1024)) -c $((64 * 1024))
GPU id:0; name: Tesla V100-SXM2-32GB; Bus id: 0000:06:00
GPU id:1; name: Tesla V100-SXM2-32GB; Bus id: 0000:07:00
GPU id:2; name: Tesla V100-SXM2-32GB; Bus id: 0000:0a:00
GPU id:3; name: Tesla V100-SXM2-32GB; Bus id: 0000:0b:00
GPU id:4; name: Tesla V100-SXM2-32GB; Bus id: 0000:85:00
GPU id:5; name: Tesla V100-SXM2-32GB; Bus id: 0000:86:00
GPU id:6; name: Tesla V100-SXM2-32GB; Bus id: 0000:89:00
GPU id:7; name: Tesla V100-SXM2-32GB; Bus id: 0000:8a:00
selecting device 0
testing size: 65536
rounded size: 65536
gpu alloc fn: cuMemAlloc
device ptr: 7f5817a00000
map_d_ptr: 0x7f583b186000
info.va: 7f5817a00000
info.mapped_size: 65536
info.page_size: 65536
info.mapped: 1
info.wc_mapping: 1
page offset: 0
user-space pointer:0x7f583b186000
writing test, size=65536 offset=0 num_iters=1000
write BW: 9768.3MB/s
reading test, size=65536 offset=0 num_iters=1000
read BW: 548.423MB/s
unmapping buffer
unpinning buffer
closing gdrdrv
```
or on the other socket:
```shell
$ numactl -N 1 -l gdrcopy_copybw -d 0 -s $((64 * 1024)) -o $((0 * 1024)) -c $((64 * 1024))
GPU id:0; name: Tesla V100-SXM2-32GB; Bus id: 0000:06:00
GPU id:1; name: Tesla V100-SXM2-32GB; Bus id: 0000:07:00
GPU id:2; name: Tesla V100-SXM2-32GB; Bus id: 0000:0a:00
GPU id:3; name: Tesla V100-SXM2-32GB; Bus id: 0000:0b:00
GPU id:4; name: Tesla V100-SXM2-32GB; Bus id: 0000:85:00
GPU id:5; name: Tesla V100-SXM2-32GB; Bus id: 0000:86:00
GPU id:6; name: Tesla V100-SXM2-32GB; Bus id: 0000:89:00
GPU id:7; name: Tesla V100-SXM2-32GB; Bus id: 0000:8a:00
selecting device 0
testing size: 65536
rounded size: 65536
gpu alloc fn: cuMemAlloc
device ptr: 7fbb63a00000
map_d_ptr: 0x7fbb82ab0000
info.va: 7fbb63a00000
info.mapped_size: 65536
info.page_size: 65536
info.mapped: 1
info.wc_mapping: 1
page offset: 0
user-space pointer:0x7fbb82ab0000
writing test, size=65536 offset=0 num_iters=1000
write BW: 9224.36MB/s
reading test, size=65536 offset=0 num_iters=1000
read BW: 521.262MB/s
unmapping buffer
unpinning buffer
closing gdrdrv
```
## Restrictions and known issues
GDRCopy works with regular CUDA device memory only, as returned by cudaMalloc.
In particular, it does not work with CUDA managed memory.
`gdr_pin_buffer()` accepts any addresses returned by cudaMalloc and its family.
In contrast, `gdr_map()` requires that the pinned address is aligned to the GPU page.
Neither CUDA Runtime nor Driver APIs guarantees that GPU memory allocation
functions return aligned addresses. Users are responsible for proper alignment
of addresses passed to the library.
Two cudaMalloc'd memory regions may be contiguous. Users may call
`gdr_pin_buffer` and `gdr_map` with address and size that extend across these
two regions. This use case is not well-supported in GDRCopy. On rare occassions,
users may experience 1.) an error in `gdr_map`, or 2.) low copy performance
because `gdr_map` cannot provide write-combined mapping.
In some GPU driver versions, pinning the same GPU address multiple times
consumes additional BAR1 space. This is because the space is not properly
reused. If you encounter this issue, we suggest that you try the latest version
of NVIDIA GPU driver.
On POWER9 where CPU and GPU are connected via NVLink, CUDA9.2 and GPU Driver
v396.37 are the minimum requirements in order to achieve the full performance.
GDRCopy works with ealier CUDA and GPU driver versions but the achievable
bandwidth is substantially lower.
If gdrdrv is compiled with the proprietary flavor of NVIDIA driver, GDRCopy does not fully support Linux with the
confidential computing (CC) configuration with Intel CPU. In particular, it does not functional if
`CONFIG_ARCH_HAS_CC_PLATFORM=y` and CC is enabled at runtime. However, it works if CC is disabled or
`CONFIG_ARCH_HAS_CC_PLATFORM=n`. This issue is not applied to AMD CPU. To avoid this issue, please compile and load
gdrdrv with the opensource flavor of NVIDIA driver.
To allow the loading of unsupported 3rd party modules in SLE, set `allow_unsupported_modules 1` in
/etc/modprobe.d/unsupported-modules. After making this change, modules missing the "supported" flag, will be allowed to
load.
## Bug filing
For reporting issues you may be having using any of NVIDIA software or
reporting suspected bugs we would recommend you use the bug filing system
which is available to NVIDIA registered developers on the developer site.
If you are not a member you can [sign
up](https://developer.nvidia.com/accelerated-computing-developer).
Once a member you can submit issues using [this
form](https://developer.nvidia.com/nvbugs/cuda/add). Be sure to select
GPUDirect in the "Relevant Area" field.
You can later track their progress using the __My Bugs__ link on the left of
this [view](https://developer.nvidia.com/user).
## Acknowledgment
If you find this software useful in your work, please cite:
R. Shi et al., "Designing efficient small message transfer mechanism for inter-node MPI communication on InfiniBand GPU clusters," 2014 21st International Conference on High Performance Computing (HiPC), Dona Paula, 2014, pp. 1-10, doi: 10.1109/HiPC.2014.7116873.

46
gdrcopy/config_arch Executable file
View File

@ -0,0 +1,46 @@
#!/bin/bash
# Copyright (c) 2014-2021, NVIDIA CORPORATION. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
topdir="$(cd "$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")" && pwd)"
dir=$(mktemp -d)
src=$dir/arch.c
exe=$dir/arch
cat <<EOF >$src
#include <stdio.h>
#include "gdrconfig.h"
int main(int argc, char *argv[])
{
#ifdef GDRAPI_X86
printf("X86\n");
#elif defined(GDRAPI_POWER)
printf("POWER\n");
#elif defined(GDRAPI_ARM64)
printf("ARM64\n");
#else
printf("ERROR\n");
#endif
return 0;
}
EOF
gcc -I ${topdir}/include -I ${topdir}/src $src -o $exe
$exe
rm -rf $dir

154
gdrcopy/include/gdrapi.h Normal file
View File

@ -0,0 +1,154 @@
/*
* Copyright (c) 2014-2021, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#ifndef __GDRAPI_H__
#define __GDRAPI_H__
#include <stdint.h> // for standard [u]intX_t types
#include <stddef.h>
#define MAJOR_VERSION_SHIFT 16
#define MINOR_VERSION_MASK (((uint32_t)1 << MAJOR_VERSION_SHIFT) - 1)
#define GDR_API_MAJOR_VERSION 2
#define GDR_API_MINOR_VERSION 4
#define GDR_API_VERSION ((GDR_API_MAJOR_VERSION << MAJOR_VERSION_SHIFT) | GDR_API_MINOR_VERSION)
#define MINIMUM_GDRDRV_MAJOR_VERSION 2
#define MINIMUM_GDRDRV_MINOR_VERSION 0
#define MINIMUM_GDRDRV_VERSION ((MINIMUM_GDRDRV_MAJOR_VERSION << MAJOR_VERSION_SHIFT) | MINIMUM_GDRDRV_MINOR_VERSION)
#define GPU_PAGE_SHIFT 16
#define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT)
#define GPU_PAGE_OFFSET (GPU_PAGE_SIZE-1)
#define GPU_PAGE_MASK (~GPU_PAGE_OFFSET)
/*
* GDRCopy, a low-latency GPU memory copy library (and a kernel-mode
* driver) based on NVIDIA GPUDirect RDMA technology.
*
* supported environment variables:
*
* - GDRCOPY_ENABLE_LOGGING, if defined logging is enabled, default is
* disabled.
*
* - GDRCOPY_LOG_LEVEL, overrides log threshold, default is to print errors
* only.
*/
#ifdef __cplusplus
extern "C" {
#endif
struct gdr;
typedef struct gdr *gdr_t;
// Initialize the library, e.g. by opening a connection to the kernel-mode
// driver. Returns an handle to the library state object.
gdr_t gdr_open(void);
// Destroy library state object, e.g. it closes the connection to kernel-mode
// driver.
int gdr_close(gdr_t g);
// The handle to a user-space GPU memory mapping
typedef struct gdr_mh_s {
unsigned long h;
} gdr_mh_t;
// Create a peer-to-peer mapping of the device memory buffer, returning an opaque handle.
// Note that at this point the mapping is still not accessible to user-space.
int gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle);
// Destroys the peer-to-peer mapping and frees the handle.
//
// If there exists a corresponding user-space mapping, gdr_unmap should be
// called before this one.
int gdr_unpin_buffer(gdr_t g, gdr_mh_t handle);
// flag is set when the kernel callback (relative to the
// nvidia_p2p_get_pages) gets invoked, e.g. cuMemFree() before
// gdr_unpin_buffer.
int gdr_get_callback_flag(gdr_t g, gdr_mh_t handle, int *flag);
typedef enum gdr_mapping_type {
GDR_MAPPING_TYPE_NONE = 0,
GDR_MAPPING_TYPE_WC = 1,
GDR_MAPPING_TYPE_CACHING = 2,
GDR_MAPPING_TYPE_DEVICE = 3
} gdr_mapping_type_t;
// After pinning, info struct contains details of the mapped area.
//
// Note that both info->va and info->mapped_size might be different from
// the original address passed to gdr_pin_buffer due to aligning happening
// in the kernel-mode driver
typedef struct gdr_info_v2 {
uint64_t va;
uint64_t mapped_size;
uint32_t page_size;
// tm_cycles and cycles_per_ms are deprecated and will be removed in future.
uint64_t tm_cycles;
uint32_t cycles_per_ms;
unsigned mapped:1;
unsigned wc_mapping:1;
gdr_mapping_type_t mapping_type;
} gdr_info_v2_t;
typedef gdr_info_v2_t gdr_info_t;
int gdr_get_info_v2(gdr_t g, gdr_mh_t handle, gdr_info_v2_t *info);
#define gdr_get_info gdr_get_info_v2
// Create a user-space mapping of the memory handle.
//
// WARNING: the address could be potentially aligned to the boundary of the page size
// before being mapped in user-space, so the pointer returned might be
// affected by an offset. gdr_get_info can be used to calculate that
// offset.
int gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size);
// get rid of a user-space mapping.
// First invoke gdr_unmap() then gdr_unpin_buffer().
int gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size);
// map_d_ptr is the user-space virtual address belonging to a mapping of a device memory buffer,
// i.e. one returned by gdr_map()
//
// WARNING: Both integrity and ordering of data as observed by pre-launched GPU
// work is not guaranteed by this API. For more information, see
// https://docs.nvidia.com/cuda/gpudirect-rdma/index.html#sync-behavior
int gdr_copy_to_mapping(gdr_mh_t handle, void *map_d_ptr, const void *h_ptr, size_t size);
int gdr_copy_from_mapping(gdr_mh_t handle, void *h_ptr, const void *map_d_ptr, size_t size);
// Query the version of libgdrapi
void gdr_runtime_get_version(int *major, int *minor);
// Query the version of gdrdrv driver
int gdr_driver_get_version(gdr_t g, int *major, int *minor);
#ifdef __cplusplus
}
#endif
#endif // __GDRAPI_H__

View File

@ -0,0 +1,15 @@
#pragma once
#if defined __GNUC__
#if defined(__powerpc__)
#define GDRAPI_POWER
#elif defined(__aarch64__)
#define GDRAPI_ARM64
#elif defined(__i386__) || defined(__x86_64__) || defined(__X86__)
#define GDRAPI_X86
#else
#error "architecture is not supported"
#endif // arch
#else
#error "compiler not supported"
#endif // __GNUC__

41
gdrcopy/insmod.sh Executable file
View File

@ -0,0 +1,41 @@
#!/bin/bash
# Copyright (c) 2014-2021, NVIDIA CORPORATION. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
THIS_DIR=$(dirname $0)
# remove driver
grep gdrdrv /proc/devices >/dev/null && sudo /sbin/rmmod gdrdrv
# insert driver
sudo /sbin/insmod src/gdrdrv/gdrdrv.ko dbg_enabled=0 info_enabled=0 use_persistent_mapping=0
# create device inodes
major=`fgrep gdrdrv /proc/devices | cut -b 1-4`
echo "INFO: driver major is $major"
# remove old inodes just in case
if [ -e /dev/gdrdrv ]; then
sudo rm /dev/gdrdrv
fi
echo "INFO: creating /dev/gdrdrv inode"
sudo mknod /dev/gdrdrv c $major 0
sudo chmod a+w+r /dev/gdrdrv

View File

@ -0,0 +1,247 @@
#!/bin/bash
# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
# Restart this number at 1 if MAJOR_VERSION or MINOR_VERSION changes
# See https://www.debian.org/doc/debian-policy/ch-controlfields.html#version
DEBIAN_VERSION=1
SCRIPT_DIR_PATH="$(cd "$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")" && pwd)"
TOP_DIR_PATH="${SCRIPT_DIR_PATH}/.."
CWD=$(pwd)
skip_dep_check=0
build_test_package=1
build_driver_package=1
ex()
{
local rc
echo "+ $@"
$@
rc=$?
if [[ $rc -ne 0 ]]; then
echo "Failed with error $rc to execute: $@" >&2
exit $rc
fi
}
function show_help
{
echo "Usage: [CUDA=<path>] $0 [-d] [-t] [-k] [-h]"
echo ""
echo " CUDA=<path> Set your installed CUDA path (ex. /usr/local/cuda)."
echo " -d Don't check build dependencies. Use my environment variables such as C_INCLUDE_PATH instead."
echo " -t Skip building gdrcopy-tests package."
echo " -k Skip building gdrdrv-dkms package."
echo " -h Show this help text."
echo ""
}
OPTIND=1 # Reset in case getopts has been used previously in the shell.
while getopts "hdtk" opt; do
case "${opt}" in
h)
show_help
exit 0
;;
d) skip_dep_check=1
;;
t) build_test_package=0
;;
k) build_driver_package=0
;;
esac
done
shift $((OPTIND-1))
if [[ ${build_test_package} == 1 ]] && [ "X$CUDA" == "X" ]; then
echo "CUDA environment variable is not defined"; exit 1
fi
NVCC=${CUDA}/bin/nvcc
CUDA_VERSION=`$NVCC --version | grep release | sed 's/^.*release \([0-9]\+\.[0-9]\+\).*/\1/'`
CUDA_MAJOR=`echo ${CUDA_VERSION} | cut -d "." -f 1`
CUDA_MINOR=`echo ${CUDA_VERSION} | cut -d "." -f 2`
echo "Building debian package for the gdrcopy library ..."
ex cd ${SCRIPT_DIR_PATH}
MODULE_SUBDIR=$(awk '/MODULE_SUBDIR \?=/ { print $3 }' ${TOP_DIR_PATH}/src/gdrdrv/Makefile | tr -d '\n')
MAJOR_VERSION=$(awk '/#define GDR_API_MAJOR_VERSION/ { print $3 }' ${TOP_DIR_PATH}/include/gdrapi.h | tr -d '\n')
MINOR_VERSION=$(awk '/#define GDR_API_MINOR_VERSION/ { print $3 }' ${TOP_DIR_PATH}/include/gdrapi.h | tr -d '\n')
VERSION="${MAJOR_VERSION}.${MINOR_VERSION}.4"
if [ "X$VERSION" == "X" ]; then
echo "Failed to get version numbers!" >&2
exit 1
fi
#FULL_VERSION="${VERSION}-${DEBIAN_VERSION}"
FULL_VERSION="${VERSION}"
tmpdir=`mktemp -d /tmp/gdr.XXXXXX`
if [ ! -d "${tmpdir}" ]; then
echo "Failed to create a temp directory!" >&2
exit 1
fi
echo "Building gdrcopy debian packages version ${FULL_VERSION} ..."
echo "Working in ${tmpdir} ..."
ex cd ${TOP_DIR_PATH}
ex mkdir -p ${tmpdir}/gdrcopy
ex rm -rf ${tmpdir}/gdrcopy/*
ex cp -r Makefile README.md include src tests LICENSE config_arch ${tmpdir}/gdrcopy/
ex cp -r packages/debian-lib ${tmpdir}/gdrcopy/
ex cp -r packages/debian-tests ${tmpdir}/gdrcopy/
ex cp README.md ${tmpdir}/gdrcopy/debian-lib/README.Debian
ex cp README.md ${tmpdir}/gdrcopy/debian-lib/README.source
ex cp README.md ${tmpdir}/gdrcopy/debian-tests/README.Debian
ex cp README.md ${tmpdir}/gdrcopy/debian-tests/README.source
ex cd ${tmpdir}/gdrcopy
ex find . -type f -exec sed -i "s/@FULL_VERSION@/${FULL_VERSION}/g" {} +
ex find . -type f -exec sed -i "s/@VERSION@/${VERSION}/g" {} +
ex rm -f ${tmpdir}/libgdrapi_${VERSION}.orig.tar.gz
ex rm -f ${tmpdir}/gdrcopy-tests_${VERSION}.orig.tar.gz
ex cd ${tmpdir}
ex cp -r gdrcopy libgdrapi-${VERSION}
ex cd ${tmpdir}/libgdrapi-${VERSION}
ex mv debian-lib debian
ex rm -rf debian-*
ex cd ${tmpdir}
ex cp -r gdrcopy gdrcopy-tests-${VERSION}
ex cd ${tmpdir}/gdrcopy-tests-${VERSION}
ex mv debian-tests debian
ex rm -rf debian-*
ex cd ${tmpdir}
ex tar czvf libgdrapi_${VERSION}.orig.tar.gz libgdrapi-${VERSION}
ex tar czvf gdrcopy-tests_${VERSION}.orig.tar.gz gdrcopy-tests-${VERSION}
echo "Building libgdrapi package ..."
ex cd ${tmpdir}/libgdrapi-${VERSION}
debuild_params="--set-envvar=PKG_CONFIG_PATH=${PKG_CONFIG_PATH}"
if [ "${skip_dep_check}" -eq 1 ]; then
debuild_params+=" --preserve-env -d"
echo "Skip build dependency check. Use the environment variables instead ..."
fi
# --set-envvar needs to be placed before -us -uc
debuild_params+=" -us -uc"
ex debuild ${debuild_params}
if [[ ${build_test_package} == 1 ]]; then
echo
echo "Building gdrcopy-tests package ..."
ex cd ${tmpdir}/gdrcopy-tests-${VERSION}
debuild_params="--set-envvar=CUDA=${CUDA} --set-envvar=PKG_CONFIG_PATH=${PKG_CONFIG_PATH}"
if [ "${skip_dep_check}" -eq 1 ]; then
debuild_params+=" --preserve-env -d"
echo "Skip build dependency check. Use the environment variables instead ..."
fi
# --set-envvar needs to be placed before -us -uc
debuild_params+=" -us -uc"
ex debuild ${debuild_params}
fi
if [[ ${build_driver_package} == 1 ]]; then
echo
echo "Building gdrdrv-dkms package ..."
ex cd ${tmpdir}/gdrcopy/src/gdrdrv
ex make clean
dkmsdir="${tmpdir}/gdrdrv-dkms-${VERSION}"
ex mkdir -p ${dkmsdir}
ex cp -r ${tmpdir}/gdrcopy/src/gdrdrv ${dkmsdir}/gdrdrv-${VERSION}
ex rm -rf ${dkmsdir}/gdrdrv-${VERSION}/debian-*
ex cp ${SCRIPT_DIR_PATH}/dkms.conf ${dkmsdir}/gdrdrv-${VERSION}/
ex cp -r ${TOP_DIR_PATH}/scripts ${dkmsdir}/gdrdrv-${VERSION}
ex cd ${dkmsdir}
ex cp -r ${SCRIPT_DIR_PATH}/dkms/* .
ex find . -type f -exec sed -i "s/@FULL_VERSION@/${FULL_VERSION}/g" {} +
ex find . -type f -exec sed -i "s/@VERSION@/${VERSION}/g" {} +
ex find . -type f -exec sed -i "s/@MODULE_LOCATION@/${MODULE_SUBDIR//\//\\/}/g" {} +
ex cd ${tmpdir}
ex tar czvf gdrdrv-dkms_${VERSION}.orig.tar.gz gdrdrv-dkms-${VERSION}
ex cd ${dkmsdir}
ex dpkg-buildpackage -rfakeroot -d -F -us -uc
fi
echo
echo "Building gdrcopy package ..."
metadir=${tmpdir}/gdrcopy-${VERSION}
ex mkdir -p ${metadir}
ex cd ${TOP_DIR_PATH}
ex cp -r packages/debian-meta ${metadir}/debian
ex cp README.md ${metadir}/debian/README.Debian
ex cp README.md ${metadir}/debian/README.source
ex cd ${metadir}
ex find . -type f -exec sed -i "s/@FULL_VERSION@/${FULL_VERSION}/g" {} +
ex find . -type f -exec sed -i "s/@VERSION@/${VERSION}/g" {} +
ex find . -type f -exec sed -i "s/@MODULE_LOCATION@/${MODULE_SUBDIR//\//\\/}/g" {} +
ex cd ${tmpdir}
ex tar czvf gdrcopy_${VERSION}.orig.tar.gz gdrcopy-${VERSION}
cd ${metadir}
ex debuild -us -uc
echo
echo "Copying *.deb and supplementary files to the current working directory ..."
if $(hash lsb_release 2>/dev/null); then
release=`lsb_release -rs | sed -e "s/\./_/g"`
id=`lsb_release -is | sed -e "s/ /_/g"`
release=".${id}${release}"
else
release=""
fi
ex cd ${CWD}
for item in `ls ${tmpdir}/*.deb`; do
item_name=`basename $item`
item_name=`echo $item_name | sed -e "s/\.deb//g"`
if echo "$item_name" | grep -q "tests"; then
item_name="${item_name}${release}+cuda${CUDA_MAJOR}.${CUDA_MINOR}.deb"
else
item_name="${item_name}${release}.deb"
fi
ex cp $item ./${item_name}
done
ex cp ${tmpdir}/*.tar.* .
ex cp ${tmpdir}/*.dsc .
echo
echo "Cleaning up ..."
ex rm -rf ${tmpdir}

View File

@ -0,0 +1,185 @@
#!/bin/bash
# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
# Restart this number at 1 if MAJOR_VERSION or MINOR_VERSION changes
# See https://rpm-packaging-guide.github.io/#preamble-items
RPM_VERSION=1
SCRIPT_DIR_PATH="$(cd "$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")" && pwd)"
TOP_DIR_PATH="${SCRIPT_DIR_PATH}/.."
CWD=$(pwd)
ex()
{
local rc
echo "+ $@"
$@
rc=$?
if [[ $rc -ne 0 ]]; then
echo "Failed with error $rc to execute: $@" >&2
exit $rc
fi
}
function show_help
{
echo "This script is for generating GDRCopy RPM packages."
echo
echo "Usage: CUDA=<path> $0 [-m]"
echo
echo "Optional arguments:"
echo " -m Generate kmod package (default: no)."
echo
echo "Environment variables:"
echo " CUDA=<path> [Required] CUDA installation path (usually /usr/local/cuda)."
echo " NVIDIA_SRC_DIR=<path> [Optional] NVIDIA driver source directory (usually /usr/src/nvidia-<version>/nvidia)."
}
OPTIND=1 # Reset in case getopts has been used previously in the shell.
generate_kmod=0
while getopts "h?m" opt; do
case "$opt" in
h|\?)
show_help
exit 0
;;
m) generate_kmod=1
;;
esac
done
shift $((OPTIND-1))
NVCC=${CUDA}/bin/nvcc
CUDA_VERSION=`$NVCC --version | grep release | sed 's/^.*release \([0-9]\+\.[0-9]\+\).*/\1/'`
CUDA_MAJOR=`echo ${CUDA_VERSION} | cut -d "." -f 1`
CUDA_MINOR=`echo ${CUDA_VERSION} | cut -d "." -f 2`
if [ "X$CUDA" == "X" ]; then
echo "CUDA environment variable is not defined"
exit 1
fi
echo "Building rpm package ..."
ex cd ${SCRIPT_DIR_PATH}
MODULE_SUBDIR=$(awk '/MODULE_SUBDIR \?=/ { print $3 }' ${TOP_DIR_PATH}/src/gdrdrv/Makefile | tr -d '\n')
MAJOR_VERSION=$(awk '/#define GDR_API_MAJOR_VERSION/ { print $3 }' ${TOP_DIR_PATH}/include/gdrapi.h | tr -d '\n')
MINOR_VERSION=$(awk '/#define GDR_API_MINOR_VERSION/ { print $3 }' ${TOP_DIR_PATH}/include/gdrapi.h | tr -d '\n')
VERSION="${MAJOR_VERSION}.${MINOR_VERSION}.4"
if [ "X$VERSION" == "X" ]; then
echo "Failed to get version numbers!" >&2
exit 1
fi
FULL_VERSION="${VERSION}"
if [[ ${generate_kmod} == 1 ]]; then
if [ -z "${NVIDIA_SRC_DIR}" ]; then
NVIDIA_SRC_DIR=$(find /usr/src/kernel-modules/nvidia-* /usr/src/nvidia-* -name "nv-p2p.c" -print -quit 2>/dev/null)
if [ ${#NVIDIA_SRC_DIR} -gt 0 ]; then
NVIDIA_SRC_DIR=$(dirname ${NVIDIA_SRC_DIR})
fi
fi
if [ -d ${NVIDIA_SRC_DIR} ]; then
NVIDIA_DRIVER_VERSION=$(basename $(dirname ${NVIDIA_SRC_DIR}))
else
echo "NVIDIA_SRC_DIR=${NVIDIA_SRC_DIR}" >&2
echo "Failed to find NVIDIA driver!" >&2
exit 1
fi
fi
tmpdir=`mktemp -d /tmp/gdr.XXXXXX`
if [ ! -d "$tmpdir" ]; then
echo "Failed to create a temp directory!" >&2
exit 1
fi
echo "Building gdrcopy rpm packages version ${VERSION} ..."
echo "Working in $tmpdir ..."
ex cd ${TOP_DIR_PATH}
ex mkdir -p $tmpdir/gdrcopy
ex rm -rf $tmpdir/gdrcopy/*
ex cp -r packages/dkms.conf packages/rhel/init.d packages/rhel/gdrcopy.service scripts/ insmod.sh Makefile README.md include src tests config_arch LICENSE packages/gdrcopy.spec $tmpdir/gdrcopy/
ex rm -f $tmpdir/gdrcopy-$VERSION.tar.gz
ex cd $tmpdir/gdrcopy
ex find . -type f -exec sed -i "s/@FULL_VERSION@/${FULL_VERSION}/g" {} +
ex find . -type f -exec sed -i "s/@VERSION@/${VERSION}/g" {} +
ex find . -type f -exec sed -i "s/@MODULE_LOCATION@/${MODULE_SUBDIR//\//\\/}/g" {} +
ex cd $tmpdir
ex mv gdrcopy gdrcopy-$VERSION
ex tar czvf gdrcopy-$VERSION.tar.gz gdrcopy-$VERSION
ex mkdir -p $tmpdir/topdir/{SRPMS,RPMS,SPECS,BUILD,SOURCES}
ex cp gdrcopy-$VERSION/gdrcopy.spec $tmpdir/topdir/SPECS/
ex cp gdrcopy-$VERSION.tar.gz $tmpdir/topdir/SOURCES/
rpmbuild_params="-ba --nodeps --define '_build_id_links none' --define \"_topdir $tmpdir/topdir\" --define \"_release ${RPM_VERSION}\" --define 'dist %{nil}' --define \"CUDA $CUDA\" --define \"GDR_VERSION ${VERSION}\" --define \"KVERSION $(uname -r)\" --define \"MODULE_LOCATION ${MODULE_SUBDIR}\""
if [[ ${generate_kmod} == 1 ]]; then
rpmbuild_params="${rpmbuild_params} --define \"NVIDIA_DRIVER_VERSION ${NVIDIA_DRIVER_VERSION}\" --define \"NVIDIA_SRC_DIR ${NVIDIA_SRC_DIR}\" --define \"BUILD_KMOD 1\""
fi
rpmbuild_params="${rpmbuild_params} $tmpdir/topdir/SPECS/gdrcopy.spec"
eval "rpmbuild ${rpmbuild_params}"
rpms=`ls -1 $tmpdir/topdir/RPMS/*/*.rpm`
srpm=`ls -1 $tmpdir/topdir/SRPMS/`
if [ -f "/etc/redhat-release" ]; then
release_version=".el$(cat /etc/redhat-release | grep -o -E '[0-9]+' | head -1)"
elif [ -f "/etc/centos-release" ]; then
release_version=".el$(cat /etc/centos-release | grep -o -E '[0-9]+' | head -1)"
elif [ -f "/etc/os-release" ]; then
release_version=$(source /etc/os-release && echo ".$ID-$VERSION_ID")
else
release_version="unknown_distro"
fi
echo $srpm $rpms
ex cd ${CWD}
for item in `ls $tmpdir/topdir/SRPMS/*.rpm $tmpdir/topdir/RPMS/*/*.rpm`; do
item_name=`basename $item .rpm`
arch=$(sed -ne 's/.*\(\.[^\.]\+\)$/\1/p' <<< $item_name)
item_name=`basename $item_name $arch`
if [ "$item_name" == "gdrcopy-${FULL_VERSION}-${RPM_VERSION}.`uname -m`" ]; then
item_name="${item_name}${release_version}+cuda${CUDA_MAJOR}.${CUDA_MINOR}.${arch}.rpm"
else
item_name="${item_name}${release_version}${arch}.rpm"
fi
ex cp $item ./${item_name}
done
echo
echo "Cleaning up ..."
ex rm -rf ${tmpdir}

View File

@ -0,0 +1,44 @@
libgdrapi (2.4.4) stable; urgency=low
* No change.
-- Pak Markthub <pmarkthub@nvidia.com> Mon, 16 Dec 2024 11:59:59 -0700
libgdrapi (2.4.3) stable; urgency=low
* No change.
-- Pak Markthub <pmarkthub@nvidia.com> Mon, 02 Dec 2024 11:59:59 -0700
libgdrapi (2.4.2) stable; urgency=low
* Fix memory leak in gdr_pin_buffer.
-- Pak Markthub <pmarkthub@nvidia.com> Thu, 31 Oct 2024 11:59:59 -0700
libgdrapi (2.4.1) stable; urgency=low
* No change
-- Pak Markthub <pmarkthub@nvidia.com> Mon, 18 Dec 2023 11:59:59 -0700
libgdrapi (2.4) stable; urgency=low
* Introduce gdr_get_info_v2.
* Introduce new copy algorithm for device mappings.
* Add support for NVIDIA BLUEFIELD-3.
-- Pak Markthub <pmarkthub@nvidia.com> Fri, 01 Sep 2023 11:59:59 -0700
libgdrapi (2.3.1) stable; urgency=low
* No change
-- Pak Markthub <pmarkthub@nvidia.com> Mon, 12 May 2023 11:59:59 -0700
libgdrapi (2.3) stable; urgency=low
* Initial version of libgdrapi package -- was a part of gdrcopy package.
-- Pak Markthub <pmarkthub@nvidia.com> Mon, 23 Jul 2021 11:59:59 -0700

View File

@ -0,0 +1 @@
9

View File

@ -0,0 +1,19 @@
Source: libgdrapi
Priority: optional
Maintainer: GPUDirect Team <gpudirect@nvidia.com>
Uploaders: Davide Rossetti <drossetti@nvidia.com>, Pak Markthub <pmarkthub@nvidia.com>
Build-Depends: debhelper (>= 9)
Standards-Version: @FULL_VERSION@
Section: libs
Homepage: https://github.com/NVIDIA/gdrcopy
#Vcs-Git: https://anonscm.debian.org/git/collab-maint/gdrcopy.git
#Vcs-Browser: https://anonscm.debian.org/cgit/collab-maint/gdrcopy.git
Package: libgdrapi
Architecture: any
Multi-Arch: same
Depends: ${shlibs:Depends}, ${misc:Depends}
Replaces: gdrcopy (<= 2.2-1)
Conflicts: gdrcopy (<= 2.2-1)
Description: A low-latency GPU memory copy library
A low-latency GPU memory copy library based on NVIDIA GPUDirect RDMA technology.

View File

@ -0,0 +1,25 @@
Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
Upstream-Name: gdrcopy
Source: https://github.com/NVIDIA/gdrcopy
Files: *
Copyright: 2019-2021 NVIDIA CORPORATION. All rights reserved.
License: MIT
Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the "Software"),
to deal in the Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.

View File

@ -0,0 +1,2 @@
README.Debian
README.source

View File

@ -0,0 +1 @@

View File

@ -0,0 +1,27 @@
#!/usr/bin/make -f
# See debhelper(7) (uncomment to enable)
# output every command that modifies files on the build system.
#export DH_VERBOSE = 1
# see FEATURE AREAS in dpkg-buildflags(1)
#export DEB_BUILD_MAINT_OPTIONS = hardening=+all
# see ENVIRONMENT in dpkg-buildflags(1)
# package maintainers to append CFLAGS
#export DEB_CFLAGS_MAINT_APPEND = -Wall -pedantic
# package maintainers to append LDFLAGS
#export DEB_LDFLAGS_MAINT_APPEND = -Wl,--as-needed
%:
dh $@
# dh_make generated override targets
# This is example for Cmake (See https://bugs.debian.org/641051 )
override_dh_auto_build:
dh_auto_build -- lib
override_dh_auto_install:
$(MAKE) DESTDIR=$(CURDIR)/debian/libgdrapi prefix=/usr libdir=/usr/lib/$(DEB_HOST_MULTIARCH) lib_install

View File

@ -0,0 +1 @@
3.0 (quilt)

View File

@ -0,0 +1,105 @@
gdrcopy (2.4.4) stable; urgency=low
* Fix the use-after-free bug of mr objects in gdrdv_vma_close.
* Fix the resource leakage bug in gdrdrv_release.
-- Pak Markthub <pmarkthub@nvidia.com> Mon, 16 Dec 2024 11:59:59 -0700
gdrcopy (2.4.3) stable; urgency=low
* Fix NVIDIA_IS_OPENSOURCE detection when compile with NVIDIA driver version 545 or newer.
* Fix compile error in gdrdrv when compile on RHEL9.5.
-- Pak Markthub <pmarkthub@nvidia.com> Mon, 02 Dec 2024 11:59:59 -0700
gdrcopy (2.4.2) stable; urgency=low
* Fix the size alignment bug in gdrdrv.
* Fix memory leak in gdr_pin_buffer.
* Add support for another flavor of BF3.
-- Pak Markthub <pmarkthub@nvidia.com> Thu, 31 Oct 2024 11:59:59 -0700
gdrcopy (2.4.1) stable; urgency=low
* Add support for persistent mapping.
* Fix bug in src/gdrdrv/Makefile.
* Fix compile-time bug when check.h is not found.
-- Pak Markthub <pmarkthub@nvidia.com> Mon, 18 Dec 2023 11:59:59 -0700
gdrcopy (2.4) stable; urgency=low
* Various bug fixes in the test and benchmark applications.
* Prefix all applications with "gdrcopy_".
* Introduce more unit tests in gdrcopy_sanity.
* Introduce gdrcopy_pplat benchmark application.
* Remove dependency on libcheck and libsubunit
* Introduce gdr_get_info_v2.
* Introduce new copy algorithm for device mappings.
* Add support for NVIDIA BLUEFIELD-3.
* Add support for Linux kernel >= 6.3.
* Relicense gdrdrv to Dual MIT/GPL.
* Fix bugs in gdrdrv when pinning two small buffers back-to-back.
* Add support for coherent platforms such as Grace-Hopper.
* Add support for Confidential Computing (CC).
-- Pak Markthub <pmarkthub@nvidia.com> Fri, 01 Sep 2023 11:59:59 -0700
gdrcopy (2.3.1) stable; urgency=low
* Add a workaround for the GPL-compatibility issue when compile with CONFIG_ARCH_HAS_CC_PLATFORM on Linux kernel 5.18+.
-- Pak Markthub <pmarkthub@nvidia.com> Mon, 12 May 2023 11:59:59 -0700
gdrcopy (2.3) stable; urgency=low
* Convert to meta package.
* Declare dependency with gdrdrv-dkms, libgdrapi, and gdrcopy-tests.
* Update the package maintainer.
-- Pak Markthub <pmarkthub@nvidia.com> Mon, 23 Jul 2021 11:59:59 -0700
gdrcopy (2.2) stable; urgency=low
* Add support for ARM64.
* Update various information on README.
* Improve Makefile.
* Add multi-arch support.
* Handle removal of HAVE_UNLOCKED_IOCTL in Linux kernel v5.9 and later.
* Prevent dpkg package creation to unnecessarily compile gdrdrv.
* Improve gdr_open error message.
* Fix bug that prevents sanity from correctly summarizing failure.
* Add dkms support in kmod package.
* Handle the removal of kzfree in Linux kernel v5.10 and later.
* Improve small-size copy-to-mapping.
-- Pak Markthub <pmarkthub@nvidia.com> Mon, 01 Feb 2021 11:59:59 -0700
gdrcopy (2.1) stable; urgency=low
* fix build problem on RHL8 kernels
* relax checks in gdrdrv to support multi-threading use cases
* fix fd leak in gdr_open()
* Introduce copylat test application.
* Introduce basic_with_tokens and invalidation_fork_child_gdr_pin_parent_with_tokens sub-tests in sanity.
* Remove the dependency with libcudart.so.
* Clean up the code in the tests folder.
* Change the package maintainer to Davide Rossetti.
-- Davide Rossetti <drossetti@nvidia.com> Mon, 02 Mar 2020 11:59:59 -0700
gdrcopy (2.0) stable; urgency=low
* Improve copy performance with unrolling in POWERPC.
* Create sanity unit test for testing the functionality and security.
* Consolidate basic and validate into sanity unit test.
* Introduce compile time and runtime version checking in libgdrapi.
* Improve rpm packaging.
* Introduce deb packaging for the userspace library and the applications.
* Introduce dkms packaging for the gdrdrv driver.
* Rename gdr_copy_from/to_bar to gdr_copy_from/to_mapping.
* Update README
-- Pak Markthub <pmarkthub@nvidia.com> Mon, 16 Sep 2019 11:59:59 -0700

View File

@ -0,0 +1 @@
9

View File

@ -0,0 +1,17 @@
Source: gdrcopy
Priority: optional
Maintainer: GPUDirect Team <gpudirect@nvidia.com>
Uploaders: Davide Rossetti <drossetti@nvidia.com>, Pak Markthub <pmarkthub@nvidia.com>
Build-Depends: debhelper (>= 9)
Standards-Version: @FULL_VERSION@
Section: misc
Homepage: https://github.com/NVIDIA/gdrcopy
Package: gdrcopy
Architecture: any
Multi-Arch: same
Depends: gdrdrv-dkms (= @FULL_VERSION@), libgdrapi (= @FULL_VERSION@), gdrcopy-tests (= @FULL_VERSION@)
Maintainer: GPUDirect Team <gpudirect@nvidia.com>
Uploaders: Davide Rossetti <drossetti@nvidia.com>, Pak Markthub <pmarkthub@nvidia.com>
Description: GDRCopy meta-package
Meta-package for GDRCopy, a low-latency GPU memory copy library based on NVIDIA GPUDirect RDMA technology.

View File

@ -0,0 +1,25 @@
Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
Upstream-Name: gdrcopy
Source: https://github.com/NVIDIA/gdrcopy
Files: *
Copyright: 2019-2021 NVIDIA CORPORATION. All rights reserved.
License: MIT
Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the "Software"),
to deal in the Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.

View File

@ -0,0 +1,44 @@
#!/usr/bin/make -f
# See debhelper(7) (uncomment to enable)
# output every command that modifies files on the build system.
#export DH_VERBOSE = 1
# see FEATURE AREAS in dpkg-buildflags(1)
#export DEB_BUILD_MAINT_OPTIONS = hardening=+all
# see ENVIRONMENT in dpkg-buildflags(1)
# package maintainers to append CFLAGS
#export DEB_CFLAGS_MAINT_APPEND = -Wall -pedantic
# package maintainers to append LDFLAGS
#export DEB_LDFLAGS_MAINT_APPEND = -Wl,--as-needed
build build-arch build-indep:
clean:
dh_testdir
dh_clean
install: build
dh_testdir
dh_testroot
dh_prep
binary-arch: install
binary-indep: install
dh_testdir
dh_testroot
dh_install
dh_installdocs
dh_installchangelogs
dh_compress
dh_fixperms
dh_installdeb
dh_gencontrol
dh_md5sums
dh_builddeb
binary: binary-indep binary-arch
.PHONY: build clean binary-indep binary-arch binary install

View File

@ -0,0 +1 @@
3.0 (quilt)

View File

@ -0,0 +1,47 @@
gdrcopy-tests (2.4.4) stable; urgency=low
* No change.
-- Pak Markthub <pmarkthub@nvidia.com> Mon, 16 Dec 2024 11:59:59 -0700
gdrcopy-tests (2.4.3) stable; urgency=low
* No change.
-- Pak Markthub <pmarkthub@nvidia.com> Mon, 02 Dec 2024 11:59:59 -0700
gdrcopy-tests (2.4.2) stable; urgency=low
* No change.
-- Pak Markthub <pmarkthub@nvidia.com> Thu, 31 Oct 2024 11:59:59 -0700
gdrcopy-tests (2.4.1) stable; urgency=low
* Fix compile-time bug when check.h is not found.
-- Pak Markthub <pmarkthub@nvidia.com> Mon, 18 Dec 2023 11:59:59 -0700
gdrcopy-tests (2.4) stable; urgency=low
* Various bug fixes in the test and benchmark applications.
* Prefix all applications with "gdrcopy_".
* Introduce more unit tests in gdrcopy_sanity.
* Introduce gdrcopy_pplat benchmark application.
* Remove dependency on libcheck and libsubunit
-- Pak Markthub <pmarkthub@nvidia.com> Fri, 01 Sep 2023 11:59:59 -0700
gdrcopy-tests (2.3.1) stable; urgency=low
* No change
-- Pak Markthub <pmarkthub@nvidia.com> Mon, 12 May 2023 11:59:59 -0700
gdrcopy-tests (2.3) stable; urgency=low
* Initial version of gdrcopy-tests package -- was a part of gdrcopy package.
* Add apiperf test.
-- Pak Markthub <pmarkthub@nvidia.com> Mon, 23 Jul 2021 11:59:59 -0700

View File

@ -0,0 +1 @@
9

View File

@ -0,0 +1,18 @@
Source: gdrcopy-tests
Priority: optional
Maintainer: GPUDirect Team <gpudirect@nvidia.com>
Uploaders: Davide Rossetti <drossetti@nvidia.com>, Pak Markthub <pmarkthub@nvidia.com>
Build-Depends: debhelper (>= 9)
Standards-Version: @FULL_VERSION@
Section: utils
Homepage: https://github.com/NVIDIA/gdrcopy
#Vcs-Git: https://anonscm.debian.org/git/collab-maint/gdrcopy.git
#Vcs-Browser: https://anonscm.debian.org/cgit/collab-maint/gdrcopy.git
Package: gdrcopy-tests
Architecture: any
Multi-Arch: same
Depends: libgdrapi (>= @FULL_VERSION@), ${shlibs:Depends}, ${misc:Depends}
Replaces: gdrcopy (<= 2.2-1)
Conflicts: gdrcopy (<= 2.2-1)
Description: Test utilities for GDRCopy

View File

@ -0,0 +1,25 @@
Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
Upstream-Name: gdrcopy
Source: https://github.com/NVIDIA/gdrcopy
Files: *
Copyright: 2019-2021 NVIDIA CORPORATION. All rights reserved.
License: MIT
Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the "Software"),
to deal in the Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.

View File

@ -0,0 +1,2 @@
README.Debian
README.source

View File

@ -0,0 +1 @@

View File

@ -0,0 +1,30 @@
#!/usr/bin/make -f
# See debhelper(7) (uncomment to enable)
# output every command that modifies files on the build system.
#export DH_VERBOSE = 1
# see FEATURE AREAS in dpkg-buildflags(1)
#export DEB_BUILD_MAINT_OPTIONS = hardening=+all
# see ENVIRONMENT in dpkg-buildflags(1)
# package maintainers to append CFLAGS
#export DEB_CFLAGS_MAINT_APPEND = -Wall -pedantic
# package maintainers to append LDFLAGS
#export DEB_LDFLAGS_MAINT_APPEND = -Wl,--as-needed
%:
dh $@
# dh_make generated override targets
# This is example for Cmake (See https://bugs.debian.org/641051 )
override_dh_auto_build:
dh_auto_build -- CUDA=$(CUDA) lib exes
override_dh_shlibdeps:
dh_shlibdeps -Xgdrcopy_apiperf -Xgdrcopy_copybw -Xgdrcopy_copylat -Xgdrcopy_sanity -Xgdrcopy_pplat
override_dh_auto_install:
$(MAKE) DESTDIR=$(CURDIR)/debian/gdrcopy-tests prefix=/usr exes_install

View File

@ -0,0 +1 @@
3.0 (quilt)

View File

@ -0,0 +1,6 @@
PACKAGE_NAME="gdrdrv"
PACKAGE_VERSION="@FULL_VERSION@"
BUILT_MODULE_NAME[0]="gdrdrv"
DEST_MODULE_LOCATION[0]="@MODULE_LOCATION@"
AUTOINSTALL="yes"
MAKE[0]="cd $dkms_tree/gdrdrv/@FULL_VERSION@/build && make CONF_SCRIPT_DIR=scripts KVER=$kernelver"

View File

@ -0,0 +1,33 @@
#/usr/bin/make
SRC = $(DESTDIR)/usr/src
SHARE = $(DESTDIR)/usr/share/$(NAME)-dkms
all:
clean:
install:
#source tree
ifeq ("$(wildcard $(NAME)-$(VERSION))", "$(NAME)-$(VERSION)")
install -d "$(SRC)"
cp -a $(NAME)-$(VERSION) $(SRC)
# sets 0755 for dirs, 0644 for files
chmod a-wx+rX,u+w -R "$(SRC)/$(NAME)-$(VERSION)"
# set u+x for all files under the scripts folder
chmod u+x -R "$(SRC)/$(NAME)-$(VERSION)/scripts"
endif
#tarball, possibly with binaries
ifeq ("$(wildcard $(NAME)-$(VERSION).dkms.tar.gz)", "$(NAME)-$(VERSION).dkms.tar.gz")
install -d "$(SHARE)"
install -m 644 $(NAME)-$(VERSION).dkms.tar.gz "$(SHARE)"
endif
#postinst, only if we are supporting legacy mode
ifeq ("$(wildcard common.postinst)", "common.postinst")
install -d "$(SHARE)"
install -m 755 $(PREFIX)/usr/lib/dkms/common.postinst $(SHARE)/postinst
endif

View File

@ -0,0 +1,293 @@
#!/bin/sh
# Copyright (C) 2002-2005 Flavio Stanchina
# Copyright (C) 2005-2006 Aric Cyr
# Copyright (C) 2007 Mario Limonciello
# Copyright (C) 2009 Alberto Milone
set -e
. /usr/share/debconf/confmodule
uname_s=$(uname -s)
_get_kernel_dir() {
KVER=$1
case ${uname_s} in
Linux) DIR="/lib/modules/$KVER/build" ;;
GNU/kFreeBSD) DIR="/usr/src/kfreebsd-headers-$KVER/sys" ;;
esac
echo $DIR
}
_check_kernel_dir() {
DIR=$(_get_kernel_dir $1)
case ${uname_s} in
Linux) test -e $DIR/include ;;
GNU/kFreeBSD) test -e $DIR/kern && test -e $DIR/conf/kmod.mk ;;
*) return 1 ;;
esac
return $?
}
# Check the existence of a kernel named as $1
_is_kernel_name_correct() {
CORRECT="no"
KERNEL_NAME=$1
for kernel in /boot/config-*; do
KERNEL=${kernel#*-}
if [ "${KERNEL}" = "${KERNEL_NAME}" ]; then
CORRECT="yes"
break
fi
done
echo $CORRECT
}
# Get the most recent kernel on Debian based systems. This keeps
# into account both the version and the ABI. If the current kernel
# is the most recent kernel then the function will print a null string.
_get_newest_kernel_debian() {
NEWEST_KERNEL=
NEWEST_VERSION=
NEWEST_ABI=
for kernel in /boot/config-*; do
[ -f "$kernel" ] || continue
KERNEL=${kernel#*-}
KERNEL_VERSION=${KERNEL%%-*}
ABI=${KERNEL#*-}
ABI=${ABI%%-*}
if [ -z "$NEWEST_KERNEL" ]; then
# The 1st time get a version which is bigger than $1
COMPARE_TO=$1
else
# Get the biggest version
COMPARE_TO="$NEWEST_VERSION-$NEWEST_ABI"
fi
# if $kernel is greater than $COMPARE_TO
if [ `dpkg --compare-versions "$KERNEL_VERSION-$ABI" gt "$COMPARE_TO" && echo "yes" || \
echo "no"` = "yes" ]; then
NEWEST_KERNEL=$KERNEL
NEWEST_VERSION=$KERNEL_VERSION
NEWEST_ABI=$ABI
fi
done
echo "$NEWEST_KERNEL"
}
# Get the most recent kernel in Rhel based systems. If the current kernel
# is the most recent kernel then the function will print a null string.
_get_newest_kernel_rhel() {
NEWEST_KERNEL=
LAST_INSTALLED_KERNEL=$(rpm -q --whatprovides kernel --last | grep kernel -m1 | cut -f1 -d' ')
LIK_FORMATTED_NAME=$(rpm -q $LAST_INSTALLED_KERNEL --queryformat="%{VERSION}-%{RELEASE}.%{ARCH}\n")
if [ `echo $LIK_FORMATTED_NAME | grep 2.6 >/dev/null` ]; then
# Fedora and Suse
NEWEST_KERNEL=$LIK_FORMATTED_NAME
else
# Hack for Mandriva where $LIK_FORMATTED_NAME is broken
LIK_NAME=$(rpm -q $LAST_INSTALLED_KERNEL --queryformat="%{NAME}\n")
LIK_TYPE=${LIK_NAME#kernel-}
LIK_TYPE=${LIK_TYPE%%-*}
LIK_STRIPPED=${LIK_NAME#kernel-}
LIK_STRIPPED=${LIK_STRIPPED#$LIK_TYPE-}
LIK_STRIPPED_BASE=${LIK_STRIPPED%%-*}
LIK_STRIPPED_END=${LIK_STRIPPED#$LIK_STRIPPED_BASE-}
LIK_FINAL=$LIK_STRIPPED_BASE-$LIK_TYPE-$LIK_STRIPPED_END
NEWEST_KERNEL=$LIK_FINAL
fi
echo $NEWEST_KERNEL
}
# Get the newest kernel on Debian and Rhel based systems.
get_newest_kernel() {
NEWEST_KERNEL=
# Try Debian first as rpm can be installed in Debian based distros
if [ -e /usr/bin/dpkg ]; then
# If DEB based
CURRENT_VERSION=${CURRENT_KERNEL%%-*}
CURRENT_ABI=${CURRENT_KERNEL#*-}
CURRENT_FLAVOUR=${CURRENT_ABI#*-}
CURRENT_ABI=${CURRENT_ABI%%-*}
NEWEST_KERNEL=$(_get_newest_kernel_debian "$CURRENT_VERSION-$CURRENT_ABI")
elif [ `which rpm >/dev/null` ]; then
# If RPM based
NEWEST_KERNEL=$(_get_newest_kernel_rhel)
fi
# Make sure that kernel name that we extracted corresponds to an installed
# kernel
if [ -n "$NEWEST_KERNEL" ] && [ `_is_kernel_name_correct $NEWEST_KERNEL` = "no" ]; then
NEWEST_KERNEL=
fi
echo $NEWEST_KERNEL
}
NAME=$1
VERSION=$2
TARBALL_ROOT=$3
ARCH=$4
UPGRADE=$5
if [ -z "$NAME" ] || [ -z "$VERSION" ]; then
echo "Need NAME, and VERSION defined"
echo "ARCH is optional"
exit 1
fi
# read framework configuration options
if [ -r /etc/dkms/framework.conf ]; then
. /etc/dkms/framework.conf
fi
KERNELS=$(ls /lib/modules/ 2>/dev/null || true)
CURRENT_KERNEL=$(uname -r)
#We never want to keep an older version side by side to prevent conflicts
if [ -e "/var/lib/dkms/$NAME/$VERSION" ]; then
echo "Removing old $NAME-$VERSION DKMS files..."
dkms remove -m $NAME -v $VERSION --all
fi
#Load new files, by source package and by tarball
if [ -f "$TARBALL_ROOT/$NAME-$VERSION.dkms.tar.gz" ]; then
if ! dkms ldtarball --archive "$TARBALL_ROOT/$NAME-$VERSION.dkms.tar.gz"; then
echo ""
echo ""
echo "Unable to load DKMS tarball $TARBALL_ROOT/$NAME-$VERSION.dkms.tar.gz."
echo "Common causes include: "
echo " - You must be using DKMS 2.1.0.0 or later to support binaries only"
echo " distribution specific archives."
echo " - Corrupt distribution specific archive"
echo ""
echo ""
exit 2
fi
elif [ -d "/usr/src/$NAME-$VERSION" ]; then
echo "Loading new $NAME-$VERSION DKMS files..."
dkms add -m $NAME -v $VERSION > /dev/null
fi
# On 1st installation, let us look for a directory
# in /lib/modules which matches `uname -r`. If none
# is found it is possible that buildd is being used
# and that uname -r is giving us the name of the
# kernel used by the buildd machine.
#
# If this is the case we try to build the kernel
# module for each kernel which has a directory in
# /lib/modules. Furthermore we will have to tell
# DKMS which architecture it should build the module
# for (e.g. if the buildd machine is using a
# 2.6.24-23-xen 64bit kernel).
#
# NOTE: if the headers are not installed then the
# module won't be built, as usual
# Here we look for the most recent kernel so that we can
# build the module for it (in addition to doing it for the
# current kernel.
NEWEST_KERNEL=$(get_newest_kernel)
if [ -z "$autoinstall_all_kernels" ]; then
# If the current kernel is installed on the system or chroot
if [ `_is_kernel_name_correct $CURRENT_KERNEL` = "yes" ]; then
if [ -n "$NEWEST_KERNEL" ] && [ ${CURRENT_KERNEL} != ${NEWEST_KERNEL} ]; then
KERNELS="$CURRENT_KERNEL $NEWEST_KERNEL"
else
KERNELS=$CURRENT_KERNEL
fi
# The current kernel is not useful as it's not installed
else
echo "It is likely that $CURRENT_KERNEL belongs to a chroot's host"
# Let's use only the newest kernel if this is not a first installation
# otherwise build for all kernels
if [ -n "$NEWEST_KERNEL" -a -n "$UPGRADE" ]; then
KERNELS="$NEWEST_KERNEL"
fi
fi
fi
# Take care of displaying newline separated list
echo "Building for $KERNELS" | tr '\n' ',' \
| sed -e 's/,/, /g; s/, $/\n/; s/, \([^,]\+\)$/ and \1/'
if [ -n "$ARCH" ]; then
if which lsb_release >/dev/null && [ $(lsb_release -s -i) = "Ubuntu" ]; then
case $ARCH in
amd64)
ARCH="x86_64"
;;
lpia|i?86)
ARCH="i686"
;;
esac
fi
echo "Building for architecture $ARCH"
ARCH="-a $ARCH"
fi
for KERNEL in $KERNELS; do
dkms_status=`dkms status -m $NAME -v $VERSION -k $KERNEL $ARCH`
if [ `echo $KERNEL | grep -c "BOOT"` -gt 0 ]; then
echo ""
echo "Module build and install for $KERNEL was skipped as "
echo "it is a BOOT variant"
continue
fi
#if the module isn't yet built, try to build it
if [ `echo $dkms_status | grep -c ": built"` -eq 0 ]; then
if [ ! -L /var/lib/dkms/$NAME/$VERSION/source ]; then
echo "This package appears to be a binaries-only package"
echo " you will not be able to build against kernel $KERNEL"
echo " since the package source was not provided"
continue
fi
if _check_kernel_dir $KERNEL; then
echo "Building initial module for $KERNEL"
set +e
dkms build -m $NAME -v $VERSION -k $KERNEL $ARCH > /dev/null
case $? in
9)
set -e
echo "Skipped."
continue
;;
0)
set -e
echo "Done."
;;
*)
exit $?
;;
esac
dkms_status=`dkms status -m $NAME -v $VERSION -k $KERNEL $ARCH`
else
echo "Module build for kernel $KERNEL was skipped since the"
echo "kernel headers for this kernel does not seem to be installed."
fi
fi
#if the module is built (either pre-built or just now), install it
if [ `echo $dkms_status | grep -c ": built"` -eq 1 ] &&
[ `echo $dkms_status | grep -c ": installed"` -eq 0 ]; then
dkms install -m $NAME -v $VERSION -k $KERNEL $ARCH
fi
done

View File

@ -0,0 +1,5 @@
gdrdrv DKMS module for Debian
This package was automatically generated by the DKMS system,
for distribution on Debian based operating systems.

View File

@ -0,0 +1,72 @@
gdrdrv-dkms (2.4.4) stable; urgency=low
* Fix the use-after-free bug of mr objects in gdrdv_vma_close.
* Fix the resource leakage bug in gdrdrv_release.
-- Pak Markthub <pmarkthub@nvidia.com> Mon, 16 Dec 2024 11:59:59 -0700
gdrdrv-dkms (2.4.3) stable; urgency=low
* Fix NVIDIA_IS_OPENSOURCE detection when compile with NVIDIA driver version 545 or newer.
* Fix compile error in gdrdrv when compile on RHEL9.5.
-- Pak Markthub <pmarkthub@nvidia.com> Mon, 02 Dec 2024 11:59:59 -0700
gdrdrv-dkms (2.4.2) stable; urgency=low
* Fix the size alignment bug in gdrdrv.
* Add support for another flavor of BF3.
-- Pak Markthub <pmarkthub@nvidia.com> Thu, 31 Oct 2024 11:59:59 -0700
gdrdrv-dkms (2.4.1) stable; urgency=low
* Add support for persistent mapping.
* Fix bug in src/gdrdrv/Makefile.
-- Pak Markthub <pmarkthub@nvidia.com> Mon, 18 Dec 2023 11:59:59 -0700
gdrdrv-dkms (2.4) stable; urgency=low
* Add support for NVIDIA BLUEFIELD-3.
* Add support for Linux kernel >= 6.3.
* Relicense gdrdrv to Dual MIT/GPL.
* Fix bugs in gdrdrv when pinning two small buffers back-to-back.
* Add support for coherent platforms such as Grace-Hopper.
* Add support for Confidential Computing (CC).
-- Pak Markthub <pmarkthub@nvidia.com> Fri, 01 Sep 2023 11:59:59 -0700
gdrdrv-dkms (2.3.1) stable; urgency=low
* Add a workaround for the GPL-compatibility issue when compile with CONFIG_ARCH_HAS_CC_PLATFORM on Linux kernel 5.18+.
-- Pak Markthub <pmarkthub@nvidia.com> Mon, 12 May 2023 11:59:59 -0700
gdrdrv-dkms (2.3) stable; urgency=low
* Change the package maintainer to GPUDirect Team.
* Add Davide Rossetti and Pak Makthub as Uploaders.
* Revamp gdrdrv to fix race-condition bugs.
-- Pak Markthub <pmarkthub@nvidia.com> Mon, 23 Jul 2021 11:59:59 -0700
gdrdrv-dkms (2.2) stable; urgency=low
* No change.
-- Pak Markthub <pmarkthub@nvidia.com> Mon, 01 Feb 2021 11:59:59 -0700
gdrdrv-dkms (2.1) stable; urgency=low
* Change the package maintainer to Davide Rossetti.
-- Davide Rossetti <drossetti@nvidia.com> Mon, 02 Mar 2020 11:59:59 -0700
gdrdrv-dkms (2.0) stable; urgency=low
* Harden security in gdrdrv.
* Enable cached mappings in POWER9.
-- Pak Markthub <pmarkthub@nvidia.com> Mon, 16 Sep 2019 11:59:59 -0700

View File

@ -0,0 +1 @@
9

View File

@ -0,0 +1,13 @@
Source: gdrdrv-dkms
Section: misc
Priority: optional
Maintainer: GPUDirect Team <gpudirect@nvidia.com>
Uploaders: Davide Rossetti <drossetti@nvidia.com>, Pak Markthub <pmarkthub@nvidia.com>
Build-Depends: debhelper (>= 9), dkms
Standards-Version: @FULL_VERSION@
Package: gdrdrv-dkms
Architecture: any
Multi-Arch: same
Depends: dkms (>= 1.95), ${misc:Depends}
Description: gdrdrv driver in DKMS format.

View File

@ -0,0 +1,2 @@
This copyright has not been completed by the author of this package.

View File

@ -0,0 +1 @@
usr/src

View File

@ -0,0 +1,147 @@
#!/bin/bash
#
# Startup/shutdown script for GDRcopy driver
# chkconfig: 2345 20 80
# description: Startup/shutdown script for GDRcopy kernel-mode driver
### BEGIN INIT INFO
# Provides: gdrcopy
# Required-Start:
# Required-Stop:
# Default-Start: 2 3 4 5
# Default-Stop: 0 1 6
# Description: GDRcopy kernel-mode driver init script
### END INIT INFO
# Source function library.
. /lib/lsb/init-functions
DRIVER=gdrdrv
RETVAL=0
is_module()
{
local RC
/sbin/lsmod | grep -w "$1" > /dev/null 2>&1
RC=$?
return $RC
}
log_msg()
{
logger -i "$modname: $@"
}
function req_modules_loaded() {
local RC
local reqmods="nvidia"
for mod in $reqmods; do
if ! is_module $mod; then
echo "module $mod is not loaded"
RC=1
break
fi
done
return $RC
}
# Create /dev nodes for device
function createnodes() {
local module=$1
local RC
local inode=/dev/$module
major=`fgrep $module /proc/devices | cut -b 1-4`
log_msg "$module: driver major is $major"
[ -e $inode ] && rm -f $inode
mknod -m 666 $inode c $major 0
RC=$?
return $RC
}
# Remove /dev nodes for device
function removenodes() {
rm -f /dev/gdrdrv*
}
load_module()
{
local RC
local module=$1
filename=`modinfo $module | grep filename | awk '{print $NF}'`
if [ ! -n "$filename" ]; then
echo "Module $module does not exist"
log_msg "Error: Module $module does not exist"
return 1
fi
echo -n $"Loading $DRIVER kernel module: "
/sbin/modprobe $module && log_success_msg || log_failure_msg
RC=$?
return $RC
}
# Start daemon
function start() {
echo -n $"Checking required modules: "
req_modules_loaded && log_success_msg || log_failure_msg
RETVAL=$?
echo
[ "$RETVAL" = 0 ] || exit $RETVAL
if is_module $DRIVER ; then
echo "module already loaded"
else
load_module $DRIVER
RETVAL=$?
echo
[ "$RETVAL" = 0 ] || exit $RETVAL
fi
echo -n $"Initializing GDRcopy /dev entries: "
createnodes $DRIVER && log_success_msg || log_failure_msg
RETVAL=$?
echo
[ "$RETVAL" = 0 ] || exit $RETVAL
}
# Stop daemon
function stop() {
echo -n $"Unloading $DRIVER kernel module: "
/sbin/rmmod $DRIVER && log_success_msg || log_failure_msg
RETVAL=$?
echo
[ "$RETVAL" = 0 ] || exit $RETVAL
echo -n $"Removing GDRcopy /dev entries: "
removenodes $DRIVER && log_success_msg || log_failure_msg
RETVAL=$?
echo
[ "$RETVAL" = 0 ] || exit $RETVAL
}
# See how we were called
case "$1" in
start)
start
;;
stop)
stop
;;
restart)
stop
start
;;
*)
echo $"Usage: $0 {start|stop|restart}"
RETVAL=1
esac
exit $RETVAL

View File

@ -0,0 +1,49 @@
#!/bin/sh
# Copyright (C) 2002-2005 Flavio Stanchina
# Copyright (C) 2005-2006 Aric Cyr
# Copyright (C) 2007 Mario Limonciello
# Copyright (C) 2009 Alberto Milone
set -e
NAME=gdrdrv
PACKAGE_NAME=$NAME-dkms
DEB_NAME=$(echo $PACKAGE_NAME | sed 's,_,-,')
CVERSION=`dpkg-query -W -f='${Version}' $DEB_NAME | awk -F "-" '{print $1}' | cut -d\: -f2`
ARCH=`dpkg --print-architecture`
dkms_configure () {
for POSTINST in /usr/lib/dkms/common.postinst "/usr/share/$PACKAGE_NAME/postinst"; do
if [ -f "$POSTINST" ]; then
"$POSTINST" "$NAME" "$CVERSION" "/usr/share/$PACKAGE_NAME" "$ARCH" "$2"
return $?
fi
echo "WARNING: $POSTINST does not exist." >&2
done
echo "ERROR: DKMS version is too old and $PACKAGE_NAME was not" >&2
echo "built with legacy DKMS support." >&2
echo "You must either rebuild $PACKAGE_NAME with legacy postinst" >&2
echo "support or upgrade DKMS to a more current version." >&2
return 1
}
case "$1" in
configure)
dkms_configure
;;
abort-upgrade|abort-remove|abort-deconfigure)
;;
*)
echo "postinst called with unknown argument \`$1'" >&2
exit 1
;;
esac
# dh_installdeb will replace this with shell code automatically
# generated by other debhelper scripts.
#DEBHELPER#
exit 0

View File

@ -0,0 +1,28 @@
#!/bin/sh
NAME=gdrdrv
VERSION=@VERSION@
set -e
case "$1" in
remove|upgrade|deconfigure)
if [ "`dkms status -m $NAME`" ]; then
dkms remove -m $NAME -v $VERSION --all
fi
;;
failed-upgrade)
;;
*)
echo "prerm called with unknown argument \`$1'" >&2
exit 1
;;
esac
#DEBHELPER#
exit 0

View File

@ -0,0 +1,55 @@
#!/usr/bin/make -f
# -*- makefile -*-
# Uncomment this to turn on verbose mode.
#export DH_VERBOSE=1
DEB_NAME=gdrdrv
NAME=gdrdrv
VERSION=@VERSION@
configure: configure-stamp
configure-stamp:
dh_testdir
touch configure-stamp
build: build-stamp
build-stamp: configure-stamp
dh_testdir
$(MAKE)
touch $@
clean:
dh_testdir
dh_testroot
rm -f build-stamp configure-stamp
-$(MAKE) clean
dh_clean
install: build
dh_testdir
dh_testroot
dh_prep
dh_installdirs
$(MAKE) DESTDIR=$(CURDIR)/debian/$(DEB_NAME)-dkms NAME=$(NAME) VERSION=$(VERSION) install
dh_installinit --name $(DEB_NAME)
binary-arch: build install
binary-indep: build install
dh_testdir
dh_testroot
dh_link
dh_strip
dh_compress
dh_fixperms
dh_installdeb
dh_shlibdeps
dh_gencontrol
dh_md5sums
dh_builddeb
binary: binary-indep binary-arch
.PHONY: build clean binary-indep binary-arch binary install configure

View File

@ -0,0 +1 @@
3.0 (quilt)

View File

@ -0,0 +1,20 @@
### Commented entries have reasonable defaults.
### Uncomment to edit them.
# Source: <source package name; defaults to package name>
Section: misc
Priority: optional
Homepage: https://github.com/NVIDIA/gdrcopy
Standards-Version: @FULL_VERSION@
Package: gdrcopy
Version: @FULL_VERSION@
Maintainer: GPUDirect Team <gpudirect@nvidia.com>
Uploaders: Davide Rossetti <drossetti@nvidia.com>, Pak Markthub <pmarkthub@nvidia.com>
Depends: gdrdrv-dkms (= @FULL_VERSION@), libgdrapi (= @FULL_VERSION@), gdrcopy-tests (= @FULL_VERSION@)
Architecture: any
Multi-Arch: same
Copyright: MIT
Changelog: changelog
Readme: README.md
Description: GDRCopy meta-package
Meta-package for GDRCopy, a low-latency GPU memory copy library based on NVIDIA GPUDirect RDMA technology.

View File

@ -0,0 +1,14 @@
[Unit]
Description=GDRCopy service
After=multi-user.target
[Service]
Type=oneshot
RemainAfterExit=yes
ExecStart=/bin/bash /usr/libexec/gdrcopy/gdrcopy start
ExecReload=/bin/bash /usr/libexec/gdrcopy/gdrcopy restart
ExecStop=/bin/bash /usr/libexec/gdrcopy/gdrcopy stop
[Install]
WantedBy=multi-user.target

View File

@ -0,0 +1,81 @@
#!/bin/sh
show_help()
{
echo "Usage: ${0} [-hk]"
echo
echo " -h Show this help text."
echo " -k <kver> Specify the kernel version."
echo
}
set_kver=0
kver=""
OPTIND=1 # Reset in case getopts has been used previously in the shell.
while getopts "hk:" opt ; do
case "${opt}" in
h)
show_help
exit 0
;;
k)
set_kver=1
kver="${OPTARG}"
;;
?)
show_help
exit 0
;;
esac
done
if [ ${set_kver} -eq 0 ]; then
kver="$(uname -r)"
fi
kdir="/lib/modules/${kver}/build"
tmpfolder=$(mktemp --tmpdir -d gdrcopy.XXXXXXXXX)
testfile="${tmpfolder}/test-dummy.c"
makefile="${tmpfolder}/Makefile"
cat >${testfile} <<EOF
#include <linux/module.h>
#include <linux/mm.h>
static int __init test_dummy_init(void)
{
struct vm_area_struct vma;
vm_flags_set(&vma, 0);
return 0;
}
static void __exit test_dummy_fini(void)
{
}
MODULE_AUTHOR("gpudirect@nvidia.com");
MODULE_LICENSE("MIT");
MODULE_VERSION("1.0");
module_init(test_dummy_init);
module_exit(test_dummy_fini);
EOF
cat >${makefile} <<EOF
obj-m := test-dummy.o
EOF
cd ${tmpfolder}
make -C ${kdir} M=${tmpfolder} modules > /dev/null 2>&1
ret=$?
rm -rf ${tmpfolder}
if [ "${ret}" -eq 0 ]; then
echo "y"
else
echo "n"
fi

79
gdrcopy/src/Makefile Normal file
View File

@ -0,0 +1,79 @@
# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
LIB_MAJOR_VER ?= $(shell awk '/\#define GDR_API_MAJOR_VERSION/ { print $$3 }' ../include/gdrapi.h | tr -d '\n')
LIB_MINOR_VER ?= $(shell awk '/\#define GDR_API_MINOR_VERSION/ { print $$3 }' ../include/gdrapi.h | tr -d '\n')
GDRAPI_ARCH ?= $(shell ../config_arch)
GDRAPI_INC := ../include
CPPFLAGS := -I $(GDRAPI_INC) -I gdrdrv/ -D GDRAPI_ARCH=$(GDRAPI_ARCH)
LDFLAGS :=
COMMONCFLAGS := -O2
CFLAGS += $(COMMONCFLAGS)
CXXFLAGS += $(COMMONCFLAGS)
LIBS := -lpthread -ldl
LIB_VER:=$(LIB_MAJOR_VER).$(LIB_MINOR_VER)
LIB_BASENAME:=libgdrapi.so
LIB_DYNAMIC=$(LIB_BASENAME).$(LIB_VER)
LIB_SONAME=$(LIB_BASENAME).$(LIB_MAJOR_VER)
LIB:=$(LIB_DYNAMIC)
LIBSRCS := gdrapi.c
ifeq ($(GDRAPI_ARCH),X86)
LIBSRCS += memcpy_avx.c memcpy_sse.c memcpy_sse41.c
endif
LIBOBJS := $(LIBSRCS:.c=.o)
all: config lib
config:
@ echo "GDRAPI_ARCH=$(GDRAPI_ARCH)"
lib: $(LIB)
#static
#$(LIB): $(LIB)($(LIBOBJS))
#dynamic
$(LIBOBJS): CFLAGS+=-fPIC
$(LIB): $(LIBOBJS)
$(CC) -shared -Wl,-soname,$(LIB_SONAME) -o $@ $^
PATH=/sbin:/usr/sbin:$$PATH; ldconfig -n $(PWD)
ln -sf $(LIB_DYNAMIC) $(LIB_SONAME)
ln -sf $(LIB_SONAME) $(LIB_BASENAME)
# special-cased to finely tune the arch option
memcpy_avx.o: memcpy_avx.c
$(COMPILE.c) -mavx -o $@ $^
memcpy_sse.o: memcpy_sse.c
$(COMPILE.c) -msse -o $@ $^
memcpy_sse41.o: memcpy_sse41.c
$(COMPILE.c) -msse4.1 -o $@ $^
gdrapi.o: gdrapi.c $(GDRAPI_INC)/gdrapi.h gdrapi_internal.h gdrdrv/gdrdrv.h
clean:
rm -f *.o $(EXES) lib*.so* *~ core.*
.PHONY: clean all lib

877
gdrcopy/src/gdrapi.c Normal file
View File

@ -0,0 +1,877 @@
/*
* Copyright (c) 2014-2021, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <stdarg.h>
#include <string.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/time.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <errno.h>
#include <netdb.h>
#include <malloc.h>
#include <getopt.h>
#include <arpa/inet.h>
#include <sys/ioctl.h>
#include <time.h>
#include <asm/types.h>
#include <assert.h>
#include <sys/queue.h>
#include "gdrconfig.h"
#include "gdrapi.h"
#include "gdrdrv.h"
#include "gdrapi_internal.h"
// logging/tracing
enum gdrcopy_msg_level {
GDRCOPY_MSG_DEBUG = 1,
GDRCOPY_MSG_INFO,
GDRCOPY_MSG_WARN,
GDRCOPY_MSG_ERROR
};
static int gdr_msg_level = GDRCOPY_MSG_ERROR;
static int gdr_enable_logging = -1;
static void gdr_msg(enum gdrcopy_msg_level lvl, const char* fmt, ...)
{
if (-1 == gdr_enable_logging) {
const char *env = getenv("GDRCOPY_ENABLE_LOGGING");
if (env)
gdr_enable_logging = 1;
else
gdr_enable_logging = 0;
env = getenv("GDRCOPY_LOG_LEVEL");
if (env)
gdr_msg_level = atoi(env);
}
if (gdr_enable_logging) {
if (lvl >= gdr_msg_level) {
va_list ap;
va_start(ap, fmt);
vfprintf(stderr, fmt, ap);
va_end(ap);
}
}
}
#define gdr_dbg(FMT, ARGS...) gdr_msg(GDRCOPY_MSG_DEBUG, "DBG: " FMT, ## ARGS)
#define gdr_dbgc(C, FMT, ARGS...) do { static int gdr_dbg_cnt=(C); if (gdr_dbg_cnt) { gdr_dbg(FMT, ## ARGS); --gdr_dbg_cnt; }} while (0)
#define gdr_info(FMT, ARGS...) gdr_msg(GDRCOPY_MSG_INFO, "INFO: " FMT, ## ARGS)
#define gdr_warn(FMT, ARGS...) gdr_msg(GDRCOPY_MSG_WARN, "WARN: " FMT, ## ARGS)
#define gdr_err(FMT, ARGS...) gdr_msg(GDRCOPY_MSG_ERROR, "ERR: " FMT, ## ARGS)
static gdr_memh_t *to_memh(gdr_mh_t mh) {
return (gdr_memh_t *)mh.h;
}
static gdr_mh_t from_memh(gdr_memh_t *memh) {
gdr_mh_t mh;
mh.h = (unsigned long)memh;
return mh;
}
static void gdr_init_cpu_flags(void);
static inline int gdr_is_mapped(const gdr_mapping_type_t mapping_type)
{
return mapping_type != GDR_MAPPING_TYPE_NONE;
}
gdr_t gdr_open(void)
{
gdr_t g = NULL;
const char *gdrinode = "/dev/gdrdrv";
int ret;
g = calloc(1, sizeof(*g));
if (!g) {
gdr_err("error while allocating memory\n");
return NULL;
}
int fd = open(gdrinode, O_RDWR | O_CLOEXEC);
if (-1 == fd ) {
ret = errno;
gdr_err("error opening driver (errno=%d/%s)\n", ret, strerror(ret));
goto err_mem;
}
struct GDRDRV_IOC_GET_VERSION_PARAMS params;
int retcode = ioctl(fd, GDRDRV_IOC_GET_VERSION, &params);
if (0 != retcode) {
ret = errno;
gdr_err("Error getting the gdrdrv driver version with ioctl error (errno=%d). gdrdrv might be too old.\n", ret);
goto err_fd;
}
if (params.gdrdrv_version < MINIMUM_GDRDRV_VERSION) {
gdr_err(
"The minimum required gdrdrv driver version is %d.%d but the current gdrdrv version is %d.%d\n",
MINIMUM_GDRDRV_MAJOR_VERSION,
MINIMUM_GDRDRV_MINOR_VERSION,
params.gdrdrv_version >> MAJOR_VERSION_SHIFT,
params.gdrdrv_version & MINOR_VERSION_MASK
);
goto err_fd;
}
if (params.minimum_gdr_api_version > GDR_API_VERSION) {
gdr_err(
"gdrdrv driver requires libgdrapi version %d.%d or above but the current libgdrapi version is %d.%d\n",
params.minimum_gdr_api_version >> MAJOR_VERSION_SHIFT,
params.minimum_gdr_api_version & MINOR_VERSION_MASK,
GDR_API_MAJOR_VERSION,
GDR_API_MINOR_VERSION
);
goto err_fd;
}
g->fd = fd;
LIST_INIT(&g->memhs);
gdr_init_cpu_flags();
// Initialize page_shift, page_size, and page_mask.
g->page_size = sysconf(_SC_PAGESIZE);
g->page_mask = ~(g->page_size - 1);
size_t ps_tmp = g->page_size;
g->page_shift = -1;
while (ps_tmp > 0) {
++g->page_shift;
if ((ps_tmp & 0x1) == 1)
break;
ps_tmp >>= 1;
}
g->gdrdrv_version = params.gdrdrv_version;
return g;
err_fd:
close(fd);
err_mem:
free(g);
return NULL;
}
int gdr_close(gdr_t g)
{
int ret = 0;
int retcode;
gdr_memh_t *mh, *next_mh;
mh = g->memhs.lh_first;
while (mh != NULL) {
// gdr_unpin_buffer frees mh, so we need to get the next one
// beforehand.
next_mh = mh->entries.le_next;
ret = gdr_unpin_buffer(g, from_memh(mh));
if (ret) {
gdr_err("error unpinning buffer inside gdr_close (errno=%d/%s)\n", ret, strerror(ret));
return ret;
}
mh = next_mh;
}
retcode = close(g->fd);
if (-1 == retcode) {
ret = errno;
gdr_err("error closing driver (errno=%d/%s)\n", ret, strerror(ret));
}
g->fd = 0;
free(g);
return ret;
}
int gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle)
{
int ret = 0;
int retcode;
if (!handle) {
return EINVAL;
}
gdr_memh_t *mh = calloc(1, sizeof(gdr_memh_t));
if (!mh) {
return ENOMEM;
}
struct GDRDRV_IOC_PIN_BUFFER_PARAMS params;
params.addr = addr;
params.size = size;
params.p2p_token = p2p_token;
params.va_space = va_space;
params.handle = 0;
retcode = ioctl(g->fd, GDRDRV_IOC_PIN_BUFFER, &params);
if (0 != retcode) {
ret = errno;
gdr_err("ioctl error (errno=%d)\n", ret);
free(mh);
goto err;
}
mh->handle = params.handle;
LIST_INSERT_HEAD(&g->memhs, mh, entries);
*handle = from_memh(mh);
err:
return ret;
}
int gdr_unpin_buffer(gdr_t g, gdr_mh_t handle)
{
int ret = 0;
int retcode;
gdr_memh_t *mh = to_memh(handle);
struct GDRDRV_IOC_UNPIN_BUFFER_PARAMS params;
params.handle = mh->handle;
retcode = ioctl(g->fd, GDRDRV_IOC_UNPIN_BUFFER, &params);
if (0 != retcode) {
ret = errno;
gdr_err("ioctl error (errno=%d)\n", ret);
}
LIST_REMOVE(mh, entries);
free(mh);
return ret;
}
int gdr_get_callback_flag(gdr_t g, gdr_mh_t handle, int *flag)
{
int ret = 0;
int retcode;
gdr_memh_t *mh = to_memh(handle);
struct GDRDRV_IOC_GET_CB_FLAG_PARAMS params;
params.handle = mh->handle;
retcode = ioctl(g->fd, GDRDRV_IOC_GET_CB_FLAG, &params);
if (0 != retcode) {
ret = errno;
gdr_err("ioctl error (errno=%d)\n", ret);
} else {
*flag = params.flag;
}
return ret;
}
int gdr_get_info_v2(gdr_t g, gdr_mh_t handle, gdr_info_v2_t *info)
{
int ret = 0;
int retcode;
gdr_memh_t *mh = to_memh(handle);
if (g->gdrdrv_version >= GDRDRV_MINIMUM_VERSION_WITH_GET_INFO_V2) {
struct GDRDRV_IOC_GET_INFO_V2_PARAMS params;
params.handle = mh->handle;
retcode = ioctl(g->fd, GDRDRV_IOC_GET_INFO_V2, &params);
if (0 != retcode) {
ret = errno;
gdr_err("ioctl error (errno=%d)\n", ret);
goto out;
} else {
info->va = params.va;
info->mapped_size = params.mapped_size;
info->page_size = params.page_size;
info->tm_cycles = params.tm_cycles;
info->cycles_per_ms = params.tsc_khz;
info->mapped = gdr_is_mapped(params.mapping_type);
info->wc_mapping = (params.mapping_type == GDR_MAPPING_TYPE_WC);
info->mapping_type = params.mapping_type;
}
}
else
{
struct GDRDRV_IOC_GET_INFO_PARAMS params;
params.handle = mh->handle;
retcode = ioctl(g->fd, GDRDRV_IOC_GET_INFO, &params);
if (0 != retcode) {
ret = errno;
gdr_err("ioctl error (errno=%d)\n", ret);
goto out;
} else {
info->va = params.va;
info->mapped_size = params.mapped_size;
info->page_size = params.page_size;
info->tm_cycles = params.tm_cycles;
info->cycles_per_ms = params.tsc_khz;
info->mapped = params.mapped;
info->wc_mapping = params.wc_mapping;
info->mapping_type = params.mapped ? (params.wc_mapping ? GDR_MAPPING_TYPE_WC : GDR_MAPPING_TYPE_CACHING) : GDR_MAPPING_TYPE_NONE;
}
}
out:
return ret;
}
int gdr_map(gdr_t g, gdr_mh_t handle, void **ptr_va, size_t size)
{
int ret = 0;
gdr_info_v2_t info = {0,};
gdr_memh_t *mh = to_memh(handle);
if (gdr_is_mapped(mh->mapping_type)) {
gdr_err("mh is mapped already\n");
return EAGAIN;
}
size_t rounded_size = (size + g->page_size - 1) & g->page_mask;
off_t magic_off = (off_t)mh->handle << g->page_shift;
void *mmio = mmap(NULL, rounded_size, PROT_READ|PROT_WRITE, MAP_SHARED, g->fd, magic_off);
if (mmio == MAP_FAILED) {
int __errno = errno;
mmio = NULL;
gdr_err("error %s(%d) while mapping handle %x, rounded_size=%zu offset=%llx\n",
strerror(__errno), __errno, handle, rounded_size, (long long unsigned)magic_off);
ret = __errno;
goto err;
}
*ptr_va = mmio;
ret = gdr_get_info_v2(g, handle, &info);
if (ret) {
gdr_err("error %d from get_info, munmapping before exiting\n", ret);
munmap(mmio, rounded_size);
goto err;
}
if (!gdr_is_mapped(info.mapping_type)) {
// Race could cause this issue.
// E.g., gdr_map and cuMemFree are triggered concurrently.
// The above mmap is successful but cuMemFree causes unmapping immediately.
gdr_err("mh is not mapped\n");
ret = EAGAIN;
}
mh->mapping_type = info.mapping_type;
gdr_dbg("mapping_type=%d\n", mh->mapping_type);
err:
return ret;
}
int gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size)
{
int ret = 0;
int retcode = 0;
size_t rounded_size;
gdr_memh_t *mh = to_memh(handle);
rounded_size = (size + g->page_size - 1) & g->page_mask;
if (!gdr_is_mapped(mh->mapping_type)) {
gdr_err("mh is not mapped yet\n");
return EINVAL;
}
retcode = munmap(va, rounded_size);
if (-1 == retcode) {
int __errno = errno;
gdr_err("error %s(%d) while unmapping handle %x, rounded_size=%zu\n",
strerror(__errno), __errno, handle, rounded_size);
ret = __errno;
goto err;
}
mh->mapping_type = GDR_MAPPING_TYPE_NONE;
err:
return ret;
}
#ifdef GDRAPI_X86
#include <cpuid.h>
// prepare for AVX2 implementation
#ifndef bit_AVX2
/* Extended Features (%eax == 7) */
/* %ebx */
#define bit_AVX2 (1 << 5)
#endif
#include <immintrin.h>
extern int memcpy_uncached_store_avx(void *dest, const void *src, size_t n_bytes);
extern int memcpy_cached_store_avx(void *dest, const void *src, size_t n_bytes);
extern int memcpy_uncached_store_sse(void *dest, const void *src, size_t n_bytes);
extern int memcpy_cached_store_sse(void *dest, const void *src, size_t n_bytes);
extern int memcpy_uncached_load_sse41(void *dest, const void *src, size_t n_bytes);
static inline void wc_store_fence(void) { _mm_sfence(); }
#define PREFERS_STORE_UNROLL4 0
#define PREFERS_STORE_UNROLL8 0
#define PREFERS_LOAD_UNROLL4 0
#define PREFERS_LOAD_UNROLL8 0
// GDRAPI_X86
#elif defined(GDRAPI_POWER)
static int memcpy_uncached_store_avx(void *dest, const void *src, size_t n_bytes) { return 1; }
static int memcpy_cached_store_avx(void *dest, const void *src, size_t n_bytes) { return 1; }
static int memcpy_uncached_store_sse(void *dest, const void *src, size_t n_bytes) { return 1; }
static int memcpy_cached_store_sse(void *dest, const void *src, size_t n_bytes) { return 1; }
static int memcpy_uncached_load_sse41(void *dest, const void *src, size_t n_bytes) { return 1; }
static inline void wc_store_fence(void) { asm volatile("sync") ; }
#define PREFERS_STORE_UNROLL4 1
#define PREFERS_STORE_UNROLL8 0
#define PREFERS_LOAD_UNROLL4 0
#define PREFERS_LOAD_UNROLL8 1
// GDRAPI_POWER
#elif defined(GDRAPI_ARM64)
static int memcpy_uncached_store_avx(void *dest, const void *src, size_t n_bytes) { return 1; }
static int memcpy_cached_store_avx(void *dest, const void *src, size_t n_bytes) { return 1; }
static int memcpy_uncached_store_sse(void *dest, const void *src, size_t n_bytes) { return 1; }
static int memcpy_cached_store_sse(void *dest, const void *src, size_t n_bytes) { return 1; }
static int memcpy_uncached_load_sse41(void *dest, const void *src, size_t n_bytes) { return 1; }
static inline void wc_store_fence(void) { asm volatile("DMB ishld") ; }
#define PREFERS_STORE_UNROLL4 0
#define PREFERS_STORE_UNROLL8 0
#define PREFERS_LOAD_UNROLL4 0
#define PREFERS_LOAD_UNROLL8 0
// GDRAPI_ARM64
#endif
static int has_sse = 0;
static int has_sse2 = 0;
static int has_sse4_1 = 0;
static int has_avx = 0;
static int has_avx2 = 0;
static void gdr_init_cpu_flags(void)
{
#ifdef GDRAPI_X86
unsigned int info_type = 0x00000001;
unsigned int ax, bx, cx, dx;
if (__get_cpuid(info_type, &ax, &bx, &cx, &dx) == 1) {
has_sse4_1 = ((cx & bit_SSE4_1) != 0);
has_avx = ((cx & bit_AVX) != 0);
has_sse = ((dx & bit_SSE) != 0);
has_sse2 = ((dx & bit_SSE2) != 0);
gdr_dbg("sse4_1=%d avx=%d sse=%d sse2=%d\n", has_sse4_1, has_avx, has_sse, has_sse2);
}
#ifdef bit_AVX2
info_type = 0x7;
if (__get_cpuid(info_type, &ax, &bx, &cx, &dx) == 1) {
has_avx2 = bx & bit_AVX2;
}
#endif // bit_AVX2
#endif // GDRAPI_X86
#ifdef GDRAPI_POWER
// detect and enable Altivec/SMX support
#endif
}
// note: more than one implementation may be compiled in
static void unroll8_memcpy(void *dst, const void *src, size_t size)
{
const uint64_t *r = (const uint64_t *)src;
uint64_t *w = (uint64_t *)dst;
size_t nw = size / sizeof(*r);
assert(size % sizeof(*r) == 0);
while (nw) {
if (0 == (nw & 3)) {
uint64_t r0 = r[0];
uint64_t r1 = r[1];
uint64_t r2 = r[2];
uint64_t r3 = r[3];
w[0] = r0;
w[1] = r1;
w[2] = r2;
w[3] = r3;
r += 4;
w += 4;
nw -= 4;
} else if (0 == (nw & 1)) {
uint64_t r0 = r[0];
uint64_t r1 = r[1];
w[0] = r0;
w[1] = r1;
r += 2;
w += 2;
nw -= 2;
} else {
w[0] = r[0];
++w;
++r;
--nw;
}
}
}
static void unroll4_memcpy(void *dst, const void *src, size_t size)
{
const uint32_t *r = (const uint32_t *)src;
uint32_t *w = (uint32_t *)dst;
size_t nw = size / sizeof(*r);
assert(size % sizeof(*r) == 0);
while (nw) {
if (0 == (nw & 3)) {
uint32_t r0 = r[0];
uint32_t r1 = r[1];
uint32_t r2 = r[2];
uint32_t r3 = r[3];
w[0] = r0;
w[1] = r1;
w[2] = r2;
w[3] = r3;
r += 4;
w += 4;
nw -= 4;
} else if (0 == (nw & 1)) {
uint32_t r0 = r[0];
uint32_t r1 = r[1];
w[0] = r0;
w[1] = r1;
r += 2;
w += 2;
nw -= 2;
} else {
w[0] = r[0];
++w;
++r;
--nw;
}
}
}
static inline int is_aligned(unsigned long value, unsigned powof2)
{
return ((value & (powof2-1)) == 0);
}
static inline int ptr_is_aligned(const void *ptr, unsigned powof2)
{
unsigned long addr = (unsigned long)ptr;
return is_aligned(addr, powof2);
}
static inline void memcpy_to_device_mapping(void *dst, const void *src, size_t size)
{
size_t remaining_size = size;
void *curr_map_d_ptr = dst;
const void *curr_h_ptr = src;
size_t copy_size = 0;
while (remaining_size > 0) {
if (is_aligned(remaining_size, sizeof(uint64_t)) && ptr_is_aligned(curr_map_d_ptr, sizeof(uint64_t)) && ptr_is_aligned(curr_h_ptr, sizeof(uint64_t))) {
// We have proper alignment. memcpy can be used here. Although
// unlikely, this might break in the future if the implementation
// of memcpy changes to generate unaligned access. Still, we choose
// memcpy because it provides better performance than our simple
// aligned-access workaround.
memcpy(curr_map_d_ptr, curr_h_ptr, remaining_size);
copy_size = remaining_size;
}
else if (remaining_size >= sizeof(uint64_t) && ptr_is_aligned(curr_map_d_ptr, sizeof(uint64_t))) {
// memcpy cannot be used here because its internal
// implementation may end up in an unaligned access.
WRITE_ONCE(*(uint64_t *)curr_map_d_ptr, *(uint64_t *)curr_h_ptr);
copy_size = sizeof(uint64_t);
}
else if (remaining_size >= sizeof(uint32_t) && ptr_is_aligned(curr_map_d_ptr, sizeof(uint32_t))) {
WRITE_ONCE(*(uint32_t *)curr_map_d_ptr, *(uint32_t *)curr_h_ptr);
copy_size = sizeof(uint32_t);
}
else if (remaining_size >= sizeof(uint16_t) && ptr_is_aligned(curr_map_d_ptr, sizeof(uint16_t))) {
WRITE_ONCE(*(uint16_t *)curr_map_d_ptr, *(uint16_t *)curr_h_ptr);
copy_size = sizeof(uint16_t);
}
else {
WRITE_ONCE(*(uint8_t *)curr_map_d_ptr, *(uint8_t *)curr_h_ptr);
copy_size = sizeof(uint8_t);
}
remaining_size -= copy_size;
curr_map_d_ptr = (void *)((uintptr_t)curr_map_d_ptr + copy_size);
curr_h_ptr = (const void *)((uintptr_t)curr_h_ptr + copy_size);
}
}
static inline void memcpy_from_device_mapping(void *dst, const void *src, size_t size)
{
size_t remaining_size = size;
const void *curr_map_d_ptr = src;
void *curr_h_ptr = dst;
size_t copy_size = 0;
while (remaining_size > 0) {
if (is_aligned(remaining_size, sizeof(uint64_t)) && ptr_is_aligned(curr_map_d_ptr, sizeof(uint64_t)) && ptr_is_aligned(curr_h_ptr, sizeof(uint64_t))) {
// We have proper alignment. memcpy can be used here. Although
// unlikely, this might break in the future if the implementation
// of memcpy changes to generate unaligned access. Still, we choose
// memcpy because it provides better performance than our simple
// aligned-access workaround.
memcpy(curr_h_ptr, curr_map_d_ptr, remaining_size);
copy_size = remaining_size;
}
else if (remaining_size >= sizeof(uint64_t) && ptr_is_aligned(curr_map_d_ptr, sizeof(uint64_t))) {
// memcpy cannot be used here because its internal
// implementation may end up in an unaligned access.
*(uint64_t *)curr_h_ptr = READ_ONCE(*(uint64_t *)curr_map_d_ptr);
copy_size = sizeof(uint64_t);
}
else if (remaining_size >= sizeof(uint32_t) && ptr_is_aligned(curr_map_d_ptr, sizeof(uint32_t))) {
*(uint32_t *)curr_h_ptr = READ_ONCE(*(uint32_t *)curr_map_d_ptr);
copy_size = sizeof(uint32_t);
}
else if (remaining_size >= sizeof(uint16_t) && ptr_is_aligned(curr_map_d_ptr, sizeof(uint16_t))) {
*(uint16_t *)curr_h_ptr = READ_ONCE(*(uint16_t *)curr_map_d_ptr);
copy_size = sizeof(uint16_t);
}
else {
*(uint8_t *)curr_h_ptr = READ_ONCE(*(uint8_t *)curr_map_d_ptr);
copy_size = sizeof(uint8_t);
}
remaining_size -= copy_size;
curr_map_d_ptr = (const void *)((uintptr_t)curr_map_d_ptr + copy_size);
curr_h_ptr = (void *)((uintptr_t)curr_h_ptr + copy_size);
}
}
static int gdr_copy_to_mapping_internal(void *map_d_ptr, const void *h_ptr, size_t size, gdr_mapping_type_t mapping_type)
{
const int wc_mapping = (mapping_type == GDR_MAPPING_TYPE_WC);
const int device_mapping = (mapping_type == GDR_MAPPING_TYPE_DEVICE);
do {
// For very small sizes and aligned pointers, we use simple store.
if (size == sizeof(uint8_t)) {
WRITE_ONCE(*(uint8_t *)map_d_ptr, *(uint8_t *)h_ptr);
goto do_fence;
} else if (size == sizeof(uint16_t) && ptr_is_aligned(map_d_ptr, sizeof(uint16_t))) {
WRITE_ONCE(*(uint16_t *)map_d_ptr, *(uint16_t *)h_ptr);
goto do_fence;
} else if (size == sizeof(uint32_t) && ptr_is_aligned(map_d_ptr, sizeof(uint32_t))) {
WRITE_ONCE(*(uint32_t *)map_d_ptr, *(uint32_t *)h_ptr);
goto do_fence;
} else if (size == sizeof(uint64_t) && ptr_is_aligned(map_d_ptr, sizeof(uint64_t))) {
WRITE_ONCE(*(uint64_t *)map_d_ptr, *(uint64_t *)h_ptr);
goto do_fence;
}
// pick the most performing implementation compatible with the platform we are running on
// NOTE: write fences are included in functions below
if (has_avx) {
assert(wc_mapping);
gdr_dbgc(1, "using AVX implementation of gdr_copy_to_mapping\n");
memcpy_uncached_store_avx(map_d_ptr, h_ptr, size);
goto out;
}
if (has_sse) {
assert(wc_mapping);
gdr_dbgc(1, "using SSE implementation of gdr_copy_to_mapping\n");
memcpy_uncached_store_sse(map_d_ptr, h_ptr, size);
goto out;
}
// on POWER, compiler/libc memcpy is not optimal for MMIO
// 64bit stores are not better than 32bit ones, so we prefer the latter.
// NOTE: if preferred but not aligned, a better implementation would still try to
// use byte sized stores to align map_d_ptr and h_ptr to next word.
// NOTE2: unroll*_memcpy and memcpy do not include fencing.
if (wc_mapping && PREFERS_STORE_UNROLL8 && is_aligned(size, 8) && ptr_is_aligned(map_d_ptr, 8) && ptr_is_aligned(h_ptr, 8)) {
gdr_dbgc(1, "using unroll8_memcpy for gdr_copy_to_mapping\n");
unroll8_memcpy(map_d_ptr, h_ptr, size);
} else if (wc_mapping && PREFERS_STORE_UNROLL4 && is_aligned(size, 4) && ptr_is_aligned(map_d_ptr, 4) && ptr_is_aligned(h_ptr, 4)) {
gdr_dbgc(1, "using unroll4_memcpy for gdr_copy_to_mapping\n");
unroll4_memcpy(map_d_ptr, h_ptr, size);
} else if (device_mapping) {
gdr_dbgc(1, "using device-mapping copy for gdr_copy_to_mapping with device mapping\n");
memcpy_to_device_mapping(map_d_ptr, h_ptr, size);
} else {
gdr_dbgc(1, "fallback to compiler/libc memcpy implementation of gdr_copy_to_mapping\n");
memcpy(map_d_ptr, h_ptr, size);
}
} while (0);
do_fence:
if (wc_mapping) {
// fencing is needed even for plain memcpy(), due to performance
// being hit by delayed flushing of WC buffers
wc_store_fence();
}
out:
return 0;
}
static int gdr_copy_from_mapping_internal(void *h_ptr, const void *map_d_ptr, size_t size, gdr_mapping_type_t mapping_type)
{
const int wc_mapping = (mapping_type == GDR_MAPPING_TYPE_WC);
const int device_mapping = (mapping_type == GDR_MAPPING_TYPE_DEVICE);
do {
// pick the most performing implementation compatible with the platform we are running on
if (has_sse4_1) {
assert(wc_mapping);
gdr_dbgc(1, "using SSE4_1 implementation of gdr_copy_from_mapping\n");
memcpy_uncached_load_sse41(h_ptr, map_d_ptr, size);
break;
}
if (has_avx) {
assert(wc_mapping);
gdr_dbgc(1, "using AVX implementation of gdr_copy_from_mapping\n");
memcpy_cached_store_avx(h_ptr, map_d_ptr, size);
break;
}
if (has_sse) {
assert(wc_mapping);
gdr_dbgc(1, "using SSE implementation of gdr_copy_from_mapping\n");
memcpy_cached_store_sse(h_ptr, map_d_ptr, size);
break;
}
// on POWER, compiler memcpy is not optimal for MMIO
// 64bit loads have 2x the BW of 32bit ones
if (wc_mapping && PREFERS_LOAD_UNROLL8 && is_aligned(size, 8) && ptr_is_aligned(map_d_ptr, 8) && ptr_is_aligned(h_ptr, 8)) {
gdr_dbgc(1, "using unroll8_memcpy for gdr_copy_from_mapping\n");
unroll8_memcpy(h_ptr, map_d_ptr, size);
} else if (wc_mapping && PREFERS_LOAD_UNROLL4 && is_aligned(size, 4) && ptr_is_aligned(map_d_ptr, 4) && ptr_is_aligned(h_ptr, 4)) {
gdr_dbgc(1, "using unroll4_memcpy for gdr_copy_from_mapping\n");
unroll4_memcpy(h_ptr, map_d_ptr, size);
} else if (device_mapping) {
gdr_dbgc(1, "using device-mapping copy for gdr_copy_from_mapping\n");
memcpy_from_device_mapping(h_ptr, map_d_ptr, size);
} else {
gdr_dbgc(1, "fallback to compiler/libc memcpy implementation of gdr_copy_from_mapping\n");
memcpy(h_ptr, map_d_ptr, size);
}
// note: fencing is not needed because plain stores are used
// if non-temporal/uncached stores were used on x86, a proper fence would be needed instead
// if (wc_mapping)
// wc_store_fence();
} while (0);
return 0;
}
int gdr_copy_to_mapping(gdr_mh_t handle, void *map_d_ptr, const void *h_ptr, size_t size)
{
gdr_memh_t *mh = to_memh(handle);
if (unlikely(!gdr_is_mapped(mh->mapping_type))) {
gdr_err("mh is not mapped yet\n");
return EINVAL;
}
if (unlikely(size == 0))
return 0;
return gdr_copy_to_mapping_internal(map_d_ptr, h_ptr, size, mh->mapping_type);
}
int gdr_copy_from_mapping(gdr_mh_t handle, void *h_ptr, const void *map_d_ptr, size_t size)
{
gdr_memh_t *mh = to_memh(handle);
if (unlikely(!gdr_is_mapped(mh->mapping_type))) {
gdr_err("mh is not mapped yet\n");
return EINVAL;
}
if (unlikely(size == 0))
return 0;
return gdr_copy_from_mapping_internal(h_ptr, map_d_ptr, size, mh->mapping_type);
}
void gdr_runtime_get_version(int *major, int *minor)
{
*major = GDR_API_MAJOR_VERSION;
*minor = GDR_API_MINOR_VERSION;
}
int gdr_driver_get_version(gdr_t g, int *major, int *minor)
{
assert(g != NULL);
assert(g->fd > 0);
struct GDRDRV_IOC_GET_VERSION_PARAMS params;
int retcode = ioctl(g->fd, GDRDRV_IOC_GET_VERSION, &params);
if (0 != retcode) {
int ret = errno;
gdr_err("Error getting the gdrdrv driver version with ioctl error (errno=%d). gdrdrv might be too old.\n", ret);
return ret;
}
*major = params.gdrdrv_version >> MAJOR_VERSION_SHIFT;
*minor = params.gdrdrv_version & MINOR_VERSION_MASK;
return 0;
}
// ==============================================================================
// Obsoleted API. Provided for compatibility only.
// ==============================================================================
#ifdef gdr_get_info
#undef gdr_get_info
#endif
typedef struct gdr_info_v1 {
uint64_t va;
uint64_t mapped_size;
uint32_t page_size;
// tm_cycles and cycles_per_ms are deprecated and will be removed in future.
uint64_t tm_cycles;
uint32_t cycles_per_ms;
unsigned mapped:1;
unsigned wc_mapping:1;
} gdr_info_v1_t;
int gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_v1_t *info)
{
int ret = 0;
int retcode;
gdr_memh_t *mh = to_memh(handle);
struct GDRDRV_IOC_GET_INFO_PARAMS params;
params.handle = mh->handle;
retcode = ioctl(g->fd, GDRDRV_IOC_GET_INFO, &params);
if (0 != retcode) {
ret = errno;
gdr_err("ioctl error (errno=%d)\n", ret);
goto out;
} else {
info->va = params.va;
info->mapped_size = params.mapped_size;
info->page_size = params.page_size;
info->tm_cycles = params.tm_cycles;
info->cycles_per_ms = params.tsc_khz;
info->mapped = params.mapped;
info->wc_mapping = params.wc_mapping;
}
out:
return ret;
}
/*
* Local variables:
* c-indent-level: 4
* c-basic-offset: 4
* tab-width: 4
* indent-tabs-mode: nil
* End:
*/

View File

@ -0,0 +1,74 @@
/*
* Copyright (c) 2014-2021, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#ifndef __GDRAPI_INTERNAL_H__
#define __GDRAPI_INTERNAL_H__
#include <stdint.h> // for standard [u]intX_t types
#include <stddef.h>
#include <sys/queue.h>
#include "gdrapi.h"
#ifdef __cplusplus
extern "C" {
#endif
#ifndef unlikely
#ifdef __GNUC__
#define unlikely(x) __builtin_expect(!!(x), 0)
#else
#define unlikely(x) (x)
#endif
#endif
#ifndef ACCESS_ONCE
#define ACCESS_ONCE(x) (*(volatile typeof((x)) *)&(x))
#endif
#ifndef READ_ONCE
#define READ_ONCE(x) ACCESS_ONCE(x)
#endif
#ifndef WRITE_ONCE
#define WRITE_ONCE(x, v) (ACCESS_ONCE(x) = (v))
#endif
typedef struct gdr_memh_t {
uint32_t handle;
LIST_ENTRY(gdr_memh_t) entries;
gdr_mapping_type_t mapping_type;
} gdr_memh_t;
struct gdr {
int fd;
LIST_HEAD(memh_list, gdr_memh_t) memhs;
size_t page_size;
size_t page_mask;
uint8_t page_shift;
uint32_t gdrdrv_version;
};
#ifdef __cplusplus
}
#endif
#endif // __GDRAPI_INTERNAL_H__

View File

@ -0,0 +1,77 @@
# Copyright (c) 2014-2021, NVIDIA CORPORATION. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
ifneq ($(KERNELRELEASE),)
kver_major:=$(shell echo $(KERNELRELEASE) | awk -F '.' '// { print $$2;}' )
obj-m := nv-p2p-dummy.o
obj-m += gdrdrv.o
ccflags-y += -I$(NVIDIA_SRC_DIR)
ifeq ($(NVIDIA_IS_OPENSOURCE),y)
ccflags-y += -DGDRDRV_OPENSOURCE_NVIDIA
endif
ifeq ($(HAVE_VM_FLAGS_SET),y)
ccflags-y += -DGDRDRV_HAVE_VM_FLAGS_SET
endif
else
KVER ?= $(shell uname -r)
MODULES_DIR := /lib/modules/$(KVER)
KDIR := $(MODULES_DIR)/build
MODULE_SUBDIR ?= /kernel/drivers/misc/
MODULE_DESTDIR := $(MODULES_DIR)/$(MODULE_SUBDIR)
DEPMOD := /sbin/depmod
export NVIDIA_SRC_DIR ?= $(shell { find /usr/src/kernel-modules/nvidia-* /usr/src/nvidia-* -name "nv-p2p.c" -print -quit | xargs dirname || echo "NVIDIA_DRIVER_MISSING"; } 2>/dev/null)
export NVIDIA_IS_OPENSOURCE ?= $(shell grep -r "MODULE_LICENSE" $(NVIDIA_SRC_DIR)/ | grep -s -q "GPL" && echo "y")
CONF_SCRIPT_DIR ?= $(PWD)/../../scripts
export HAVE_VM_FLAGS_SET ?= $(shell $(CONF_SCRIPT_DIR)/test_gdrdrv_HAVE_VM_FLAGS_SET.sh -k $(KVER))
all: build
build:
@ echo "Picking NVIDIA driver sources from NVIDIA_SRC_DIR=$(NVIDIA_SRC_DIR). If that does not meet your expectation, you might have a stale driver still around and that might cause problems."
@ echo "Setting NVIDIA_IS_OPENSOURCE=$(NVIDIA_IS_OPENSOURCE)"
@ echo "Setting HAVE_VM_FLAGS_SET=$(HAVE_VM_FLAGS_SET)"
@ $(MAKE) -C $(KDIR) $(MAKE_PARAMS) M=$(PWD) modules
install: build
[ -d $(DESTDIR)/$(MODULE_DESTDIR) ] || mkdir -p $(DESTDIR)/$(MODULE_DESTDIR)
cp gdrdrv.ko $(DESTDIR)/$(MODULE_DESTDIR)
if [ ! -n "$(DESTDIR)" ]; then $(DEPMOD) -r -ae $(KVER); fi
help:
$(MAKE) -C $(KDIR) M=$$PWD help
clean:
rm -rf *.o .*.o.d *.ko* *.mod.* .*.cmd Module.symvers modules.order .tmp_versions/ *~ core .depend TAGS .cache.mk *.mod
TAGS:
find $(KERNELDIR) -follow -name \*.h -o -name \*.c |xargs etags
.PHONY: clean all help install default linksyms nvidia_src_dir build
endif

1509
gdrcopy/src/gdrdrv/gdrdrv.c Normal file

File diff suppressed because it is too large Load Diff

138
gdrcopy/src/gdrdrv/gdrdrv.h Normal file
View File

@ -0,0 +1,138 @@
/*
* Copyright (c) 2014-2021, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#ifndef __GDR_DRV_H__
#define __GDR_DRV_H__
#define GDRDRV_STRINGIFY(s) #s
#define GDRDRV_TOSTRING(s) GDRDRV_STRINGIFY(s)
#define GDRDRV_MAJOR_VERSION_SHIFT 16
#define GDRDRV_MAJOR_VERSION 2
#define GDRDRV_MINOR_VERSION 4
#define GDRDRV_VERSION ((GDRDRV_MAJOR_VERSION << GDRDRV_MAJOR_VERSION_SHIFT) | GDRDRV_MINOR_VERSION)
#define GDRDRV_VERSION_STRING GDRDRV_TOSTRING(GDRDRV_MAJOR_VERSION) "." GDRDRV_TOSTRING(GDRDRV_MINOR_VERSION)
#define MINIMUM_GDR_API_MAJOR_VERSION 2
#define MINIMUM_GDR_API_MINOR_VERSION 0
#define MINIMUM_GDR_API_VERSION ((MINIMUM_GDR_API_MAJOR_VERSION << 16) | MINIMUM_GDR_API_MINOR_VERSION)
#define GDRDRV_MINIMUM_VERSION_WITH_GET_INFO_V2 ((2 << GDRDRV_MAJOR_VERSION_SHIFT) | 4)
#define GDRDRV_IOCTL 0xDA
typedef enum {
GDR_MR_NONE = 0,
GDR_MR_WC = 1,
GDR_MR_CACHING = 2,
GDR_MR_DEVICE = 3
} gdr_mr_type_t;
typedef __u64 gdr_hnd_t;
//-----------
struct GDRDRV_IOC_PIN_BUFFER_PARAMS
{
// in
__u64 addr;
__u64 size;
__u64 p2p_token;
__u32 va_space;
// out
gdr_hnd_t handle;
};
#define GDRDRV_IOC_PIN_BUFFER _IOWR(GDRDRV_IOCTL, 1, struct GDRDRV_IOC_PIN_BUFFER_PARAMS)
//-----------
struct GDRDRV_IOC_UNPIN_BUFFER_PARAMS
{
// in
gdr_hnd_t handle;
};
#define GDRDRV_IOC_UNPIN_BUFFER _IOWR(GDRDRV_IOCTL, 2, struct GDRDRV_IOC_UNPIN_BUFFER_PARAMS *)
//-----------
struct GDRDRV_IOC_GET_CB_FLAG_PARAMS
{
// in
gdr_hnd_t handle;
// out
__u32 flag;
};
#define GDRDRV_IOC_GET_CB_FLAG _IOWR(GDRDRV_IOCTL, 3, struct GDRDRV_IOC_GET_CB_FLAG_PARAMS *)
//-----------
struct GDRDRV_IOC_GET_INFO_PARAMS
{
// in
gdr_hnd_t handle;
// out
__u64 va;
__u64 mapped_size;
__u32 page_size;
__u32 tsc_khz;
__u64 tm_cycles;
__u32 mapped;
__u32 wc_mapping;
};
#define GDRDRV_IOC_GET_INFO _IOWR(GDRDRV_IOCTL, 4, struct GDRDRV_IOC_GET_INFO_PARAMS *)
//-----------
struct GDRDRV_IOC_GET_INFO_V2_PARAMS
{
// in
gdr_hnd_t handle;
// out
__u64 va;
__u64 mapped_size;
__u32 page_size;
__u32 tsc_khz;
__u64 tm_cycles;
__u32 mapping_type;
};
#define GDRDRV_IOC_GET_INFO_V2 _IOWR(GDRDRV_IOCTL, 5, struct GDRDRV_IOC_GET_INFO_V2_PARAMS *)
//-----------
struct GDRDRV_IOC_GET_VERSION_PARAMS
{
// out
__u32 gdrdrv_version;
__u32 minimum_gdr_api_version;
};
#define GDRDRV_IOC_GET_VERSION _IOWR(GDRDRV_IOCTL, 255, struct GDRDRV_IOC_GET_VERSION_PARAMS *)
//-----------
#endif // __GDR_DRV_H__

View File

@ -0,0 +1,138 @@
/*
* Copyright (c) 2015-2023, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
/*
* Warning: this kernel module is only needed at compile time.
*
* Long story is that this module is here only to produce the correct
* module versions related to the very kernel where the other module (the
* interesting one) is going to be compiled. In other words, this module
* produce the same symbol versions as the real NVIDIA kernel-mode driver.
*
* Downside: the function signatures must be kept up-to-date.
*/
#include <linux/version.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/delay.h>
#include <linux/compiler.h>
#include <linux/string.h>
#include <linux/uaccess.h>
#include <linux/fs.h>
#include <linux/list.h>
#include <linux/mm.h>
#include <linux/io.h>
#include "nv-p2p.h"
MODULE_AUTHOR("drossetti@nvidia.com");
MODULE_LICENSE("MIT");
MODULE_DESCRIPTION("P2P dummy kernel-mode driver");
MODULE_VERSION("1.0");
int nvidia_p2p_init_mapping(uint64_t p2p_token,
struct nvidia_p2p_params *params,
void (*destroy_callback)(void *data),
void *data)
{
return -EINVAL;
}
EXPORT_SYMBOL(nvidia_p2p_init_mapping);
int nvidia_p2p_destroy_mapping(uint64_t p2p_token)
{
return -EINVAL;
}
EXPORT_SYMBOL(nvidia_p2p_destroy_mapping);
int nvidia_p2p_get_pages(uint64_t p2p_token, uint32_t va_space,
uint64_t virtual_address,
uint64_t length,
struct nvidia_p2p_page_table **page_table,
void (*free_callback)(void *data),
void *data)
{
return -EINVAL;
}
EXPORT_SYMBOL(nvidia_p2p_get_pages);
int nvidia_p2p_put_pages(uint64_t p2p_token, uint32_t va_space,
uint64_t virtual_address,
struct nvidia_p2p_page_table *page_table)
{
return -EINVAL;
}
EXPORT_SYMBOL(nvidia_p2p_put_pages);
int nvidia_p2p_free_page_table(struct nvidia_p2p_page_table *page_table)
{
return -EINVAL;
}
EXPORT_SYMBOL(nvidia_p2p_free_page_table);
#ifdef NVIDIA_P2P_CAP_PERSISTENT_PAGES
int nvidia_p2p_cap_persistent_pages;
EXPORT_SYMBOL(nvidia_p2p_cap_persistent_pages);
#endif
#ifdef NVIDIA_P2P_CAP_GET_PAGES_PERSISTENT_API
int nvidia_p2p_get_pages_persistent(uint64_t virtual_address,
uint64_t length,
struct nvidia_p2p_page_table **page_table,
uint32_t flags)
{
return -EINVAL;
}
EXPORT_SYMBOL(nvidia_p2p_get_pages_persistent);
int nvidia_p2p_put_pages_persistent(uint64_t virtual_address,
struct nvidia_p2p_page_table *page_table,
uint32_t flags)
{
return -EINVAL;
}
EXPORT_SYMBOL(nvidia_p2p_put_pages_persistent);
#endif
static int __init nv_p2p_dummy_init(void)
{
return 0;
}
static void __exit nv_p2p_dummy_cleanup(void)
{
}
module_init(nv_p2p_dummy_init);
module_exit(nv_p2p_dummy_cleanup);
/*
* Local variables:
* c-indent-level: 4
* c-basic-offset: 4
* tab-width: 4
* indent-tabs-mode: nil
* End:
*/

207
gdrcopy/src/memcpy_avx.c Normal file
View File

@ -0,0 +1,207 @@
/*
* Copyright (c) 2014-2021, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <immintrin.h>
#ifndef min
#define min(A,B) ((A)<(B)?(A):(B))
#endif
int memcpy_uncached_store_avx(void *dest, const void *src, size_t n_bytes)
{
int ret = 0;
#ifdef __AVX__
char *d = (char*)dest;
uintptr_t d_int = (uintptr_t)d;
const char *s = (const char *)src;
uintptr_t s_int = (uintptr_t)s;
size_t n = n_bytes;
// align dest to 256-bits
if (d_int & 0x1f) {
size_t nh = min(0x20 - (d_int & 0x1f), n);
memcpy(d, s, nh);
d += nh; d_int += nh;
s += nh; s_int += nh;
n -= nh;
}
if (s_int & 0x1f) { // src is not aligned to 256-bits
__m256d r0,r1,r2,r3;
// unroll 4
while (n >= 4*sizeof(__m256d)) {
r0 = _mm256_loadu_pd((double *)(s+0*sizeof(__m256d)));
r1 = _mm256_loadu_pd((double *)(s+1*sizeof(__m256d)));
r2 = _mm256_loadu_pd((double *)(s+2*sizeof(__m256d)));
r3 = _mm256_loadu_pd((double *)(s+3*sizeof(__m256d)));
_mm256_stream_pd((double *)(d+0*sizeof(__m256d)), r0);
_mm256_stream_pd((double *)(d+1*sizeof(__m256d)), r1);
_mm256_stream_pd((double *)(d+2*sizeof(__m256d)), r2);
_mm256_stream_pd((double *)(d+3*sizeof(__m256d)), r3);
s += 4*sizeof(__m256d);
d += 4*sizeof(__m256d);
n -= 4*sizeof(__m256d);
}
while (n >= sizeof(__m256d)) {
r0 = _mm256_loadu_pd((double *)(s));
_mm256_stream_pd((double *)(d), r0);
s += sizeof(__m256d);
d += sizeof(__m256d);
n -= sizeof(__m256d);
}
} else { // or it IS aligned
__m256d r0,r1,r2,r3,r4,r5,r6,r7;
// unroll 8
while (n >= 8*sizeof(__m256d)) {
r0 = _mm256_load_pd((double *)(s+0*sizeof(__m256d)));
r1 = _mm256_load_pd((double *)(s+1*sizeof(__m256d)));
r2 = _mm256_load_pd((double *)(s+2*sizeof(__m256d)));
r3 = _mm256_load_pd((double *)(s+3*sizeof(__m256d)));
r4 = _mm256_load_pd((double *)(s+4*sizeof(__m256d)));
r5 = _mm256_load_pd((double *)(s+5*sizeof(__m256d)));
r6 = _mm256_load_pd((double *)(s+6*sizeof(__m256d)));
r7 = _mm256_load_pd((double *)(s+7*sizeof(__m256d)));
_mm256_stream_pd((double *)(d+0*sizeof(__m256d)), r0);
_mm256_stream_pd((double *)(d+1*sizeof(__m256d)), r1);
_mm256_stream_pd((double *)(d+2*sizeof(__m256d)), r2);
_mm256_stream_pd((double *)(d+3*sizeof(__m256d)), r3);
_mm256_stream_pd((double *)(d+4*sizeof(__m256d)), r4);
_mm256_stream_pd((double *)(d+5*sizeof(__m256d)), r5);
_mm256_stream_pd((double *)(d+6*sizeof(__m256d)), r6);
_mm256_stream_pd((double *)(d+7*sizeof(__m256d)), r7);
s += 8*sizeof(__m256d);
d += 8*sizeof(__m256d);
n -= 8*sizeof(__m256d);
}
while (n >= sizeof(__m256d)) {
r0 = _mm256_load_pd((double *)(s));
_mm256_stream_pd((double *)(d), r0);
s += sizeof(__m256d);
d += sizeof(__m256d);
n -= sizeof(__m256d);
}
}
if (n)
memcpy(d, s, n);
// fencing is needed even for plain memcpy(), due to performance
// being hit by delayed flushing of WC buffers
_mm_sfence();
#else
#error "this file should be compiled with -mavx"
#endif
return ret;
}
int memcpy_cached_store_avx(void *dest, const void *src, size_t n_bytes)
{
int ret = 0;
#ifdef __AVX__
char *d = (char*)dest;
uintptr_t d_int = (uintptr_t)d;
const char *s = (const char *)src;
uintptr_t s_int = (uintptr_t)s;
size_t n = n_bytes;
// align dest to 256-bits
if (d_int & 0x1f) {
size_t nh = min(0x20 - (d_int & 0x1f), n);
memcpy(d, s, nh);
d += nh; d_int += nh;
s += nh; s_int += nh;
n -= nh;
}
if (s_int & 0x1f) { // src is not aligned to 256-bits
__m256d r0,r1,r2,r3;
// unroll 4
while (n >= 4*sizeof(__m256d)) {
r0 = _mm256_loadu_pd((double *)(s+0*sizeof(__m256d)));
r1 = _mm256_loadu_pd((double *)(s+1*sizeof(__m256d)));
r2 = _mm256_loadu_pd((double *)(s+2*sizeof(__m256d)));
r3 = _mm256_loadu_pd((double *)(s+3*sizeof(__m256d)));
_mm256_store_pd((double *)(d+0*sizeof(__m256d)), r0);
_mm256_store_pd((double *)(d+1*sizeof(__m256d)), r1);
_mm256_store_pd((double *)(d+2*sizeof(__m256d)), r2);
_mm256_store_pd((double *)(d+3*sizeof(__m256d)), r3);
s += 4*sizeof(__m256d);
d += 4*sizeof(__m256d);
n -= 4*sizeof(__m256d);
}
while (n >= sizeof(__m256d)) {
r0 = _mm256_loadu_pd((double *)(s));
_mm256_store_pd((double *)(d), r0);
s += sizeof(__m256d);
d += sizeof(__m256d);
n -= sizeof(__m256d);
}
} else { // or it IS aligned
__m256d r0,r1,r2,r3;
// unroll 4
while (n >= 4*sizeof(__m256d)) {
r0 = _mm256_load_pd((double *)(s+0*sizeof(__m256d)));
r1 = _mm256_load_pd((double *)(s+1*sizeof(__m256d)));
r2 = _mm256_load_pd((double *)(s+2*sizeof(__m256d)));
r3 = _mm256_load_pd((double *)(s+3*sizeof(__m256d)));
_mm256_store_pd((double *)(d+0*sizeof(__m256d)), r0);
_mm256_store_pd((double *)(d+1*sizeof(__m256d)), r1);
_mm256_store_pd((double *)(d+2*sizeof(__m256d)), r2);
_mm256_store_pd((double *)(d+3*sizeof(__m256d)), r3);
s += 4*sizeof(__m256d);
d += 4*sizeof(__m256d);
n -= 4*sizeof(__m256d);
}
while (n >= sizeof(__m256d)) {
r0 = _mm256_load_pd((double *)(s));
_mm256_store_pd((double *)(d), r0);
s += sizeof(__m256d);
d += sizeof(__m256d);
n -= sizeof(__m256d);
}
}
if (n)
memcpy(d, s, n);
// fencing is needed because of the use of non-temporal stores
_mm_sfence();
#else
#error "this file should be compiled with -mavx"
#endif
return ret;
}
// add variant for _mm_stream_load_si256() / VMOVNTDQA
/*
* Local variables:
* c-indent-level: 4
* c-basic-offset: 4
* tab-width: 4
* indent-tabs-mode: nil
* End:
*/

198
gdrcopy/src/memcpy_sse.c Normal file
View File

@ -0,0 +1,198 @@
/*
* Copyright (c) 2014-2021, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <immintrin.h>
#ifndef min
#define min(A,B) ((A)<(B)?(A):(B))
#endif
int memcpy_uncached_store_sse(void *dest, const void *src, size_t n_bytes)
{
int ret = 0;
#ifdef __SSE__
char *d = (char*)dest;
uintptr_t d_int = (uintptr_t)d;
const char *s = (const char *)src;
uintptr_t s_int = (uintptr_t)s;
size_t n = n_bytes;
// align dest to 128-bits
if (d_int & 0xf) {
size_t nh = min(0x10 - (d_int & 0x0f), n);
memcpy(d, s, nh);
d += nh; d_int += nh;
s += nh; s_int += nh;
n -= nh;
}
if (s_int & 0xf) { // src is not aligned to 128-bits
__m128 r0,r1,r2,r3;
// unroll 4
while (n >= 4*4*sizeof(float)) {
r0 = _mm_loadu_ps((float *)(s+0*4*sizeof(float)));
r1 = _mm_loadu_ps((float *)(s+1*4*sizeof(float)));
r2 = _mm_loadu_ps((float *)(s+2*4*sizeof(float)));
r3 = _mm_loadu_ps((float *)(s+3*4*sizeof(float)));
_mm_stream_ps((float *)(d+0*4*sizeof(float)), r0);
_mm_stream_ps((float *)(d+1*4*sizeof(float)), r1);
_mm_stream_ps((float *)(d+2*4*sizeof(float)), r2);
_mm_stream_ps((float *)(d+3*4*sizeof(float)), r3);
s += 4*4*sizeof(float);
d += 4*4*sizeof(float);
n -= 4*4*sizeof(float);
}
while (n >= 4*sizeof(float)) {
r0 = _mm_loadu_ps((float *)(s));
_mm_stream_ps((float *)(d), r0);
s += 4*sizeof(float);
d += 4*sizeof(float);
n -= 4*sizeof(float);
}
} else { // or it IS aligned
__m128 r0,r1,r2,r3;
// unroll 4
while (n >= 4*4*sizeof(float)) {
r0 = _mm_load_ps((float *)(s+0*4*sizeof(float)));
r1 = _mm_load_ps((float *)(s+1*4*sizeof(float)));
r2 = _mm_load_ps((float *)(s+2*4*sizeof(float)));
r3 = _mm_load_ps((float *)(s+3*4*sizeof(float)));
_mm_stream_ps((float *)(d+0*4*sizeof(float)), r0);
_mm_stream_ps((float *)(d+1*4*sizeof(float)), r1);
_mm_stream_ps((float *)(d+2*4*sizeof(float)), r2);
_mm_stream_ps((float *)(d+3*4*sizeof(float)), r3);
s += 4*4*sizeof(float);
d += 4*4*sizeof(float);
n -= 4*4*sizeof(float);
}
while (n >= 4*sizeof(float)) {
r0 = _mm_load_ps((float *)(s));
_mm_stream_ps((float *)(d), r0);
s += 4*sizeof(float);
d += 4*sizeof(float);
n -= 4*sizeof(float);
}
}
if (n)
memcpy(d, s, n);
// fencing is needed even for plain memcpy(), due to performance
// being hit by delayed flushing of WC buffers
_mm_sfence();
#else
#error "this file should be compiled with -msse"
#endif
return ret;
}
int memcpy_cached_store_sse(void *dest, const void *src, size_t n_bytes)
{
int ret = 0;
#ifdef __SSE__
char *d = (char*)dest;
uintptr_t d_int = (uintptr_t)d;
const char *s = (const char *)src;
uintptr_t s_int = (uintptr_t)s;
size_t n = n_bytes;
// align dest to 128-bits
if (d_int & 0xf) {
size_t nh = min(0x10 - (d_int & 0x0f), n);
memcpy(d, s, nh);
d += nh; d_int += nh;
s += nh; s_int += nh;
n -= nh;
}
if (s_int & 0xf) { // src is not aligned to 128-bits
__m128 r0,r1,r2,r3;
// unroll 4
while (n >= 4*4*sizeof(float)) {
r0 = _mm_loadu_ps((float *)(s+0*4*sizeof(float)));
r1 = _mm_loadu_ps((float *)(s+1*4*sizeof(float)));
r2 = _mm_loadu_ps((float *)(s+2*4*sizeof(float)));
r3 = _mm_loadu_ps((float *)(s+3*4*sizeof(float)));
_mm_store_ps((float *)(d+0*4*sizeof(float)), r0);
_mm_store_ps((float *)(d+1*4*sizeof(float)), r1);
_mm_store_ps((float *)(d+2*4*sizeof(float)), r2);
_mm_store_ps((float *)(d+3*4*sizeof(float)), r3);
s += 4*4*sizeof(float);
d += 4*4*sizeof(float);
n -= 4*4*sizeof(float);
}
while (n >= 4*sizeof(float)) {
r0 = _mm_loadu_ps((float *)(s));
_mm_store_ps((float *)(d), r0);
s += 4*sizeof(float);
d += 4*sizeof(float);
n -= 4*sizeof(float);
}
} else { // or it IS aligned
__m128 r0,r1,r2,r3;
// unroll 4
while (n >= 4*4*sizeof(float)) {
r0 = _mm_load_ps((float *)(s+0*4*sizeof(float)));
r1 = _mm_load_ps((float *)(s+1*4*sizeof(float)));
r2 = _mm_load_ps((float *)(s+2*4*sizeof(float)));
r3 = _mm_load_ps((float *)(s+3*4*sizeof(float)));
_mm_store_ps((float *)(d+0*4*sizeof(float)), r0);
_mm_store_ps((float *)(d+1*4*sizeof(float)), r1);
_mm_store_ps((float *)(d+2*4*sizeof(float)), r2);
_mm_store_ps((float *)(d+3*4*sizeof(float)), r3);
s += 4*4*sizeof(float);
d += 4*4*sizeof(float);
n -= 4*4*sizeof(float);
}
while (n >= 4*sizeof(float)) {
r0 = _mm_load_ps((float *)(s));
_mm_store_ps((float *)(d), r0);
s += 4*sizeof(float);
d += 4*sizeof(float);
n -= 4*sizeof(float);
}
}
if (n)
memcpy(d, s, n);
// fencing because of NT stores
// potential optimization: issue only when NT stores are actually emitted
_mm_sfence();
#else
#error "this file should be compiled with -msse"
#endif
return ret;
}
/*
* Local variables:
* c-indent-level: 4
* c-basic-offset: 4
* tab-width: 4
* indent-tabs-mode: nil
* End:
*/

141
gdrcopy/src/memcpy_sse41.c Normal file
View File

@ -0,0 +1,141 @@
/*
* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <immintrin.h>
#ifndef min
#define min(A,B) ((A)<(B)?(A):(B))
#endif
// implementation of copy from BAR using MOVNTDQA
// suggested by Nicholas Wilt <nwilt@amazon.com>
// src is WC MMIO of GPU BAR
// dest is host memory
int memcpy_uncached_load_sse41(void *dest, const void *src, size_t n_bytes)
{
int ret = 0;
#ifdef __SSE4_1__
char *d = (char*)dest;
uintptr_t d_int = (uintptr_t)d;
const char *s = (const char *)src;
uintptr_t s_int = (uintptr_t)s;
size_t n = n_bytes;
// align src to 128-bits
if (s_int & 0xf) {
size_t nh = min(0x10 - (s_int & 0x0f), n);
memcpy(d, s, nh);
d += nh; d_int += nh;
s += nh; s_int += nh;
n -= nh;
}
if (d_int & 0xf) { // dest is not aligned to 128-bits
__m128i r0,r1,r2,r3,r4,r5,r6,r7;
// unroll 8
while (n >= 8*sizeof(__m128i)) {
r0 = _mm_stream_load_si128 ((__m128i *)(s+0*sizeof(__m128i)));
r1 = _mm_stream_load_si128 ((__m128i *)(s+1*sizeof(__m128i)));
r2 = _mm_stream_load_si128 ((__m128i *)(s+2*sizeof(__m128i)));
r3 = _mm_stream_load_si128 ((__m128i *)(s+3*sizeof(__m128i)));
r4 = _mm_stream_load_si128 ((__m128i *)(s+4*sizeof(__m128i)));
r5 = _mm_stream_load_si128 ((__m128i *)(s+5*sizeof(__m128i)));
r6 = _mm_stream_load_si128 ((__m128i *)(s+6*sizeof(__m128i)));
r7 = _mm_stream_load_si128 ((__m128i *)(s+7*sizeof(__m128i)));
_mm_storeu_si128((__m128i *)(d+0*sizeof(__m128i)), r0);
_mm_storeu_si128((__m128i *)(d+1*sizeof(__m128i)), r1);
_mm_storeu_si128((__m128i *)(d+2*sizeof(__m128i)), r2);
_mm_storeu_si128((__m128i *)(d+3*sizeof(__m128i)), r3);
_mm_storeu_si128((__m128i *)(d+4*sizeof(__m128i)), r4);
_mm_storeu_si128((__m128i *)(d+5*sizeof(__m128i)), r5);
_mm_storeu_si128((__m128i *)(d+6*sizeof(__m128i)), r6);
_mm_storeu_si128((__m128i *)(d+7*sizeof(__m128i)), r7);
s += 8*sizeof(__m128i);
d += 8*sizeof(__m128i);
n -= 8*sizeof(__m128i);
}
while (n >= sizeof(__m128i)) {
r0 = _mm_stream_load_si128 ((__m128i *)(s+0*sizeof(__m128i)));
_mm_storeu_si128((__m128i *)(d+0*sizeof(__m128i)), r0);
s += sizeof(__m128i);
d += sizeof(__m128i);
n -= sizeof(__m128i);
}
} else { // or it IS aligned
__m128i r0,r1,r2,r3,r4,r5,r6,r7;
// unroll 8
while (n >= 8*sizeof(__m128i)) {
r0 = _mm_stream_load_si128 ((__m128i *)(s+0*sizeof(__m128i)));
r1 = _mm_stream_load_si128 ((__m128i *)(s+1*sizeof(__m128i)));
r2 = _mm_stream_load_si128 ((__m128i *)(s+2*sizeof(__m128i)));
r3 = _mm_stream_load_si128 ((__m128i *)(s+3*sizeof(__m128i)));
r4 = _mm_stream_load_si128 ((__m128i *)(s+4*sizeof(__m128i)));
r5 = _mm_stream_load_si128 ((__m128i *)(s+5*sizeof(__m128i)));
r6 = _mm_stream_load_si128 ((__m128i *)(s+6*sizeof(__m128i)));
r7 = _mm_stream_load_si128 ((__m128i *)(s+7*sizeof(__m128i)));
_mm_stream_si128((__m128i *)(d+0*sizeof(__m128i)), r0);
_mm_stream_si128((__m128i *)(d+1*sizeof(__m128i)), r1);
_mm_stream_si128((__m128i *)(d+2*sizeof(__m128i)), r2);
_mm_stream_si128((__m128i *)(d+3*sizeof(__m128i)), r3);
_mm_stream_si128((__m128i *)(d+4*sizeof(__m128i)), r4);
_mm_stream_si128((__m128i *)(d+5*sizeof(__m128i)), r5);
_mm_stream_si128((__m128i *)(d+6*sizeof(__m128i)), r6);
_mm_stream_si128((__m128i *)(d+7*sizeof(__m128i)), r7);
s += 8*sizeof(__m128i);
d += 8*sizeof(__m128i);
n -= 8*sizeof(__m128i);
}
while (n >= sizeof(__m128i)) {
r0 = _mm_stream_load_si128 ((__m128i *)(s+0*sizeof(__m128i)));
_mm_stream_si128((__m128i *)(d+0*sizeof(__m128i)), r0);
s += sizeof(__m128i);
d += sizeof(__m128i);
n -= sizeof(__m128i);
}
}
if (n)
memcpy(d, s, n);
// fencing because of NT stores
// potential optimization: issue only when NT stores are actually emitted
_mm_sfence();
#else
#error "this file should be compiled with -msse4.1"
#endif
return ret;
}
/*
* Local variables:
* c-indent-level: 4
* c-basic-offset: 4
* tab-width: 4
* indent-tabs-mode: nil
* End:
*/

69
gdrcopy/tests/Makefile Normal file
View File

@ -0,0 +1,69 @@
DESTBIN ?=
CUDA ?= /usr/local/cuda
NVCC ?= $(CUDA)/bin/nvcc
GDRAPI_INC := ../include
GDRAPI_SRC := ../src
CUDA_LIB := -L $(CUDA)/lib64 -L $(CUDA)/lib -L /usr/lib64/nvidia -L /usr/lib/nvidia -L $(CUDA)/lib64/stubs
CUDA_INC += -I $(CUDA)/include
CPPFLAGS := $(CUDA_INC) -I $(GDRAPI_INC) -I $(GDRAPI_SRC) -I $(CUDA)/include
LDFLAGS := $(CUDA_LIB) -L $(CUDA)/lib64 -L $(GDRAPI_SRC)
COMMONCFLAGS := -O2
CFLAGS += $(COMMONCFLAGS)
CXXFLAGS += $(COMMONCFLAGS)
NVCCFLAGS ?=
LIBS := -lcuda -lpthread -ldl -lgdrapi
CPP_SRCS := copybw.cpp sanity.cpp copylat.cpp apiperf.cpp
CU_SRCS := pplat.cu
EXES := $(patsubst %.cpp,gdrcopy_%,$(CPP_SRCS)) $(patsubst %.cu,gdrcopy_%,$(CU_SRCS))
all: exes
exes: $(EXES)
testsuites/testsuite.o: testsuites/testsuite.cpp testsuites/testsuite.hpp common.hpp
common.o: common.cpp $(GDRAPI_INC)/gdrapi.h common.hpp
copybw.o: copybw.cpp $(GDRAPI_INC)/gdrapi.h common.hpp
sanity.o: sanity.cpp $(GDRAPI_INC)/gdrapi.h $(GDRAPI_SRC)/gdrapi_internal.h common.hpp testsuites/testsuite.hpp
copylat.o: copylat.cpp $(GDRAPI_INC)/gdrapi.h common.hpp
apiperf.o: apiperf.cpp $(GDRAPI_INC)/gdrapi.h common.hpp
gdrcopy_copybw: copybw.o common.o
$(LINK.cc) -o $@ $^ $(LIBS) -lrt
gdrcopy_sanity: sanity.o common.o testsuites/testsuite.o
$(LINK.cc) -o $@ $^ $(LIBS)
gdrcopy_copylat: copylat.o common.o
$(LINK.cc) -o $@ $^ $(LIBS) -lrt
gdrcopy_apiperf: apiperf.o common.o
$(LINK.cc) -o $@ $^ $(LIBS) -lrt
gdrcopy_pplat: pplat.o common.o
$(NVCC) -o $@ $^ $(LDFLAGS) -lgdrapi -lcuda
%.o: %.cu
$(NVCC) -o $@ -c $^ $(LIBS) $(CPPFLAGS) $(NVCCFLAGS)
clean:
rm -f *.o $(EXES) *~ core.* testsuites/*.o
install: exes
@ echo "installing exes in $(DESTBIN)..." && \
mkdir -p $(DESTBIN) && \
install -D -v -m u=rwx,g=rx,o=rx gdrcopy_copybw -t $(DESTBIN) && \
install -D -v -m u=rwx,g=rx,o=rx gdrcopy_copylat -t $(DESTBIN) && \
install -D -v -m u=rwx,g=rx,o=rx gdrcopy_apiperf -t $(DESTBIN) && \
install -D -v -m u=rwx,g=rx,o=rx gdrcopy_sanity -t $(DESTBIN) && \
install -D -v -m u=rwx,g=rx,o=rx gdrcopy_pplat -t $(DESTBIN)
cd $(DESTBIN) && \
ln -sf gdrcopy_copybw copybw && \
ln -sf gdrcopy_copylat copylat && \
ln -sf gdrcopy_apiperf apiperf && \
ln -sf gdrcopy_sanity sanity
.PHONY: clean all exes install

287
gdrcopy/tests/apiperf.cpp Normal file
View File

@ -0,0 +1,287 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include <stdlib.h>
#include <getopt.h>
#include <memory.h>
#include <stdio.h>
#include <math.h>
#include <iostream>
#include <iomanip>
#include <cuda.h>
using namespace std;
#include "gdrapi.h"
#include "common.hpp"
using namespace gdrcopy::test;
// manually tuned...
int num_iters = 100;
int num_bins = 10;
int num_warmup_iters = 10;
size_t _size = (size_t)1 << 24;
int dev_id = 0;
void print_usage(const char *path)
{
cout << "Usage: " << path << " [-h][-s <max-size>][-d <gpu>][-n <iters>][-w <iters>][-a <fn>]" << endl;
cout << endl;
cout << "Options:" << endl;
cout << " -h Print this help text" << endl;
cout << " -s <max-size> Max buffer size to benchmark (default: " << _size << ")" << endl;
cout << " -d <gpu> GPU ID (default: " << dev_id << ")" << endl;
cout << " -n <iters> Number of benchmark iterations (default: " << num_iters << ")" << endl;
cout << " -w <iters> Number of warm-up iterations (default: " << num_warmup_iters << ")" << endl;
cout << " -a <fn> GPU buffer allocation function (default: cuMemAlloc)" << endl;
cout << " Choices: cuMemAlloc, cuMemCreate" << endl;
}
void run_test(CUdeviceptr d_A, size_t size)
{
// minimum pinning size is a GPU page size
size_t pin_request_size = GPU_PAGE_SIZE;
struct timespec beg, end;
double pin_lat_us;
double map_lat_us;
double unpin_lat_us;
double unmap_lat_us;
double inf_lat_us;
double delta_lat_us;
double *lat_arr;
int *bin_arr;
gdr_t g = gdr_open();
ASSERT_NEQ(g, (void*)0);
gdr_mh_t mh;
BEGIN_CHECK {
// tokens are optional in CUDA 6.0
// wave out the test if GPUDirectRDMA is not enabled
lat_arr = (double *)malloc(sizeof(double) * num_iters);
bin_arr = (int *)malloc(sizeof(double) * num_bins);
while (pin_request_size <= size) {
int iter = 0;
size_t actual_pin_size;
double min_lat, max_lat;
min_lat = -1;
max_lat = -1;
pin_lat_us = 0;
map_lat_us = 0;
unpin_lat_us = 0;
unmap_lat_us = 0;
inf_lat_us = 0;
actual_pin_size = PAGE_ROUND_UP(pin_request_size, GPU_PAGE_SIZE);
for (iter = 0; iter < num_warmup_iters; ++iter) {
BREAK_IF_NEQ(gdr_pin_buffer(g, d_A, actual_pin_size, 0, 0, &mh), 0);
ASSERT_NEQ(mh, null_mh);
void *map_d_ptr = NULL;
ASSERT_EQ(gdr_map(g, mh, &map_d_ptr, actual_pin_size), 0);
gdr_info_t info;
ASSERT_EQ(gdr_get_info(g, mh, &info), 0);
ASSERT_EQ(gdr_unmap(g, mh, map_d_ptr, actual_pin_size), 0);
ASSERT_EQ(gdr_unpin_buffer(g, mh), 0);
}
for (iter = 0; iter < num_iters; ++iter) {
clock_gettime(MYCLOCK, &beg);
BREAK_IF_NEQ(gdr_pin_buffer(g, d_A, actual_pin_size, 0, 0, &mh), 0);
clock_gettime(MYCLOCK, &end);
delta_lat_us = ((end.tv_nsec-beg.tv_nsec)/1000.0 + (end.tv_sec-beg.tv_sec)*1000000.0);
pin_lat_us += delta_lat_us;
ASSERT_NEQ(mh, null_mh);
lat_arr[iter] = delta_lat_us;
min_lat = (min_lat == -1) ? delta_lat_us : ((delta_lat_us < min_lat) ? delta_lat_us : min_lat);
max_lat = delta_lat_us > max_lat ? delta_lat_us : max_lat;
void *map_d_ptr = NULL;
clock_gettime(MYCLOCK, &beg);
ASSERT_EQ(gdr_map(g, mh, &map_d_ptr, actual_pin_size), 0);
clock_gettime(MYCLOCK, &end);
delta_lat_us = ((end.tv_nsec-beg.tv_nsec)/1000.0 + (end.tv_sec-beg.tv_sec)*1000000.0);
map_lat_us += delta_lat_us;
gdr_info_t info;
clock_gettime(MYCLOCK, &beg);
ASSERT_EQ(gdr_get_info(g, mh, &info), 0);
clock_gettime(MYCLOCK, &end);
delta_lat_us = ((end.tv_nsec-beg.tv_nsec)/1000.0 + (end.tv_sec-beg.tv_sec)*1000000.0);
inf_lat_us += delta_lat_us;
clock_gettime(MYCLOCK, &beg);
ASSERT_EQ(gdr_unmap(g, mh, map_d_ptr, actual_pin_size), 0);
clock_gettime(MYCLOCK, &end);
delta_lat_us = ((end.tv_nsec-beg.tv_nsec)/1000.0 + (end.tv_sec-beg.tv_sec)*1000000.0);
unmap_lat_us += delta_lat_us;
clock_gettime(MYCLOCK, &beg);
ASSERT_EQ(gdr_unpin_buffer(g, mh), 0);
clock_gettime(MYCLOCK, &end);
delta_lat_us = ((end.tv_nsec-beg.tv_nsec)/1000.0 + (end.tv_sec-beg.tv_sec)*1000000.0);
unpin_lat_us += delta_lat_us;
}
pin_lat_us /= iter;
map_lat_us /= iter;
inf_lat_us /= iter;
unpin_lat_us /= iter;
unmap_lat_us /= iter;
printf("Size(B)\tpin.Time(us)\tmap.Time(us)\tget_info.Time(us)\tunmap.Time(us)\tunpin.Time(us)\n");
printf("%zu\t%f\t%f\t%f\t%f\t%f\n",
actual_pin_size, pin_lat_us, map_lat_us, inf_lat_us, unmap_lat_us, unpin_lat_us);
pin_request_size <<= 1;
printf("Histogram of gdr_pin_buffer latency for %ld bytes\n", actual_pin_size);
print_histogram(lat_arr, num_iters, bin_arr, num_bins, min_lat, max_lat);
printf("\n");
}
free(lat_arr);
free(bin_arr);
} END_CHECK;
cout << "closing gdrdrv" << endl;
ASSERT_EQ(gdr_close(g), 0);
}
int main(int argc, char *argv[])
{
gpu_memalloc_fn_t galloc_fn = gpu_mem_alloc;
gpu_memfree_fn_t gfree_fn = gpu_mem_free;
while(1) {
int c;
c = getopt(argc, argv, "s:d:n:w:a:h");
if (c == -1)
break;
switch (c) {
case 's':
_size = strtol(optarg, NULL, 0);
break;
case 'd':
dev_id = strtol(optarg, NULL, 0);
break;
case 'n':
num_iters = strtol(optarg, NULL, 0);
break;
case 'w':
num_warmup_iters = strtol(optarg, NULL, 0);
break;
case 'a':
if (strcmp(optarg, "cuMemAlloc") == 0) {
galloc_fn = gpu_mem_alloc;
gfree_fn = gpu_mem_free;
}
else if (strcmp(optarg, "cuMemCreate") == 0) {
galloc_fn = gpu_vmm_alloc;
gfree_fn = gpu_vmm_free;
}
else {
cerr << "Unrecognized fn argument" << endl;
exit(EXIT_FAILURE);
}
break;
case 'h':
print_usage(argv[0]);
exit(EXIT_SUCCESS);
break;
default:
printf("ERROR: invalid option\n");
exit(EXIT_FAILURE);
}
}
size_t size = PAGE_ROUND_UP(_size, GPU_PAGE_SIZE);
ASSERTDRV(cuInit(0));
int n_devices = 0;
ASSERTDRV(cuDeviceGetCount(&n_devices));
CUdevice dev;
for (int n=0; n<n_devices; ++n) {
char dev_name[256];
int dev_pci_domain_id;
int dev_pci_bus_id;
int dev_pci_device_id;
ASSERTDRV(cuDeviceGet(&dev, n));
ASSERTDRV(cuDeviceGetName(dev_name, sizeof(dev_name) / sizeof(dev_name[0]), dev));
ASSERTDRV(cuDeviceGetAttribute(&dev_pci_domain_id, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, dev));
ASSERTDRV(cuDeviceGetAttribute(&dev_pci_bus_id, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev));
ASSERTDRV(cuDeviceGetAttribute(&dev_pci_device_id, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, dev));
cout << "GPU id:" << n << "; name: " << dev_name
<< "; Bus id: "
<< std::hex
<< std::setfill('0') << std::setw(4) << dev_pci_domain_id
<< ":" << std::setfill('0') << std::setw(2) << dev_pci_bus_id
<< ":" << std::setfill('0') << std::setw(2) << dev_pci_device_id
<< std::dec
<< endl;
}
cout << "selecting device " << dev_id << endl;
ASSERTDRV(cuDeviceGet(&dev, dev_id));
CUcontext dev_ctx;
ASSERTDRV(cuDevicePrimaryCtxRetain(&dev_ctx, dev));
ASSERTDRV(cuCtxSetCurrent(dev_ctx));
ASSERT_EQ(check_gdr_support(dev), true);
CUdeviceptr d_A;
gpu_mem_handle_t mhandle;
ASSERTDRV(galloc_fn(&mhandle, size, true, true));
d_A = mhandle.ptr;
cout << "device ptr: 0x" << hex << d_A << dec << endl;
cout << "allocated size: " << size << endl;
run_test(d_A, size);
ASSERTDRV(gfree_fn(&mhandle));
ASSERTDRV(cuCtxSetCurrent(NULL));
ASSERTDRV(cuDevicePrimaryCtxRelease(dev));
return 0;
}
/*
* Local variables:
* c-indent-level: 4
* c-basic-offset: 4
* tab-width: 4
* indent-tabs-mode: nil
* End:
*/

358
gdrcopy/tests/common.cpp Normal file
View File

@ -0,0 +1,358 @@
/*
* Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include <stdio.h>
#include <stdarg.h>
#include <sys/types.h>
#include <unistd.h>
#include <map>
#include <cuda.h>
#include "common.hpp"
namespace gdrcopy {
namespace test {
bool print_dbg_msg = false;
void print_dbg(const char* fmt, ...)
{
if (print_dbg_msg) {
va_list ap;
va_start(ap, fmt);
vfprintf(stderr, fmt, ap);
va_end(ap);
}
}
CUresult gpu_mem_alloc(gpu_mem_handle_t *handle, const size_t size, bool aligned_mapping, bool set_sync_memops)
{
CUresult ret = CUDA_SUCCESS;
CUdeviceptr ptr, out_ptr;
size_t allocated_size;
if (aligned_mapping)
allocated_size = size + GPU_PAGE_SIZE - 1;
else
allocated_size = size;
ret = cuMemAlloc(&ptr, allocated_size);
if (ret != CUDA_SUCCESS)
return ret;
if (set_sync_memops) {
unsigned int flag = 1;
ret = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, ptr);
if (ret != CUDA_SUCCESS) {
cuMemFree(ptr);
return ret;
}
}
if (aligned_mapping)
out_ptr = PAGE_ROUND_UP(ptr, GPU_PAGE_SIZE);
else
out_ptr = ptr;
handle->ptr = out_ptr;
handle->unaligned_ptr = ptr;
handle->size = size;
handle->allocated_size = allocated_size;
return CUDA_SUCCESS;
}
CUresult gpu_mem_free(gpu_mem_handle_t *handle)
{
CUresult ret = CUDA_SUCCESS;
CUdeviceptr ptr;
ret = cuMemFree(handle->unaligned_ptr);
if (ret == CUDA_SUCCESS)
memset(handle, 0, sizeof(gpu_mem_handle_t));
return ret;
}
#if CUDA_VERSION >= 11000
/**
* Allocating GPU memory using VMM API.
* VMM API is available since CUDA 10.2. However, the RDMA support is added in CUDA 11.0.
* Our tests are not useful without RDMA support. So, we enable this VMM allocation from CUDA 11.0.
*/
CUresult gpu_vmm_alloc(gpu_mem_handle_t *handle, const size_t size, bool aligned_mapping, bool set_sync_memops)
{
CUresult ret = CUDA_SUCCESS;
size_t granularity, gran;
CUmemAllocationProp mprop;
CUdevice gpu_dev;
size_t rounded_size;
CUdeviceptr ptr = 0;
CUmemGenericAllocationHandle mem_handle = 0;
bool is_mapped = false;
int RDMASupported = 0;
int version;
ret = cuDriverGetVersion(&version);
if (ret != CUDA_SUCCESS) {
print_dbg("error in cuDriverGetVersion\n");
goto out;
}
if (version < 11000) {
print_dbg("VMM with RDMA is not supported in this CUDA version.\n");
ret = CUDA_ERROR_NOT_SUPPORTED;
goto out;
}
ret = cuCtxGetDevice(&gpu_dev);
if (ret != CUDA_SUCCESS) {
print_dbg("error in cuCtxGetDevice\n");
goto out;
}
ret = cuDeviceGetAttribute(&RDMASupported, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, gpu_dev);
if (ret != CUDA_SUCCESS) {
print_dbg("error in cuDeviceGetAttribute\n");
goto out;
}
if (!RDMASupported) {
print_dbg("GPUDirect RDMA is not supported on this GPU.\n");
ret = CUDA_ERROR_NOT_SUPPORTED;
goto out;
}
memset(&mprop, 0, sizeof(CUmemAllocationProp));
mprop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
mprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
mprop.location.id = gpu_dev;
mprop.allocFlags.gpuDirectRDMACapable = 1;
ret = cuMemGetAllocationGranularity(&gran, &mprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED);
if (ret != CUDA_SUCCESS) {
print_dbg("error in cuMemGetAllocationGranularity\n");
goto out;
}
// In case gran is smaller than GPU_PAGE_SIZE
granularity = PAGE_ROUND_UP(gran, GPU_PAGE_SIZE);
rounded_size = PAGE_ROUND_UP(size, granularity);
ret = cuMemAddressReserve(&ptr, rounded_size, granularity, 0, 0);
if (ret != CUDA_SUCCESS) {
print_dbg("error in cuMemAddressReserve\n");
goto out;
}
ret = cuMemCreate(&mem_handle, rounded_size, &mprop, 0);
if (ret != CUDA_SUCCESS) {
print_dbg("error in cuMemCreate\n");
goto out;
}
ret = cuMemMap(ptr, rounded_size, 0, mem_handle, 0);
if (ret != CUDA_SUCCESS) {
print_dbg("error in cuMemMap\n");
goto out;
}
is_mapped = true;
CUmemAccessDesc access;
access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
access.location.id = gpu_dev;
access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
ret = cuMemSetAccess(ptr, rounded_size, &access, 1);
if (ret != CUDA_SUCCESS) {
print_dbg("error in cuMemSetAccess\n");
goto out;
}
// cuMemAddressReserve always returns aligned ptr
handle->ptr = ptr;
handle->handle = mem_handle;
handle->size = size;
handle->allocated_size = rounded_size;
out:
if (ret != CUDA_SUCCESS) {
if (is_mapped)
cuMemUnmap(ptr, rounded_size);
if (mem_handle)
cuMemRelease(mem_handle);
if (ptr)
cuMemAddressFree(ptr, rounded_size);
}
return ret;
}
CUresult gpu_vmm_free(gpu_mem_handle_t *handle)
{
CUresult ret;
if (!handle || !handle->ptr)
return CUDA_ERROR_INVALID_VALUE;
ret = cuMemUnmap(handle->ptr, handle->allocated_size);
if (ret != CUDA_SUCCESS) {
print_dbg("error in cuMemUnmap\n");
return ret;
}
ret = cuMemRelease(handle->handle);
if (ret != CUDA_SUCCESS) {
print_dbg("error in cuMemRelease\n");
return ret;
}
ret = cuMemAddressFree(handle->ptr, handle->allocated_size);
if (ret != CUDA_SUCCESS) {
print_dbg("error in cuMemAddressFree\n");
return ret;
}
memset(handle, 0, sizeof(gpu_mem_handle_t));
return CUDA_SUCCESS;
}
#else
/* VMM with RDMA is not available before CUDA 11.0 */
CUresult gpu_vmm_alloc(gpu_mem_handle_t *handle, const size_t size, bool aligned_mapping, bool set_sync_memops)
{
return CUDA_ERROR_NOT_SUPPORTED;
}
CUresult gpu_vmm_free(gpu_mem_handle_t *handle)
{
return CUDA_ERROR_NOT_SUPPORTED;
}
#endif
int compare_buf(uint32_t *ref_buf, uint32_t *buf, size_t size)
{
int diff = 0;
if (size % 4 != 0U) {
print_dbg("warning: buffer size %zu is not dword aligned, ignoring trailing bytes\n", size);
size -= (size % 4);
}
unsigned ndwords = size/sizeof(uint32_t);
for(unsigned w = 0; w < ndwords; ++w) {
if (ref_buf[w] != buf[w]) {
if (!diff) {
printf("%10.10s %8.8s %8.8s\n", "word", "content", "expected");
}
if (diff < 10) {
printf("%10d %08x %08x\n", w, buf[w], ref_buf[w]);
}
++diff;
}
}
if (diff) {
print_dbg("check error: %d different dwords out of %d\n", diff, ndwords);
}
return diff;
}
void init_hbuf_walking_bit(uint32_t *h_buf, size_t size)
{
uint32_t base_value = 0x3F4C5E6A; // 0xa55ad33d;
unsigned w;
ASSERT_NEQ(h_buf, (void*)0);
ASSERT_EQ(size % 4, 0U);
//OUT << "filling mem with walking bit " << endl;
for(w = 0; w<size/sizeof(uint32_t); ++w)
h_buf[w] = base_value ^ (1<< (w%32));
}
void init_hbuf_linear_ramp(uint32_t *h_buf, size_t size)
{
uint32_t base_value = 0x3F4C5E6A; // 0xa55ad33d;
unsigned w;
ASSERT_NEQ(h_buf, (void*)0);
ASSERT_EQ(size % 4, 0U);
//OUT << "filling mem with walking bit " << endl;
for(w = 0; w<size/sizeof(uint32_t); ++w)
h_buf[w] = w;
}
bool check_gdr_support(CUdevice dev)
{
#if CUDA_VERSION >= 11030
int drv_version;
ASSERTDRV(cuDriverGetVersion(&drv_version));
// Starting from CUDA 11.3, CUDA provides an ability to check GPUDirect RDMA support.
if (drv_version >= 11030) {
int gdr_support = 0;
ASSERTDRV(cuDeviceGetAttribute(&gdr_support, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, dev));
if (!gdr_support)
print_dbg("This GPU does not support GPUDirect RDMA.\n");
return !!gdr_support;
}
#endif
// For older versions, we fall back to detect this support with gdr_pin_buffer.
const size_t size = GPU_PAGE_SIZE;
CUdeviceptr d_A;
gpu_mem_handle_t mhandle;
ASSERTDRV(gpu_mem_alloc(&mhandle, size, true, true));
d_A = mhandle.ptr;
gdr_t g = gdr_open_safe();
gdr_mh_t mh;
int status = gdr_pin_buffer(g, d_A, size, 0, 0, &mh);
if (status != 0) {
print_dbg("error in gdr_pin_buffer with code=%d\n", status);
print_dbg("Your GPU might not support GPUDirect RDMA\n");
}
else
ASSERT_EQ(gdr_unpin_buffer(g, mh), 0);
ASSERT_EQ(gdr_close(g), 0);
ASSERTDRV(gpu_mem_free(&mhandle));
return status == 0;
}
void print_histogram(double *lat_arr, int count, int *bin_arr, int num_bins, double min, double max)
{
int den = (max - min) / num_bins;
den = den > 0 ? den : 1;
for (int j = 0; j < num_bins; j++)
bin_arr[j] = 0;
for (int i = 0; i < count; i++) {
bin_arr[(int) ((lat_arr[i] - min) / den)]++;
}
for (int j = 0; j < num_bins; j++) {
printf("[%lf\t-\t%lf]\t%d\n", (min * (j + 1)), (min * (j + 2)), bin_arr[j]);
}
}
}
}

162
gdrcopy/tests/common.hpp Normal file
View File

@ -0,0 +1,162 @@
/*
* Copyright (c) 2014-2021, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#pragma once
#include <stdarg.h>
#include <sys/types.h>
#include <unistd.h>
#include <stdio.h>
#include <cuda.h>
#include <cstring>
#include <map>
#include <gdrapi.h>
#include <gdrconfig.h>
#ifndef ACCESS_ONCE
#define ACCESS_ONCE(x) (*(volatile typeof((x)) *)&(x))
#endif
#ifndef READ_ONCE
#define READ_ONCE(x) ACCESS_ONCE(x)
#endif
#ifndef WRITE_ONCE
#define WRITE_ONCE(x, v) (ACCESS_ONCE(x) = (v))
#endif
/**
* Memory barrier
*/
#if defined(GDRAPI_X86)
#define MB() asm volatile("mfence":::"memory")
#define SB() asm volatile("sfence":::"memory")
#define LB() asm volatile("lfence":::"memory")
#elif defined(GDRAPI_POWER)
#define MB() asm volatile("sync":::"memory")
#define SB() MB()
#define LB() MB()
#elif defined(GDRAPI_ARM64)
#define MB() asm volatile("dmb sy":::"memory")
#define SB() asm volatile("dmb st":::"memory")
#define LB() MB()
#else
#error "Compiling on an unsupported architecture."
#endif
/**
* Clock used for timing
*/
//#define MYCLOCK CLOCK_REALTIME
//#define MYCLOCK CLOCK_RAW_MONOTONIC
#define MYCLOCK CLOCK_MONOTONIC
#define EXIT_WAIVED 2
#define ASSERT(x) \
do \
{ \
if (!(x)) \
{ \
fprintf(stderr, "Assertion \"%s\" failed at %s:%d\n", #x, __FILE__, __LINE__); \
exit(EXIT_FAILURE); \
} \
} while (0)
#define ASSERTDRV(stmt) \
do \
{ \
CUresult result = (stmt); \
if (result != CUDA_SUCCESS) { \
const char *_err_name; \
cuGetErrorName(result, &_err_name); \
fprintf(stderr, "CUDA error: %s\n", _err_name); \
} \
ASSERT(CUDA_SUCCESS == result); \
} while (0)
#define ASSERT_EQ(P, V) ASSERT((P) == (V))
#define CHECK_EQ(P, V) ASSERT((P) == (V))
#define ASSERT_NEQ(P, V) ASSERT(!((P) == (V)))
#define BREAK_IF_NEQ(P, V) if((P) != (V)) break
#define BEGIN_CHECK do
#define END_CHECK while(0)
#define PAGE_ROUND_UP(x, n) (((x) + ((n) - 1)) & ~((n) - 1))
namespace gdrcopy {
namespace test {
typedef struct gpuMemHandle
{
CUdeviceptr ptr; // aligned ptr if requested; otherwise, the same as unaligned_ptr.
union {
CUdeviceptr unaligned_ptr; // for tracking original ptr; may be unaligned.
#if CUDA_VERSION >= 11000
// VMM with GDR support is available from CUDA 11.0
CUmemGenericAllocationHandle handle;
#endif
};
size_t size;
size_t allocated_size;
} gpu_mem_handle_t;
typedef CUresult (*gpu_memalloc_fn_t)(gpu_mem_handle_t *handle, const size_t size, bool aligned_mapping, bool set_sync_memops);
typedef CUresult (*gpu_memfree_fn_t)(gpu_mem_handle_t *handle);
static inline gdr_t gdr_open_safe()
{
gdr_t g = gdr_open();
if (!g) {
fprintf(stderr, "gdr_open error: Is gdrdrv driver installed and loaded?\n");
exit(EXIT_FAILURE);
}
return g;
}
extern bool print_dbg_msg;
extern const char *testname;
void print_dbg(const char* fmt, ...);
CUresult gpu_mem_alloc(gpu_mem_handle_t *handle, const size_t size, bool aligned_mapping, bool set_sync_memops);
CUresult gpu_mem_free(gpu_mem_handle_t *handle);
CUresult gpu_vmm_alloc(gpu_mem_handle_t *handle, const size_t size, bool aligned_mapping, bool set_sync_memops);
CUresult gpu_vmm_free(gpu_mem_handle_t *handle);
static inline bool operator==(const gdr_mh_t &a, const gdr_mh_t &b) {
return a.h == b.h;
}
static const gdr_mh_t null_mh = {0};
int compare_buf(uint32_t *ref_buf, uint32_t *buf, size_t size);
void init_hbuf_walking_bit(uint32_t *h_buf, size_t size);
void init_hbuf_linear_ramp(uint32_t *h_buf, size_t size);
bool check_gdr_support(CUdevice dev);
void print_histogram(double *lat_arr, int count, int *bin_arr, int num_bins, double min, double max);
}
}

282
gdrcopy/tests/copybw.cpp Normal file
View File

@ -0,0 +1,282 @@
/*
* Copyright (c) 2014-2021, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include <stdlib.h>
#include <getopt.h>
#include <memory.h>
#include <stdio.h>
#include <math.h>
#include <iostream>
#include <iomanip>
#include <cuda.h>
using namespace std;
#include "gdrapi.h"
#include "common.hpp"
using namespace gdrcopy::test;
// manually tuned...
int num_write_iters = 10000;
int num_read_iters = 100;
size_t _size = 128*1024;
size_t copy_size = 0;
size_t copy_offset = 0;
int dev_id = 0;
void print_usage(const char *path)
{
cout << "Usage: " << path << " [-h][-s <size>][-c <size>][-o <offset>][-d <gpu>][-w <iters>][-r <iters>][-a <fn>]" << endl;
cout << endl;
cout << "Options:" << endl;
cout << " -h Print this help text" << endl;
cout << " -s <size> Buffer allocation size (default: " << _size << ")" << endl;
cout << " -c <size> Copy size (default: " << copy_size << ")" << endl;
cout << " -o <offset> Copy offset (default: " << copy_offset << ")" << endl;
cout << " -d <gpu> GPU ID (default: " << dev_id << ")" << endl;
cout << " -w <iters> Number of write iterations (default: " << num_write_iters << ")" << endl;
cout << " -r <iters> Number of read iterations (default: " << num_read_iters << ")" << endl;
cout << " -a <fn> GPU buffer allocation function (default: cuMemAlloc)" << endl;
cout << " Choices: cuMemAlloc, cuMemCreate" << endl;
}
void run_test(CUdeviceptr d_A, size_t size)
{
uint32_t *init_buf = NULL;
ASSERTDRV(cuMemAllocHost((void **)&init_buf, size));
ASSERT_NEQ(init_buf, (void*)0);
init_hbuf_walking_bit(init_buf, size);
gdr_t g = gdr_open_safe();
gdr_mh_t mh;
BEGIN_CHECK {
// tokens are optional in CUDA 6.0
// wave out the test if GPUDirectRDMA is not enabled
BREAK_IF_NEQ(gdr_pin_buffer(g, d_A, size, 0, 0, &mh), 0);
ASSERT_NEQ(mh, null_mh);
void *map_d_ptr = NULL;
ASSERT_EQ(gdr_map(g, mh, &map_d_ptr, size), 0);
cout << "map_d_ptr: " << map_d_ptr << endl;
gdr_info_t info;
ASSERT_EQ(gdr_get_info(g, mh, &info), 0);
cout << "info.va: " << hex << info.va << dec << endl;
cout << "info.mapped_size: " << info.mapped_size << endl;
cout << "info.page_size: " << info.page_size << endl;
cout << "info.mapped: " << info.mapped << endl;
cout << "info.wc_mapping: " << info.wc_mapping << endl;
// remember that mappings start on a 64KB boundary, so let's
// calculate the offset from the head of the mapping to the
// beginning of the buffer
int off = info.va - d_A;
cout << "page offset: " << off << endl;
uint32_t *buf_ptr = (uint32_t *)((char *)map_d_ptr + off);
cout << "user-space pointer:" << buf_ptr << endl;
// copy to GPU benchmark
cout << "writing test, size=" << copy_size << " offset=" << copy_offset << " num_iters=" << num_write_iters << endl;
struct timespec beg, end;
clock_gettime(MYCLOCK, &beg);
for (int iter=0; iter<num_write_iters; ++iter)
gdr_copy_to_mapping(mh, buf_ptr + copy_offset/4, init_buf, copy_size);
clock_gettime(MYCLOCK, &end);
double woMBps;
{
double byte_count = (double) copy_size * num_write_iters;
double dt_ms = (end.tv_nsec-beg.tv_nsec)/1000000.0 + (end.tv_sec-beg.tv_sec)*1000.0;
double Bps = byte_count / dt_ms * 1e3;
woMBps = Bps / 1024.0 / 1024.0;
cout << "write BW: " << woMBps << "MB/s" << endl;
}
compare_buf(init_buf, buf_ptr + copy_offset/4, copy_size);
// copy from GPU benchmark
cout << "reading test, size=" << copy_size << " offset=" << copy_offset << " num_iters=" << num_read_iters << endl;
clock_gettime(MYCLOCK, &beg);
for (int iter=0; iter<num_read_iters; ++iter)
gdr_copy_from_mapping(mh, init_buf, buf_ptr + copy_offset/4, copy_size);
clock_gettime(MYCLOCK, &end);
double roMBps;
{
double byte_count = (double) copy_size * num_read_iters;
double dt_ms = (end.tv_nsec-beg.tv_nsec)/1000000.0 + (end.tv_sec-beg.tv_sec)*1000.0;
double Bps = byte_count / dt_ms * 1e3;
roMBps = Bps / 1024.0 / 1024.0;
cout << "read BW: " << roMBps << "MB/s" << endl;
}
cout << "unmapping buffer" << endl;
ASSERT_EQ(gdr_unmap(g, mh, map_d_ptr, size), 0);
cout << "unpinning buffer" << endl;
ASSERT_EQ(gdr_unpin_buffer(g, mh), 0);
} END_CHECK;
cout << "closing gdrdrv" << endl;
ASSERT_EQ(gdr_close(g), 0);
}
int main(int argc, char *argv[])
{
gpu_memalloc_fn_t galloc_fn = gpu_mem_alloc;
gpu_memfree_fn_t gfree_fn = gpu_mem_free;
while(1) {
int c;
c = getopt(argc, argv, "s:d:o:c:w:r:a:h");
if (c == -1)
break;
switch (c) {
case 's':
_size = strtol(optarg, NULL, 0);
break;
case 'c':
copy_size = strtol(optarg, NULL, 0);
break;
case 'o':
copy_offset = strtol(optarg, NULL, 0);
break;
case 'd':
dev_id = strtol(optarg, NULL, 0);
break;
case 'w':
num_write_iters = strtol(optarg, NULL, 0);
break;
case 'r':
num_read_iters = strtol(optarg, NULL, 0);
break;
case 'a':
if (strcmp(optarg, "cuMemAlloc") == 0) {
galloc_fn = gpu_mem_alloc;
gfree_fn = gpu_mem_free;
}
else if (strcmp(optarg, "cuMemCreate") == 0) {
galloc_fn = gpu_vmm_alloc;
gfree_fn = gpu_vmm_free;
}
else {
cerr << "Unrecognized fn argument" << endl;
exit(EXIT_FAILURE);
}
break;
case 'h':
print_usage(argv[0]);
exit(EXIT_SUCCESS);
default:
fprintf(stderr, "ERROR: invalid option\n");
exit(EXIT_FAILURE);
}
}
if (!copy_size)
copy_size = _size;
if (copy_offset % sizeof(uint32_t) != 0) {
fprintf(stderr, "ERROR: offset must be multiple of 4 bytes\n");
exit(EXIT_FAILURE);
}
if (copy_offset + copy_size > _size) {
fprintf(stderr, "ERROR: offset + copy size run past the end of the buffer\n");
exit(EXIT_FAILURE);
}
size_t size = PAGE_ROUND_UP(_size, GPU_PAGE_SIZE);
ASSERTDRV(cuInit(0));
int n_devices = 0;
ASSERTDRV(cuDeviceGetCount(&n_devices));
CUdevice dev;
for (int n=0; n<n_devices; ++n) {
char dev_name[256];
int dev_pci_domain_id;
int dev_pci_bus_id;
int dev_pci_device_id;
ASSERTDRV(cuDeviceGet(&dev, n));
ASSERTDRV(cuDeviceGetName(dev_name, sizeof(dev_name) / sizeof(dev_name[0]), dev));
ASSERTDRV(cuDeviceGetAttribute(&dev_pci_domain_id, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, dev));
ASSERTDRV(cuDeviceGetAttribute(&dev_pci_bus_id, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev));
ASSERTDRV(cuDeviceGetAttribute(&dev_pci_device_id, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, dev));
cout << "GPU id:" << n << "; name: " << dev_name
<< "; Bus id: "
<< std::hex
<< std::setfill('0') << std::setw(4) << dev_pci_domain_id
<< ":" << std::setfill('0') << std::setw(2) << dev_pci_bus_id
<< ":" << std::setfill('0') << std::setw(2) << dev_pci_device_id
<< std::dec
<< endl;
}
cout << "selecting device " << dev_id << endl;
ASSERTDRV(cuDeviceGet(&dev, dev_id));
CUcontext dev_ctx;
ASSERTDRV(cuDevicePrimaryCtxRetain(&dev_ctx, dev));
ASSERTDRV(cuCtxSetCurrent(dev_ctx));
cout << "testing size: " << _size << endl;
cout << "rounded size: " << size << endl;
ASSERT_EQ(check_gdr_support(dev), true);
if (galloc_fn == gpu_mem_alloc)
cout << "gpu alloc fn: cuMemAlloc" << endl;
else
cout << "gpu alloc fn: cuMemCreate" << endl;
CUdeviceptr d_A;
gpu_mem_handle_t mhandle;
ASSERTDRV(galloc_fn(&mhandle, size, true, true));
d_A = mhandle.ptr;
cout << "device ptr: " << hex << d_A << dec << endl;
run_test(d_A, size);
ASSERTDRV(gfree_fn(&mhandle));
ASSERTDRV(cuDevicePrimaryCtxRelease(dev));
return 0;
}
/*
* Local variables:
* c-indent-level: 4
* c-basic-offset: 4
* tab-width: 4
* indent-tabs-mode: nil
* End:
*/

307
gdrcopy/tests/copylat.cpp Normal file
View File

@ -0,0 +1,307 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include <stdlib.h>
#include <getopt.h>
#include <memory.h>
#include <stdio.h>
#include <math.h>
#include <iostream>
#include <iomanip>
#include <cuda.h>
using namespace std;
#include "gdrapi.h"
#include "common.hpp"
using namespace gdrcopy::test;
// manually tuned...
int num_write_iters = 10000;
int num_read_iters = 100;
int dev_id = 0;
bool do_cumemcpy = false;
size_t _size = (size_t)1 << 24;
void print_usage(const char *path)
{
cout << "Usage: " << path << " [-h][-c][-s <size>][-d <gpu>][-w <iters>][-r <iters>][-a <fn>]" << endl;
cout << endl;
cout << "Options:" << endl;
cout << " -h Print this help text" << endl;
cout << " -c Also run cuMemcpy (default: no)" << endl;
cout << " -s <size> Buffer allocation size (default: " << _size << ")" << endl;
cout << " -d <gpu> GPU ID (default: " << dev_id << ")" << endl;
cout << " -w <iters> Number of write iterations (default: " << num_write_iters << ")" << endl;
cout << " -r <iters> Number of read iterations (default: " << num_read_iters << ")" << endl;
cout << " -a <fn> GPU buffer allocation function (default: cuMemAlloc)" << endl;
cout << " Choices: cuMemAlloc, cuMemCreate" << endl;
}
int main(int argc, char *argv[])
{
size_t copy_size = 1;
struct timespec beg, end;
double lat_us;
gpu_memalloc_fn_t galloc_fn = gpu_mem_alloc;
gpu_memfree_fn_t gfree_fn = gpu_mem_free;
while(1) {
int c;
c = getopt(argc, argv, "s:d:w:r:a:hc");
if (c == -1)
break;
switch (c) {
case 's':
_size = strtol(optarg, NULL, 0);
break;
case 'd':
dev_id = strtol(optarg, NULL, 0);
break;
case 'w':
num_write_iters = strtol(optarg, NULL, 0);
break;
case 'r':
num_read_iters = strtol(optarg, NULL, 0);
break;
case 'a':
if (strcmp(optarg, "cuMemAlloc") == 0) {
galloc_fn = gpu_mem_alloc;
gfree_fn = gpu_mem_free;
}
else if (strcmp(optarg, "cuMemCreate") == 0) {
galloc_fn = gpu_vmm_alloc;
gfree_fn = gpu_vmm_free;
}
else {
cerr << "Unrecognized fn argument" << endl;
exit(EXIT_FAILURE);
}
break;
case 'c':
do_cumemcpy = true;
break;
case 'h':
print_usage(argv[0]);
exit(EXIT_SUCCESS);
default:
printf("ERROR: invalid option\n");
exit(EXIT_FAILURE);
}
}
size_t size = PAGE_ROUND_UP(_size, GPU_PAGE_SIZE);
ASSERTDRV(cuInit(0));
int n_devices = 0;
ASSERTDRV(cuDeviceGetCount(&n_devices));
CUdevice dev;
for (int n=0; n<n_devices; ++n) {
char dev_name[256];
int dev_pci_domain_id;
int dev_pci_bus_id;
int dev_pci_device_id;
ASSERTDRV(cuDeviceGet(&dev, n));
ASSERTDRV(cuDeviceGetName(dev_name, sizeof(dev_name) / sizeof(dev_name[0]), dev));
ASSERTDRV(cuDeviceGetAttribute(&dev_pci_domain_id, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, dev));
ASSERTDRV(cuDeviceGetAttribute(&dev_pci_bus_id, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev));
ASSERTDRV(cuDeviceGetAttribute(&dev_pci_device_id, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, dev));
cout << "GPU id:" << n << "; name: " << dev_name
<< "; Bus id: "
<< std::hex
<< std::setfill('0') << std::setw(4) << dev_pci_domain_id
<< ":" << std::setfill('0') << std::setw(2) << dev_pci_bus_id
<< ":" << std::setfill('0') << std::setw(2) << dev_pci_device_id
<< std::dec
<< endl;
}
cout << "selecting device " << dev_id << endl;
ASSERTDRV(cuDeviceGet(&dev, dev_id));
CUcontext dev_ctx;
ASSERTDRV(cuDevicePrimaryCtxRetain(&dev_ctx, dev));
ASSERTDRV(cuCtxSetCurrent(dev_ctx));
ASSERT_EQ(check_gdr_support(dev), true);
CUdeviceptr d_A;
gpu_mem_handle_t mhandle;
ASSERTDRV(galloc_fn(&mhandle, size, true, true));
d_A = mhandle.ptr;
cout << "device ptr: 0x" << hex << d_A << dec << endl;
cout << "allocated size: " << size << endl;
if (galloc_fn == gpu_mem_alloc)
cout << "gpu alloc fn: cuMemAlloc" << endl;
else
cout << "gpu alloc fn: cuMemCreate" << endl;
uint32_t *init_buf = NULL;
uint32_t *h_buf = NULL;
ASSERTDRV(cuMemAllocHost((void **)&init_buf, size));
ASSERT_NEQ(init_buf, (void*)0);
ASSERTDRV(cuMemAllocHost((void **)&h_buf, size));
ASSERT_NEQ(h_buf, (void*)0);
init_hbuf_walking_bit(init_buf, size);
if (do_cumemcpy) {
cout << endl;
cout << "cuMemcpy_H2D num iters for each size: " << num_write_iters << endl;
printf("Test \t\t Size(B) \t Avg.Time(us)\n");
BEGIN_CHECK {
// cuMemcpy H2D benchmark
copy_size = 1;
while (copy_size <= size) {
int iter = 0;
clock_gettime(MYCLOCK, &beg);
for (iter = 0; iter < num_write_iters; ++iter) {
ASSERTDRV(cuMemcpy(d_A, (CUdeviceptr)init_buf, copy_size));
}
clock_gettime(MYCLOCK, &end);
lat_us = ((end.tv_nsec-beg.tv_nsec)/1000.0 + (end.tv_sec-beg.tv_sec)*1000000.0) / (double)iter;
printf("cuMemcpy_H2D \t %8zu \t %11.4f\n", copy_size, lat_us);
copy_size <<= 1;
}
} END_CHECK;
cout << endl;
cout << "cuMemcpy_D2H num iters for each size: " << num_read_iters << endl;
printf("Test \t\t Size(B) \t Avg.Time(us)\n");
BEGIN_CHECK {
// cuMemcpy D2H benchmark
copy_size = 1;
while (copy_size <= size) {
int iter = 0;
clock_gettime(MYCLOCK, &beg);
for (iter = 0; iter < num_read_iters; ++iter) {
ASSERTDRV(cuMemcpy((CUdeviceptr)h_buf, d_A, copy_size));
}
clock_gettime(MYCLOCK, &end);
lat_us = ((end.tv_nsec-beg.tv_nsec)/1000.0 + (end.tv_sec-beg.tv_sec)*1000000.0) / (double)iter;
printf("cuMemcpy_D2H \t %8zu \t %11.4f\n", copy_size, lat_us);
copy_size <<= 1;
}
} END_CHECK;
cout << endl;
}
cout << endl;
gdr_t g = gdr_open_safe();
gdr_mh_t mh;
BEGIN_CHECK {
// tokens are optional in CUDA 6.0
ASSERT_EQ(gdr_pin_buffer(g, d_A, size, 0, 0, &mh), 0);
ASSERT_NEQ(mh, null_mh);
void *map_d_ptr = NULL;
ASSERT_EQ(gdr_map(g, mh, &map_d_ptr, size), 0);
cout << "map_d_ptr: " << map_d_ptr << endl;
gdr_info_t info;
ASSERT_EQ(gdr_get_info(g, mh, &info), 0);
cout << "info.va: " << hex << info.va << dec << endl;
cout << "info.mapped_size: " << info.mapped_size << endl;
cout << "info.page_size: " << info.page_size << endl;
cout << "info.mapped: " << info.mapped << endl;
cout << "info.wc_mapping: " << info.wc_mapping << endl;
// remember that mappings start on a 64KB boundary, so let's
// calculate the offset from the head of the mapping to the
// beginning of the buffer
int off = info.va - d_A;
cout << "page offset: " << off << endl;
uint32_t *buf_ptr = (uint32_t *)((char *)map_d_ptr + off);
cout << "user-space pointer: " << buf_ptr << endl;
// gdr_copy_to_mapping benchmark
cout << endl;
cout << "gdr_copy_to_mapping num iters for each size: " << num_write_iters << endl;
cout << "WARNING: Measuring the API invocation overhead as observed by the CPU. Data might not be ordered all the way to the GPU internal visibility." << endl;
// For more information, see
// https://docs.nvidia.com/cuda/gpudirect-rdma/index.html#sync-behavior
printf("Test \t\t\t Size(B) \t Avg.Time(us)\n");
copy_size = 1;
while (copy_size <= size) {
int iter = 0;
clock_gettime(MYCLOCK, &beg);
for (iter = 0; iter < num_write_iters; ++iter) {
gdr_copy_to_mapping(mh, buf_ptr, init_buf, copy_size);
}
clock_gettime(MYCLOCK, &end);
lat_us = ((end.tv_nsec-beg.tv_nsec)/1000.0 + (end.tv_sec-beg.tv_sec)*1000000.0) / (double)iter;
printf("gdr_copy_to_mapping \t %8zu \t %11.4f\n", copy_size, lat_us);
copy_size <<= 1;
}
MB();
// gdr_copy_from_mapping benchmark
cout << endl;
cout << "gdr_copy_from_mapping num iters for each size: " << num_read_iters << endl;
printf("Test \t\t\t Size(B) \t Avg.Time(us)\n");
copy_size = 1;
while (copy_size <= size) {
int iter = 0;
clock_gettime(MYCLOCK, &beg);
for (iter = 0; iter < num_read_iters; ++iter)
gdr_copy_from_mapping(mh, h_buf, buf_ptr, copy_size);
clock_gettime(MYCLOCK, &end);
lat_us = ((end.tv_nsec-beg.tv_nsec)/1000.0 + (end.tv_sec-beg.tv_sec)*1000000.0) / (double)iter;
printf("gdr_copy_from_mapping \t %8zu \t %11.4f\n", copy_size, lat_us);
copy_size <<= 1;
}
cout << "unmapping buffer" << endl;
ASSERT_EQ(gdr_unmap(g, mh, map_d_ptr, size), 0);
cout << "unpinning buffer" << endl;
ASSERT_EQ(gdr_unpin_buffer(g, mh), 0);
} END_CHECK;
cout << "closing gdrdrv" << endl;
ASSERT_EQ(gdr_close(g), 0);
ASSERTDRV(gfree_fn(&mhandle));
return 0;
}
/*
* Local variables:
* c-indent-level: 4
* c-basic-offset: 4
* tab-width: 4
* indent-tabs-mode: nil
* End:
*/

Some files were not shown because too many files have changed in this diff Show More