2074 lines
59 KiB
C++
2074 lines
59 KiB
C++
/*
|
|
* Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
* to deal in the Software without restriction, including without limitation
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be included in
|
|
* all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
* DEALINGS IN THE SOFTWARE.
|
|
*/
|
|
|
|
#include <ctype.h>
|
|
#include <signal.h>
|
|
#include <stdlib.h>
|
|
#include <memory.h>
|
|
#include <stdio.h>
|
|
#include <sys/socket.h>
|
|
#include <sys/types.h>
|
|
#include <sys/wait.h>
|
|
#include <sys/stat.h>
|
|
#include <fcntl.h>
|
|
#include <unistd.h>
|
|
#include <cuda.h>
|
|
#include <errno.h>
|
|
#include <sys/queue.h>
|
|
|
|
#include <iostream>
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
using namespace std;
|
|
|
|
#include "gdrapi.h"
|
|
#include "gdrapi_internal.h"
|
|
#include "gdrconfig.h"
|
|
#include "common.hpp"
|
|
#include "testsuites/testsuite.hpp"
|
|
|
|
using namespace gdrcopy::test;
|
|
|
|
volatile bool expecting_exception_signal = false;
|
|
|
|
void exception_signal_handle(int sig)
|
|
{
|
|
if (expecting_exception_signal) {
|
|
print_dbg("Get signal %d as expected\n", sig);
|
|
exit(EXIT_SUCCESS);
|
|
}
|
|
print_dbg("Unexpectedly get exception signal");
|
|
}
|
|
|
|
void init_cuda(int dev_id)
|
|
{
|
|
CUdevice dev;
|
|
CUcontext dev_ctx;
|
|
ASSERTDRV(cuInit(0));
|
|
ASSERTDRV(cuDeviceGet(&dev, dev_id));
|
|
|
|
ASSERTDRV(cuDevicePrimaryCtxRetain(&dev_ctx, dev));
|
|
ASSERTDRV(cuCtxSetCurrent(dev_ctx));
|
|
|
|
ASSERT_EQ(check_gdr_support(dev), true);
|
|
}
|
|
|
|
void finalize_cuda(int dev_id)
|
|
{
|
|
CUdevice dev;
|
|
ASSERTDRV(cuDeviceGet(&dev, dev_id));
|
|
ASSERTDRV(cuDevicePrimaryCtxRelease(dev));
|
|
}
|
|
|
|
typedef void (*filter_fn_t)();
|
|
|
|
void null_filter()
|
|
{
|
|
// NO-OP.
|
|
}
|
|
|
|
#if CUDA_VERSION >= 11000
|
|
/**
|
|
* Waive the test if VMM is not supported.
|
|
* Must be called after init_cuda.
|
|
*/
|
|
void vmm_filter()
|
|
{
|
|
int version;
|
|
ASSERTDRV(cuDriverGetVersion(&version));
|
|
if (version < 11000)
|
|
exit(EXIT_WAIVED);
|
|
}
|
|
#else
|
|
void vmm_filter()
|
|
{
|
|
exit(EXIT_WAIVED);
|
|
}
|
|
#endif
|
|
|
|
/**
|
|
* Sends given file descriptior via given socket
|
|
*
|
|
* @param socket to be used for fd sending
|
|
* @param fd to be sent
|
|
* @return sendmsg result
|
|
*
|
|
* @note socket should be (PF_UNIX, SOCK_DGRAM)
|
|
*/
|
|
int sendfd(int socket, int fd)
|
|
{
|
|
char dummy = '$';
|
|
struct msghdr msg;
|
|
struct iovec iov;
|
|
|
|
char cmsgbuf[CMSG_SPACE(sizeof(int))];
|
|
|
|
iov.iov_base = &dummy;
|
|
iov.iov_len = sizeof(dummy);
|
|
|
|
msg.msg_name = NULL;
|
|
msg.msg_namelen = 0;
|
|
msg.msg_iov = &iov;
|
|
msg.msg_iovlen = 1;
|
|
msg.msg_flags = 0;
|
|
msg.msg_control = cmsgbuf;
|
|
msg.msg_controllen = CMSG_LEN(sizeof(int));
|
|
|
|
struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
|
|
cmsg->cmsg_level = SOL_SOCKET;
|
|
cmsg->cmsg_type = SCM_RIGHTS;
|
|
cmsg->cmsg_len = CMSG_LEN(sizeof(int));
|
|
|
|
*(int*) CMSG_DATA(cmsg) = fd;
|
|
|
|
int ret = sendmsg(socket, &msg, 0);
|
|
|
|
if (ret == -1) {
|
|
print_dbg("sendmsg failed with %s", strerror(errno));
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
/**
|
|
* Receives file descriptor using given socket
|
|
*
|
|
* @param socket to be used for fd recepion
|
|
* @return received file descriptor; -1 if failed
|
|
*
|
|
* @note socket should be (PF_UNIX, SOCK_DGRAM)
|
|
*/
|
|
int recvfd(int socket)
|
|
{
|
|
int len;
|
|
int fd;
|
|
char buf[1];
|
|
struct iovec iov;
|
|
struct msghdr msg;
|
|
struct cmsghdr *cmsg;
|
|
char cms[CMSG_SPACE(sizeof(int))];
|
|
|
|
iov.iov_base = buf;
|
|
iov.iov_len = sizeof(buf);
|
|
|
|
msg.msg_name = 0;
|
|
msg.msg_namelen = 0;
|
|
msg.msg_iov = &iov;
|
|
msg.msg_iovlen = 1;
|
|
msg.msg_flags = 0;
|
|
msg.msg_control = (caddr_t) cms;
|
|
msg.msg_controllen = sizeof cms;
|
|
|
|
len = recvmsg(socket, &msg, 0);
|
|
|
|
if (len < 0) {
|
|
print_dbg("recvmsg failed with %s", strerror(errno));
|
|
return -1;
|
|
}
|
|
|
|
if (len == 0) {
|
|
print_dbg("recvmsg failed no data");
|
|
return -1;
|
|
}
|
|
|
|
cmsg = CMSG_FIRSTHDR(&msg);
|
|
memmove(&fd, CMSG_DATA(cmsg), sizeof(int));
|
|
return fd;
|
|
}
|
|
|
|
template <gpu_memalloc_fn_t galloc_fn, gpu_memfree_fn_t gfree_fn, filter_fn_t filter_fn>
|
|
void basic()
|
|
{
|
|
expecting_exception_signal = false;
|
|
MB();
|
|
|
|
init_cuda(0);
|
|
filter_fn();
|
|
|
|
const size_t _size = 256*1024+16;
|
|
const size_t size = PAGE_ROUND_UP(_size, GPU_PAGE_SIZE);
|
|
|
|
print_dbg("buffer size: %zu\n", size);
|
|
CUdeviceptr d_A;
|
|
gpu_mem_handle_t mhandle;
|
|
ASSERTDRV(galloc_fn(&mhandle, size, true, true));
|
|
d_A = mhandle.ptr;
|
|
|
|
gdr_t g = gdr_open_safe();
|
|
|
|
gdr_mh_t mh = null_mh;
|
|
CUdeviceptr d_ptr = d_A;
|
|
|
|
// tokens are optional in CUDA 6.0
|
|
// wave out the test if GPUDirectRDMA is not enabled
|
|
ASSERT_EQ(gdr_pin_buffer(g, d_ptr, size, 0, 0, &mh), 0);
|
|
ASSERT_NEQ(mh, null_mh);
|
|
ASSERT_EQ(gdr_unpin_buffer(g, mh), 0);
|
|
ASSERT_EQ(gdr_close(g), 0);
|
|
|
|
ASSERTDRV(gfree_fn(&mhandle));
|
|
|
|
finalize_cuda(0);
|
|
}
|
|
|
|
GDRCOPY_TEST(basic_cumemalloc)
|
|
{
|
|
basic<gpu_mem_alloc, gpu_mem_free, null_filter>();
|
|
}
|
|
|
|
#if CUDA_VERSION >= 11000
|
|
// VMM with GDR support is available from CUDA 11.0
|
|
GDRCOPY_TEST(basic_vmmalloc)
|
|
{
|
|
basic<gpu_vmm_alloc, gpu_vmm_free, vmm_filter>();
|
|
}
|
|
#endif
|
|
|
|
GDRCOPY_TEST(basic_with_tokens)
|
|
{
|
|
expecting_exception_signal = false;
|
|
MB();
|
|
|
|
init_cuda(0);
|
|
|
|
const size_t _size = 256*1024+16;
|
|
const size_t size = PAGE_ROUND_UP(_size, GPU_PAGE_SIZE);
|
|
|
|
print_dbg("buffer size: %zu\n", size);
|
|
|
|
CUdeviceptr d_A;
|
|
gpu_mem_handle_t mhandle;
|
|
CUDA_POINTER_ATTRIBUTE_P2P_TOKENS tokens = {0,0};
|
|
|
|
// Token does not work with cuMemCreate
|
|
ASSERTDRV(gpu_mem_alloc(&mhandle, size, true, true));
|
|
d_A = mhandle.ptr;
|
|
|
|
ASSERTDRV(cuPointerGetAttribute(&tokens, CU_POINTER_ATTRIBUTE_P2P_TOKENS, d_A));
|
|
|
|
gdr_t g = gdr_open_safe();
|
|
|
|
gdr_mh_t mh = null_mh;
|
|
CUdeviceptr d_ptr = d_A;
|
|
|
|
ASSERT_EQ(gdr_pin_buffer(g, d_ptr, size, tokens.p2pToken, tokens.vaSpaceToken, &mh), 0);
|
|
ASSERT_NEQ(mh, null_mh);
|
|
ASSERT_EQ(gdr_unpin_buffer(g, mh), 0);
|
|
ASSERT_EQ(gdr_close(g), 0);
|
|
|
|
ASSERTDRV(gpu_mem_free(&mhandle));
|
|
|
|
finalize_cuda(0);
|
|
}
|
|
|
|
/**
|
|
* This unit test ensures that gdrcopy returns error when trying to map
|
|
* unaligned addresses. In addition, it tests that mapping hand-aligned
|
|
* addresses by users are successful.
|
|
*
|
|
* cuMemCreate + cuMemMap always return an aligned address. So, this test is
|
|
* for cuMemAlloc only.
|
|
*
|
|
*/
|
|
GDRCOPY_TEST(basic_unaligned_mapping)
|
|
{
|
|
expecting_exception_signal = false;
|
|
MB();
|
|
|
|
init_cuda(0);
|
|
|
|
// Allocate for a few bytes so that cuMemAlloc returns an unaligned address
|
|
// in the next allocation. This behavior is observed in GPU Driver 410 and
|
|
// above.
|
|
const size_t fa_size = 4;
|
|
CUdeviceptr d_fa;
|
|
gpu_mem_handle_t fa_mhandle;
|
|
ASSERTDRV(gpu_mem_alloc(&fa_mhandle, fa_size, true, true));
|
|
d_fa = fa_mhandle.ptr;
|
|
print_dbg("First allocation: d_fa=0x%llx, size=%zu\n", d_fa, fa_size);
|
|
|
|
const size_t A_size = GPU_PAGE_SIZE + sizeof(int);
|
|
|
|
const int retry = 10;
|
|
int cnt = 0;
|
|
|
|
CUdeviceptr d_A, d_A_boundary;
|
|
gpu_mem_handle_t A_mhandle[retry];
|
|
|
|
// Try until we get an unaligned address. Give up after cnt times.
|
|
for (cnt = 0; cnt < retry; ++cnt) {
|
|
ASSERTDRV(gpu_mem_alloc(&A_mhandle[cnt], A_size, false, true));
|
|
d_A = A_mhandle[cnt].ptr;
|
|
d_A_boundary = d_A & GPU_PAGE_MASK;
|
|
if (d_A != d_A_boundary) {
|
|
++cnt;
|
|
break;
|
|
}
|
|
}
|
|
print_dbg("Second allocation: d_A=0x%llx, size=%zu, GPU-page-boundary 0x%llx\n", d_A, A_size, d_A_boundary);
|
|
if (d_A == d_A_boundary) {
|
|
print_dbg("d_A is aligned. Waiving this test.\n");
|
|
for (int i = 0; i < cnt; ++i)
|
|
ASSERTDRV(gpu_mem_free(&A_mhandle[i]));
|
|
|
|
exit(EXIT_WAIVED);
|
|
}
|
|
print_dbg("d_A is unaligned\n");
|
|
|
|
gdr_t g = gdr_open_safe();
|
|
|
|
// Try mapping with unaligned address. This should fail.
|
|
print_dbg("Try mapping d_A as is.\n");
|
|
gdr_mh_t A_mh = null_mh;
|
|
|
|
ASSERT_EQ(gdr_pin_buffer(g, d_A, A_size, 0, 0, &A_mh), 0);
|
|
ASSERT_NEQ(A_mh, null_mh);
|
|
|
|
void *A_bar_ptr = NULL;
|
|
// Expect gdr_map to fail with unaligned address
|
|
ASSERT_NEQ(gdr_map(g, A_mh, &A_bar_ptr, A_size), 0);
|
|
ASSERT_EQ(gdr_unpin_buffer(g, A_mh), 0);
|
|
print_dbg("Mapping d_A failed as expected.\n");
|
|
|
|
print_dbg("Align d_A and try mapping it again.\n");
|
|
// In order to align d_A, we move to the next GPU page. The reason is that
|
|
// the first GPU page may belong to another allocation.
|
|
CUdeviceptr d_aligned_A = PAGE_ROUND_UP(d_A, GPU_PAGE_SIZE);
|
|
off_t aligned_A_offset = d_aligned_A - d_A;
|
|
size_t aligned_A_size = A_size - aligned_A_offset;
|
|
|
|
print_dbg("Pin and map aligned address: d_aligned_A=0x%llx, offset=%lld, size=%zu\n", d_aligned_A, aligned_A_offset, aligned_A_size);
|
|
|
|
gdr_mh_t aligned_A_mh = null_mh;
|
|
void *aligned_A_bar_ptr = NULL;
|
|
ASSERT_EQ(gdr_pin_buffer(g, d_aligned_A, aligned_A_size, 0, 0, &aligned_A_mh), 0);
|
|
ASSERT_NEQ(aligned_A_mh, null_mh);
|
|
// expect gdr_map to success
|
|
ASSERT_EQ(gdr_map(g, aligned_A_mh, &aligned_A_bar_ptr, aligned_A_size), 0);
|
|
|
|
// Test accessing the mapping
|
|
int *aligned_A_map_ptr = (int *)aligned_A_bar_ptr;
|
|
aligned_A_map_ptr[0] = 7;
|
|
|
|
// The first allocation and d_A should share a GPU page. We should make
|
|
// sure that freeing the first allocation would not accidentally unmap
|
|
// d_aligned_A as the d_aligned_A mapping starts from the next GPU page.
|
|
gdr_mh_t fa_mh = null_mh;
|
|
ASSERT_EQ(gdr_pin_buffer(g, d_fa, fa_size, 0, 0, &fa_mh), 0);
|
|
ASSERT_NEQ(fa_mh, null_mh);
|
|
|
|
void *fa_bar_ptr = NULL;
|
|
ASSERT_EQ(gdr_map(g, fa_mh, &fa_bar_ptr, fa_size), 0);
|
|
|
|
ASSERTDRV(gpu_mem_free(&fa_mhandle));
|
|
|
|
// Test accessing aligned_A_map_ptr again. This should not cause segmentation fault.
|
|
aligned_A_map_ptr[0] = 9;
|
|
|
|
ASSERT_EQ(gdr_unpin_buffer(g, aligned_A_mh), 0);
|
|
ASSERT_EQ(gdr_close(g), 0);
|
|
|
|
for (int i = 0; i < cnt; ++i)
|
|
ASSERTDRV(gpu_mem_free(&A_mhandle[i]));
|
|
|
|
finalize_cuda(0);
|
|
}
|
|
|
|
/**
|
|
* This unit test is for catching issue-244
|
|
* (https://github.com/NVIDIA/gdrcopy/issues/244). The bug occurs when the
|
|
* first buffer is smaller than the GPU page size and the second buffer is
|
|
* within the same page. We expect to be able to map the first buffer. The
|
|
* second buffer cannot be mapped because it is not aligned.
|
|
*
|
|
* cuMemCreate + cuMemMap always return an aligned address. So, this test is
|
|
* for cuMemAlloc only.
|
|
*
|
|
*/
|
|
GDRCOPY_TEST(basic_small_buffers_mapping)
|
|
{
|
|
expecting_exception_signal = false;
|
|
MB();
|
|
|
|
init_cuda(0);
|
|
|
|
const size_t fa_size = GPU_PAGE_SIZE;
|
|
CUdeviceptr d_fa;
|
|
gpu_mem_handle_t fa_mhandle;
|
|
ASSERTDRV(gpu_mem_alloc(&fa_mhandle, fa_size, true, true));
|
|
d_fa = fa_mhandle.ptr;
|
|
print_dbg("Allocated d_fa=%#llx, size=%zu\n", d_fa, fa_size);
|
|
|
|
const size_t buffer_size = sizeof(uint64_t);
|
|
CUdeviceptr d_A[2];
|
|
d_A[0] = d_fa;
|
|
d_A[1] = d_fa + buffer_size;
|
|
|
|
gdr_t g = gdr_open_safe();
|
|
|
|
// Pin both buffers.
|
|
print_dbg("Try pinning d_A[0] and d_A[1].\n");
|
|
gdr_mh_t A_mh[2];
|
|
A_mh[0] = null_mh;
|
|
A_mh[1] = null_mh;
|
|
|
|
ASSERT_EQ(gdr_pin_buffer(g, d_A[0], buffer_size, 0, 0, &A_mh[0]), 0);
|
|
ASSERT_EQ(gdr_pin_buffer(g, d_A[1], buffer_size, 0, 0, &A_mh[1]), 0);
|
|
ASSERT_NEQ(A_mh[0], null_mh);
|
|
ASSERT_NEQ(A_mh[1], null_mh);
|
|
|
|
void *A_bar_ptr[2];
|
|
A_bar_ptr[0] = NULL;
|
|
A_bar_ptr[1] = NULL;
|
|
|
|
// Expect gdr_map to pass
|
|
ASSERT_EQ(gdr_map(g, A_mh[0], &A_bar_ptr[0], buffer_size), 0);
|
|
print_dbg("Mapping d_A[0] passed as expected.\n");
|
|
|
|
// Expect gdr_map to fail due to unaligned mapping
|
|
ASSERT_NEQ(gdr_map(g, A_mh[1], &A_bar_ptr[1], buffer_size), 0);
|
|
print_dbg("Mapping d_A[1] failed as expected.\n");
|
|
|
|
ASSERT_EQ(gdr_unmap(g, A_mh[0], A_bar_ptr[0], buffer_size), 0);
|
|
|
|
ASSERT_EQ(gdr_unpin_buffer(g, A_mh[0]), 0);
|
|
ASSERT_EQ(gdr_unpin_buffer(g, A_mh[1]), 0);
|
|
|
|
ASSERT_EQ(gdr_close(g), 0);
|
|
|
|
ASSERTDRV(gpu_mem_free(&fa_mhandle));
|
|
|
|
finalize_cuda(0);
|
|
}
|
|
|
|
template <gpu_memalloc_fn_t galloc_fn, gpu_memfree_fn_t gfree_fn, filter_fn_t filter_fn>
|
|
void data_validation()
|
|
{
|
|
expecting_exception_signal = false;
|
|
MB();
|
|
|
|
init_cuda(0);
|
|
filter_fn();
|
|
|
|
const size_t _size = 256*1024+16;
|
|
const size_t size = PAGE_ROUND_UP(_size, GPU_PAGE_SIZE);
|
|
|
|
print_dbg("buffer size: %zu\n", size);
|
|
CUdeviceptr d_A;
|
|
gpu_mem_handle_t mhandle;
|
|
ASSERTDRV(galloc_fn(&mhandle, size, true, true));
|
|
d_A = mhandle.ptr;
|
|
|
|
ASSERTDRV(cuMemsetD8(d_A, 0xA5, size));
|
|
ASSERTDRV(cuCtxSynchronize());
|
|
|
|
uint32_t *init_buf = new uint32_t[size / sizeof(uint32_t)];
|
|
uint32_t *copy_buf = new uint32_t[size / sizeof(uint32_t)];
|
|
|
|
init_hbuf_walking_bit(init_buf, size);
|
|
memset(copy_buf, 0xA5, size);
|
|
|
|
gdr_t g = gdr_open_safe();
|
|
|
|
gdr_mh_t mh;
|
|
|
|
CUdeviceptr d_ptr = d_A;
|
|
|
|
// tokens are optional in CUDA 6.0
|
|
// wave out the test if GPUDirectRDMA is not enabled
|
|
ASSERT_EQ(gdr_pin_buffer(g, d_ptr, size, 0, 0, &mh), 0);
|
|
ASSERT_NEQ(mh, null_mh);
|
|
|
|
gdr_info_t info;
|
|
ASSERT_EQ(gdr_get_info(g, mh, &info), 0);
|
|
ASSERT(!info.mapped);
|
|
|
|
void *bar_ptr = NULL;
|
|
ASSERT_EQ(gdr_map(g, mh, &bar_ptr, size), 0);
|
|
|
|
ASSERT_EQ(gdr_get_info(g, mh, &info), 0);
|
|
ASSERT(info.mapped);
|
|
int off = d_ptr - info.va;
|
|
print_dbg("off: %d\n", off);
|
|
|
|
uint32_t *buf_ptr = (uint32_t *)((char *)bar_ptr + off);
|
|
|
|
print_dbg("check 1: MMIO CPU initialization + read back via cuMemcpy D->H\n");
|
|
init_hbuf_walking_bit(buf_ptr, size);
|
|
ASSERTDRV(cuMemcpyDtoH(copy_buf, d_ptr, size));
|
|
ASSERT_EQ(compare_buf(init_buf, copy_buf, size), 0);
|
|
memset(copy_buf, 0xA5, size);
|
|
ASSERTDRV(cuMemsetD8(d_A, 0xA5, size));
|
|
ASSERTDRV(cuCtxSynchronize());
|
|
|
|
print_dbg("check 2: gdr_copy_to_bar() + read back via cuMemcpy D->H\n");
|
|
gdr_copy_to_mapping(mh, buf_ptr, init_buf, size);
|
|
ASSERTDRV(cuMemcpyDtoH(copy_buf, d_ptr, size));
|
|
ASSERT_EQ(compare_buf(init_buf, copy_buf, size), 0);
|
|
memset(copy_buf, 0xA5, size);
|
|
ASSERTDRV(cuMemsetD8(d_A, 0xA5, size));
|
|
ASSERTDRV(cuCtxSynchronize());
|
|
|
|
print_dbg("check 3: gdr_copy_to_bar() + read back via gdr_copy_from_bar()\n");
|
|
gdr_copy_to_mapping(mh, buf_ptr, init_buf, size);
|
|
gdr_copy_from_mapping(mh, copy_buf, buf_ptr, size);
|
|
ASSERT_EQ(compare_buf(init_buf, copy_buf, size), 0);
|
|
memset(copy_buf, 0xA5, size);
|
|
ASSERTDRV(cuMemsetD8(d_A, 0xA5, size));
|
|
ASSERTDRV(cuCtxSynchronize());
|
|
|
|
int offset_array[] = { 1, 2, 3, 4, 5, 6, 7, 11, 129, 1023 };
|
|
|
|
for (int i = 0; i < sizeof(offset_array) / sizeof(offset_array[0]); ++i) {
|
|
int extra_dwords = offset_array[i];
|
|
int extra_off = extra_dwords * sizeof(uint32_t);
|
|
print_dbg("check 4.%d: gdr_copy_to_bar() + read back via gdr_copy_from_bar() + %d dwords offset on mapping\n", i, extra_dwords);
|
|
gdr_copy_to_mapping(mh, buf_ptr + extra_dwords, init_buf, size - extra_off);
|
|
gdr_copy_from_mapping(mh, copy_buf, buf_ptr + extra_dwords, size - extra_off);
|
|
ASSERT_EQ(compare_buf(init_buf, copy_buf, size - extra_off), 0);
|
|
memset(copy_buf, 0xA5, size);
|
|
ASSERTDRV(cuMemsetD8(d_A, 0xA5, size));
|
|
ASSERTDRV(cuCtxSynchronize());
|
|
|
|
extra_off = offset_array[i];
|
|
print_dbg("check 5.%d: gdr_copy_to_bar() + read back via gdr_copy_from_bar() + %d bytes offset on mapping\n", i, extra_off);
|
|
gdr_copy_to_mapping(mh, (char*)buf_ptr + extra_off, init_buf, size - extra_off);
|
|
gdr_copy_from_mapping(mh, copy_buf, (char*)buf_ptr + extra_off, size - extra_off);
|
|
ASSERT_EQ(compare_buf(init_buf, copy_buf, size - extra_off), 0);
|
|
memset(copy_buf, 0xA5, size);
|
|
ASSERTDRV(cuMemsetD8(d_A, 0xA5, size));
|
|
ASSERTDRV(cuCtxSynchronize());
|
|
|
|
extra_dwords = offset_array[i];
|
|
extra_off = extra_dwords * sizeof(uint32_t);
|
|
print_dbg("check 6.%d: gdr_copy_to_bar() + read back via gdr_copy_from_bar() + %d dwords offset on host buffer\n", i, extra_dwords);
|
|
gdr_copy_to_mapping(mh, buf_ptr, init_buf + extra_dwords, size - extra_off);
|
|
gdr_copy_from_mapping(mh, copy_buf + extra_dwords, buf_ptr, size - extra_off);
|
|
ASSERT_EQ(compare_buf(init_buf + extra_dwords, copy_buf + extra_dwords, size - extra_off), 0);
|
|
memset(copy_buf, 0xA5, size);
|
|
ASSERTDRV(cuMemsetD8(d_A, 0xA5, size));
|
|
ASSERTDRV(cuCtxSynchronize());
|
|
|
|
extra_off = offset_array[i];
|
|
print_dbg("check 7.%d: gdr_copy_to_bar() + read back via gdr_copy_from_bar() + %d bytes offset on host buffer\n", i, extra_off);
|
|
gdr_copy_to_mapping(mh, buf_ptr, (char *)init_buf + extra_off, size - extra_off);
|
|
gdr_copy_from_mapping(mh, (char *)copy_buf + extra_off, buf_ptr, size - extra_off);
|
|
ASSERT_EQ(compare_buf((uint32_t *)((uintptr_t)init_buf + extra_off), (uint32_t *)((uintptr_t)copy_buf + extra_off), size - extra_off), 0);
|
|
memset(copy_buf, 0xA5, size);
|
|
ASSERTDRV(cuMemsetD8(d_A, 0xA5, size));
|
|
ASSERTDRV(cuCtxSynchronize());
|
|
|
|
extra_dwords = offset_array[i];
|
|
extra_off = extra_dwords * sizeof(uint32_t);
|
|
print_dbg("check 8.%d: gdr_copy_to_bar() + read back via gdr_copy_from_bar() + %d dwords offset on both mapping and host buffer\n", i, extra_dwords);
|
|
gdr_copy_to_mapping(mh, buf_ptr + extra_dwords, init_buf + extra_dwords, size - extra_off);
|
|
gdr_copy_from_mapping(mh, copy_buf + extra_dwords, buf_ptr + extra_dwords, size - extra_off);
|
|
ASSERT_EQ(compare_buf(init_buf + extra_dwords, copy_buf + extra_dwords, size - extra_off), 0);
|
|
memset(copy_buf, 0xA5, size);
|
|
ASSERTDRV(cuMemsetD8(d_A, 0xA5, size));
|
|
ASSERTDRV(cuCtxSynchronize());
|
|
|
|
extra_off = offset_array[i];
|
|
print_dbg("check 9.%d: gdr_copy_to_bar() + read back via gdr_copy_from_bar() + %d bytes offset on both mapping and host buffer\n", i, extra_off);
|
|
gdr_copy_to_mapping(mh, (char *)buf_ptr + extra_off, (char *)init_buf + extra_off, size - extra_off);
|
|
gdr_copy_from_mapping(mh, (char *)copy_buf + extra_off, (char *)buf_ptr + extra_off, size - extra_off);
|
|
ASSERT_EQ(compare_buf((uint32_t *)((uintptr_t)init_buf + extra_off), (uint32_t *)((uintptr_t)copy_buf + extra_off), size - extra_off), 0);
|
|
memset(copy_buf, 0xA5, size);
|
|
ASSERTDRV(cuMemsetD8(d_A, 0xA5, size));
|
|
ASSERTDRV(cuCtxSynchronize());
|
|
}
|
|
|
|
print_dbg("unmapping\n");
|
|
ASSERT_EQ(gdr_unmap(g, mh, bar_ptr, size), 0);
|
|
print_dbg("unpinning\n");
|
|
ASSERT_EQ(gdr_unpin_buffer(g, mh), 0);
|
|
|
|
ASSERT_EQ(gdr_close(g), 0);
|
|
if (copy_buf) {
|
|
delete [] copy_buf;
|
|
copy_buf = NULL;
|
|
}
|
|
if (init_buf) {
|
|
delete [] init_buf;
|
|
init_buf = NULL;
|
|
}
|
|
|
|
ASSERTDRV(gfree_fn(&mhandle));
|
|
|
|
delete init_buf;
|
|
delete copy_buf;
|
|
|
|
finalize_cuda(0);
|
|
}
|
|
|
|
GDRCOPY_TEST(data_validation_cumemalloc)
|
|
{
|
|
data_validation<gpu_mem_alloc, gpu_mem_free, null_filter>();
|
|
}
|
|
|
|
#if CUDA_VERSION >= 11000
|
|
// VMM with GDR support is available from CUDA 11.0
|
|
GDRCOPY_TEST(data_validation_vmmalloc)
|
|
{
|
|
data_validation<gpu_vmm_alloc, gpu_vmm_free, vmm_filter>();
|
|
}
|
|
#endif
|
|
|
|
/**
|
|
* This unit test ensures that accessing to gdr_map'ed region is not possible
|
|
* after gdr_close.
|
|
*
|
|
* Step:
|
|
* 1. Initialize CUDA and gdrcopy
|
|
* 2. Do gdr_map(..., &bar_ptr, ...)
|
|
* 3. Do gdr_close
|
|
* 4. Attempt to access to bar_ptr after 3. should fail
|
|
*/
|
|
template <gpu_memalloc_fn_t galloc_fn, gpu_memfree_fn_t gfree_fn, filter_fn_t filter_fn>
|
|
void invalidation_access_after_gdr_close()
|
|
{
|
|
expecting_exception_signal = false;
|
|
MB();
|
|
|
|
struct sigaction act;
|
|
act.sa_handler = exception_signal_handle;
|
|
sigemptyset(&act.sa_mask);
|
|
act.sa_flags = 0;
|
|
sigaction(SIGBUS, &act, 0);
|
|
|
|
srand(time(NULL));
|
|
|
|
const size_t _size = sizeof(int) * 16;
|
|
const size_t size = PAGE_ROUND_UP(_size, GPU_PAGE_SIZE);
|
|
|
|
int mydata = (rand() % 1000) + 1;
|
|
|
|
init_cuda(0);
|
|
filter_fn();
|
|
|
|
CUdeviceptr d_A;
|
|
gpu_mem_handle_t mhandle;
|
|
ASSERTDRV(galloc_fn(&mhandle, size, true, true));
|
|
d_A = mhandle.ptr;
|
|
|
|
ASSERTDRV(cuMemsetD8(d_A, 0x95, size));
|
|
ASSERTDRV(cuCtxSynchronize());
|
|
|
|
gdr_t g = gdr_open_safe();
|
|
|
|
gdr_mh_t mh;
|
|
CUdeviceptr d_ptr = d_A;
|
|
|
|
// tokens are optional in CUDA 6.0
|
|
// wave out the test if GPUDirectRDMA is not enabled
|
|
ASSERT_EQ(gdr_pin_buffer(g, d_ptr, size, 0, 0, &mh), 0);
|
|
ASSERT_NEQ(mh, null_mh);
|
|
|
|
print_dbg("Mapping bar1\n");
|
|
void *bar_ptr = NULL;
|
|
ASSERT_EQ(gdr_map(g, mh, &bar_ptr, size), 0);
|
|
|
|
gdr_info_t info;
|
|
ASSERT_EQ(gdr_get_info(g, mh, &info), 0);
|
|
int off = d_ptr - info.va;
|
|
|
|
volatile int *buf_ptr = (volatile int *)((char *)bar_ptr + off);
|
|
|
|
// Write data
|
|
print_dbg("Writing %d into buf_ptr[0]\n", mydata);
|
|
buf_ptr[0] = mydata;
|
|
|
|
print_dbg("Calling gdr_close\n");
|
|
ASSERT_EQ(gdr_close(g), 0);
|
|
|
|
print_dbg("Trying to read buf_ptr[0] after gdr_close\n");
|
|
expecting_exception_signal = true;
|
|
MB();
|
|
int data_from_buf_ptr = buf_ptr[0];
|
|
MB();
|
|
expecting_exception_signal = false;
|
|
MB();
|
|
|
|
ASSERT_NEQ(data_from_buf_ptr, mydata);
|
|
|
|
finalize_cuda(0);
|
|
}
|
|
|
|
GDRCOPY_TEST(invalidation_access_after_gdr_close_cumemalloc)
|
|
{
|
|
invalidation_access_after_gdr_close<gpu_mem_alloc, gpu_mem_free, null_filter>();
|
|
}
|
|
|
|
#if CUDA_VERSION >= 11000
|
|
// VMM with GDR support is available from CUDA 11.0
|
|
GDRCOPY_TEST(invalidation_access_after_gdr_close_vmmalloc)
|
|
{
|
|
invalidation_access_after_gdr_close<gpu_vmm_alloc, gpu_vmm_free, vmm_filter>();
|
|
}
|
|
#endif
|
|
|
|
/**
|
|
* This unit test ensures that accessing to gdr_map'ed region is not possible
|
|
* after gpuMemFree.
|
|
*
|
|
* Step:
|
|
* 1. Initialize CUDA and gdrcopy
|
|
* 2. Do gdr_map(..., &bar_ptr, ...)
|
|
* 3. Do gpuMemFree
|
|
* 4. Attempt to access to bar_ptr after 3. should fail
|
|
*/
|
|
template <gpu_memalloc_fn_t galloc_fn, gpu_memfree_fn_t gfree_fn, filter_fn_t filter_fn>
|
|
void invalidation_access_after_free()
|
|
{
|
|
// Waive this test until we provide a way to query whether persistent
|
|
// mapping is being used.
|
|
exit(EXIT_WAIVED);
|
|
|
|
expecting_exception_signal = false;
|
|
MB();
|
|
|
|
struct sigaction act;
|
|
act.sa_handler = exception_signal_handle;
|
|
sigemptyset(&act.sa_mask);
|
|
act.sa_flags = 0;
|
|
sigaction(SIGBUS, &act, 0);
|
|
|
|
srand(time(NULL));
|
|
|
|
const size_t _size = sizeof(int) * 16;
|
|
const size_t size = PAGE_ROUND_UP(_size, GPU_PAGE_SIZE);
|
|
|
|
int mydata = (rand() % 1000) + 1;
|
|
|
|
init_cuda(0);
|
|
filter_fn();
|
|
|
|
CUdeviceptr d_A;
|
|
gpu_mem_handle_t mhandle;
|
|
ASSERTDRV(galloc_fn(&mhandle, size, true, true));
|
|
d_A = mhandle.ptr;
|
|
|
|
ASSERTDRV(cuMemsetD8(d_A, 0x95, size));
|
|
ASSERTDRV(cuCtxSynchronize());
|
|
|
|
gdr_t g = gdr_open_safe();
|
|
|
|
gdr_mh_t mh;
|
|
CUdeviceptr d_ptr = d_A;
|
|
|
|
// tokens are optional in CUDA 6.0
|
|
// wave out the test if GPUDirectRDMA is not enabled
|
|
ASSERT_EQ(gdr_pin_buffer(g, d_ptr, size, 0, 0, &mh), 0);
|
|
ASSERT_NEQ(mh, null_mh);
|
|
|
|
print_dbg("Mapping bar1\n");
|
|
void *bar_ptr = NULL;
|
|
ASSERT_EQ(gdr_map(g, mh, &bar_ptr, size), 0);
|
|
|
|
gdr_info_t info;
|
|
ASSERT_EQ(gdr_get_info(g, mh, &info), 0);
|
|
int off = d_ptr - info.va;
|
|
|
|
volatile int *buf_ptr = (volatile int *)((char *)bar_ptr + off);
|
|
|
|
// Write data
|
|
print_dbg("Writing %d into buf_ptr[0]\n", mydata);
|
|
buf_ptr[0] = mydata;
|
|
|
|
print_dbg("Calling gpuMemFree\n");
|
|
ASSERTDRV(gfree_fn(&mhandle));
|
|
|
|
print_dbg("Trying to read buf_ptr[0] after gpuMemFree\n");
|
|
expecting_exception_signal = true;
|
|
MB();
|
|
int data_from_buf_ptr = buf_ptr[0];
|
|
MB();
|
|
expecting_exception_signal = false;
|
|
MB();
|
|
|
|
ASSERT_NEQ(data_from_buf_ptr, mydata);
|
|
|
|
ASSERT_EQ(gdr_unmap(g, mh, bar_ptr, size), 0);
|
|
ASSERT_EQ(gdr_unpin_buffer(g, mh), 0);
|
|
ASSERT_EQ(gdr_close(g), 0);
|
|
|
|
finalize_cuda(0);
|
|
}
|
|
|
|
GDRCOPY_TEST(invalidation_access_after_free_cumemalloc)
|
|
{
|
|
invalidation_access_after_free<gpu_mem_alloc, gpu_mem_free, null_filter>();
|
|
}
|
|
|
|
#if CUDA_VERSION >= 11000
|
|
// VMM with GDR support is available from CUDA 11.0
|
|
GDRCOPY_TEST(invalidation_access_after_free_vmmalloc)
|
|
{
|
|
invalidation_access_after_free<gpu_vmm_alloc, gpu_vmm_free, vmm_filter>();
|
|
}
|
|
#endif
|
|
|
|
|
|
/**
|
|
* This unit test ensures that gpuMemFree destroys only the mapping it is
|
|
* corresponding to.
|
|
*
|
|
* Step:
|
|
* 1. Initialize CUDA and gdrcopy
|
|
* 2. cuMemAlloc(&d_A, ...); cuMemAlloc(&d_B, ...)
|
|
* 3. Do gdr_map(..., &bar_ptr_A, ...) of d_A
|
|
* 4. Do gdr_map(..., &bar_ptr_B, ...) of d_B
|
|
* 5. Do gpuMemFree(d_A)
|
|
* 6. Verify that bar_ptr_B is still accessible
|
|
*/
|
|
template <gpu_memalloc_fn_t galloc_fn, gpu_memfree_fn_t gfree_fn, filter_fn_t filter_fn>
|
|
void invalidation_two_mappings()
|
|
{
|
|
expecting_exception_signal = false;
|
|
MB();
|
|
|
|
srand(time(NULL));
|
|
|
|
const size_t _size = sizeof(int) * 16;
|
|
const size_t size = PAGE_ROUND_UP(_size, GPU_PAGE_SIZE);
|
|
|
|
int mydata = (rand() % 1000) + 1;
|
|
|
|
init_cuda(0);
|
|
filter_fn();
|
|
|
|
CUdeviceptr d_A[2];
|
|
gpu_mem_handle_t mhandle[2];
|
|
|
|
for (int i = 0; i < 2; ++i) {
|
|
ASSERTDRV(galloc_fn(&mhandle[i], size, true, true));
|
|
d_A[i] = mhandle[i].ptr;
|
|
ASSERTDRV(cuMemsetD8(d_A[i], 0x95, size));
|
|
}
|
|
ASSERTDRV(cuCtxSynchronize());
|
|
|
|
gdr_t g = gdr_open_safe();
|
|
|
|
gdr_mh_t mh[2];
|
|
|
|
volatile int *buf_ptr[2];
|
|
void *bar_ptr[2];
|
|
|
|
print_dbg("Mapping bar1\n");
|
|
for (int i = 0; i < 2; ++i) {
|
|
CUdeviceptr d_ptr = d_A[i];
|
|
|
|
// tokens are optional in CUDA 6.0
|
|
// wave out the test if GPUDirectRDMA is not enabled
|
|
ASSERT_EQ(gdr_pin_buffer(g, d_ptr, size, 0, 0, &mh[i]), 0);
|
|
ASSERT_NEQ(mh[i], null_mh);
|
|
|
|
bar_ptr[i] = NULL;
|
|
ASSERT_EQ(gdr_map(g, mh[i], &bar_ptr[i], size), 0);
|
|
|
|
gdr_info_t info;
|
|
ASSERT_EQ(gdr_get_info(g, mh[i], &info), 0);
|
|
int off = d_ptr - info.va;
|
|
|
|
buf_ptr[i] = (volatile int *)((char *)bar_ptr[i] + off);
|
|
}
|
|
|
|
|
|
// Write data
|
|
print_dbg("Writing data to both mappings %d and %d respectively\n", mydata, mydata + 1);
|
|
buf_ptr[0][0] = mydata;
|
|
buf_ptr[1][0] = mydata + 1;
|
|
|
|
print_dbg("Validating that we can read the data back\n");
|
|
ASSERT_EQ(buf_ptr[0][0], mydata);
|
|
ASSERT_EQ(buf_ptr[1][0], mydata + 1);
|
|
|
|
print_dbg("gpuMemFree and thus destroying the first mapping\n");
|
|
ASSERTDRV(gfree_fn(&mhandle[0]));
|
|
|
|
print_dbg("Trying to read and validate the data from the second mapping after the first mapping has been destroyed\n");
|
|
ASSERT_EQ(buf_ptr[1][0], mydata + 1);
|
|
|
|
ASSERTDRV(gfree_fn(&mhandle[1]));
|
|
|
|
for (int i = 0; i < 2; ++i) {
|
|
ASSERT_EQ(gdr_unmap(g, mh[i], bar_ptr[i], size), 0);
|
|
ASSERT_EQ(gdr_unpin_buffer(g, mh[i]), 0);
|
|
}
|
|
|
|
ASSERT_EQ(gdr_close(g), 0);
|
|
|
|
finalize_cuda(0);
|
|
}
|
|
|
|
GDRCOPY_TEST(invalidation_two_mappings_cumemalloc)
|
|
{
|
|
invalidation_two_mappings<gpu_mem_alloc, gpu_mem_free, null_filter>();
|
|
}
|
|
|
|
#if CUDA_VERSION >= 11000
|
|
// VMM with GDR support is available from CUDA 11.0
|
|
GDRCOPY_TEST(invalidation_two_mappings_vmmalloc)
|
|
{
|
|
invalidation_two_mappings<gpu_vmm_alloc, gpu_vmm_free, vmm_filter>();
|
|
}
|
|
#endif
|
|
|
|
/**
|
|
* This unit test is intended to check the security hole originated from not
|
|
* doing invalidation correctly. In a nutshell, it ensures that the parent
|
|
* process cannot spy on the child process.
|
|
*
|
|
* Step:
|
|
* 1. Fork the process
|
|
* 2.C Child: Waiting for parent's signal before continue
|
|
*
|
|
* 2.P Parent: Initialize CUDA and gdrcopy
|
|
* 3.P Parent: Do gdr_map then gpuMemFree without gdr_unmap
|
|
* 4.P Parent: Signal child and wait for child's signal
|
|
*
|
|
* 3.C Child: Initialize CUDA and gdrcopy
|
|
* 4.C Child: Do gdr_map, signal parent, and wait for parent's signal
|
|
*
|
|
* 5.P Parent: Check whether it can access to its gdr_map'ed data or not and
|
|
* compare with the data written by child. If gdrdrv does not handle
|
|
* invalidation properly, child's data will be leaked to parent.
|
|
*/
|
|
template <gpu_memalloc_fn_t galloc_fn, gpu_memfree_fn_t gfree_fn, filter_fn_t filter_fn>
|
|
void invalidation_fork_access_after_free()
|
|
{
|
|
expecting_exception_signal = false;
|
|
MB();
|
|
|
|
int filedes_0[2];
|
|
int filedes_1[2];
|
|
int read_fd;
|
|
int write_fd;
|
|
ASSERT_NEQ(pipe(filedes_0), -1);
|
|
ASSERT_NEQ(pipe(filedes_1), -1);
|
|
|
|
srand(time(NULL));
|
|
|
|
const size_t _size = sizeof(int) * 16;
|
|
const size_t size = PAGE_ROUND_UP(_size, GPU_PAGE_SIZE);
|
|
const char *myname;
|
|
|
|
fflush(stdout);
|
|
fflush(stderr);
|
|
|
|
pid_t pid = fork();
|
|
ASSERT(pid >= 0);
|
|
|
|
myname = pid == 0 ? "child" : "parent";
|
|
|
|
print_dbg("%s: Start\n", myname);
|
|
|
|
if (pid == 0) {
|
|
close(filedes_0[0]);
|
|
close(filedes_1[1]);
|
|
|
|
read_fd = filedes_1[0];
|
|
write_fd = filedes_0[1];
|
|
|
|
int cont = 0;
|
|
|
|
do {
|
|
print_dbg("%s: waiting for cont signal from parent\n", myname);
|
|
ASSERT_EQ(read(read_fd, &cont, sizeof(int)), sizeof(int));
|
|
print_dbg("%s: receive cont signal %d from parent\n", myname, cont);
|
|
} while (cont != 1);
|
|
}
|
|
else {
|
|
close(filedes_0[1]);
|
|
close(filedes_1[0]);
|
|
|
|
read_fd = filedes_0[0];
|
|
write_fd = filedes_1[1];
|
|
|
|
struct sigaction act;
|
|
act.sa_handler = exception_signal_handle;
|
|
sigemptyset(&act.sa_mask);
|
|
act.sa_flags = 0;
|
|
sigaction(SIGBUS, &act, 0);
|
|
}
|
|
|
|
int mydata = (rand() % 1000) + 1;
|
|
|
|
// Make sure that parent's and child's mydata are different.
|
|
// Remember that we do srand before fork.
|
|
if (pid == 0)
|
|
mydata += 10;
|
|
|
|
init_cuda(0);
|
|
filter_fn();
|
|
|
|
CUdeviceptr d_A;
|
|
gpu_mem_handle_t mhandle;
|
|
ASSERTDRV(galloc_fn(&mhandle, size, true, true));
|
|
d_A = mhandle.ptr;
|
|
|
|
ASSERTDRV(cuMemsetD8(d_A, 0x95, size));
|
|
ASSERTDRV(cuCtxSynchronize());
|
|
|
|
gdr_t g = gdr_open_safe();
|
|
|
|
gdr_mh_t mh;
|
|
|
|
CUdeviceptr d_ptr = d_A;
|
|
|
|
// tokens are optional in CUDA 6.0
|
|
// wave out the test if GPUDirectRDMA is not enabled
|
|
ASSERT_EQ(gdr_pin_buffer(g, d_ptr, size, 0, 0, &mh), 0);
|
|
ASSERT_NEQ(mh, null_mh);
|
|
|
|
void *bar_ptr = NULL;
|
|
ASSERT_EQ(gdr_map(g, mh, &bar_ptr, size), 0);
|
|
|
|
gdr_info_t info;
|
|
ASSERT_EQ(gdr_get_info(g, mh, &info), 0);
|
|
int off = d_ptr - info.va;
|
|
|
|
volatile int *buf_ptr = (volatile int *)((char *)bar_ptr + off);
|
|
|
|
print_dbg("%s: writing buf_ptr[0] with %d\n", myname, mydata);
|
|
buf_ptr[0] = mydata;
|
|
|
|
if (pid == 0) {
|
|
print_dbg("%s: signal parent that I have written\n", myname);
|
|
ASSERT_EQ(write(write_fd, &mydata, sizeof(int)), sizeof(int));
|
|
|
|
int cont = 0;
|
|
print_dbg("%s: waiting for signal from parent before calling gpuMemFree\n", myname);
|
|
do {
|
|
ASSERT_NEQ(read(read_fd, &cont, sizeof(int)), -1);
|
|
} while (cont != 1);
|
|
}
|
|
|
|
print_dbg("%s: read buf_ptr[0] before gpuMemFree get %d\n", myname, buf_ptr[0]);
|
|
|
|
print_dbg("%s: calling gpuMemFree\n", myname);
|
|
ASSERTDRV(gfree_fn(&mhandle));
|
|
|
|
if (pid > 0) {
|
|
int msg = 1;
|
|
ASSERT_EQ(write(write_fd, &msg, sizeof(int)), sizeof(int));
|
|
int child_data = 0;
|
|
print_dbg("%s: waiting for child write signal\n", myname);
|
|
do {
|
|
ASSERT_EQ(read(read_fd, &child_data, sizeof(int)), sizeof(int));
|
|
} while (child_data == 0);
|
|
|
|
print_dbg("%s: trying to read buf_ptr[0]\n", myname);
|
|
expecting_exception_signal = true;
|
|
MB();
|
|
int data_from_buf_ptr = buf_ptr[0];
|
|
MB();
|
|
expecting_exception_signal = false;
|
|
MB();
|
|
|
|
print_dbg("%s: read buf_ptr[0] after child write get %d\n", myname, data_from_buf_ptr);
|
|
print_dbg("%s: child data is %d\n", myname, child_data);
|
|
ASSERT_EQ(write(write_fd, &msg, sizeof(int)), sizeof(int));
|
|
ASSERT_NEQ(child_data, data_from_buf_ptr);
|
|
}
|
|
|
|
ASSERT_EQ(gdr_unmap(g, mh, bar_ptr, size), 0);
|
|
ASSERT_EQ(gdr_unpin_buffer(g, mh), 0);
|
|
|
|
ASSERT_EQ(gdr_close(g), 0);
|
|
|
|
finalize_cuda(0);
|
|
}
|
|
|
|
GDRCOPY_TEST(invalidation_fork_access_after_free_cumemalloc)
|
|
{
|
|
invalidation_fork_access_after_free<gpu_mem_alloc, gpu_mem_free, null_filter>();
|
|
}
|
|
|
|
#if CUDA_VERSION >= 11000
|
|
// VMM with GDR support is available from CUDA 11.0
|
|
GDRCOPY_TEST(invalidation_fork_access_after_free_vmmalloc)
|
|
{
|
|
invalidation_fork_access_after_free<gpu_vmm_alloc, gpu_vmm_free, vmm_filter>();
|
|
}
|
|
#endif
|
|
|
|
/**
|
|
* This unit test makes sure that child processes cannot spy on the parent
|
|
* process if the parent does fork without doing gdr_unmap first.
|
|
*
|
|
* Step:
|
|
* 1. Initilize CUDA and gdrcopy
|
|
* 2. Do gdr_map
|
|
* 3. Fork the process
|
|
*
|
|
* 4.P Parent: Waiting for child to exit
|
|
*
|
|
* 4.C Child: Attempt to access the gdr_map'ed data and compare with what
|
|
* parent writes into that region. If gdrdrv does not invalidate the
|
|
* mapping correctly, child can spy on parent.
|
|
*/
|
|
template <gpu_memalloc_fn_t galloc_fn, gpu_memfree_fn_t gfree_fn, filter_fn_t filter_fn>
|
|
void invalidation_fork_after_gdr_map()
|
|
{
|
|
expecting_exception_signal = false;
|
|
MB();
|
|
|
|
int filedes_0[2];
|
|
int filedes_1[2];
|
|
int read_fd;
|
|
int write_fd;
|
|
ASSERT_NEQ(pipe(filedes_0), -1);
|
|
ASSERT_NEQ(pipe(filedes_1), -1);
|
|
|
|
const size_t _size = sizeof(int) * 16;
|
|
const size_t size = PAGE_ROUND_UP(_size, GPU_PAGE_SIZE);
|
|
const char *myname;
|
|
|
|
init_cuda(0);
|
|
filter_fn();
|
|
|
|
CUdeviceptr d_A;
|
|
gpu_mem_handle_t mhandle;
|
|
ASSERTDRV(galloc_fn(&mhandle, size, true, true));
|
|
d_A = mhandle.ptr;
|
|
|
|
ASSERTDRV(cuMemsetD8(d_A, 0x95, size));
|
|
ASSERTDRV(cuCtxSynchronize());
|
|
|
|
gdr_t g = gdr_open_safe();
|
|
|
|
gdr_mh_t mh;
|
|
|
|
CUdeviceptr d_ptr = d_A;
|
|
|
|
// tokens are optional in CUDA 6.0
|
|
// wave out the test if GPUDirectRDMA is not enabled
|
|
ASSERT_EQ(gdr_pin_buffer(g, d_ptr, size, 0, 0, &mh), 0);
|
|
ASSERT_NEQ(mh, null_mh);
|
|
|
|
void *bar_ptr = NULL;
|
|
ASSERT_EQ(gdr_map(g, mh, &bar_ptr, size), 0);
|
|
|
|
gdr_info_t info;
|
|
ASSERT_EQ(gdr_get_info(g, mh, &info), 0);
|
|
int off = d_ptr - info.va;
|
|
|
|
volatile int *buf_ptr = (volatile int *)((char *)bar_ptr + off);
|
|
|
|
fflush(stdout);
|
|
fflush(stderr);
|
|
|
|
pid_t pid = fork();
|
|
ASSERT(pid >= 0);
|
|
|
|
myname = pid == 0 ? "child" : "parent";
|
|
|
|
print_dbg("%s: Start\n", myname);
|
|
|
|
srand(time(NULL));
|
|
|
|
int mynumber = rand() % 1000 + 1;
|
|
|
|
if (pid == 0) {
|
|
close(filedes_0[0]);
|
|
close(filedes_1[1]);
|
|
|
|
read_fd = filedes_1[0];
|
|
write_fd = filedes_0[1];
|
|
|
|
srand(rand());
|
|
int cont = 0;
|
|
|
|
do {
|
|
print_dbg("%s: waiting for cont signal from parent\n", myname);
|
|
ASSERT_EQ(read(read_fd, &cont, sizeof(int)), sizeof(int));
|
|
print_dbg("%s: receive cont signal %d from parent\n", myname, cont);
|
|
} while (cont != 1);
|
|
}
|
|
else {
|
|
close(filedes_0[1]);
|
|
close(filedes_1[0]);
|
|
|
|
read_fd = filedes_0[0];
|
|
write_fd = filedes_1[1];
|
|
}
|
|
|
|
if (pid > 0) {
|
|
print_dbg("%s: writing buf_ptr[0] with %d\n", myname, mynumber);
|
|
buf_ptr[0] = mynumber;
|
|
}
|
|
|
|
if (pid == 0) {
|
|
struct sigaction act;
|
|
act.sa_handler = exception_signal_handle;
|
|
sigemptyset(&act.sa_mask);
|
|
act.sa_flags = 0;
|
|
sigaction(SIGBUS, &act, 0);
|
|
sigaction(SIGSEGV, &act, 0);
|
|
|
|
expecting_exception_signal = true;
|
|
MB();
|
|
}
|
|
print_dbg("%s: trying to read buf_ptr[0]\n", myname);
|
|
int data_from_buf_ptr = buf_ptr[0];
|
|
print_dbg("%s: read buf_ptr[0] get %d\n", myname, data_from_buf_ptr);
|
|
if (pid == 0) {
|
|
MB();
|
|
expecting_exception_signal = false;
|
|
MB();
|
|
print_dbg("%s: should not be able to read buf_ptr[0] anymore!! aborting!!\n", myname);
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
if (pid > 0) {
|
|
print_dbg("%s: signaling child\n", myname);
|
|
int msg = 1;
|
|
ASSERT_EQ(write(write_fd, &msg, sizeof(int)), sizeof(int));
|
|
print_dbg("%s: waiting for child to exit\n", myname);
|
|
// Child should exit because of sigbus
|
|
int child_exit_status = -EINVAL;
|
|
ASSERT(wait(&child_exit_status) == pid);
|
|
ASSERT_EQ(child_exit_status, EXIT_SUCCESS);
|
|
print_dbg("%s: trying to read buf_ptr[0] after child exits\n", myname);
|
|
data_from_buf_ptr = buf_ptr[0];
|
|
print_dbg("%s: read buf_ptr[0] after child exits get %d\n", myname, data_from_buf_ptr);
|
|
ASSERT_EQ(data_from_buf_ptr, mynumber);
|
|
ASSERT_EQ(gdr_unmap(g, mh, bar_ptr, size), 0);
|
|
ASSERT_EQ(gdr_unpin_buffer(g, mh), 0);
|
|
ASSERTDRV(gfree_fn(&mhandle));
|
|
ASSERT_EQ(gdr_close(g), 0);
|
|
}
|
|
|
|
finalize_cuda(0);
|
|
}
|
|
|
|
GDRCOPY_TEST(invalidation_fork_after_gdr_map_cumemalloc)
|
|
{
|
|
invalidation_fork_after_gdr_map<gpu_mem_alloc, gpu_mem_free, null_filter>();
|
|
}
|
|
|
|
#if CUDA_VERSION >= 11000
|
|
// VMM with GDR support is available from CUDA 11.0
|
|
GDRCOPY_TEST(invalidation_fork_after_gdr_map_vmmalloc)
|
|
{
|
|
invalidation_fork_after_gdr_map<gpu_vmm_alloc, gpu_vmm_free, vmm_filter>();
|
|
}
|
|
#endif
|
|
|
|
/**
|
|
* This unit test ensures that child cannot do gdr_map on what parent has
|
|
* prepared with gdr_pin_buffer. This situation emulates when the parent
|
|
* forgets that it has gdr_pin_buffer without gdr_map before doing fork.
|
|
*
|
|
* Step:
|
|
* 1. Initilize CUDA and gdrcopy
|
|
* 2. Do gdr_pin_buffer
|
|
* 3. Fork the process
|
|
*
|
|
* 4.P Parent: Waiting for child to exit
|
|
*
|
|
* 4.C Child: Attempt to do gdr_map on the parent's pinned buffer. gdrdrv is
|
|
* expected to prevent this case so that the child process cannot spy on
|
|
* the parent's GPU data.
|
|
*/
|
|
template <gpu_memalloc_fn_t galloc_fn, gpu_memfree_fn_t gfree_fn, filter_fn_t filter_fn>
|
|
void invalidation_fork_child_gdr_map_parent()
|
|
{
|
|
expecting_exception_signal = false;
|
|
MB();
|
|
|
|
const size_t _size = sizeof(int) * 16;
|
|
const size_t size = PAGE_ROUND_UP(_size, GPU_PAGE_SIZE);
|
|
const char *myname;
|
|
|
|
init_cuda(0);
|
|
filter_fn();
|
|
|
|
CUdeviceptr d_A;
|
|
gpu_mem_handle_t mhandle;
|
|
ASSERTDRV(galloc_fn(&mhandle, size, true, true));
|
|
d_A = mhandle.ptr;
|
|
|
|
ASSERTDRV(cuMemsetD8(d_A, 0x95, size));
|
|
ASSERTDRV(cuCtxSynchronize());
|
|
|
|
gdr_t g = gdr_open_safe();
|
|
|
|
gdr_mh_t mh;
|
|
|
|
CUdeviceptr d_ptr = d_A;
|
|
|
|
// tokens are optional in CUDA 6.0
|
|
// wave out the test if GPUDirectRDMA is not enabled
|
|
ASSERT_EQ(gdr_pin_buffer(g, d_ptr, size, 0, 0, &mh), 0);
|
|
ASSERT_NEQ(mh, null_mh);
|
|
|
|
fflush(stdout);
|
|
fflush(stderr);
|
|
|
|
pid_t pid = fork();
|
|
ASSERT(pid >= 0);
|
|
|
|
myname = pid == 0 ? "child" : "parent";
|
|
|
|
print_dbg("%s: Start\n", myname);
|
|
|
|
if (pid == 0) {
|
|
void *bar_ptr = NULL;
|
|
print_dbg("%s: attempting to gdr_map parent's pinned GPU memory\n", myname);
|
|
ASSERT_NEQ(gdr_map(g, mh, &bar_ptr, size), 0);
|
|
print_dbg("%s: cannot do gdr_map as expected\n", myname);
|
|
}
|
|
else {
|
|
int child_exit_status = -EINVAL;
|
|
ASSERT(wait(&child_exit_status) == pid);
|
|
ASSERT_EQ(child_exit_status, EXIT_SUCCESS);
|
|
|
|
ASSERT_EQ(gdr_unpin_buffer(g, mh), 0);
|
|
ASSERTDRV(gfree_fn(&mhandle));
|
|
ASSERT_EQ(gdr_close(g), 0);
|
|
|
|
finalize_cuda(0);
|
|
}
|
|
}
|
|
|
|
GDRCOPY_TEST(invalidation_fork_child_gdr_map_parent_cumemalloc)
|
|
{
|
|
invalidation_fork_child_gdr_map_parent<gpu_mem_alloc, gpu_mem_free, null_filter>();
|
|
}
|
|
|
|
#if CUDA_VERSION >= 11000
|
|
// VMM with GDR support is available from CUDA 11.0
|
|
GDRCOPY_TEST(invalidation_fork_child_gdr_map_parent_vmmalloc)
|
|
{
|
|
invalidation_fork_child_gdr_map_parent<gpu_vmm_alloc, gpu_vmm_free, vmm_filter>();
|
|
}
|
|
#endif
|
|
|
|
/**
|
|
* This unit test verifies that gpuMemFree of one process will not
|
|
* unintentionally invalidate mapping on other processes.
|
|
*
|
|
* Step:
|
|
* 1. Fork
|
|
*
|
|
* 2.P Parent: Init CUDA and gdrcopy, and do gdr_map.
|
|
* 3.P Parent: Wait for child's signal.
|
|
*
|
|
* 2.C Child: Init CUDA and gdrcopy, and do gdr_map.
|
|
* 3.C Child: Do gpuMemFree. This should unmap the gdr_map'ed region.
|
|
* 4.C Child: Signal parent.
|
|
*
|
|
* 4.P Parent: Verify that it can still access its gdr_map'ed region. If gdrdrv
|
|
* does not implement correctly, it might invalidate parent's mapping as
|
|
* well.
|
|
*/
|
|
template <gpu_memalloc_fn_t galloc_fn, gpu_memfree_fn_t gfree_fn, filter_fn_t filter_fn>
|
|
void invalidation_fork_map_and_free()
|
|
{
|
|
expecting_exception_signal = false;
|
|
MB();
|
|
|
|
int filedes_0[2];
|
|
int filedes_1[2];
|
|
int read_fd;
|
|
int write_fd;
|
|
ASSERT_NEQ(pipe(filedes_0), -1);
|
|
ASSERT_NEQ(pipe(filedes_1), -1);
|
|
|
|
srand(time(NULL));
|
|
|
|
const size_t _size = sizeof(int) * 16;
|
|
const size_t size = PAGE_ROUND_UP(_size, GPU_PAGE_SIZE);
|
|
const char *myname;
|
|
|
|
fflush(stdout);
|
|
fflush(stderr);
|
|
|
|
pid_t pid = fork();
|
|
ASSERT(pid >= 0);
|
|
|
|
myname = pid == 0 ? "child" : "parent";
|
|
|
|
print_dbg("%s: Start\n", myname);
|
|
|
|
if (pid == 0) {
|
|
close(filedes_0[0]);
|
|
close(filedes_1[1]);
|
|
|
|
read_fd = filedes_1[0];
|
|
write_fd = filedes_0[1];
|
|
|
|
srand(rand());
|
|
}
|
|
else {
|
|
close(filedes_0[1]);
|
|
close(filedes_1[0]);
|
|
|
|
read_fd = filedes_0[0];
|
|
write_fd = filedes_1[1];
|
|
}
|
|
|
|
int mydata = (rand() % 1000) + 1;
|
|
|
|
init_cuda(0);
|
|
filter_fn();
|
|
|
|
CUdeviceptr d_A;
|
|
gpu_mem_handle_t mhandle;
|
|
ASSERTDRV(galloc_fn(&mhandle, size, true, true));
|
|
d_A = mhandle.ptr;
|
|
|
|
ASSERTDRV(cuMemsetD8(d_A, 0x95, size));
|
|
ASSERTDRV(cuCtxSynchronize());
|
|
|
|
gdr_t g = gdr_open_safe();
|
|
|
|
gdr_mh_t mh;
|
|
|
|
CUdeviceptr d_ptr = d_A;
|
|
|
|
// tokens are optional in CUDA 6.0
|
|
// wave out the test if GPUDirectRDMA is not enabled
|
|
ASSERT_EQ(gdr_pin_buffer(g, d_ptr, size, 0, 0, &mh), 0);
|
|
ASSERT_NEQ(mh, null_mh);
|
|
|
|
void *bar_ptr = NULL;
|
|
ASSERT_EQ(gdr_map(g, mh, &bar_ptr, size), 0);
|
|
|
|
gdr_info_t info;
|
|
ASSERT_EQ(gdr_get_info(g, mh, &info), 0);
|
|
int off = d_ptr - info.va;
|
|
|
|
volatile int *buf_ptr = (volatile int *)((char *)bar_ptr + off);
|
|
|
|
print_dbg("%s: writing buf_ptr[0] with %d\n", myname, mydata);
|
|
buf_ptr[0] = mydata;
|
|
|
|
if (pid == 0) {
|
|
print_dbg("%s: calling gpuMemFree\n", myname);
|
|
ASSERTDRV(gfree_fn(&mhandle));
|
|
|
|
print_dbg("%s: signal parent that I have called gpuMemFree\n", myname);
|
|
int msg = 1;
|
|
ASSERT_EQ(write(write_fd, &msg, sizeof(int)), sizeof(int));
|
|
}
|
|
else {
|
|
int cont = 0;
|
|
do {
|
|
print_dbg("%s: waiting for signal from child\n", myname);
|
|
ASSERT_EQ(read(read_fd, &cont, sizeof(int)), sizeof(int));
|
|
print_dbg("%s: received cont signal %d from child\n", myname, cont);
|
|
} while (cont == 0);
|
|
|
|
print_dbg("%s: trying to read buf_ptr[0]\n", myname);
|
|
int data_from_buf_ptr = buf_ptr[0];
|
|
print_dbg("%s: read buf_ptr[0] get %d\n", myname, data_from_buf_ptr);
|
|
ASSERT_EQ(data_from_buf_ptr, mydata);
|
|
}
|
|
|
|
ASSERT_EQ(gdr_unmap(g, mh, bar_ptr, size), 0);
|
|
ASSERT_EQ(gdr_unpin_buffer(g, mh), 0);
|
|
|
|
if (pid > 0)
|
|
ASSERTDRV(gfree_fn(&mhandle));
|
|
|
|
ASSERT_EQ(gdr_close(g), 0);
|
|
|
|
finalize_cuda(0);
|
|
}
|
|
|
|
GDRCOPY_TEST(invalidation_fork_map_and_free_cumemalloc)
|
|
{
|
|
invalidation_fork_map_and_free<gpu_mem_alloc, gpu_mem_free, null_filter>();
|
|
}
|
|
|
|
#if CUDA_VERSION >= 11000
|
|
// VMM with GDR support is available from CUDA 11.0
|
|
GDRCOPY_TEST(invalidation_fork_map_and_free_vmmalloc)
|
|
{
|
|
invalidation_fork_map_and_free<gpu_vmm_alloc, gpu_vmm_free, vmm_filter>();
|
|
}
|
|
#endif
|
|
|
|
/**
|
|
* Process A can intentionally share fd with Process B through unix socket.
|
|
* This method may lead to sharing mappings of gdrcopy. Since CUDA contexts are
|
|
* not sharable between processes, gdrcopy is also expected to be unsharable.
|
|
* This unit test verifies that gdr_open's fd shared from another process is
|
|
* not usable.
|
|
*
|
|
* Step:
|
|
* 1. Fork
|
|
*
|
|
* 2.P Parent: Init CUDA and gdrcopy.
|
|
* 3.P Parent: Share gdr_open's fd to child through unix socket.
|
|
*
|
|
* 2.C Child: Init CUDA.
|
|
* 3.C Child: Receive the fd from parent.
|
|
* 4.C Child: Attempt to do gdr_pin_buffer using this fd. gdrdrv should not
|
|
* allow it.
|
|
*/
|
|
template <gpu_memalloc_fn_t galloc_fn, gpu_memfree_fn_t gfree_fn, filter_fn_t filter_fn>
|
|
void invalidation_unix_sock_shared_fd_gdr_pin_buffer()
|
|
{
|
|
expecting_exception_signal = false;
|
|
MB();
|
|
|
|
pid_t pid;
|
|
int pair[2];
|
|
int fd = -1;
|
|
|
|
const size_t _size = sizeof(int) * 16;
|
|
const size_t size = PAGE_ROUND_UP(_size, GPU_PAGE_SIZE);
|
|
|
|
ASSERT_EQ(socketpair(PF_UNIX, SOCK_DGRAM, 0, pair), 0);
|
|
|
|
fflush(stdout);
|
|
fflush(stderr);
|
|
|
|
pid = fork();
|
|
ASSERT(pid >= 0);
|
|
const char *myname = pid == 0 ? "child" : "parent";
|
|
|
|
print_dbg("%s: Start\n", myname);
|
|
|
|
init_cuda(0);
|
|
filter_fn();
|
|
|
|
CUdeviceptr d_A;
|
|
gpu_mem_handle_t mhandle;
|
|
ASSERTDRV(galloc_fn(&mhandle, size, true, true));
|
|
d_A = mhandle.ptr;
|
|
|
|
ASSERTDRV(cuMemsetD8(d_A, 0x95, size));
|
|
ASSERTDRV(cuCtxSynchronize());
|
|
|
|
CUdeviceptr d_ptr = d_A;
|
|
|
|
if (pid == 0) {
|
|
close(pair[1]);
|
|
|
|
print_dbg("%s: Receiving fd from parent via unix socket\n", myname);
|
|
fd = recvfd(pair[0]);
|
|
ASSERT(fd >= 0);
|
|
|
|
print_dbg("%s: Got fd %d\n", myname, fd);
|
|
|
|
print_dbg("%s: Converting fd to gdr_t\n", myname);
|
|
struct gdr _g;
|
|
_g.fd = fd;
|
|
gdr_t g = &_g;
|
|
|
|
print_dbg("%s: Trying to do gdr_pin_buffer with the received fd\n", myname);
|
|
gdr_mh_t mh;
|
|
ASSERT_NEQ(gdr_pin_buffer(g, d_ptr, size, 0, 0, &mh), 0);
|
|
print_dbg("%s: Cannot do gdr_pin_buffer with the received fd as expected\n", myname);
|
|
}
|
|
else {
|
|
close(pair[0]);
|
|
|
|
print_dbg("%s: Calling gdr_open\n", myname);
|
|
gdr_t g = gdr_open_safe();
|
|
|
|
fd = g->fd;
|
|
print_dbg("%s: Extracted fd from gdr_t got fd %d\n", myname, fd);
|
|
|
|
print_dbg("%s: Sending fd to child via unix socket\n", myname);
|
|
ASSERT(sendfd(pair[1], fd) >= 0);
|
|
|
|
print_dbg("%s: Waiting for child to finish\n", myname);
|
|
int child_exit_status = -EINVAL;
|
|
ASSERT(wait(&child_exit_status) == pid);
|
|
ASSERT_EQ(child_exit_status, EXIT_SUCCESS);
|
|
}
|
|
|
|
finalize_cuda(0);
|
|
}
|
|
|
|
GDRCOPY_TEST(invalidation_unix_sock_shared_fd_gdr_pin_buffer_cumemalloc)
|
|
{
|
|
invalidation_unix_sock_shared_fd_gdr_pin_buffer<gpu_mem_alloc, gpu_mem_free, null_filter>();
|
|
}
|
|
|
|
#if CUDA_VERSION >= 11000
|
|
// VMM with GDR support is available from CUDA 11.0
|
|
GDRCOPY_TEST(invalidation_unix_sock_shared_fd_gdr_pin_buffer_vmmalloc)
|
|
{
|
|
invalidation_unix_sock_shared_fd_gdr_pin_buffer<gpu_vmm_alloc, gpu_vmm_free, vmm_filter>();
|
|
}
|
|
#endif
|
|
|
|
/**
|
|
* Process A can intentionally share fd with Process B through unix socket.
|
|
* This method may lead to sharing mappings of gdrcopy. Since CUDA contexts are
|
|
* not sharable between processes, gdrcopy is also expected to be unsharable.
|
|
* This unit test verifies that gdr_open's fd shared from another process is
|
|
* not usable.
|
|
*
|
|
* Step:
|
|
* 1. Fork
|
|
*
|
|
* 2.P Parent: Init CUDA and gdrcopy, and do gdr_pin_buffer
|
|
* 3.P Parent: Share gdr_open's fd to child through unix socket.
|
|
* 4.P Parent: Also share the handle returned from gdr_pin_buffer with child.
|
|
*
|
|
* 2.C Child: Init CUDA.
|
|
* 3.C Child: Receive the fd and handle from parent.
|
|
* 4.C Child: Attempt to do gdr_map using this fd and handle. gdrdrv should not
|
|
* allow it.
|
|
*/
|
|
template <gpu_memalloc_fn_t galloc_fn, gpu_memfree_fn_t gfree_fn, filter_fn_t filter_fn>
|
|
void invalidation_unix_sock_shared_fd_gdr_map()
|
|
{
|
|
expecting_exception_signal = false;
|
|
MB();
|
|
|
|
int filedes_0[2];
|
|
int filedes_1[2];
|
|
int read_fd;
|
|
int write_fd;
|
|
ASSERT_NEQ(pipe(filedes_0), -1);
|
|
ASSERT_NEQ(pipe(filedes_1), -1);
|
|
|
|
pid_t pid;
|
|
int pair[2];
|
|
int fd = -1;
|
|
|
|
const size_t _size = sizeof(int) * 16;
|
|
const size_t size = PAGE_ROUND_UP(_size, GPU_PAGE_SIZE);
|
|
|
|
ASSERT_EQ(socketpair(PF_UNIX, SOCK_DGRAM, 0, pair), 0);
|
|
|
|
fflush(stdout);
|
|
fflush(stderr);
|
|
|
|
pid = fork();
|
|
ASSERT(pid >= 0);
|
|
const char *myname = pid == 0 ? "child" : "parent";
|
|
|
|
print_dbg("%s: Start\n", myname);
|
|
if (pid == 0) {
|
|
close(filedes_0[0]);
|
|
close(filedes_1[1]);
|
|
|
|
read_fd = filedes_1[0];
|
|
write_fd = filedes_0[1];
|
|
|
|
srand(rand());
|
|
}
|
|
else {
|
|
close(filedes_0[1]);
|
|
close(filedes_1[0]);
|
|
|
|
read_fd = filedes_0[0];
|
|
write_fd = filedes_1[1];
|
|
}
|
|
|
|
init_cuda(0);
|
|
filter_fn();
|
|
|
|
CUdeviceptr d_A;
|
|
gpu_mem_handle_t mhandle;
|
|
ASSERTDRV(galloc_fn(&mhandle, size, true, true));
|
|
d_A = mhandle.ptr;
|
|
|
|
ASSERTDRV(cuMemsetD8(d_A, 0x95, size));
|
|
ASSERTDRV(cuCtxSynchronize());
|
|
|
|
CUdeviceptr d_ptr = d_A;
|
|
|
|
if (pid == 0) {
|
|
close(pair[1]);
|
|
|
|
print_dbg("%s: Receiving fd from parent via unix socket\n", myname);
|
|
fd = recvfd(pair[0]);
|
|
ASSERT(fd >= 0);
|
|
|
|
print_dbg("%s: Got fd %d\n", myname, fd);
|
|
|
|
print_dbg("%s: Converting fd to gdr_t\n", myname);
|
|
struct gdr _g;
|
|
_g.fd = fd;
|
|
gdr_t g = &_g;
|
|
|
|
print_dbg("%s: Receiving gdr_memh_t from parent\n", myname);
|
|
gdr_memh_t memh;
|
|
ASSERT_EQ(read(read_fd, &memh, sizeof(gdr_memh_t)), sizeof(gdr_memh_t));
|
|
print_dbg("%s: Got handle 0x%lx\n", myname, memh.handle);
|
|
|
|
print_dbg("%s: Converting gdr_memh_t to gdr_mh_t\n", myname);
|
|
gdr_mh_t mh;
|
|
mh.h = (unsigned long)(&memh);
|
|
|
|
print_dbg("%s: Attempting gdr_map\n", myname);
|
|
void *bar_ptr = NULL;
|
|
ASSERT_NEQ(gdr_map(g, mh, &bar_ptr, size), 0);
|
|
print_dbg("%s: Cannot do gdr_map as expected\n", myname);
|
|
}
|
|
else {
|
|
close(pair[0]);
|
|
|
|
print_dbg("%s: Calling gdr_open\n", myname);
|
|
gdr_t g = gdr_open_safe();
|
|
|
|
print_dbg("%s: Calling gdr_pin_buffer\n", myname);
|
|
gdr_mh_t mh;
|
|
ASSERT_EQ(gdr_pin_buffer(g, d_ptr, size, 0, 0, &mh), 0);
|
|
ASSERT_NEQ(mh, null_mh);
|
|
|
|
fd = g->fd;
|
|
print_dbg("%s: Extracted fd from gdr_t got fd %d\n", myname, fd);
|
|
|
|
print_dbg("%s: Sending fd to child via unix socket\n", myname);
|
|
ASSERT(sendfd(pair[1], fd) >= 0);
|
|
|
|
gdr_memh_t *memh = (gdr_memh_t *)mh.h;
|
|
print_dbg("%s: Extracted gdr_memh_t from gdr_mh_t got handle 0x%lx\n", myname, memh->handle);
|
|
|
|
print_dbg("%s: Sending gdr_memh_t to child\n", myname);
|
|
ASSERT_EQ(write(write_fd, memh, sizeof(gdr_memh_t)), sizeof(gdr_memh_t));
|
|
|
|
print_dbg("%s: Waiting for child to finish\n", myname);
|
|
int child_exit_status = -EINVAL;
|
|
ASSERT(wait(&child_exit_status) == pid);
|
|
ASSERT_EQ(child_exit_status, EXIT_SUCCESS);
|
|
}
|
|
|
|
finalize_cuda(0);
|
|
}
|
|
|
|
GDRCOPY_TEST(invalidation_unix_sock_shared_fd_gdr_map_cumemalloc)
|
|
{
|
|
invalidation_unix_sock_shared_fd_gdr_map<gpu_mem_alloc, gpu_mem_free, null_filter>();
|
|
}
|
|
|
|
#if CUDA_VERSION >= 11000
|
|
// VMM with GDR support is available from CUDA 11.0
|
|
GDRCOPY_TEST(invalidation_unix_sock_shared_fd_gdr_map_vmmalloc)
|
|
{
|
|
invalidation_unix_sock_shared_fd_gdr_map<gpu_vmm_alloc, gpu_vmm_free, vmm_filter>();
|
|
}
|
|
#endif
|
|
|
|
/**
|
|
* Although the use of P2P tokens has been marked as depricated, CUDA still
|
|
* supports it. This unit test ensures that Process A cannot access GPU memory
|
|
* of Process B by using tokens, which can be bruteforcedly generated.
|
|
*
|
|
* Step:
|
|
* 1. Fork the process
|
|
*
|
|
* 2.P Parent: Allocate GPU memory and get tokens.
|
|
* 3.P Parent: Send the cuMemAlloc'd ptr and the tokens to Child.
|
|
* 4.P Parent: Waiting for Child to exit.
|
|
*
|
|
* 2.C Child: Waiting for ptr and tokens from Parent
|
|
* 3.C Child: Attempt gdr_pin_buffer with the ptr and tokens. We expect that
|
|
* gdr_pin_buffer would fail
|
|
*/
|
|
GDRCOPY_TEST(invalidation_fork_child_gdr_pin_parent_with_tokens)
|
|
{
|
|
expecting_exception_signal = false;
|
|
MB();
|
|
|
|
int filedes_0[2];
|
|
int filedes_1[2];
|
|
int read_fd;
|
|
int write_fd;
|
|
ASSERT_NEQ(pipe(filedes_0), -1);
|
|
ASSERT_NEQ(pipe(filedes_1), -1);
|
|
|
|
const size_t _size = sizeof(int) * 16;
|
|
const size_t size = PAGE_ROUND_UP(_size, GPU_PAGE_SIZE);
|
|
const char *myname;
|
|
|
|
fflush(stdout);
|
|
fflush(stderr);
|
|
|
|
CUdeviceptr d_A;
|
|
CUDA_POINTER_ATTRIBUTE_P2P_TOKENS tokens = {0,0};
|
|
|
|
pid_t pid = fork();
|
|
ASSERT(pid >= 0);
|
|
|
|
myname = pid == 0 ? "child" : "parent";
|
|
|
|
print_dbg("%s: Start\n", myname);
|
|
|
|
if (pid == 0) {
|
|
close(filedes_0[0]);
|
|
close(filedes_1[1]);
|
|
|
|
read_fd = filedes_1[0];
|
|
write_fd = filedes_0[1];
|
|
|
|
gdr_t g = gdr_open_safe();
|
|
|
|
ASSERT_EQ(read(read_fd, &d_A, sizeof(CUdeviceptr)), sizeof(CUdeviceptr));
|
|
ASSERT_EQ(read(read_fd, &tokens, sizeof(CUDA_POINTER_ATTRIBUTE_P2P_TOKENS)), sizeof(CUDA_POINTER_ATTRIBUTE_P2P_TOKENS));
|
|
|
|
print_dbg("%s: Received from parent tokens.p2pToken %llu, tokens.vaSpaceToken %u\n", myname, tokens.p2pToken, tokens.vaSpaceToken);
|
|
|
|
gdr_mh_t mh;
|
|
|
|
CUdeviceptr d_ptr = d_A;
|
|
|
|
ASSERT_NEQ(gdr_pin_buffer(g, d_ptr, size, tokens.p2pToken, tokens.vaSpaceToken, &mh), 0);
|
|
}
|
|
else {
|
|
close(filedes_0[1]);
|
|
close(filedes_1[0]);
|
|
|
|
read_fd = filedes_0[0];
|
|
write_fd = filedes_1[1];
|
|
|
|
init_cuda(0);
|
|
|
|
gpu_mem_handle_t mhandle;
|
|
ASSERTDRV(gpu_mem_alloc(&mhandle, size, true, true));
|
|
d_A = mhandle.ptr;
|
|
|
|
ASSERTDRV(cuPointerGetAttribute(&tokens, CU_POINTER_ATTRIBUTE_P2P_TOKENS, d_A));
|
|
|
|
print_dbg("%s: CUDA generated tokens.p2pToken %llu, tokens.vaSpaceToken %u\n", myname, tokens.p2pToken, tokens.vaSpaceToken);
|
|
|
|
ASSERT_EQ(write(write_fd, &d_A, sizeof(CUdeviceptr)), sizeof(CUdeviceptr));
|
|
ASSERT_EQ(write(write_fd, &tokens, sizeof(CUDA_POINTER_ATTRIBUTE_P2P_TOKENS)), sizeof(CUDA_POINTER_ATTRIBUTE_P2P_TOKENS));
|
|
|
|
int child_exit_status = -EINVAL;
|
|
ASSERT(wait(&child_exit_status) == pid);
|
|
ASSERT_EQ(child_exit_status, EXIT_SUCCESS);
|
|
|
|
ASSERTDRV(gpu_mem_free(&mhandle));
|
|
|
|
finalize_cuda(0);
|
|
}
|
|
}
|
|
|
|
|
|
struct mt_test_info {
|
|
gpu_mem_handle_t mhandle;
|
|
CUdeviceptr d_buf;
|
|
void *mapped_d_buf;
|
|
size_t size;
|
|
gdr_t g;
|
|
gdr_mh_t mh;
|
|
bool use_barrier;
|
|
pthread_barrier_t barrier;
|
|
gpu_memfree_fn_t gfree_fn;
|
|
};
|
|
|
|
void *thr_fun_setup(void *data)
|
|
{
|
|
mt_test_info *pt = static_cast<mt_test_info*>(data);
|
|
ASSERT(pt);
|
|
print_dbg("pinning\n");
|
|
ASSERT_EQ(gdr_pin_buffer(pt->g, pt->d_buf, pt->size, 0, 0, &pt->mh), 0);
|
|
ASSERT_NEQ(pt->mh, null_mh);
|
|
print_dbg("mapping\n");
|
|
ASSERT_EQ(gdr_map(pt->g, pt->mh, &pt->mapped_d_buf, pt->size), 0);
|
|
if (pt->use_barrier)
|
|
pthread_barrier_wait(&pt->barrier);
|
|
return NULL;
|
|
}
|
|
|
|
void *thr_fun_teardown(void *data)
|
|
{
|
|
mt_test_info *pt = static_cast<mt_test_info*>(data);
|
|
ASSERT(pt);
|
|
if (pt->use_barrier)
|
|
pthread_barrier_wait(&pt->barrier);
|
|
print_dbg("unmapping\n");
|
|
ASSERT_EQ(gdr_unmap(pt->g, pt->mh, pt->mapped_d_buf, pt->size), 0);
|
|
pt->mapped_d_buf = 0;
|
|
print_dbg("unpinning\n");
|
|
ASSERT_EQ(gdr_unpin_buffer(pt->g, pt->mh), 0);
|
|
pt->mh = null_mh;
|
|
return NULL;
|
|
}
|
|
|
|
void *thr_fun_combined(void *data)
|
|
{
|
|
mt_test_info *pt = static_cast<mt_test_info*>(data);
|
|
ASSERT(pt);
|
|
ASSERT(!pt->use_barrier);
|
|
thr_fun_setup(data);
|
|
thr_fun_teardown(data);
|
|
return NULL;
|
|
}
|
|
|
|
void *thr_fun_cleanup(void *data)
|
|
{
|
|
mt_test_info *pt = static_cast<mt_test_info*>(data);
|
|
ASSERT(pt);
|
|
ASSERT_EQ(gdr_close(pt->g), 0);
|
|
pt->g = 0;
|
|
ASSERTDRV(pt->gfree_fn(&pt->mhandle));
|
|
pt->d_buf = 0;
|
|
return NULL;
|
|
}
|
|
|
|
template <gpu_memalloc_fn_t galloc_fn, gpu_memfree_fn_t gfree_fn, filter_fn_t filter_fn>
|
|
void basic_child_thread_pins_buffer()
|
|
{
|
|
const size_t _size = GPU_PAGE_SIZE * 16;
|
|
mt_test_info t;
|
|
memset(&t, 0, sizeof(mt_test_info));
|
|
t.size = PAGE_ROUND_UP(_size, GPU_PAGE_SIZE);
|
|
|
|
init_cuda(0);
|
|
filter_fn();
|
|
|
|
t.gfree_fn = gfree_fn;
|
|
|
|
ASSERTDRV(galloc_fn(&t.mhandle, t.size, true, true));
|
|
t.d_buf = t.mhandle.ptr;
|
|
|
|
ASSERTDRV(cuMemsetD8(t.d_buf, 0xA5, t.size));
|
|
ASSERTDRV(cuCtxSynchronize());
|
|
|
|
t.g = gdr_open_safe();
|
|
{
|
|
pthread_t tid;
|
|
t.use_barrier = false;
|
|
print_dbg("spawning single child thread\n");
|
|
ASSERT_EQ(pthread_create(&tid, NULL, thr_fun_combined, &t), 0);
|
|
ASSERT_EQ(pthread_join(tid, NULL), 0);
|
|
}
|
|
{
|
|
pthread_t tid[2];
|
|
ASSERT_EQ(pthread_barrier_init(&t.barrier, NULL, 2), 0);
|
|
t.use_barrier = true;
|
|
print_dbg("spawning two children threads, splitting setup and teardown\n");
|
|
ASSERT_EQ(pthread_create(&tid[0], NULL, thr_fun_setup, &t), 0);
|
|
ASSERT_EQ(pthread_create(&tid[1], NULL, thr_fun_teardown, &t), 0);
|
|
ASSERT_EQ(pthread_join(tid[0], NULL), 0);
|
|
ASSERT_EQ(pthread_join(tid[1], NULL), 0);
|
|
}
|
|
{
|
|
pthread_t tid[2];
|
|
t.use_barrier = false;
|
|
mt_test_info t2 = t;
|
|
print_dbg("spawning two children threads, concurrently pinning and mapping the same buffer\n");
|
|
ASSERT_EQ(pthread_create(&tid[0], NULL, thr_fun_combined, &t), 0);
|
|
ASSERT_EQ(pthread_create(&tid[1], NULL, thr_fun_combined, &t2), 0);
|
|
ASSERT_EQ(pthread_join(tid[0], NULL), 0);
|
|
ASSERT_EQ(pthread_join(tid[1], NULL), 0);
|
|
}
|
|
{
|
|
pthread_t tid;
|
|
print_dbg("spawning cleanup child thread\n");
|
|
ASSERT_EQ(pthread_create(&tid, NULL, thr_fun_cleanup, &t), 0);
|
|
ASSERT_EQ(pthread_join(tid, NULL), 0);
|
|
}
|
|
finalize_cuda(0);
|
|
}
|
|
|
|
GDRCOPY_TEST(basic_child_thread_pins_buffer_cumemalloc)
|
|
{
|
|
basic_child_thread_pins_buffer<gpu_mem_alloc, gpu_mem_free, null_filter>();
|
|
}
|
|
|
|
#if CUDA_VERSION >= 11000
|
|
// VMM with GDR support is available from CUDA 11.0
|
|
GDRCOPY_TEST(basic_child_thread_pins_buffer_vmmalloc)
|
|
{
|
|
basic_child_thread_pins_buffer<gpu_vmm_alloc, gpu_vmm_free, vmm_filter>();
|
|
}
|
|
#endif
|
|
|
|
void print_usage(const char *path)
|
|
{
|
|
cout << "Usage: " << path << " [-h][-v][-s][-l][-t <test>]" << endl;
|
|
cout << endl;
|
|
cout << "Options:" << endl;
|
|
cout << " -h Print this help text." << endl;
|
|
cout << " -v Increase report verbosity." << endl;
|
|
cout << " -s DON'T print summary report." << endl;
|
|
cout << " -l List all available tests." << endl;
|
|
cout << " -t <test> Run the specified test only." << endl;
|
|
}
|
|
|
|
void print_all_tests()
|
|
{
|
|
vector<string> tests;
|
|
gdrcopy::testsuite::get_all_test_names(tests);
|
|
cout << "List of all available tests:" << endl;
|
|
for (vector<string>::iterator it = tests.begin(); it != tests.end(); ++it)
|
|
cout << " " << *it << endl;
|
|
}
|
|
|
|
int main(int argc, char *argv[])
|
|
{
|
|
int c;
|
|
bool print_summary = true;
|
|
int status;
|
|
vector<string> tests;
|
|
|
|
while ((c = getopt(argc, argv, "hvslt:")) != -1) {
|
|
switch (c) {
|
|
case 'h':
|
|
print_usage(argv[0]);
|
|
return EXIT_SUCCESS;
|
|
case 'v':
|
|
gdrcopy::test::print_dbg_msg = true;
|
|
break;
|
|
case 's':
|
|
print_summary = false;
|
|
break;
|
|
case 'l':
|
|
print_all_tests();
|
|
return EXIT_SUCCESS;
|
|
case 't':
|
|
tests.emplace_back(optarg);
|
|
break;
|
|
default:
|
|
cerr << "Invalid option" << endl;
|
|
return EXIT_FAILURE;
|
|
}
|
|
}
|
|
|
|
if (tests.size() > 0)
|
|
status = gdrcopy::testsuite::run_tests(print_summary, tests);
|
|
else
|
|
status = gdrcopy::testsuite::run_all_tests(print_summary);
|
|
if (status) {
|
|
cerr << "Error: Encountered an error or a test failure with status=" << status << endl;
|
|
return EXIT_FAILURE;
|
|
}
|
|
|
|
return EXIT_SUCCESS;
|
|
}
|
|
|
|
|
|
/*
|
|
* Local variables:
|
|
* c-indent-level: 4
|
|
* c-basic-offset: 4
|
|
* tab-width: 4
|
|
* indent-tabs-mode: nil
|
|
* End:
|
|
*/
|