sglang_v0.5.2/gdrcopy/tests/sanity.cpp

/*
 * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */

#include <ctype.h>
#include <signal.h>
#include <stdlib.h>
#include <memory.h>
#include <stdio.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <cuda.h>
#include <errno.h>
#include <sys/queue.h>

#include <iostream>
#include <string>
#include <vector>

using namespace std;

#include "gdrapi.h"
#include "gdrapi_internal.h"
#include "gdrconfig.h"
#include "common.hpp"
#include "testsuites/testsuite.hpp"

using namespace gdrcopy::test;

volatile bool expecting_exception_signal = false;

void exception_signal_handle(int sig)
{
    if (expecting_exception_signal) {
        print_dbg("Get signal %d as expected\n", sig);
        exit(EXIT_SUCCESS);
    }
    print_dbg("Unexpectedly get exception signal");
}

void init_cuda(int dev_id)
{
    CUdevice dev;
    CUcontext dev_ctx;
    ASSERTDRV(cuInit(0));
    ASSERTDRV(cuDeviceGet(&dev, dev_id));

    ASSERTDRV(cuDevicePrimaryCtxRetain(&dev_ctx, dev));
    ASSERTDRV(cuCtxSetCurrent(dev_ctx));

    ASSERT_EQ(check_gdr_support(dev), true);
}

void finalize_cuda(int dev_id)
{
    CUdevice dev;
    ASSERTDRV(cuDeviceGet(&dev, dev_id));
    ASSERTDRV(cuDevicePrimaryCtxRelease(dev));
}

typedef void (*filter_fn_t)();

void null_filter()
{
    // NO-OP.
}

#if CUDA_VERSION >= 11000
/**
 * Waive the test if VMM is not supported.
 * Must be called after init_cuda.
 */
void vmm_filter()
{
    int version;
    ASSERTDRV(cuDriverGetVersion(&version));
    if (version < 11000)
        exit(EXIT_WAIVED);
}
#else
void vmm_filter()
{
    exit(EXIT_WAIVED);
}
#endif

/**
 * Sends given file descriptior via given socket
 *
 * @param socket to be used for fd sending
 * @param fd to be sent
 * @return sendmsg result
 *
 * @note socket should be (PF_UNIX, SOCK_DGRAM)
 */
int sendfd(int socket, int fd)
{
    char dummy = '$';
    struct msghdr msg;
    struct iovec iov;

    char cmsgbuf[CMSG_SPACE(sizeof(int))];

    iov.iov_base = &dummy;
    iov.iov_len = sizeof(dummy);

    msg.msg_name = NULL;
    msg.msg_namelen = 0;
    msg.msg_iov = &iov;
    msg.msg_iovlen = 1;
    msg.msg_flags = 0;
    msg.msg_control = cmsgbuf;
    msg.msg_controllen = CMSG_LEN(sizeof(int));

    struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
    cmsg->cmsg_level = SOL_SOCKET;
    cmsg->cmsg_type = SCM_RIGHTS;
    cmsg->cmsg_len = CMSG_LEN(sizeof(int));

    *(int*) CMSG_DATA(cmsg) = fd;

    int ret = sendmsg(socket, &msg, 0);

    if (ret == -1) {
        print_dbg("sendmsg failed with %s", strerror(errno));
    }

    return ret;
}

/**
 * Receives file descriptor using given socket
 *
 * @param socket to be used for fd recepion
 * @return received file descriptor; -1 if failed
 *
 * @note socket should be (PF_UNIX, SOCK_DGRAM)
 */
int recvfd(int socket)
{
    int len;
    int fd;
    char buf[1];
    struct iovec iov;
    struct msghdr msg;
    struct cmsghdr *cmsg;
    char cms[CMSG_SPACE(sizeof(int))];

    iov.iov_base = buf;
    iov.iov_len = sizeof(buf);

    msg.msg_name = 0;
    msg.msg_namelen = 0;
    msg.msg_iov = &iov;
    msg.msg_iovlen = 1;
    msg.msg_flags = 0;
    msg.msg_control = (caddr_t) cms;
    msg.msg_controllen = sizeof cms;

    len = recvmsg(socket, &msg, 0);

    if (len < 0) {
        print_dbg("recvmsg failed with %s", strerror(errno));
        return -1;
    }

    if (len == 0) {
        print_dbg("recvmsg failed no data");
        return -1;
    }

    cmsg = CMSG_FIRSTHDR(&msg);
    memmove(&fd, CMSG_DATA(cmsg), sizeof(int));
    return fd;
}

template <gpu_memalloc_fn_t galloc_fn, gpu_memfree_fn_t gfree_fn, filter_fn_t filter_fn>
void basic()
{
    expecting_exception_signal = false;
    MB();

    init_cuda(0);
    filter_fn();

    const size_t _size = 256*1024+16;
    const size_t size = PAGE_ROUND_UP(_size, GPU_PAGE_SIZE);

    print_dbg("buffer size: %zu\n", size);
    CUdeviceptr d_A;
    gpu_mem_handle_t mhandle;
    ASSERTDRV(galloc_fn(&mhandle, size, true, true));
    d_A = mhandle.ptr;

    gdr_t g = gdr_open_safe();

    gdr_mh_t mh = null_mh;
    CUdeviceptr d_ptr = d_A;

    // tokens are optional in CUDA 6.0
    // wave out the test if GPUDirectRDMA is not enabled
    ASSERT_EQ(gdr_pin_buffer(g, d_ptr, size, 0, 0, &mh), 0);
    ASSERT_NEQ(mh, null_mh);
    ASSERT_EQ(gdr_unpin_buffer(g, mh), 0);
    ASSERT_EQ(gdr_close(g), 0);

    ASSERTDRV(gfree_fn(&mhandle));

    finalize_cuda(0);
}

GDRCOPY_TEST(basic_cumemalloc)
{
    basic<gpu_mem_alloc, gpu_mem_free, null_filter>();
}

#if CUDA_VERSION >= 11000
// VMM with GDR support is available from CUDA 11.0
GDRCOPY_TEST(basic_vmmalloc)
{
    basic<gpu_vmm_alloc, gpu_vmm_free, vmm_filter>();
}
#endif

GDRCOPY_TEST(basic_with_tokens)
{
    expecting_exception_signal = false;
    MB();

    init_cuda(0);

    const size_t _size = 256*1024+16;
    const size_t size = PAGE_ROUND_UP(_size, GPU_PAGE_SIZE);

    print_dbg("buffer size: %zu\n", size);

    CUdeviceptr d_A;
    gpu_mem_handle_t mhandle;
    CUDA_POINTER_ATTRIBUTE_P2P_TOKENS tokens = {0,0};

    // Token does not work with cuMemCreate
    ASSERTDRV(gpu_mem_alloc(&mhandle, size, true, true));
    d_A = mhandle.ptr;

    ASSERTDRV(cuPointerGetAttribute(&tokens, CU_POINTER_ATTRIBUTE_P2P_TOKENS, d_A));

    gdr_t g = gdr_open_safe();

    gdr_mh_t mh = null_mh;
    CUdeviceptr d_ptr = d_A;

    ASSERT_EQ(gdr_pin_buffer(g, d_ptr, size, tokens.p2pToken, tokens.vaSpaceToken, &mh), 0);
    ASSERT_NEQ(mh, null_mh);
    ASSERT_EQ(gdr_unpin_buffer(g, mh), 0);
    ASSERT_EQ(gdr_close(g), 0);

    ASSERTDRV(gpu_mem_free(&mhandle));

    finalize_cuda(0);
}

/**
 * This unit test ensures that gdrcopy returns error when trying to map
 * unaligned addresses. In addition, it tests that mapping hand-aligned
 * addresses by users are successful.
 *
 * cuMemCreate + cuMemMap always return an aligned address. So, this test is
 * for cuMemAlloc only.
 *
 */
GDRCOPY_TEST(basic_unaligned_mapping)
{
    expecting_exception_signal = false;
    MB();

    init_cuda(0);

    // Allocate for a few bytes so that cuMemAlloc returns an unaligned address
    // in the next allocation. This behavior is observed in GPU Driver 410 and
    // above.
    const size_t fa_size = 4;
    CUdeviceptr d_fa;
    gpu_mem_handle_t fa_mhandle;
    ASSERTDRV(gpu_mem_alloc(&fa_mhandle, fa_size, true, true));
    d_fa = fa_mhandle.ptr;
    print_dbg("First allocation: d_fa=0x%llx, size=%zu\n", d_fa, fa_size);

    const size_t A_size = GPU_PAGE_SIZE + sizeof(int);

    const int retry = 10;
    int cnt = 0;

    CUdeviceptr d_A, d_A_boundary;
    gpu_mem_handle_t A_mhandle[retry];

    // Try until we get an unaligned address. Give up after cnt times.
    for (cnt = 0; cnt < retry; ++cnt) {
        ASSERTDRV(gpu_mem_alloc(&A_mhandle[cnt], A_size, false, true));
        d_A = A_mhandle[cnt].ptr;
        d_A_boundary = d_A & GPU_PAGE_MASK;
        if (d_A != d_A_boundary) {
            ++cnt;
            break;
        }
    }
    print_dbg("Second allocation: d_A=0x%llx, size=%zu, GPU-page-boundary 0x%llx\n", d_A, A_size, d_A_boundary);
    if (d_A == d_A_boundary) {
        print_dbg("d_A is aligned. Waiving this test.\n");
        for (int i = 0; i < cnt; ++i)
            ASSERTDRV(gpu_mem_free(&A_mhandle[i]));

        exit(EXIT_WAIVED);
    }
    print_dbg("d_A is unaligned\n");

    gdr_t g = gdr_open_safe();

    // Try mapping with unaligned address. This should fail.
    print_dbg("Try mapping d_A as is.\n");
    gdr_mh_t A_mh = null_mh;

    ASSERT_EQ(gdr_pin_buffer(g, d_A, A_size, 0, 0, &A_mh), 0);
    ASSERT_NEQ(A_mh, null_mh);

    void *A_bar_ptr  = NULL;
    // Expect gdr_map to fail with unaligned address
    ASSERT_NEQ(gdr_map(g, A_mh, &A_bar_ptr, A_size), 0);
    ASSERT_EQ(gdr_unpin_buffer(g, A_mh), 0);
    print_dbg("Mapping d_A failed as expected.\n");

    print_dbg("Align d_A and try mapping it again.\n");
    // In order to align d_A, we move to the next GPU page. The reason is that
    // the first GPU page may belong to another allocation.
    CUdeviceptr d_aligned_A = PAGE_ROUND_UP(d_A, GPU_PAGE_SIZE);
    off_t aligned_A_offset = d_aligned_A - d_A;
    size_t aligned_A_size = A_size - aligned_A_offset;

    print_dbg("Pin and map aligned address: d_aligned_A=0x%llx, offset=%lld, size=%zu\n", d_aligned_A, aligned_A_offset, aligned_A_size);

    gdr_mh_t aligned_A_mh = null_mh;
    void *aligned_A_bar_ptr = NULL;
    ASSERT_EQ(gdr_pin_buffer(g, d_aligned_A, aligned_A_size, 0, 0, &aligned_A_mh), 0);
    ASSERT_NEQ(aligned_A_mh, null_mh);
    // expect gdr_map to success
    ASSERT_EQ(gdr_map(g, aligned_A_mh, &aligned_A_bar_ptr, aligned_A_size), 0);

    // Test accessing the mapping
    int *aligned_A_map_ptr = (int *)aligned_A_bar_ptr;
    aligned_A_map_ptr[0] = 7;

    // The first allocation and d_A should share a GPU page. We should make
    // sure that freeing the first allocation would not accidentally unmap
    // d_aligned_A as the d_aligned_A mapping starts from the next GPU page.
    gdr_mh_t fa_mh = null_mh;
    ASSERT_EQ(gdr_pin_buffer(g, d_fa, fa_size, 0, 0, &fa_mh), 0);
    ASSERT_NEQ(fa_mh, null_mh);

    void *fa_bar_ptr = NULL;
    ASSERT_EQ(gdr_map(g, fa_mh, &fa_bar_ptr, fa_size), 0);

    ASSERTDRV(gpu_mem_free(&fa_mhandle));

    // Test accessing aligned_A_map_ptr again. This should not cause segmentation fault.
    aligned_A_map_ptr[0] = 9;

    ASSERT_EQ(gdr_unpin_buffer(g, aligned_A_mh), 0);
    ASSERT_EQ(gdr_close(g), 0);

    for (int i = 0; i < cnt; ++i)
        ASSERTDRV(gpu_mem_free(&A_mhandle[i]));

    finalize_cuda(0);
}

/**
 * This unit test is for catching issue-244
 * (https://github.com/NVIDIA/gdrcopy/issues/244).  The bug occurs when the
 * first buffer is smaller than the GPU page size and the second buffer is
 * within the same page.  We expect to be able to map the first buffer. The
 * second buffer cannot be mapped because it is not aligned.
 *
 * cuMemCreate + cuMemMap always return an aligned address. So, this test is
 * for cuMemAlloc only.
 *
 */
GDRCOPY_TEST(basic_small_buffers_mapping)
{
    expecting_exception_signal = false;
    MB();

    init_cuda(0);

    const size_t fa_size = GPU_PAGE_SIZE;
    CUdeviceptr d_fa;
    gpu_mem_handle_t fa_mhandle;
    ASSERTDRV(gpu_mem_alloc(&fa_mhandle, fa_size, true, true));
    d_fa = fa_mhandle.ptr;
    print_dbg("Allocated d_fa=%#llx, size=%zu\n", d_fa, fa_size);

    const size_t buffer_size = sizeof(uint64_t);
    CUdeviceptr d_A[2];
    d_A[0] = d_fa;
    d_A[1] = d_fa + buffer_size;

    gdr_t g = gdr_open_safe();

    // Pin both buffers.
    print_dbg("Try pinning d_A[0] and d_A[1].\n");
    gdr_mh_t A_mh[2];
    A_mh[0] = null_mh;
    A_mh[1] = null_mh;

    ASSERT_EQ(gdr_pin_buffer(g, d_A[0], buffer_size, 0, 0, &A_mh[0]), 0);
    ASSERT_EQ(gdr_pin_buffer(g, d_A[1], buffer_size, 0, 0, &A_mh[1]), 0);
    ASSERT_NEQ(A_mh[0], null_mh);
    ASSERT_NEQ(A_mh[1], null_mh);

    void *A_bar_ptr[2];
    A_bar_ptr[0] = NULL;
    A_bar_ptr[1] = NULL;

    // Expect gdr_map to pass
    ASSERT_EQ(gdr_map(g, A_mh[0], &A_bar_ptr[0], buffer_size), 0);
    print_dbg("Mapping d_A[0] passed as expected.\n");

    // Expect gdr_map to fail due to unaligned mapping
    ASSERT_NEQ(gdr_map(g, A_mh[1], &A_bar_ptr[1], buffer_size), 0);
    print_dbg("Mapping d_A[1] failed as expected.\n");

    ASSERT_EQ(gdr_unmap(g, A_mh[0], A_bar_ptr[0], buffer_size), 0);

    ASSERT_EQ(gdr_unpin_buffer(g, A_mh[0]), 0);
    ASSERT_EQ(gdr_unpin_buffer(g, A_mh[1]), 0);

    ASSERT_EQ(gdr_close(g), 0);

    ASSERTDRV(gpu_mem_free(&fa_mhandle));

    finalize_cuda(0);
}

template <gpu_memalloc_fn_t galloc_fn, gpu_memfree_fn_t gfree_fn, filter_fn_t filter_fn>
void data_validation()
{
    expecting_exception_signal = false;
    MB();

    init_cuda(0);
    filter_fn();

    const size_t _size = 256*1024+16;
    const size_t size = PAGE_ROUND_UP(_size, GPU_PAGE_SIZE);

    print_dbg("buffer size: %zu\n", size);
    CUdeviceptr d_A;
    gpu_mem_handle_t mhandle;
    ASSERTDRV(galloc_fn(&mhandle, size, true, true));
    d_A = mhandle.ptr;

    ASSERTDRV(cuMemsetD8(d_A, 0xA5, size));
    ASSERTDRV(cuCtxSynchronize());

    uint32_t *init_buf = new uint32_t[size / sizeof(uint32_t)];
    uint32_t *copy_buf = new uint32_t[size / sizeof(uint32_t)];

    init_hbuf_walking_bit(init_buf, size);
    memset(copy_buf, 0xA5, size);

    gdr_t g = gdr_open_safe();

    gdr_mh_t mh;

    CUdeviceptr d_ptr = d_A;

    // tokens are optional in CUDA 6.0
    // wave out the test if GPUDirectRDMA is not enabled
    ASSERT_EQ(gdr_pin_buffer(g, d_ptr, size, 0, 0, &mh), 0);
    ASSERT_NEQ(mh, null_mh);

    gdr_info_t info;
    ASSERT_EQ(gdr_get_info(g, mh, &info), 0);
    ASSERT(!info.mapped);

    void *bar_ptr  = NULL;
    ASSERT_EQ(gdr_map(g, mh, &bar_ptr, size), 0);

    ASSERT_EQ(gdr_get_info(g, mh, &info), 0);
    ASSERT(info.mapped);
    int off = d_ptr - info.va;
    print_dbg("off: %d\n", off);

    uint32_t *buf_ptr = (uint32_t *)((char *)bar_ptr + off);

    print_dbg("check 1: MMIO CPU initialization + read back via cuMemcpy D->H\n");
    init_hbuf_walking_bit(buf_ptr, size);
    ASSERTDRV(cuMemcpyDtoH(copy_buf, d_ptr, size));
    ASSERT_EQ(compare_buf(init_buf, copy_buf, size), 0);
    memset(copy_buf, 0xA5, size);
    ASSERTDRV(cuMemsetD8(d_A, 0xA5, size));
    ASSERTDRV(cuCtxSynchronize());

    print_dbg("check 2: gdr_copy_to_bar() + read back via cuMemcpy D->H\n");
    gdr_copy_to_mapping(mh, buf_ptr, init_buf, size);
    ASSERTDRV(cuMemcpyDtoH(copy_buf, d_ptr, size));
    ASSERT_EQ(compare_buf(init_buf, copy_buf, size), 0);
    memset(copy_buf, 0xA5, size);
    ASSERTDRV(cuMemsetD8(d_A, 0xA5, size));
    ASSERTDRV(cuCtxSynchronize());

    print_dbg("check 3: gdr_copy_to_bar() + read back via gdr_copy_from_bar()\n");
    gdr_copy_to_mapping(mh, buf_ptr, init_buf, size);
    gdr_copy_from_mapping(mh, copy_buf, buf_ptr, size);
    ASSERT_EQ(compare_buf(init_buf, copy_buf, size), 0);
    memset(copy_buf, 0xA5, size);
    ASSERTDRV(cuMemsetD8(d_A, 0xA5, size));
    ASSERTDRV(cuCtxSynchronize());

    int offset_array[] = { 1, 2, 3, 4, 5, 6, 7, 11, 129, 1023 };

    for (int i = 0; i < sizeof(offset_array) / sizeof(offset_array[0]); ++i) {
        int extra_dwords = offset_array[i];
        int extra_off = extra_dwords * sizeof(uint32_t);
        print_dbg("check 4.%d: gdr_copy_to_bar() + read back via gdr_copy_from_bar() + %d dwords offset on mapping\n", i, extra_dwords);
        gdr_copy_to_mapping(mh, buf_ptr + extra_dwords, init_buf, size - extra_off);
        gdr_copy_from_mapping(mh, copy_buf, buf_ptr + extra_dwords, size - extra_off);
        ASSERT_EQ(compare_buf(init_buf, copy_buf, size - extra_off), 0);
        memset(copy_buf, 0xA5, size);
        ASSERTDRV(cuMemsetD8(d_A, 0xA5, size));
        ASSERTDRV(cuCtxSynchronize());

        extra_off = offset_array[i];
        print_dbg("check 5.%d: gdr_copy_to_bar() + read back via gdr_copy_from_bar() + %d bytes offset on mapping\n", i, extra_off);
        gdr_copy_to_mapping(mh, (char*)buf_ptr + extra_off, init_buf, size - extra_off);
        gdr_copy_from_mapping(mh, copy_buf, (char*)buf_ptr + extra_off, size - extra_off);
        ASSERT_EQ(compare_buf(init_buf, copy_buf, size - extra_off), 0);
        memset(copy_buf, 0xA5, size);
        ASSERTDRV(cuMemsetD8(d_A, 0xA5, size));
        ASSERTDRV(cuCtxSynchronize());

        extra_dwords = offset_array[i];
        extra_off = extra_dwords * sizeof(uint32_t);
        print_dbg("check 6.%d: gdr_copy_to_bar() + read back via gdr_copy_from_bar() + %d dwords offset on host buffer\n", i, extra_dwords);
        gdr_copy_to_mapping(mh, buf_ptr, init_buf + extra_dwords, size - extra_off);
        gdr_copy_from_mapping(mh, copy_buf + extra_dwords, buf_ptr, size - extra_off);
        ASSERT_EQ(compare_buf(init_buf + extra_dwords, copy_buf + extra_dwords, size - extra_off), 0);
        memset(copy_buf, 0xA5, size);
        ASSERTDRV(cuMemsetD8(d_A, 0xA5, size));
        ASSERTDRV(cuCtxSynchronize());

        extra_off = offset_array[i];
        print_dbg("check 7.%d: gdr_copy_to_bar() + read back via gdr_copy_from_bar() + %d bytes offset on host buffer\n", i, extra_off);
        gdr_copy_to_mapping(mh, buf_ptr, (char *)init_buf + extra_off, size - extra_off);
        gdr_copy_from_mapping(mh, (char *)copy_buf + extra_off, buf_ptr, size - extra_off);
        ASSERT_EQ(compare_buf((uint32_t *)((uintptr_t)init_buf + extra_off), (uint32_t *)((uintptr_t)copy_buf + extra_off), size - extra_off), 0);
        memset(copy_buf, 0xA5, size);
        ASSERTDRV(cuMemsetD8(d_A, 0xA5, size));
        ASSERTDRV(cuCtxSynchronize());

        extra_dwords = offset_array[i];
        extra_off = extra_dwords * sizeof(uint32_t);
        print_dbg("check 8.%d: gdr_copy_to_bar() + read back via gdr_copy_from_bar() + %d dwords offset on both mapping and host buffer\n", i, extra_dwords);
        gdr_copy_to_mapping(mh, buf_ptr + extra_dwords, init_buf + extra_dwords, size - extra_off);
        gdr_copy_from_mapping(mh, copy_buf + extra_dwords, buf_ptr + extra_dwords, size - extra_off);
        ASSERT_EQ(compare_buf(init_buf + extra_dwords, copy_buf + extra_dwords, size - extra_off), 0);
        memset(copy_buf, 0xA5, size);
        ASSERTDRV(cuMemsetD8(d_A, 0xA5, size));
        ASSERTDRV(cuCtxSynchronize());

        extra_off = offset_array[i];
        print_dbg("check 9.%d: gdr_copy_to_bar() + read back via gdr_copy_from_bar() + %d bytes offset on both mapping and host buffer\n", i, extra_off);
        gdr_copy_to_mapping(mh, (char *)buf_ptr + extra_off, (char *)init_buf + extra_off, size - extra_off);
        gdr_copy_from_mapping(mh, (char *)copy_buf + extra_off, (char *)buf_ptr + extra_off, size - extra_off);
        ASSERT_EQ(compare_buf((uint32_t *)((uintptr_t)init_buf + extra_off), (uint32_t *)((uintptr_t)copy_buf + extra_off), size - extra_off), 0);
        memset(copy_buf, 0xA5, size);
        ASSERTDRV(cuMemsetD8(d_A, 0xA5, size));
        ASSERTDRV(cuCtxSynchronize());
    }

    print_dbg("unmapping\n");
    ASSERT_EQ(gdr_unmap(g, mh, bar_ptr, size), 0);
    print_dbg("unpinning\n");
    ASSERT_EQ(gdr_unpin_buffer(g, mh), 0);

    ASSERT_EQ(gdr_close(g), 0);
    if (copy_buf) {
        delete [] copy_buf;
        copy_buf = NULL;
    }
    if (init_buf) {
        delete [] init_buf;
        init_buf = NULL;
    }

    ASSERTDRV(gfree_fn(&mhandle));

    delete init_buf;
    delete copy_buf;

    finalize_cuda(0);
}

GDRCOPY_TEST(data_validation_cumemalloc)
{
    data_validation<gpu_mem_alloc, gpu_mem_free, null_filter>();
}

#if CUDA_VERSION >= 11000
// VMM with GDR support is available from CUDA 11.0
GDRCOPY_TEST(data_validation_vmmalloc)
{
    data_validation<gpu_vmm_alloc, gpu_vmm_free, vmm_filter>();
}
#endif

/**
 * This unit test ensures that accessing to gdr_map'ed region is not possible
 * after gdr_close.
 *
 * Step:
 * 1. Initialize CUDA and gdrcopy
 * 2. Do gdr_map(..., &bar_ptr, ...)
 * 3. Do gdr_close
 * 4. Attempt to access to bar_ptr after 3. should fail
 */
template <gpu_memalloc_fn_t galloc_fn, gpu_memfree_fn_t gfree_fn, filter_fn_t filter_fn>
void invalidation_access_after_gdr_close()
{
    expecting_exception_signal = false;
    MB();

    struct sigaction act;
    act.sa_handler = exception_signal_handle;
    sigemptyset(&act.sa_mask);
    act.sa_flags = 0;
    sigaction(SIGBUS, &act, 0);

    srand(time(NULL));

    const size_t _size = sizeof(int) * 16;
    const size_t size = PAGE_ROUND_UP(_size, GPU_PAGE_SIZE);

    int mydata = (rand() % 1000) + 1;

    init_cuda(0);
    filter_fn();

    CUdeviceptr d_A;
    gpu_mem_handle_t mhandle;
    ASSERTDRV(galloc_fn(&mhandle, size, true, true));
    d_A = mhandle.ptr;

    ASSERTDRV(cuMemsetD8(d_A, 0x95, size));
    ASSERTDRV(cuCtxSynchronize());

    gdr_t g = gdr_open_safe();

    gdr_mh_t mh;
    CUdeviceptr d_ptr = d_A;

    // tokens are optional in CUDA 6.0
    // wave out the test if GPUDirectRDMA is not enabled
    ASSERT_EQ(gdr_pin_buffer(g, d_ptr, size, 0, 0, &mh), 0);
    ASSERT_NEQ(mh, null_mh);

    print_dbg("Mapping bar1\n");
    void *bar_ptr  = NULL;
    ASSERT_EQ(gdr_map(g, mh, &bar_ptr, size), 0);

    gdr_info_t info;
    ASSERT_EQ(gdr_get_info(g, mh, &info), 0);
    int off = d_ptr - info.va;

    volatile int *buf_ptr = (volatile int *)((char *)bar_ptr + off);

    // Write data
    print_dbg("Writing %d into buf_ptr[0]\n", mydata);
    buf_ptr[0] = mydata;

    print_dbg("Calling gdr_close\n");
    ASSERT_EQ(gdr_close(g), 0);

    print_dbg("Trying to read buf_ptr[0] after gdr_close\n");
    expecting_exception_signal = true;
    MB();
    int data_from_buf_ptr = buf_ptr[0];
    MB();
    expecting_exception_signal = false;
    MB();

    ASSERT_NEQ(data_from_buf_ptr, mydata);

    finalize_cuda(0);
}

GDRCOPY_TEST(invalidation_access_after_gdr_close_cumemalloc)
{
    invalidation_access_after_gdr_close<gpu_mem_alloc, gpu_mem_free, null_filter>();
}

#if CUDA_VERSION >= 11000
// VMM with GDR support is available from CUDA 11.0
GDRCOPY_TEST(invalidation_access_after_gdr_close_vmmalloc)
{
    invalidation_access_after_gdr_close<gpu_vmm_alloc, gpu_vmm_free, vmm_filter>();
}
#endif

/**
 * This unit test ensures that accessing to gdr_map'ed region is not possible
 * after gpuMemFree.
 *
 * Step:
 * 1. Initialize CUDA and gdrcopy
 * 2. Do gdr_map(..., &bar_ptr, ...)
 * 3. Do gpuMemFree
 * 4. Attempt to access to bar_ptr after 3. should fail
 */
template <gpu_memalloc_fn_t galloc_fn, gpu_memfree_fn_t gfree_fn, filter_fn_t filter_fn>
void invalidation_access_after_free()
{
    // Waive this test until we provide a way to query whether persistent
    // mapping is being used.
    exit(EXIT_WAIVED);

    expecting_exception_signal = false;
    MB();

    struct sigaction act;
    act.sa_handler = exception_signal_handle;
    sigemptyset(&act.sa_mask);
    act.sa_flags = 0;
    sigaction(SIGBUS, &act, 0);

    srand(time(NULL));

    const size_t _size = sizeof(int) * 16;
    const size_t size = PAGE_ROUND_UP(_size, GPU_PAGE_SIZE);

    int mydata = (rand() % 1000) + 1;

    init_cuda(0);
    filter_fn();

    CUdeviceptr d_A;
    gpu_mem_handle_t mhandle;
    ASSERTDRV(galloc_fn(&mhandle, size, true, true));
    d_A = mhandle.ptr;

    ASSERTDRV(cuMemsetD8(d_A, 0x95, size));
    ASSERTDRV(cuCtxSynchronize());

    gdr_t g = gdr_open_safe();

    gdr_mh_t mh;
    CUdeviceptr d_ptr = d_A;

    // tokens are optional in CUDA 6.0
    // wave out the test if GPUDirectRDMA is not enabled
    ASSERT_EQ(gdr_pin_buffer(g, d_ptr, size, 0, 0, &mh), 0);
    ASSERT_NEQ(mh, null_mh);

    print_dbg("Mapping bar1\n");
    void *bar_ptr  = NULL;
    ASSERT_EQ(gdr_map(g, mh, &bar_ptr, size), 0);

    gdr_info_t info;
    ASSERT_EQ(gdr_get_info(g, mh, &info), 0);
    int off = d_ptr - info.va;

    volatile int *buf_ptr = (volatile int *)((char *)bar_ptr + off);

    // Write data
    print_dbg("Writing %d into buf_ptr[0]\n", mydata);
    buf_ptr[0] = mydata;

    print_dbg("Calling gpuMemFree\n");
    ASSERTDRV(gfree_fn(&mhandle));

    print_dbg("Trying to read buf_ptr[0] after gpuMemFree\n");
    expecting_exception_signal = true;
    MB();
    int data_from_buf_ptr = buf_ptr[0];
    MB();
    expecting_exception_signal = false;
    MB();

    ASSERT_NEQ(data_from_buf_ptr, mydata);

    ASSERT_EQ(gdr_unmap(g, mh, bar_ptr, size), 0);
    ASSERT_EQ(gdr_unpin_buffer(g, mh), 0);
    ASSERT_EQ(gdr_close(g), 0);

    finalize_cuda(0);
}

GDRCOPY_TEST(invalidation_access_after_free_cumemalloc)
{
    invalidation_access_after_free<gpu_mem_alloc, gpu_mem_free, null_filter>();
}

#if CUDA_VERSION >= 11000
// VMM with GDR support is available from CUDA 11.0
GDRCOPY_TEST(invalidation_access_after_free_vmmalloc)
{
    invalidation_access_after_free<gpu_vmm_alloc, gpu_vmm_free, vmm_filter>();
}
#endif


/**
 * This unit test ensures that gpuMemFree destroys only the mapping it is
 * corresponding to.
 *
 * Step:
 * 1. Initialize CUDA and gdrcopy
 * 2. cuMemAlloc(&d_A, ...); cuMemAlloc(&d_B, ...)
 * 3. Do gdr_map(..., &bar_ptr_A, ...) of d_A
 * 4. Do gdr_map(..., &bar_ptr_B, ...) of d_B
 * 5. Do gpuMemFree(d_A)
 * 6. Verify that bar_ptr_B is still accessible
 */
template <gpu_memalloc_fn_t galloc_fn, gpu_memfree_fn_t gfree_fn, filter_fn_t filter_fn>
void invalidation_two_mappings()
{
    expecting_exception_signal = false;
    MB();

    srand(time(NULL));

    const size_t _size = sizeof(int) * 16;
    const size_t size = PAGE_ROUND_UP(_size, GPU_PAGE_SIZE);

    int mydata = (rand() % 1000) + 1;

    init_cuda(0);
    filter_fn();

    CUdeviceptr d_A[2];
    gpu_mem_handle_t mhandle[2];

    for (int i = 0; i < 2; ++i) {
        ASSERTDRV(galloc_fn(&mhandle[i], size, true, true));
        d_A[i] = mhandle[i].ptr;
        ASSERTDRV(cuMemsetD8(d_A[i], 0x95, size));
    }
    ASSERTDRV(cuCtxSynchronize());

    gdr_t g = gdr_open_safe();

    gdr_mh_t mh[2];

    volatile int *buf_ptr[2];
    void *bar_ptr[2];

    print_dbg("Mapping bar1\n");
    for (int i = 0; i < 2; ++i) {
        CUdeviceptr d_ptr = d_A[i];

        // tokens are optional in CUDA 6.0
        // wave out the test if GPUDirectRDMA is not enabled
        ASSERT_EQ(gdr_pin_buffer(g, d_ptr, size, 0, 0, &mh[i]), 0);
        ASSERT_NEQ(mh[i], null_mh);

        bar_ptr[i] = NULL;
        ASSERT_EQ(gdr_map(g, mh[i], &bar_ptr[i], size), 0);

        gdr_info_t info;
        ASSERT_EQ(gdr_get_info(g, mh[i], &info), 0);
        int off = d_ptr - info.va;

        buf_ptr[i] = (volatile int *)((char *)bar_ptr[i] + off);
    }


    // Write data
    print_dbg("Writing data to both mappings %d and %d respectively\n", mydata, mydata + 1);
    buf_ptr[0][0] = mydata;
    buf_ptr[1][0] = mydata + 1;

    print_dbg("Validating that we can read the data back\n");
    ASSERT_EQ(buf_ptr[0][0], mydata);
    ASSERT_EQ(buf_ptr[1][0], mydata + 1);

    print_dbg("gpuMemFree and thus destroying the first mapping\n");
    ASSERTDRV(gfree_fn(&mhandle[0]));

    print_dbg("Trying to read and validate the data from the second mapping after the first mapping has been destroyed\n");
    ASSERT_EQ(buf_ptr[1][0], mydata + 1);

    ASSERTDRV(gfree_fn(&mhandle[1]));

    for (int i = 0; i < 2; ++i) {
        ASSERT_EQ(gdr_unmap(g, mh[i], bar_ptr[i], size), 0);
        ASSERT_EQ(gdr_unpin_buffer(g, mh[i]), 0);
    }

    ASSERT_EQ(gdr_close(g), 0);

    finalize_cuda(0);
}

GDRCOPY_TEST(invalidation_two_mappings_cumemalloc)
{
    invalidation_two_mappings<gpu_mem_alloc, gpu_mem_free, null_filter>();
}

#if CUDA_VERSION >= 11000
// VMM with GDR support is available from CUDA 11.0
GDRCOPY_TEST(invalidation_two_mappings_vmmalloc)
{
    invalidation_two_mappings<gpu_vmm_alloc, gpu_vmm_free, vmm_filter>();
}
#endif

/**
 * This unit test is intended to check the security hole originated from not
 * doing invalidation correctly. In a nutshell, it ensures that the parent
 * process cannot spy on the child process.
 *
 * Step:
 * 1. Fork the process
 * 2.C Child: Waiting for parent's signal before continue
 *
 * 2.P Parent: Initialize CUDA and gdrcopy
 * 3.P Parent: Do gdr_map then gpuMemFree without gdr_unmap
 * 4.P Parent: Signal child and wait for child's signal
 *
 * 3.C Child: Initialize CUDA and gdrcopy
 * 4.C Child: Do gdr_map, signal parent, and wait for parent's signal
 *
 * 5.P Parent: Check whether it can access to its gdr_map'ed data or not and
 *     compare with the data written by child. If gdrdrv does not handle
 *     invalidation properly, child's data will be leaked to parent.
 */
template <gpu_memalloc_fn_t galloc_fn, gpu_memfree_fn_t gfree_fn, filter_fn_t filter_fn>
void invalidation_fork_access_after_free()
{
    expecting_exception_signal = false;
    MB();

    int filedes_0[2];
    int filedes_1[2];
    int read_fd;
    int write_fd;
    ASSERT_NEQ(pipe(filedes_0), -1);
    ASSERT_NEQ(pipe(filedes_1), -1);

    srand(time(NULL));

    const size_t _size = sizeof(int) * 16;
    const size_t size = PAGE_ROUND_UP(_size, GPU_PAGE_SIZE);
    const char *myname;

    fflush(stdout);
    fflush(stderr);

    pid_t pid = fork();
    ASSERT(pid >= 0);

    myname = pid == 0 ? "child" : "parent";

    print_dbg("%s: Start\n", myname);

    if (pid == 0) {
        close(filedes_0[0]);
        close(filedes_1[1]);

        read_fd = filedes_1[0];
        write_fd = filedes_0[1];

        int cont = 0;

        do {
            print_dbg("%s: waiting for cont signal from parent\n", myname);
            ASSERT_EQ(read(read_fd, &cont, sizeof(int)), sizeof(int));
            print_dbg("%s: receive cont signal %d from parent\n", myname, cont);
        } while (cont != 1);
    }
    else {
        close(filedes_0[1]);
        close(filedes_1[0]);

        read_fd = filedes_0[0];
        write_fd = filedes_1[1];

        struct sigaction act;
        act.sa_handler = exception_signal_handle;
        sigemptyset(&act.sa_mask);
        act.sa_flags = 0;
        sigaction(SIGBUS, &act, 0);
    }

    int mydata = (rand() % 1000) + 1;

    // Make sure that parent's and child's mydata are different.
    // Remember that we do srand before fork.
    if (pid == 0)
        mydata += 10;

    init_cuda(0);
    filter_fn();

    CUdeviceptr d_A;
    gpu_mem_handle_t mhandle;
    ASSERTDRV(galloc_fn(&mhandle, size, true, true));
    d_A = mhandle.ptr;

    ASSERTDRV(cuMemsetD8(d_A, 0x95, size));
    ASSERTDRV(cuCtxSynchronize());

    gdr_t g = gdr_open_safe();

    gdr_mh_t mh;

    CUdeviceptr d_ptr = d_A;

    // tokens are optional in CUDA 6.0
    // wave out the test if GPUDirectRDMA is not enabled
    ASSERT_EQ(gdr_pin_buffer(g, d_ptr, size, 0, 0, &mh), 0);
    ASSERT_NEQ(mh, null_mh);

    void *bar_ptr  = NULL;
    ASSERT_EQ(gdr_map(g, mh, &bar_ptr, size), 0);

    gdr_info_t info;
    ASSERT_EQ(gdr_get_info(g, mh, &info), 0);
    int off = d_ptr - info.va;

    volatile int *buf_ptr = (volatile int *)((char *)bar_ptr + off);

    print_dbg("%s: writing buf_ptr[0] with %d\n", myname, mydata);
    buf_ptr[0] = mydata;

    if (pid == 0) {
        print_dbg("%s: signal parent that I have written\n", myname);
        ASSERT_EQ(write(write_fd, &mydata, sizeof(int)), sizeof(int));

        int cont = 0;
        print_dbg("%s: waiting for signal from parent before calling gpuMemFree\n", myname);
        do {
            ASSERT_NEQ(read(read_fd, &cont, sizeof(int)), -1);
        } while (cont != 1);
    }

    print_dbg("%s: read buf_ptr[0] before gpuMemFree get %d\n", myname, buf_ptr[0]);

    print_dbg("%s: calling gpuMemFree\n", myname);
    ASSERTDRV(gfree_fn(&mhandle));

    if (pid > 0) {
        int msg = 1;
        ASSERT_EQ(write(write_fd, &msg, sizeof(int)), sizeof(int));
        int child_data = 0;
        print_dbg("%s: waiting for child write signal\n", myname);
        do {
            ASSERT_EQ(read(read_fd, &child_data, sizeof(int)), sizeof(int));
        } while (child_data == 0);

        print_dbg("%s: trying to read buf_ptr[0]\n", myname);
        expecting_exception_signal = true;
        MB();
        int data_from_buf_ptr = buf_ptr[0];
        MB();
        expecting_exception_signal = false;
        MB();

        print_dbg("%s: read buf_ptr[0] after child write get %d\n", myname, data_from_buf_ptr);
        print_dbg("%s: child data is %d\n", myname, child_data);
        ASSERT_EQ(write(write_fd, &msg, sizeof(int)), sizeof(int));
        ASSERT_NEQ(child_data, data_from_buf_ptr);
    }

    ASSERT_EQ(gdr_unmap(g, mh, bar_ptr, size), 0);
    ASSERT_EQ(gdr_unpin_buffer(g, mh), 0);

    ASSERT_EQ(gdr_close(g), 0);

    finalize_cuda(0);
}

GDRCOPY_TEST(invalidation_fork_access_after_free_cumemalloc)
{
    invalidation_fork_access_after_free<gpu_mem_alloc, gpu_mem_free, null_filter>();
}

#if CUDA_VERSION >= 11000
// VMM with GDR support is available from CUDA 11.0
GDRCOPY_TEST(invalidation_fork_access_after_free_vmmalloc)
{
    invalidation_fork_access_after_free<gpu_vmm_alloc, gpu_vmm_free, vmm_filter>();
}
#endif

/**
 * This unit test makes sure that child processes cannot spy on the parent
 * process if the parent does fork without doing gdr_unmap first.
 *
 * Step:
 * 1. Initilize CUDA and gdrcopy
 * 2. Do gdr_map
 * 3. Fork the process
 *
 * 4.P Parent: Waiting for child to exit
 *
 * 4.C Child: Attempt to access the gdr_map'ed data and compare with what
 *     parent writes into that region. If gdrdrv does not invalidate the
 *     mapping correctly, child can spy on parent.
 */
template <gpu_memalloc_fn_t galloc_fn, gpu_memfree_fn_t gfree_fn, filter_fn_t filter_fn>
void invalidation_fork_after_gdr_map()
{
    expecting_exception_signal = false;
    MB();

    int filedes_0[2];
    int filedes_1[2];
    int read_fd;
    int write_fd;
    ASSERT_NEQ(pipe(filedes_0), -1);
    ASSERT_NEQ(pipe(filedes_1), -1);

    const size_t _size = sizeof(int) * 16;
    const size_t size = PAGE_ROUND_UP(_size, GPU_PAGE_SIZE);
    const char *myname;

    init_cuda(0);
    filter_fn();

    CUdeviceptr d_A;
    gpu_mem_handle_t mhandle;
    ASSERTDRV(galloc_fn(&mhandle, size, true, true));
    d_A = mhandle.ptr;

    ASSERTDRV(cuMemsetD8(d_A, 0x95, size));
    ASSERTDRV(cuCtxSynchronize());

    gdr_t g = gdr_open_safe();

    gdr_mh_t mh;

    CUdeviceptr d_ptr = d_A;

    // tokens are optional in CUDA 6.0
    // wave out the test if GPUDirectRDMA is not enabled
    ASSERT_EQ(gdr_pin_buffer(g, d_ptr, size, 0, 0, &mh), 0);
    ASSERT_NEQ(mh, null_mh);

    void *bar_ptr  = NULL;
    ASSERT_EQ(gdr_map(g, mh, &bar_ptr, size), 0);

    gdr_info_t info;
    ASSERT_EQ(gdr_get_info(g, mh, &info), 0);
    int off = d_ptr - info.va;

    volatile int *buf_ptr = (volatile int *)((char *)bar_ptr + off);

    fflush(stdout);
    fflush(stderr);

    pid_t pid = fork();
    ASSERT(pid >= 0);

    myname = pid == 0 ? "child" : "parent";

    print_dbg("%s: Start\n", myname);

    srand(time(NULL));

    int mynumber = rand() % 1000 + 1;

    if (pid == 0) {
        close(filedes_0[0]);
        close(filedes_1[1]);

        read_fd = filedes_1[0];
        write_fd = filedes_0[1];

        srand(rand());
        int cont = 0;

        do {
            print_dbg("%s: waiting for cont signal from parent\n", myname);
            ASSERT_EQ(read(read_fd, &cont, sizeof(int)), sizeof(int));
            print_dbg("%s: receive cont signal %d from parent\n", myname, cont);
        } while (cont != 1);
    }
    else {
        close(filedes_0[1]);
        close(filedes_1[0]);

        read_fd = filedes_0[0];
        write_fd = filedes_1[1];
    }

    if (pid > 0) {
        print_dbg("%s: writing buf_ptr[0] with %d\n", myname, mynumber);
        buf_ptr[0] = mynumber;
    }

    if (pid == 0) {
        struct sigaction act;
        act.sa_handler = exception_signal_handle;
        sigemptyset(&act.sa_mask);
        act.sa_flags = 0;
        sigaction(SIGBUS, &act, 0);
        sigaction(SIGSEGV, &act, 0);

        expecting_exception_signal = true;
        MB();
    }
    print_dbg("%s: trying to read buf_ptr[0]\n", myname);
    int data_from_buf_ptr = buf_ptr[0];
    print_dbg("%s: read buf_ptr[0] get %d\n", myname, data_from_buf_ptr);
    if (pid == 0) {
        MB();
        expecting_exception_signal = false;
        MB();
        print_dbg("%s: should not be able to read buf_ptr[0] anymore!! aborting!!\n", myname);
        exit(EXIT_FAILURE);
    }

    if (pid > 0) {
        print_dbg("%s: signaling child\n", myname);
        int msg = 1;
        ASSERT_EQ(write(write_fd, &msg, sizeof(int)), sizeof(int));
        print_dbg("%s: waiting for child to exit\n", myname);
        // Child should exit because of sigbus
        int child_exit_status = -EINVAL;
        ASSERT(wait(&child_exit_status) == pid);
        ASSERT_EQ(child_exit_status, EXIT_SUCCESS);
        print_dbg("%s: trying to read buf_ptr[0] after child exits\n", myname);
        data_from_buf_ptr = buf_ptr[0];
        print_dbg("%s: read buf_ptr[0] after child exits get %d\n", myname, data_from_buf_ptr);
        ASSERT_EQ(data_from_buf_ptr, mynumber);
        ASSERT_EQ(gdr_unmap(g, mh, bar_ptr, size), 0);
        ASSERT_EQ(gdr_unpin_buffer(g, mh), 0);
        ASSERTDRV(gfree_fn(&mhandle));
        ASSERT_EQ(gdr_close(g), 0);
    }

    finalize_cuda(0);
}

GDRCOPY_TEST(invalidation_fork_after_gdr_map_cumemalloc)
{
    invalidation_fork_after_gdr_map<gpu_mem_alloc, gpu_mem_free, null_filter>();
}

#if CUDA_VERSION >= 11000
// VMM with GDR support is available from CUDA 11.0
GDRCOPY_TEST(invalidation_fork_after_gdr_map_vmmalloc)
{
    invalidation_fork_after_gdr_map<gpu_vmm_alloc, gpu_vmm_free, vmm_filter>();
}
#endif

/**
 * This unit test ensures that child cannot do gdr_map on what parent has
 * prepared with gdr_pin_buffer. This situation emulates when the parent
 * forgets that it has gdr_pin_buffer without gdr_map before doing fork.
 *
 * Step:
 * 1. Initilize CUDA and gdrcopy
 * 2. Do gdr_pin_buffer
 * 3. Fork the process
 *
 * 4.P Parent: Waiting for child to exit
 *
 * 4.C Child: Attempt to do gdr_map on the parent's pinned buffer. gdrdrv is
 *     expected to prevent this case so that the child process cannot spy on
 *     the parent's GPU data.
 */
template <gpu_memalloc_fn_t galloc_fn, gpu_memfree_fn_t gfree_fn, filter_fn_t filter_fn>
void invalidation_fork_child_gdr_map_parent()
{
    expecting_exception_signal = false;
    MB();

    const size_t _size = sizeof(int) * 16;
    const size_t size = PAGE_ROUND_UP(_size, GPU_PAGE_SIZE);
    const char *myname;

    init_cuda(0);
    filter_fn();

    CUdeviceptr d_A;
    gpu_mem_handle_t mhandle;
    ASSERTDRV(galloc_fn(&mhandle, size, true, true));
    d_A = mhandle.ptr;

    ASSERTDRV(cuMemsetD8(d_A, 0x95, size));
    ASSERTDRV(cuCtxSynchronize());

    gdr_t g = gdr_open_safe();

    gdr_mh_t mh;

    CUdeviceptr d_ptr = d_A;

    // tokens are optional in CUDA 6.0
    // wave out the test if GPUDirectRDMA is not enabled
    ASSERT_EQ(gdr_pin_buffer(g, d_ptr, size, 0, 0, &mh), 0);
    ASSERT_NEQ(mh, null_mh);

    fflush(stdout);
    fflush(stderr);

    pid_t pid = fork();
    ASSERT(pid >= 0);

    myname = pid == 0 ? "child" : "parent";

    print_dbg("%s: Start\n", myname);

    if (pid == 0) {
        void *bar_ptr  = NULL;
        print_dbg("%s: attempting to gdr_map parent's pinned GPU memory\n", myname);
        ASSERT_NEQ(gdr_map(g, mh, &bar_ptr, size), 0);
        print_dbg("%s: cannot do gdr_map as expected\n", myname);
    }
    else {
        int child_exit_status = -EINVAL;
        ASSERT(wait(&child_exit_status) == pid);
        ASSERT_EQ(child_exit_status, EXIT_SUCCESS);

        ASSERT_EQ(gdr_unpin_buffer(g, mh), 0);
        ASSERTDRV(gfree_fn(&mhandle));
        ASSERT_EQ(gdr_close(g), 0);

        finalize_cuda(0);
    }
}

GDRCOPY_TEST(invalidation_fork_child_gdr_map_parent_cumemalloc)
{
    invalidation_fork_child_gdr_map_parent<gpu_mem_alloc, gpu_mem_free, null_filter>();
}

#if CUDA_VERSION >= 11000
// VMM with GDR support is available from CUDA 11.0
GDRCOPY_TEST(invalidation_fork_child_gdr_map_parent_vmmalloc)
{
    invalidation_fork_child_gdr_map_parent<gpu_vmm_alloc, gpu_vmm_free, vmm_filter>();
}
#endif

/**
 * This unit test verifies that gpuMemFree of one process will not
 * unintentionally invalidate mapping on other processes.
 *
 * Step:
 * 1. Fork
 *
 * 2.P Parent: Init CUDA and gdrcopy, and do gdr_map.
 * 3.P Parent: Wait for child's signal.
 *
 * 2.C Child: Init CUDA and gdrcopy, and do gdr_map.
 * 3.C Child: Do gpuMemFree. This should unmap the gdr_map'ed region.
 * 4.C Child: Signal parent.
 *
 * 4.P Parent: Verify that it can still access its gdr_map'ed region. If gdrdrv
 *     does not implement correctly, it might invalidate parent's mapping as
 *     well.
 */
template <gpu_memalloc_fn_t galloc_fn, gpu_memfree_fn_t gfree_fn, filter_fn_t filter_fn>
void invalidation_fork_map_and_free()
{
    expecting_exception_signal = false;
    MB();

    int filedes_0[2];
    int filedes_1[2];
    int read_fd;
    int write_fd;
    ASSERT_NEQ(pipe(filedes_0), -1);
    ASSERT_NEQ(pipe(filedes_1), -1);

    srand(time(NULL));

    const size_t _size = sizeof(int) * 16;
    const size_t size = PAGE_ROUND_UP(_size, GPU_PAGE_SIZE);
    const char *myname;

    fflush(stdout);
    fflush(stderr);

    pid_t pid = fork();
    ASSERT(pid >= 0);

    myname = pid == 0 ? "child" : "parent";

    print_dbg("%s: Start\n", myname);

    if (pid == 0) {
        close(filedes_0[0]);
        close(filedes_1[1]);

        read_fd = filedes_1[0];
        write_fd = filedes_0[1];

        srand(rand());
    }
    else {
        close(filedes_0[1]);
        close(filedes_1[0]);

        read_fd = filedes_0[0];
        write_fd = filedes_1[1];
    }

    int mydata = (rand() % 1000) + 1;

    init_cuda(0);
    filter_fn();

    CUdeviceptr d_A;
    gpu_mem_handle_t mhandle;
    ASSERTDRV(galloc_fn(&mhandle, size, true, true));
    d_A = mhandle.ptr;

    ASSERTDRV(cuMemsetD8(d_A, 0x95, size));
    ASSERTDRV(cuCtxSynchronize());

    gdr_t g = gdr_open_safe();

    gdr_mh_t mh;

    CUdeviceptr d_ptr = d_A;

    // tokens are optional in CUDA 6.0
    // wave out the test if GPUDirectRDMA is not enabled
    ASSERT_EQ(gdr_pin_buffer(g, d_ptr, size, 0, 0, &mh), 0);
    ASSERT_NEQ(mh, null_mh);

    void *bar_ptr  = NULL;
    ASSERT_EQ(gdr_map(g, mh, &bar_ptr, size), 0);

    gdr_info_t info;
    ASSERT_EQ(gdr_get_info(g, mh, &info), 0);
    int off = d_ptr - info.va;

    volatile int *buf_ptr = (volatile int *)((char *)bar_ptr + off);

    print_dbg("%s: writing buf_ptr[0] with %d\n", myname, mydata);
    buf_ptr[0] = mydata;

    if (pid == 0) {
        print_dbg("%s: calling gpuMemFree\n", myname);
        ASSERTDRV(gfree_fn(&mhandle));

        print_dbg("%s: signal parent that I have called gpuMemFree\n", myname);
        int msg = 1;
        ASSERT_EQ(write(write_fd, &msg, sizeof(int)), sizeof(int));
    }
    else {
        int cont = 0;
        do {
            print_dbg("%s: waiting for signal from child\n", myname);
            ASSERT_EQ(read(read_fd, &cont, sizeof(int)), sizeof(int));
            print_dbg("%s: received cont signal %d from child\n", myname, cont);
        } while (cont == 0);

        print_dbg("%s: trying to read buf_ptr[0]\n", myname);
        int data_from_buf_ptr = buf_ptr[0];
        print_dbg("%s: read buf_ptr[0] get %d\n", myname, data_from_buf_ptr);
        ASSERT_EQ(data_from_buf_ptr, mydata);
    }

    ASSERT_EQ(gdr_unmap(g, mh, bar_ptr, size), 0);
    ASSERT_EQ(gdr_unpin_buffer(g, mh), 0);

    if (pid > 0)
        ASSERTDRV(gfree_fn(&mhandle));

    ASSERT_EQ(gdr_close(g), 0);

    finalize_cuda(0);
}

GDRCOPY_TEST(invalidation_fork_map_and_free_cumemalloc)
{
    invalidation_fork_map_and_free<gpu_mem_alloc, gpu_mem_free, null_filter>();
}

#if CUDA_VERSION >= 11000
// VMM with GDR support is available from CUDA 11.0
GDRCOPY_TEST(invalidation_fork_map_and_free_vmmalloc)
{
    invalidation_fork_map_and_free<gpu_vmm_alloc, gpu_vmm_free, vmm_filter>();
}
#endif

/**
 * Process A can intentionally share fd with Process B through unix socket.
 * This method may lead to sharing mappings of gdrcopy. Since CUDA contexts are
 * not sharable between processes, gdrcopy is also expected to be unsharable.
 * This unit test verifies that gdr_open's fd shared from another process is
 * not usable.
 *
 * Step:
 * 1. Fork
 *
 * 2.P Parent: Init CUDA and gdrcopy.
 * 3.P Parent: Share gdr_open's fd to child through unix socket.
 *
 * 2.C Child: Init CUDA.
 * 3.C Child: Receive the fd from parent.
 * 4.C Child: Attempt to do gdr_pin_buffer using this fd. gdrdrv should not
 *     allow it.
 */
template <gpu_memalloc_fn_t galloc_fn, gpu_memfree_fn_t gfree_fn, filter_fn_t filter_fn>
void invalidation_unix_sock_shared_fd_gdr_pin_buffer()
{
    expecting_exception_signal = false;
    MB();

    pid_t pid;
    int pair[2];
    int fd = -1;

    const size_t _size = sizeof(int) * 16;
    const size_t size = PAGE_ROUND_UP(_size, GPU_PAGE_SIZE);

    ASSERT_EQ(socketpair(PF_UNIX, SOCK_DGRAM, 0, pair), 0);

    fflush(stdout);
    fflush(stderr);

    pid = fork();
    ASSERT(pid >= 0);
    const char *myname = pid == 0 ? "child" : "parent";

    print_dbg("%s: Start\n", myname);

    init_cuda(0);
    filter_fn();

    CUdeviceptr d_A;
    gpu_mem_handle_t mhandle;
    ASSERTDRV(galloc_fn(&mhandle, size, true, true));
    d_A = mhandle.ptr;

    ASSERTDRV(cuMemsetD8(d_A, 0x95, size));
    ASSERTDRV(cuCtxSynchronize());

    CUdeviceptr d_ptr = d_A;

    if (pid == 0) {
        close(pair[1]);

        print_dbg("%s: Receiving fd from parent via unix socket\n", myname);
        fd = recvfd(pair[0]);
        ASSERT(fd >= 0);

        print_dbg("%s: Got fd %d\n", myname, fd);

        print_dbg("%s: Converting fd to gdr_t\n", myname);
        struct gdr _g;
        _g.fd = fd;
        gdr_t g = &_g;

        print_dbg("%s: Trying to do gdr_pin_buffer with the received fd\n", myname);
        gdr_mh_t mh;
        ASSERT_NEQ(gdr_pin_buffer(g, d_ptr, size, 0, 0, &mh), 0);
        print_dbg("%s: Cannot do gdr_pin_buffer with the received fd as expected\n", myname);
    }
    else {
        close(pair[0]);

        print_dbg("%s: Calling gdr_open\n", myname);
        gdr_t g = gdr_open_safe();

        fd = g->fd;
        print_dbg("%s: Extracted fd from gdr_t got fd %d\n", myname, fd);

        print_dbg("%s: Sending fd to child via unix socket\n", myname);
        ASSERT(sendfd(pair[1], fd) >= 0);

        print_dbg("%s: Waiting for child to finish\n", myname);
        int child_exit_status = -EINVAL;
        ASSERT(wait(&child_exit_status) == pid);
        ASSERT_EQ(child_exit_status, EXIT_SUCCESS);
    }

    finalize_cuda(0);
}

GDRCOPY_TEST(invalidation_unix_sock_shared_fd_gdr_pin_buffer_cumemalloc)
{
    invalidation_unix_sock_shared_fd_gdr_pin_buffer<gpu_mem_alloc, gpu_mem_free, null_filter>();
}

#if CUDA_VERSION >= 11000
// VMM with GDR support is available from CUDA 11.0
GDRCOPY_TEST(invalidation_unix_sock_shared_fd_gdr_pin_buffer_vmmalloc)
{
    invalidation_unix_sock_shared_fd_gdr_pin_buffer<gpu_vmm_alloc, gpu_vmm_free, vmm_filter>();
}
#endif

/**
 * Process A can intentionally share fd with Process B through unix socket.
 * This method may lead to sharing mappings of gdrcopy. Since CUDA contexts are
 * not sharable between processes, gdrcopy is also expected to be unsharable.
 * This unit test verifies that gdr_open's fd shared from another process is
 * not usable.
 *
 * Step:
 * 1. Fork
 *
 * 2.P Parent: Init CUDA and gdrcopy, and do gdr_pin_buffer
 * 3.P Parent: Share gdr_open's fd to child through unix socket.
 * 4.P Parent: Also share the handle returned from gdr_pin_buffer with child.
 *
 * 2.C Child: Init CUDA.
 * 3.C Child: Receive the fd and handle from parent.
 * 4.C Child: Attempt to do gdr_map using this fd and handle. gdrdrv should not
 *     allow it.
 */
template <gpu_memalloc_fn_t galloc_fn, gpu_memfree_fn_t gfree_fn, filter_fn_t filter_fn>
void invalidation_unix_sock_shared_fd_gdr_map()
{
    expecting_exception_signal = false;
    MB();

    int filedes_0[2];
    int filedes_1[2];
    int read_fd;
    int write_fd;
    ASSERT_NEQ(pipe(filedes_0), -1);
    ASSERT_NEQ(pipe(filedes_1), -1);

    pid_t pid;
    int pair[2];
    int fd = -1;

    const size_t _size = sizeof(int) * 16;
    const size_t size = PAGE_ROUND_UP(_size, GPU_PAGE_SIZE);

    ASSERT_EQ(socketpair(PF_UNIX, SOCK_DGRAM, 0, pair), 0);

    fflush(stdout);
    fflush(stderr);

    pid = fork();
    ASSERT(pid >= 0);
    const char *myname = pid == 0 ? "child" : "parent";

    print_dbg("%s: Start\n", myname);
    if (pid == 0) {
        close(filedes_0[0]);
        close(filedes_1[1]);

        read_fd = filedes_1[0];
        write_fd = filedes_0[1];

        srand(rand());
    }
    else {
        close(filedes_0[1]);
        close(filedes_1[0]);

        read_fd = filedes_0[0];
        write_fd = filedes_1[1];
    }

    init_cuda(0);
    filter_fn();

    CUdeviceptr d_A;
    gpu_mem_handle_t mhandle;
    ASSERTDRV(galloc_fn(&mhandle, size, true, true));
    d_A = mhandle.ptr;

    ASSERTDRV(cuMemsetD8(d_A, 0x95, size));
    ASSERTDRV(cuCtxSynchronize());

    CUdeviceptr d_ptr = d_A;

    if (pid == 0) {
        close(pair[1]);

        print_dbg("%s: Receiving fd from parent via unix socket\n", myname);
        fd = recvfd(pair[0]);
        ASSERT(fd >= 0);

        print_dbg("%s: Got fd %d\n", myname, fd);

        print_dbg("%s: Converting fd to gdr_t\n", myname);
        struct gdr _g;
        _g.fd = fd;
        gdr_t g = &_g;

        print_dbg("%s: Receiving gdr_memh_t from parent\n", myname);
        gdr_memh_t memh;
        ASSERT_EQ(read(read_fd, &memh, sizeof(gdr_memh_t)), sizeof(gdr_memh_t));
        print_dbg("%s: Got handle 0x%lx\n", myname, memh.handle);

        print_dbg("%s: Converting gdr_memh_t to gdr_mh_t\n", myname);
        gdr_mh_t mh;
        mh.h = (unsigned long)(&memh);

        print_dbg("%s: Attempting gdr_map\n", myname);
        void *bar_ptr  = NULL;
        ASSERT_NEQ(gdr_map(g, mh, &bar_ptr, size), 0);
        print_dbg("%s: Cannot do gdr_map as expected\n", myname);
    }
    else {
        close(pair[0]);

        print_dbg("%s: Calling gdr_open\n", myname);
        gdr_t g = gdr_open_safe();

        print_dbg("%s: Calling gdr_pin_buffer\n", myname);
        gdr_mh_t mh;
        ASSERT_EQ(gdr_pin_buffer(g, d_ptr, size, 0, 0, &mh), 0);
        ASSERT_NEQ(mh, null_mh);

        fd = g->fd;
        print_dbg("%s: Extracted fd from gdr_t got fd %d\n", myname, fd);

        print_dbg("%s: Sending fd to child via unix socket\n", myname);
        ASSERT(sendfd(pair[1], fd) >= 0);

        gdr_memh_t *memh = (gdr_memh_t *)mh.h;
        print_dbg("%s: Extracted gdr_memh_t from gdr_mh_t got handle 0x%lx\n", myname, memh->handle);

        print_dbg("%s: Sending gdr_memh_t to child\n", myname);
        ASSERT_EQ(write(write_fd, memh, sizeof(gdr_memh_t)), sizeof(gdr_memh_t));

        print_dbg("%s: Waiting for child to finish\n", myname);
        int child_exit_status = -EINVAL;
        ASSERT(wait(&child_exit_status) == pid);
        ASSERT_EQ(child_exit_status, EXIT_SUCCESS);
    }

    finalize_cuda(0);
}

GDRCOPY_TEST(invalidation_unix_sock_shared_fd_gdr_map_cumemalloc)
{
    invalidation_unix_sock_shared_fd_gdr_map<gpu_mem_alloc, gpu_mem_free, null_filter>();
}

#if CUDA_VERSION >= 11000
// VMM with GDR support is available from CUDA 11.0
GDRCOPY_TEST(invalidation_unix_sock_shared_fd_gdr_map_vmmalloc)
{
    invalidation_unix_sock_shared_fd_gdr_map<gpu_vmm_alloc, gpu_vmm_free, vmm_filter>();
}
#endif

/**
 * Although the use of P2P tokens has been marked as depricated, CUDA still
 * supports it.  This unit test ensures that Process A cannot access GPU memory
 * of Process B by using tokens, which can be bruteforcedly generated.
 *
 * Step:
 * 1. Fork the process
 *
 * 2.P Parent: Allocate GPU memory and get tokens.
 * 3.P Parent: Send the cuMemAlloc'd ptr and the tokens to Child.
 * 4.P Parent: Waiting for Child to exit.
 *
 * 2.C Child: Waiting for ptr and tokens from Parent
 * 3.C Child: Attempt gdr_pin_buffer with the ptr and tokens. We expect that
 *     gdr_pin_buffer would fail
 */
GDRCOPY_TEST(invalidation_fork_child_gdr_pin_parent_with_tokens)
{
    expecting_exception_signal = false;
    MB();

    int filedes_0[2];
    int filedes_1[2];
    int read_fd;
    int write_fd;
    ASSERT_NEQ(pipe(filedes_0), -1);
    ASSERT_NEQ(pipe(filedes_1), -1);

    const size_t _size = sizeof(int) * 16;
    const size_t size = PAGE_ROUND_UP(_size, GPU_PAGE_SIZE);
    const char *myname;

    fflush(stdout);
    fflush(stderr);

    CUdeviceptr d_A;
    CUDA_POINTER_ATTRIBUTE_P2P_TOKENS tokens = {0,0};

    pid_t pid = fork();
    ASSERT(pid >= 0);

    myname = pid == 0 ? "child" : "parent";

    print_dbg("%s: Start\n", myname);

    if (pid == 0) {
        close(filedes_0[0]);
        close(filedes_1[1]);

        read_fd = filedes_1[0];
        write_fd = filedes_0[1];

        gdr_t g = gdr_open_safe();

        ASSERT_EQ(read(read_fd, &d_A, sizeof(CUdeviceptr)), sizeof(CUdeviceptr));
        ASSERT_EQ(read(read_fd, &tokens, sizeof(CUDA_POINTER_ATTRIBUTE_P2P_TOKENS)), sizeof(CUDA_POINTER_ATTRIBUTE_P2P_TOKENS));

        print_dbg("%s: Received from parent tokens.p2pToken %llu, tokens.vaSpaceToken %u\n", myname, tokens.p2pToken, tokens.vaSpaceToken);

        gdr_mh_t mh;

        CUdeviceptr d_ptr = d_A;

        ASSERT_NEQ(gdr_pin_buffer(g, d_ptr, size, tokens.p2pToken, tokens.vaSpaceToken, &mh), 0);
    }
    else {
        close(filedes_0[1]);
        close(filedes_1[0]);

        read_fd = filedes_0[0];
        write_fd = filedes_1[1];

        init_cuda(0);

        gpu_mem_handle_t mhandle;
        ASSERTDRV(gpu_mem_alloc(&mhandle, size, true, true));
        d_A = mhandle.ptr;

        ASSERTDRV(cuPointerGetAttribute(&tokens, CU_POINTER_ATTRIBUTE_P2P_TOKENS, d_A));

        print_dbg("%s: CUDA generated tokens.p2pToken %llu, tokens.vaSpaceToken %u\n", myname, tokens.p2pToken, tokens.vaSpaceToken);

        ASSERT_EQ(write(write_fd, &d_A, sizeof(CUdeviceptr)), sizeof(CUdeviceptr));
        ASSERT_EQ(write(write_fd, &tokens, sizeof(CUDA_POINTER_ATTRIBUTE_P2P_TOKENS)), sizeof(CUDA_POINTER_ATTRIBUTE_P2P_TOKENS));

        int child_exit_status = -EINVAL;
        ASSERT(wait(&child_exit_status) == pid);
        ASSERT_EQ(child_exit_status, EXIT_SUCCESS);

        ASSERTDRV(gpu_mem_free(&mhandle));

        finalize_cuda(0);
    }
}


struct mt_test_info {
    gpu_mem_handle_t mhandle;
    CUdeviceptr d_buf;
    void *mapped_d_buf;
    size_t size;
    gdr_t g;
    gdr_mh_t mh;
    bool use_barrier;
    pthread_barrier_t barrier;
    gpu_memfree_fn_t gfree_fn;
};

void *thr_fun_setup(void *data)
{
    mt_test_info *pt = static_cast<mt_test_info*>(data);
    ASSERT(pt);
    print_dbg("pinning\n");
    ASSERT_EQ(gdr_pin_buffer(pt->g, pt->d_buf, pt->size, 0, 0, &pt->mh), 0);
    ASSERT_NEQ(pt->mh, null_mh);
    print_dbg("mapping\n");
    ASSERT_EQ(gdr_map(pt->g, pt->mh, &pt->mapped_d_buf, pt->size), 0);
    if (pt->use_barrier)
        pthread_barrier_wait(&pt->barrier);
    return NULL;
}

void *thr_fun_teardown(void *data)
{
    mt_test_info *pt = static_cast<mt_test_info*>(data);
    ASSERT(pt);
    if (pt->use_barrier)
        pthread_barrier_wait(&pt->barrier);
    print_dbg("unmapping\n");
    ASSERT_EQ(gdr_unmap(pt->g, pt->mh, pt->mapped_d_buf, pt->size), 0);
    pt->mapped_d_buf = 0;
    print_dbg("unpinning\n");
    ASSERT_EQ(gdr_unpin_buffer(pt->g, pt->mh), 0);
    pt->mh = null_mh;
    return NULL;
}

void *thr_fun_combined(void *data)
{
    mt_test_info *pt = static_cast<mt_test_info*>(data);
    ASSERT(pt);
    ASSERT(!pt->use_barrier);
    thr_fun_setup(data);
    thr_fun_teardown(data);
    return NULL;
}

void *thr_fun_cleanup(void *data)
{
    mt_test_info *pt = static_cast<mt_test_info*>(data);
    ASSERT(pt);
    ASSERT_EQ(gdr_close(pt->g), 0);
    pt->g = 0;
    ASSERTDRV(pt->gfree_fn(&pt->mhandle));
    pt->d_buf = 0;
    return NULL;
}

template <gpu_memalloc_fn_t galloc_fn, gpu_memfree_fn_t gfree_fn, filter_fn_t filter_fn>
void basic_child_thread_pins_buffer()
{
    const size_t _size = GPU_PAGE_SIZE * 16;
    mt_test_info t;
    memset(&t, 0, sizeof(mt_test_info));
    t.size = PAGE_ROUND_UP(_size, GPU_PAGE_SIZE);

    init_cuda(0);
    filter_fn();

    t.gfree_fn = gfree_fn;

    ASSERTDRV(galloc_fn(&t.mhandle, t.size, true, true));
    t.d_buf = t.mhandle.ptr;

    ASSERTDRV(cuMemsetD8(t.d_buf, 0xA5, t.size));
    ASSERTDRV(cuCtxSynchronize());

    t.g = gdr_open_safe();
    {
        pthread_t tid;
        t.use_barrier = false;
        print_dbg("spawning single child thread\n");
        ASSERT_EQ(pthread_create(&tid, NULL, thr_fun_combined, &t), 0);
        ASSERT_EQ(pthread_join(tid, NULL), 0);
    }
    {
        pthread_t tid[2];
        ASSERT_EQ(pthread_barrier_init(&t.barrier, NULL, 2), 0);
        t.use_barrier = true;
        print_dbg("spawning two children threads, splitting setup and teardown\n");
        ASSERT_EQ(pthread_create(&tid[0], NULL, thr_fun_setup, &t), 0);
        ASSERT_EQ(pthread_create(&tid[1], NULL, thr_fun_teardown, &t), 0);
        ASSERT_EQ(pthread_join(tid[0], NULL), 0);
        ASSERT_EQ(pthread_join(tid[1], NULL), 0);
    }
    {
        pthread_t tid[2];
        t.use_barrier = false;
        mt_test_info t2 = t;
        print_dbg("spawning two children threads, concurrently pinning and mapping the same buffer\n");
        ASSERT_EQ(pthread_create(&tid[0], NULL, thr_fun_combined, &t), 0);
        ASSERT_EQ(pthread_create(&tid[1], NULL, thr_fun_combined, &t2), 0);
        ASSERT_EQ(pthread_join(tid[0], NULL), 0);
        ASSERT_EQ(pthread_join(tid[1], NULL), 0);
    }
    {
        pthread_t tid;
        print_dbg("spawning cleanup child thread\n");
        ASSERT_EQ(pthread_create(&tid, NULL, thr_fun_cleanup, &t), 0);
        ASSERT_EQ(pthread_join(tid, NULL), 0);
    }
    finalize_cuda(0);
}

GDRCOPY_TEST(basic_child_thread_pins_buffer_cumemalloc)
{
    basic_child_thread_pins_buffer<gpu_mem_alloc, gpu_mem_free, null_filter>();
}

#if CUDA_VERSION >= 11000
// VMM with GDR support is available from CUDA 11.0
GDRCOPY_TEST(basic_child_thread_pins_buffer_vmmalloc)
{
    basic_child_thread_pins_buffer<gpu_vmm_alloc, gpu_vmm_free, vmm_filter>();
}
#endif

void print_usage(const char *path)
{
    cout << "Usage: " << path << " [-h][-v][-s][-l][-t <test>]" << endl;
    cout << endl;
    cout << "Options:" << endl;
    cout << "   -h              Print this help text." << endl;
    cout << "   -v              Increase report verbosity." << endl;
    cout << "   -s              DON'T print summary report." << endl;
    cout << "   -l              List all available tests." << endl;
    cout << "   -t <test>       Run the specified test only." << endl;
}

void print_all_tests()
{
    vector<string> tests;
    gdrcopy::testsuite::get_all_test_names(tests);
    cout << "List of all available tests:" << endl;
    for (vector<string>::iterator it = tests.begin(); it != tests.end(); ++it)
        cout << "    " << *it << endl;
}

int main(int argc, char *argv[])
{
    int c;
    bool print_summary = true;
    int status;
    vector<string> tests;

    while ((c = getopt(argc, argv, "hvslt:")) != -1) {
        switch (c) {
            case 'h':
                print_usage(argv[0]);
                return EXIT_SUCCESS;
            case 'v':
                gdrcopy::test::print_dbg_msg = true;
                break;
            case 's':
                print_summary = false;
                break;
            case 'l':
                print_all_tests();
                return EXIT_SUCCESS;
            case 't':
                tests.emplace_back(optarg);
                break;
            default:
                cerr << "Invalid option" << endl;
                return EXIT_FAILURE;
        }
    }

    if (tests.size() > 0)
        status = gdrcopy::testsuite::run_tests(print_summary, tests);
    else
        status = gdrcopy::testsuite::run_all_tests(print_summary);
    if (status) {
        cerr << "Error: Encountered an error or a test failure with status=" << status << endl;
        return EXIT_FAILURE;
    }

    return EXIT_SUCCESS;
}


/*
 * Local variables:
 *  c-indent-level: 4
 *  c-basic-offset: 4
 *  tab-width: 4
 *  indent-tabs-mode: nil
 * End:
 */