/* * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ #include #include #include #include #include #include #include #include #include using namespace std; #include "gdrapi.h" #include "common.hpp" using namespace gdrcopy::test; __global__ void pp_kernel(uint32_t *d_buf, uint32_t *h_buf, uint32_t num_iters) { uint32_t i = 1; WRITE_ONCE(*h_buf, i); __threadfence_block(); while (i < num_iters) { while (READ_ONCE(*d_buf) != i) ; __threadfence_block(); ++i; WRITE_ONCE(*h_buf, i); __threadfence_block(); } } static int dev_id = 0; static uint32_t num_iters = 1000; static unsigned int timeout = 10; // in s // Counter value before checking timeout. static unsigned long int timeout_check_threshold = 1000000UL; static unsigned long int timeout_counter = 0; static void print_usage(const char *path) { cout << "Usage: " << path << " [-h][-d ][-t ][-u ][-a ]" << endl; cout << endl; cout << "Options:" << endl; cout << " -h Print this help text" << endl; cout << " -d GPU ID (default: " << dev_id << ")" << endl; cout << " -t Number of iterations (default: " << num_iters << ")" << endl; cout << " -u Timeout in second. 0 to disable. (default: " << timeout << ")" << endl; cout << " -a GPU buffer allocation function (default: cuMemAlloc)" << endl; cout << " Choices: cuMemAlloc, cuMemCreate" << endl; } /** * Return time difference in us. */ static inline double time_diff(struct timespec start, struct timespec end) { return (double)((end.tv_nsec - start.tv_nsec) / 1000.0 + (end.tv_sec - start.tv_sec) * 1000000.0); } static inline void check_timeout(struct timespec start, double timeout_us) { CUresult status; const char *cu_status_name; struct timespec now; double time_used_us; if (timeout_us > 0) { ++timeout_counter; if (timeout_counter >= timeout_check_threshold) { clock_gettime(MYCLOCK, &now); time_used_us = time_diff(start, now); if (time_used_us > timeout_us) { cerr << "ERROR: TIMEOUT!!!" << endl; status = cuStreamQuery(0); cuGetErrorName(status, &cu_status_name); cerr << "cuStreamQuery(0) returned " << cu_status_name << endl; abort(); } timeout_counter = 0; } } } int main(int argc, char *argv[]) { uint32_t *d_buf = NULL; uint32_t *h_buf = NULL; CUdeviceptr d_buf_cuptr; CUdeviceptr h_buf_cuptr; gpu_mem_handle_t mhandle; struct timespec beg, end; double lat_us; double timeout_us; gpu_memalloc_fn_t galloc_fn = gpu_mem_alloc; gpu_memfree_fn_t gfree_fn = gpu_mem_free; while(1) { int c; c = getopt(argc, argv, "d:t:u:a:h"); if (c == -1) break; switch (c) { case 'd': dev_id = strtol(optarg, NULL, 0); break; case 't': num_iters = strtol(optarg, NULL, 0); break; case 'u': timeout = strtol(optarg, NULL, 0); break; case 'a': if (strcmp(optarg, "cuMemAlloc") == 0) { galloc_fn = gpu_mem_alloc; gfree_fn = gpu_mem_free; } else if (strcmp(optarg, "cuMemCreate") == 0) { galloc_fn = gpu_vmm_alloc; gfree_fn = gpu_vmm_free; } else { cerr << "Unrecognized fn argument" << endl; exit(EXIT_FAILURE); } break; case 'h': print_usage(argv[0]); exit(EXIT_SUCCESS); default: printf("ERROR: invalid option\n"); exit(EXIT_FAILURE); } } timeout_us = timeout * 1000000.0; ASSERTDRV(cuInit(0)); int n_devices = 0; ASSERTDRV(cuDeviceGetCount(&n_devices)); CUdevice dev; for (int n=0; n>>((uint32_t *)d_buf_cuptr, (uint32_t *)h_buf_cuptr, num_iters); // Catching any potential errors. CUDA_ERROR_NOT_READY means pp_kernel // is running. We expect to see this status instead of CUDA_SUCCESS // because pp_kernel must wait for signal from CPU, which occurs after // this line. ASSERT_EQ(cuStreamQuery(0), CUDA_ERROR_NOT_READY); uint32_t i = 1; // Wait for pp_kernel to be ready before starting the time measurement. clock_gettime(MYCLOCK, &beg); while (READ_ONCE(*h_buf) != i) { check_timeout(beg, timeout_us); } LB(); // Restart the timer for measurement. clock_gettime(MYCLOCK, &beg); while (i < num_iters) { gdr_copy_to_mapping(mh, d_buf, &i, sizeof(d_buf)); SB(); ++i; while (READ_ONCE(*h_buf) != i) { check_timeout(beg, timeout_us); } LB(); } clock_gettime(MYCLOCK, &end); ASSERTDRV(cuStreamSynchronize(0)); clock_gettime(MYCLOCK, &end); lat_us = time_diff(beg, end) / (double)num_iters; cout << "Round-trip latency per iteration is " << lat_us << " us" << endl; cout << "unmapping buffer" << endl; ASSERT_EQ(gdr_unmap(g, mh, map_d_ptr, sizeof(*d_buf)), 0); cout << "unpinning buffer" << endl; ASSERT_EQ(gdr_unpin_buffer(g, mh), 0); } END_CHECK; cout << "closing gdrdrv" << endl; ASSERT_EQ(gdr_close(g), 0); ASSERTDRV(cuMemFreeHost(h_buf)); ASSERTDRV(gfree_fn(&mhandle)); return 0; } /* * Local variables: * c-indent-level: 4 * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */