/* * Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved. * * NVIDIA CORPORATION and its licensors retain all intellectual property * and proprietary rights in and to this software, related documentation * and any modifications thereto. Any use, reproduction, disclosure or * distribution of this software and related documentation without an express * license agreement from NVIDIA CORPORATION is strictly prohibited. * * See COPYRIGHT.txt for license information */ #include #include #include #include #include #include #include "utils.h" #define DEFAULT_ITERS 10 #define DEFAULT_MIN_MSG_SIZE 1 #define DEFAULT_MAX_MSG_SIZE 128 * 1024 * 1024 typedef enum { PUSH = 0, PULL = 1 } putget_dir_t; __global__ void test_kernel(void *data_d_local, long long int ncycles) { long long int sclk = clock64(); long long int cyc = 0; while (cyc < ncycles) { cyc = clock64() - sclk; } *(long long int *)data_d_local = cyc; } int lat(void *data_d, void *data_d_local, int sizeBytes, int pe, int iter, putget_dir_t dir, cudaStream_t strm, cudaEvent_t sev, cudaEvent_t eev, float *ms1, float *ms2, int ng, int nb, long long int ncycles) { int status = 0; int peer = !pe; if (dir == PUSH) { CUDA_CHECK(cudaEventRecord(sev, strm)); for (int i = 0; i < iter; i++) { test_kernel<<>>(data_d_local, ncycles); nvshmemx_putmem_on_stream((void *)data_d, (void *)data_d_local, sizeBytes, peer, strm); } CUDA_CHECK(cudaEventRecord(eev, strm)); CUDA_CHECK(cudaEventSynchronize(eev)); CUDA_CHECK(cudaEventElapsedTime(ms1, sev, eev)); CUDA_CHECK(cudaEventRecord(sev, strm)); for (int i = 0; i < iter; i++) { test_kernel<<>>(data_d_local, ncycles); CUDA_CHECK(cudaStreamSynchronize(strm)); nvshmem_putmem((void *)data_d, (void *)data_d_local, sizeBytes, peer); } CUDA_CHECK(cudaEventRecord(eev, strm)); CUDA_CHECK(cudaEventSynchronize(eev)); CUDA_CHECK(cudaEventElapsedTime(ms2, sev, eev)); } else { CUDA_CHECK(cudaEventRecord(sev, strm)); for (int i = 0; i < iter; i++) { nvshmemx_getmem_on_stream((void *)data_d_local, (void *)data_d, sizeBytes, peer, strm); test_kernel<<>>(data_d_local, ncycles); } CUDA_CHECK(cudaEventRecord(eev, strm)); CUDA_CHECK(cudaEventSynchronize(eev)); CUDA_CHECK(cudaEventElapsedTime(ms1, sev, eev)); CUDA_CHECK(cudaEventRecord(sev, strm)); for (int i = 0; i < iter; i++) { nvshmem_getmem((void *)data_d_local, (void *)data_d, sizeBytes, peer); // shmem_getmem is blocking, so nvshmem_quiet is not needed test_kernel<<>>(data_d_local, ncycles); } CUDA_CHECK(cudaEventRecord(eev, strm)); CUDA_CHECK(cudaEventSynchronize(eev)); CUDA_CHECK(cudaEventElapsedTime(ms2, sev, eev)); } return status; } int main(int argc, char *argv[]) { int status = 0; int mype, npes; char *data_d = NULL, *data_d_local = NULL; uint64_t *size_array = NULL; double *offs_latency_array = NULL; double *ons_latency_array = NULL; int num_entries; int i; putget_dir_t dir = PUSH; int iter = DEFAULT_ITERS; int min_msg_size = DEFAULT_MIN_MSG_SIZE; int max_msg_size = DEFAULT_MAX_MSG_SIZE; int nb = 1, nt = 32; long long int ncycles = 1; init_wrapper(&argc, &argv); mype = nvshmem_my_pe(); npes = nvshmem_n_pes(); if (npes != 2) { fprintf(stderr, "This test requires exactly two processes \n"); status = -1; goto finalize; } while (1) { int c; c = getopt(argc, argv, "s:S:n:i:d:b:t:c:h"); if (c == -1) break; switch (c) { case 's': min_msg_size = strtol(optarg, NULL, 0); break; case 'S': max_msg_size = strtol(optarg, NULL, 0); break; case 'n': iter = strtol(optarg, NULL, 0); break; case 'd': dir = (putget_dir_t)strtol(optarg, NULL, 0); break; case 'b': nb = strtol(optarg, NULL, 0); break; case 't': nt = strtol(optarg, NULL, 0); break; case 'c': ncycles = strtol(optarg, NULL, 0); break; default: case 'h': printf( "-n [Iterations] -S [Max message size] -s [Min message size] -i [Put/Get issue type : ON_STREAM(0) otherwise 1] -d [Direction of copy : PUSH(0) or PULL(1)] -b [# blocks] \ -t [# threads] -c [# cycles to wait in the the kernel]\n"); goto finalize; } } num_entries = floor(std::log2((float)max_msg_size)) - floor(std::log2((float)min_msg_size)) + 1; size_array = (uint64_t *)calloc(sizeof(uint64_t), num_entries); if (!size_array) { status = -1; goto finalize; } offs_latency_array = (double *)calloc(sizeof(double), num_entries); if (!offs_latency_array) { status = -1; goto finalize; } ons_latency_array = (double *)calloc(sizeof(double), num_entries); if (!ons_latency_array) { status = -1; goto finalize; } data_d = (char *)nvshmem_malloc(max_msg_size); CUDA_CHECK(cudaMemset(data_d, 0, max_msg_size)); data_d_local = (char *)nvshmem_malloc(max_msg_size); CUDA_CHECK(cudaMemset(data_d, 0, max_msg_size)); cudaStream_t strm; CUDA_CHECK(cudaStreamCreateWithFlags(&strm, cudaStreamNonBlocking)); CUDA_CHECK(cudaDeviceSynchronize()); if (mype == 0) { float ms1, ms2; cudaEvent_t sev, eev; CUDA_CHECK(cudaEventCreate(&sev)); CUDA_CHECK(cudaEventCreate(&eev)); i = 0; for (int size = min_msg_size; size <= max_msg_size; size *= 2) { size_array[i] = size; lat(data_d, data_d_local, size, mype, iter, dir, strm, sev, eev, &ms1, &ms2, nb, nt, ncycles); ons_latency_array[i] = ms1 / iter * 1000; offs_latency_array[i] = ms2 / iter * 1000; i++; } print_table_basic("Stream_Latency", "with _on_stream", "size (Bytes)", "latency", "us", '-', size_array, ons_latency_array, i); print_table_basic("Stream_Latency", "without _on_stream", "size (Bytes)", "latency", "us", '-', size_array, offs_latency_array, i); CUDA_CHECK(cudaEventDestroy(sev)); CUDA_CHECK(cudaEventDestroy(eev)); nvshmem_barrier_all(); } else { nvshmem_barrier_all(); } finalize: CUDA_CHECK(cudaStreamDestroy(strm)); if (data_d) nvshmem_free(data_d); if (size_array) free(size_array); if (ons_latency_array) free(ons_latency_array); if (offs_latency_array) free(offs_latency_array); if (data_d_local) nvshmem_free(data_d_local); finalize_wrapper(); return status; }