/* * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. * * NVIDIA CORPORATION and its licensors retain all intellectual property * and proprietary rights in and to this software, related documentation * and any modifications thereto. Any use, reproduction, disclosure or * distribution of this software and related documentation without an express * license agreement from NVIDIA CORPORATION is strictly prohibited. * * See COPYRIGHT.txt for license information */ #include #include #include #include #include #include "utils.h" __global__ void bw(double *data_d, volatile unsigned int *counter_d, int len, int pe, int iter) { int i, peer; unsigned int counter; int tid = (threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z); int bid = blockIdx.x; int nblocks = gridDim.x; peer = !pe; for (i = 0; i < iter; i++) { nvshmemx_double_get_nbi_block(data_d + (bid * (len / nblocks)), data_d + (bid * (len / nblocks)), len / nblocks, peer); // synchronizing across blocks __syncthreads(); if (!tid) { __threadfence(); counter = atomicInc((unsigned int *)counter_d, UINT_MAX); if (counter == (gridDim.x * (i + 1) - 1)) { *(counter_d + 1) += 1; } while (*(counter_d + 1) != i + 1) ; } __syncthreads(); } // synchronizing across blocks __syncthreads(); if (!tid) { __threadfence(); counter = atomicInc((unsigned int *)counter_d, UINT_MAX); if (counter == (gridDim.x * (i + 1) - 1)) { nvshmem_quiet(); *(counter_d + 1) += 1; } while (*(counter_d + 1) != i + 1) ; } } int main(int argc, char *argv[]) { int mype, npes; double *data_d = NULL; unsigned int *counter_d; read_args(argc, argv); int max_blocks = num_blocks, max_threads = threads_per_block; int array_size, i; void **h_tables; uint64_t *h_size_arr; double *h_bw = NULL, *h_bw_total = NULL; double *d_bw = NULL, *d_bw_sum = NULL; int iter = iters; int skip = warmup_iters; float milliseconds; cudaEvent_t start, stop; init_wrapper(&argc, &argv); cudaEventCreate(&start); cudaEventCreate(&stop); mype = nvshmem_my_pe(); npes = nvshmem_n_pes(); if (npes != 2) { fprintf(stderr, "This test requires exactly two processes \n"); goto finalize; } data_d = (double *)nvshmem_malloc(max_size); CUDA_CHECK(cudaMemset(data_d, 0, max_size)); array_size = max_size_log; alloc_tables(&h_tables, 2, array_size); h_size_arr = (uint64_t *)h_tables[0]; h_bw = (double *)h_tables[1]; if (bidirectional) { h_bw_total = (double *)malloc(sizeof(double) * array_size); if (!h_bw_total) { fprintf(stderr, "Error: Unable to malloc on the host.\n"); exit(1); } memset(h_bw_total, 0, sizeof(double) * array_size); /* Allocate on GPU. */ CUDA_CHECK(cudaMalloc((void **)&d_bw, sizeof(double))); CUDA_CHECK(cudaMalloc((void **)&d_bw_sum, sizeof(double))); } CUDA_CHECK(cudaMalloc((void **)&counter_d, sizeof(unsigned int) * 2)); CUDA_CHECK(cudaMemset(counter_d, 0, sizeof(unsigned int) * 2)); CUDA_CHECK(cudaDeviceSynchronize()); if (bidirectional || mype == 0) { i = 0; for (int size = min_size; size <= max_size; size *= step_factor) { h_size_arr[i] = size; CUDA_CHECK(cudaMemset(counter_d, 0, sizeof(unsigned int) * 2)); bw<<>>(data_d, counter_d, size / sizeof(double), mype, skip); CUDA_CHECK(cudaDeviceSynchronize()); CUDA_CHECK(cudaMemset(counter_d, 0, sizeof(unsigned int) * 2)); cudaEventRecord(start); bw<<>>(data_d, counter_d, size / sizeof(double), mype, iter); cudaEventRecord(stop); CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaEventSynchronize(stop)); cudaEventElapsedTime(&milliseconds, start, stop); h_bw[i] = size / (milliseconds * (B_TO_GB / (iter * MS_TO_S))); nvshmem_barrier_all(); /* Sum all h_bw of each PE for bidirectional mode. */ if (bidirectional) { CUDA_CHECK(cudaMemcpy(d_bw, &h_bw[i], sizeof(double), cudaMemcpyHostToDevice)); nvshmem_double_sum_reduce(NVSHMEM_TEAM_WORLD, d_bw_sum, d_bw, 1); CUDA_CHECK( cudaMemcpy(&h_bw_total[i], d_bw_sum, sizeof(double), cudaMemcpyDeviceToHost)); CUDA_CHECK( cudaMemcpy(&h_bw_total[i], d_bw_sum, sizeof(double), cudaMemcpyDeviceToHost)); } i++; } } else { for (int size = min_size; size <= max_size; size *= step_factor) { nvshmem_barrier_all(); } } if (mype == 0) { double *p_h_bw_tmp = bidirectional ? h_bw_total : h_bw; const char *const test_name = bidirectional ? "shmem_get_bw_bidi" : "shmem_get_bw_uni"; print_table_basic(test_name, "None", "size (Bytes)", "BW", "GB/sec", '+', h_size_arr, p_h_bw_tmp, i); } finalize: if (data_d) nvshmem_free(data_d); if (h_bw_total) free(h_bw_total); if (d_bw) cudaFree(d_bw); if (d_bw_sum) cudaFree(d_bw_sum); free_tables(h_tables, 2); finalize_wrapper(); return 0; }