/* * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. * * NVIDIA CORPORATION and its licensors retain all intellectual property * and proprietary rights in and to this software, related documentation * and any modifications thereto. Any use, reproduction, disclosure or * distribution of this software and related documentation without an express * license agreement from NVIDIA CORPORATION is strictly prohibited. * * See COPYRIGHT.txt for license information */ #ifndef COLL_TEST_H #define COLL_TEST_H #include #include #include #include #include #include "utils.h" #include #include #include #include #define MAX_SKIP 16 #define MAX_ITERS 128 #define MAX_NPES 128 #define BARRIER_MAX_ITERS 1000 #define FCOLLECT_MAX_ITERS 1024 #define BARRIER_MAX_SKIP 10 extern int coll_max_iters; #define alltoall_src_size(DATATYPE, num_elems, npes) (sizeof(DATATYPE) * num_elems * npes) #define alltoall_dest_size(DATATYPE, num_elems, npes) (sizeof(DATATYPE) * num_elems * npes) #define fcollect_src_size(DATATYPE, num_elems, npes) (sizeof(DATATYPE) * num_elems) #define fcollect_dest_size(DATATYPE, num_elems, npes) (sizeof(DATATYPE) * num_elems * npes) #define broadcast_src_size(DATATYPE, num_elems, npes) (sizeof(DATATYPE) * num_elems) #define broadcast_dest_size(DATATYPE, num_elems, npes) (sizeof(DATATYPE) * num_elems) #define call_shmem_broadcast(TYPENAME, TYPE, team, d_dest, d_source, num_elems, root) \ do { \ nvshmem_##TYPENAME##_broadcast(team, d_dest, d_source, num_elems, root); \ } while (0) #define call_shmem_fcollect(TYPENAME, TYPE, team, d_dest, d_source, num_elems, root) \ do { \ nvshmem_##TYPENAME##_fcollect(team, d_dest, d_source, num_elems); \ } while (0) #define call_shmem_alltoall(TYPENAME, TYPE, team, d_dest, d_source, num_elems, root) \ do { \ nvshmem_##TYPENAME##_alltoall(team, d_dest, d_source, num_elems); \ } while (0) #define call_shmem_broadcast_on_stream(TYPENAME, TYPE, team, d_dest, d_source, num_elems, root, \ stream) \ do { \ nvshmemx_##TYPENAME##_broadcast_on_stream(team, d_dest, d_source, num_elems, root, \ stream); \ } while (0) #define call_shmem_fcollect_on_stream(TYPENAME, TYPE, team, d_dest, d_source, num_elems, root, \ stream) \ do { \ nvshmemx_##TYPENAME##_fcollect_on_stream(team, d_dest, d_source, num_elems, stream); \ } while (0) #define call_shmem_alltoall_on_stream(TYPENAME, TYPE, team, d_dest, d_source, num_elems, root, \ stream) \ do { \ nvshmemx_##TYPENAME##_alltoall_on_stream(team, d_dest, d_source, num_elems, stream); \ } while (0) /* */ #define RUN_COLL_ON_STREAM(coll, COLL, TYPENAME, TYPE, d_source, h_source, d_dest, h_dest, npes, \ root, stream, size_array, latency_array) \ do { \ int array_index = 0; \ size_t min_elems, max_elems; \ if (strcmp(#coll, "broadcast") == 0) { \ min_elems = max(static_cast(1), min_size / sizeof(TYPE)); \ max_elems = max(static_cast(1), max_size / sizeof(TYPE)); \ } else { \ min_elems = max(static_cast(1), min_size / (npes * sizeof(TYPE))); \ max_elems = max(static_cast(1), max_size / (npes * sizeof(TYPE))); \ } \ for (size_t num_elems = min_elems; num_elems <= max_elems; num_elems *= step_factor) { \ float latency = 0; \ cudaEvent_t t_start, t_stop; \ CUDA_CHECK(cudaEventCreate(&t_start)); \ CUDA_CHECK(cudaEventCreate(&t_stop)); \ int latency_iters = 0; \ auto lat_idx_array = latency_array[array_index]; \ nvshmem_barrier_all(); \ for (int iter = 0; iter < warmup_iters; iter++) { \ call_shmem_##coll##_on_stream(TYPENAME, TYPE, NVSHMEM_TEAM_WORLD, d_dest, \ d_source, num_elems, root, stream); \ } \ CUDA_CHECK(cudaStreamSynchronize(stream)); \ nvshmem_barrier_all(); \ for (int iter = 0; iter < iters; iter++) { \ CUDA_CHECK(cudaEventRecord(t_start, stream)); \ call_shmem_##coll##_on_stream(TYPENAME, TYPE, NVSHMEM_TEAM_WORLD, d_dest, \ d_source, num_elems, root, stream); \ CUDA_CHECK(cudaEventRecord(t_stop, stream)); \ CUDA_CHECK(cudaStreamSynchronize(stream)); \ CUDA_CHECK(cudaEventElapsedTime(&latency, t_start, t_stop)); \ lat_idx_array[latency_iters] = latency * 1e+3; \ latency_iters++; \ } \ CUDA_CHECK(cudaEventDestroy(t_start)); \ CUDA_CHECK(cudaEventDestroy(t_stop)); \ nvshmem_barrier_all(); \ if (strcmp(#coll, "alltoall") == 0 || strcmp(#coll, "fcollect") == 0) \ size_array[array_index] = num_elems * sizeof(TYPE) * npes; \ else \ size_array[array_index] = num_elems * sizeof(TYPE); \ array_index++; \ } \ } while (0) #define RUN_RDXN(coll, TYPENAME, TYPE, OP, team, d_source, d_dest, size_array, latency_array, \ stream) \ do { \ cudaEvent_t start_event, stop_event; \ CUDA_CHECK(cudaEventCreate(&start_event)); \ CUDA_CHECK(cudaEventCreate(&stop_event)); \ float ms = 0.0f; \ if (strcmp(#coll, "reduce") == 0) { \ min_elems = max(static_cast(1), min_size / sizeof(TYPE)); \ max_elems = max(static_cast(1), max_size / sizeof(TYPE)); \ } else { \ min_elems = max(static_cast(1), min_size / (npes * sizeof(TYPE))); \ max_elems = max(static_cast(1), max_size / (npes * sizeof(TYPE))); \ } \ int idx = 0; \ for (size_t num_elems = min_elems; num_elems <= max_elems; num_elems *= step_factor) { \ nvshmemx_barrier_all_on_stream(stream); \ for (int iter = 0; iter < iters + warmup_iters; iter++) { \ if (iter >= warmup_iters) CUDA_CHECK(cudaEventRecord(start_event, stream)); \ nvshmemx_##TYPENAME##_##OP##_##coll##_on_stream( \ team, (TYPE *)d_dest, (const TYPE *)d_source, num_elems, stream); \ \ if (iter >= warmup_iters) { \ CUDA_CHECK(cudaEventRecord(stop_event, stream)); \ CUDA_CHECK(cudaStreamSynchronize(stream)); \ CUDA_CHECK(cudaEventElapsedTime(&ms, start_event, stop_event)); \ latency_array[idx][iter - warmup_iters] = ms * 1e+3; \ } \ } \ if (strcmp(#coll, "reduce") == 0) \ size_array[idx] = num_elems * sizeof(TYPE); \ else \ size_array[idx] = num_elems * npes * sizeof(TYPE); \ idx++; \ } \ CUDA_CHECK(cudaEventDestroy(start_event)); \ CUDA_CHECK(cudaEventDestroy(stop_event)); \ } while (0) #define RUN_RDXN_BITWISE_DATATYPE(coll, TYPENAME, TYPE, team, d_source, d_dest, num_elems, stream, \ size_array, latency_array) \ switch (reduce_op.type) { \ case NVSHMEM_SUM: \ RUN_RDXN(coll, TYPENAME, TYPE, sum, team, d_source, d_dest, size_array, latency_array, \ stream); \ break; \ case NVSHMEM_MIN: \ RUN_RDXN(coll, TYPENAME, TYPE, min, team, d_source, d_dest, size_array, latency_array, \ stream); \ break; \ case NVSHMEM_MAX: \ RUN_RDXN(coll, TYPENAME, TYPE, max, team, d_source, d_dest, size_array, latency_array, \ stream); \ break; \ case NVSHMEM_PROD: \ RUN_RDXN(coll, TYPENAME, TYPE, prod, team, d_source, d_dest, size_array, \ latency_array, stream); \ break; \ case NVSHMEM_AND: \ RUN_RDXN(coll, TYPENAME, TYPE, and, team, d_source, d_dest, size_array, latency_array, \ stream); \ break; \ case NVSHMEM_OR: \ RUN_RDXN(coll, TYPENAME, TYPE, or, team, d_source, d_dest, size_array, latency_array, \ stream); \ break; \ case NVSHMEM_XOR: \ RUN_RDXN(coll, TYPENAME, TYPE, xor, team, d_source, d_dest, size_array, latency_array, \ stream); \ break; \ } #define RUN_RDXN_DATATYPE(coll, TYPENAME, TYPE, team, d_source, d_dest, num_elems, stream, \ size_array, latency_array) \ switch (reduce_op.type) { \ case NVSHMEM_SUM: \ RUN_RDXN(coll, TYPENAME, TYPE, sum, team, d_source, d_dest, size_array, latency_array, \ stream); \ break; \ case NVSHMEM_MIN: \ RUN_RDXN(coll, TYPENAME, TYPE, min, team, d_source, d_dest, size_array, latency_array, \ stream); \ break; \ case NVSHMEM_MAX: \ RUN_RDXN(coll, TYPENAME, TYPE, max, team, d_source, d_dest, size_array, latency_array, \ stream); \ break; \ case NVSHMEM_PROD: \ RUN_RDXN(coll, TYPENAME, TYPE, prod, team, d_source, d_dest, size_array, \ latency_array, stream); \ break; \ } #endif /*COLL_TEST_H*/