/* * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. * * NVIDIA CORPORATION and its licensors retain all intellectual property * and proprietary rights in and to this software, related documentation * and any modifications thereto. Any use, reproduction, disclosure or * distribution of this software and related documentation without an express * license agreement from NVIDIA CORPORATION is strictly prohibited. * * See COPYRIGHT.txt for license information */ #define CUMODULE_NAME "reduction_latency.cubin" #include "utils.h" #include "coll_test.h" #define LARGEST_DT int64_t #if defined __cplusplus || defined NVSHMEM_BITCODE_APPLICATION extern "C" { #endif #define CALL_RDXN(TG_PRE, TG, TYPENAME, TYPE, OP, THREAD_COMP, ELEM_COMP) \ \ void call_test_##TYPENAME##_##OP##_reduce_kern##TG##_cubin( \ int num_blocks, int num_tpb, cudaStream_t stream, void **arglist) { \ CUfunction test_##TYPENAME##_##OP##_reduce_kern##TG_cubin; \ \ init_test_case_kernel(&test_##TYPENAME##_##OP##_reduce_kern##TG_cubin, \ NVSHMEMI_TEST_STRINGIFY(test_##TYPENAME##_##OP##_reduce_kern##TG)); \ CU_CHECK(cuLaunchCooperativeKernel(test_##TYPENAME##_##OP##_reduce_kern##TG_cubin, \ num_blocks, 1, 1, num_tpb, 1, 1, 0, stream, arglist)); \ } \ \ __global__ void test_##TYPENAME##_##OP##_reduce_kern##TG( \ nvshmem_team_t team, TYPE *dest, const TYPE *source, int nelems, int iter) { \ int i; \ \ if (!blockIdx.x && (threadIdx.x < THREAD_COMP) && (nelems < ELEM_COMP)) { \ for (i = 0; i < iter; i++) { \ nvshmem##TG_PRE##_##TYPENAME##_##OP##_reduce##TG(team, dest, source, nelems); \ } \ } \ } #define CALL_RDXN_KERNEL(TYPENAME, OP, TG, BLOCKS, THREADS, ARG_LIST, STREAM) \ if (use_cubin) { \ call_test_##TYPENAME##_##OP##_reduce_kern##TG##_cubin(BLOCKS, THREADS, STREAM, ARG_LIST); \ } else { \ status = \ nvshmemx_collective_launch((const void *)test_##TYPENAME##_##OP##_reduce_kern##TG, \ BLOCKS, THREADS, ARG_LIST, 0, STREAM); \ if (status != NVSHMEMX_SUCCESS) { \ fprintf(stderr, "shmemx_collective_launch failed %d \n", status); \ exit(-1); \ } \ } #define CALL_RDXN_OPS_ALL_TG(TYPENAME, TYPE) \ CALL_RDXN(x, _block, TYPENAME, TYPE, sum, INT_MAX, INT_MAX) \ CALL_RDXN(x, _block, TYPENAME, TYPE, prod, INT_MAX, INT_MAX) \ CALL_RDXN(x, _block, TYPENAME, TYPE, and, INT_MAX, INT_MAX) \ CALL_RDXN(x, _block, TYPENAME, TYPE, or, INT_MAX, INT_MAX) \ CALL_RDXN(x, _block, TYPENAME, TYPE, xor, INT_MAX, INT_MAX) \ CALL_RDXN(x, _block, TYPENAME, TYPE, min, INT_MAX, INT_MAX) \ CALL_RDXN(x, _block, TYPENAME, TYPE, max, INT_MAX, INT_MAX) \ CALL_RDXN(x, _warp, TYPENAME, TYPE, sum, warpSize, 4096) \ CALL_RDXN(x, _warp, TYPENAME, TYPE, prod, warpSize, 4096) \ CALL_RDXN(x, _warp, TYPENAME, TYPE, and, warpSize, 4096) \ CALL_RDXN(x, _warp, TYPENAME, TYPE, or, warpSize, 4096) \ CALL_RDXN(x, _warp, TYPENAME, TYPE, xor, warpSize, 4096) \ CALL_RDXN(x, _warp, TYPENAME, TYPE, min, warpSize, 4096) \ CALL_RDXN(x, _warp, TYPENAME, TYPE, max, warpSize, 4096) \ CALL_RDXN(, , TYPENAME, TYPE, sum, 1, 512) \ CALL_RDXN(, , TYPENAME, TYPE, prod, 1, 512) \ CALL_RDXN(, , TYPENAME, TYPE, and, 1, 512) \ CALL_RDXN(, , TYPENAME, TYPE, or, 1, 512) \ CALL_RDXN(, , TYPENAME, TYPE, xor, 1, 512) \ CALL_RDXN(, , TYPENAME, TYPE, min, 1, 512) \ CALL_RDXN(, , TYPENAME, TYPE, max, 1, 512) CALL_RDXN_OPS_ALL_TG(int32, int32_t) CALL_RDXN_OPS_ALL_TG(int64, int64_t) #if defined __cplusplus || defined NVSHMEM_BITCODE_APPLICATION } #endif #define SET_SIZE_ARR(TYPE, ELEM_COMP) \ do { \ j = 0; \ for (num_elems = min_elems; num_elems <= max_elems; num_elems *= step_factor) { \ if (num_elems < ELEM_COMP) { \ size_arr[j] = num_elems * sizeof(TYPE); \ } else { \ size_arr[j] = 0; \ } \ j++; \ } \ } while (0) #define RUN_ITERS_OP(TYPENAME, TYPE, GROUP, OP, ELEM_COMP) \ do { \ void *skip_arg_list[] = {&team, &dest, &source, &num_elems, &skip}; \ void *time_arg_list[] = {&team, &dest, &source, &num_elems, &iter}; \ float milliseconds; \ cudaEvent_t start, stop; \ cudaEventCreate(&start); \ cudaEventCreate(&stop); \ SET_SIZE_ARR(TYPE, ELEM_COMP); \ \ nvshmem_barrier_all(); \ j = 0; \ for (num_elems = min_elems; num_elems < ELEM_COMP; num_elems *= 2) { \ CALL_RDXN_KERNEL(TYPENAME, OP, GROUP, num_blocks, nvshm_test_num_tpb, skip_arg_list, \ stream); \ CUDA_CHECK(cudaStreamSynchronize(stream)); \ nvshmem_barrier_all(); \ \ cudaEventRecord(start, stream); \ CALL_RDXN_KERNEL(TYPENAME, OP, GROUP, num_blocks, nvshm_test_num_tpb, time_arg_list, \ stream); \ cudaEventRecord(stop, stream); \ CUDA_CHECK(cudaStreamSynchronize(stream)); \ \ if (!mype) { \ cudaEventElapsedTime(&milliseconds, start, stop); \ h_##OP##_lat[j] = (milliseconds * 1000.0) / (float)iter; \ } \ nvshmem_barrier_all(); \ j++; \ } \ } while (0) #define RUN_ITERS(TYPENAME, TYPE, GROUP, ELEM_COMP) \ RUN_ITERS_OP(TYPENAME, TYPE, GROUP, sum, ELEM_COMP); \ RUN_ITERS_OP(TYPENAME, TYPE, GROUP, prod, ELEM_COMP); \ RUN_ITERS_OP(TYPENAME, TYPE, GROUP, and, ELEM_COMP); \ RUN_ITERS_OP(TYPENAME, TYPE, GROUP, or, ELEM_COMP); \ RUN_ITERS_OP(TYPENAME, TYPE, GROUP, xor, ELEM_COMP); \ RUN_ITERS_OP(TYPENAME, TYPE, GROUP, min, ELEM_COMP); \ RUN_ITERS_OP(TYPENAME, TYPE, GROUP, max, ELEM_COMP); int rdxn_calling_kernel(nvshmem_team_t team, void *dest, const void *source, int mype, cudaStream_t stream, run_opt_t run_options, void **h_tables) { int status = 0; int nvshm_test_num_tpb = threads_per_block; int num_blocks = 1; size_t num_elems = 1, min_elems, max_elems; int iter = iters; int skip = warmup_iters; int j; uint64_t *size_arr = (uint64_t *)h_tables[0]; double *h_sum_lat = (double *)h_tables[1]; double *h_prod_lat = (double *)h_tables[2]; double *h_and_lat = (double *)h_tables[3]; double *h_or_lat = (double *)h_tables[4]; double *h_xor_lat = (double *)h_tables[5]; double *h_min_lat = (double *)h_tables[6]; double *h_max_lat = (double *)h_tables[7]; // if (!mype) printf("Transfer size in bytes and latency of thread/warp/block variants of all // operations of reduction API in us\n"); if (run_options.run_thread) { min_elems = max(static_cast(1), min_size / sizeof(int32_t)); max_elems = max(static_cast(1), max_size / sizeof(int32_t)); RUN_ITERS(int32, int32_t, , 512); if (!mype) { print_table_v1("device_reduction", "int32-sum-t", "size (Bytes)", "latency", "us", '-', size_arr, h_sum_lat, j); print_table_v1("device_reduction", "int32-prod-t", "size (Bytes)", "latency", "us", '-', size_arr, h_prod_lat, j); print_table_v1("device_reduction", "int32-and-t", "size (Bytes)", "latency", "us", '-', size_arr, h_and_lat, j); print_table_v1("device_reduction", "int32-or-t", "size (Bytes)", "latency", "us", '-', size_arr, h_or_lat, j); print_table_v1("device_reduction", "int32-xor-t", "size (Bytes)", "latency", "us", '-', size_arr, h_xor_lat, j); print_table_v1("device_reduction", "int32-min-t", "size (Bytes)", "latency", "us", '-', size_arr, h_min_lat, j); print_table_v1("device_reduction", "int32-max-t", "size (Bytes)", "latency", "us", '-', size_arr, h_max_lat, j); } min_elems = max(static_cast(1), min_size / sizeof(int64_t)); max_elems = max(static_cast(1), max_size / sizeof(int64_t)); RUN_ITERS(int64, int64_t, , 512); if (!mype) { print_table_v1("device_reduction", "int64-sum-t", "size (Bytes)", "latency", "us", '-', size_arr, h_sum_lat, j); print_table_v1("device_reduction", "int64-prod-t", "size (Bytes)", "latency", "us", '-', size_arr, h_prod_lat, j); print_table_v1("device_reduction", "int64-and-t", "size (Bytes)", "latency", "us", '-', size_arr, h_and_lat, j); print_table_v1("device_reduction", "int64-or-t", "size (Bytes)", "latency", "us", '-', size_arr, h_or_lat, j); print_table_v1("device_reduction", "int64-xor-t", "size (Bytes)", "latency", "us", '-', size_arr, h_xor_lat, j); print_table_v1("device_reduction", "int64-min-t", "size (Bytes)", "latency", "us", '-', size_arr, h_min_lat, j); print_table_v1("device_reduction", "int64-max-t", "size (Bytes)", "latency", "us", '-', size_arr, h_max_lat, j); } } if (run_options.run_warp) { min_elems = max(static_cast(1), min_size / sizeof(int32_t)); max_elems = max(static_cast(1), max_size / sizeof(int32_t)); RUN_ITERS(int32, int32_t, _warp, 4096); if (!mype) { print_table_v1("device_reduction", "int32-sum-w", "size (Bytes)", "latency", "us", '-', size_arr, h_sum_lat, j); print_table_v1("device_reduction", "int32-prod-w", "size (Bytes)", "latency", "us", '-', size_arr, h_prod_lat, j); print_table_v1("device_reduction", "int32-and-w", "size (Bytes)", "latency", "us", '-', size_arr, h_and_lat, j); print_table_v1("device_reduction", "int32-or-w", "size (Bytes)", "latency", "us", '-', size_arr, h_or_lat, j); print_table_v1("device_reduction", "int32-xor-w", "size (Bytes)", "latency", "us", '-', size_arr, h_xor_lat, j); print_table_v1("device_reduction", "int32-min-w", "size (Bytes)", "latency", "us", '-', size_arr, h_min_lat, j); print_table_v1("device_reduction", "int32-max-w", "size (Bytes)", "latency", "us", '-', size_arr, h_max_lat, j); } min_elems = max(static_cast(1), min_size / sizeof(int64_t)); max_elems = max(static_cast(1), max_size / sizeof(int64_t)); RUN_ITERS(int64, int64_t, _warp, 4096); if (!mype) { print_table_v1("device_reduction", "int64-sum-w", "size (Bytes)", "latency", "us", '-', size_arr, h_sum_lat, j); print_table_v1("device_reduction", "int64-prod-w", "size (Bytes)", "latency", "us", '-', size_arr, h_prod_lat, j); print_table_v1("device_reduction", "int64-and-w", "size (Bytes)", "latency", "us", '-', size_arr, h_and_lat, j); print_table_v1("device_reduction", "int64-or-w", "size (Bytes)", "latency", "us", '-', size_arr, h_or_lat, j); print_table_v1("device_reduction", "int64-xor-w", "size (Bytes)", "latency", "us", '-', size_arr, h_xor_lat, j); print_table_v1("device_reduction", "int64-min-w", "size (Bytes)", "latency", "us", '-', size_arr, h_min_lat, j); print_table_v1("device_reduction", "int64-max-w", "size (Bytes)", "latency", "us", '-', size_arr, h_max_lat, j); } } if (run_options.run_block) { min_elems = max(static_cast(1), min_size / sizeof(int32_t)); max_elems = max(static_cast(1), max_size / sizeof(int32_t)); RUN_ITERS(int32, int32_t, _block, max_elems); if (!mype) { print_table_v1("device_reduction", "int32-sum-b", "size (Bytes)", "latency", "us", '-', size_arr, h_sum_lat, j); print_table_v1("device_reduction", "int32-prod-b", "size (Bytes)", "latency", "us", '-', size_arr, h_prod_lat, j); print_table_v1("device_reduction", "int32-and-b", "size (Bytes)", "latency", "us", '-', size_arr, h_and_lat, j); print_table_v1("device_reduction", "int32-or-b", "size (Bytes)", "latency", "us", '-', size_arr, h_or_lat, j); print_table_v1("device_reduction", "int32-xor-b", "size (Bytes)", "latency", "us", '-', size_arr, h_xor_lat, j); print_table_v1("device_reduction", "int32-min-b", "size (Bytes)", "latency", "us", '-', size_arr, h_min_lat, j); print_table_v1("device_reduction", "int32-max-b", "size (Bytes)", "latency", "us", '-', size_arr, h_max_lat, j); } min_elems = max(static_cast(1), min_size / sizeof(int64_t)); max_elems = max(static_cast(1), max_size / sizeof(int64_t)); RUN_ITERS(int64, int64_t, _block, max_elems); if (!mype) { print_table_v1("device_reduction", "int64-sum-b", "size (Bytes)", "latency", "us", '-', size_arr, h_sum_lat, j); print_table_v1("device_reduction", "int64-prod-b", "size (Bytes)", "latency", "us", '-', size_arr, h_prod_lat, j); print_table_v1("device_reduction", "int64-and-b", "size (Bytes)", "latency", "us", '-', size_arr, h_and_lat, j); print_table_v1("device_reduction", "int64-or-b", "size (Bytes)", "latency", "us", '-', size_arr, h_or_lat, j); print_table_v1("device_reduction", "int64-xor-b", "size (Bytes)", "latency", "us", '-', size_arr, h_xor_lat, j); print_table_v1("device_reduction", "int64-min-b", "size (Bytes)", "latency", "us", '-', size_arr, h_min_lat, j); print_table_v1("device_reduction", "int64-max-b", "size (Bytes)", "latency", "us", '-', size_arr, h_max_lat, j); } } return status; } int main(int argc, char **argv) { int status = 0; int mype, array_size; size_t size = 0; read_args(argc, argv); int *h_buffer = NULL; int *d_source, *d_dest; int *h_source, *h_dest; char size_string[100]; cudaStream_t cstrm; run_opt_t run_options; void **h_tables; run_options.run_thread = run_options.run_warp = run_options.run_block = 1; size = page_size_roundoff(max_size); // send buf size += page_size_roundoff(max_size); // recv buf DEBUG_PRINT("symmetric size requested %lu\n", size); sprintf(size_string, "%lu", size); status = setenv("NVSHMEM_SYMMETRIC_SIZE", size_string, 1); if (status) { fprintf(stderr, "setenv failed \n"); status = -1; goto out; } array_size = max_size_log; init_wrapper(&argc, &argv); alloc_tables(&h_tables, 8, array_size); if (use_cubin) { init_cumodule(CUMODULE_NAME); } mype = nvshmem_my_pe(); CUDA_CHECK(cudaStreamCreateWithFlags(&cstrm, cudaStreamNonBlocking)); CUDA_CHECK(cudaHostAlloc(&h_buffer, max_size * 2, cudaHostAllocDefault)); h_source = (int32_t *)h_buffer; h_dest = (int32_t *)&h_source[max_size / sizeof(int32_t)]; d_source = (int32_t *)nvshmem_align(getpagesize(), max_size); d_dest = (int32_t *)nvshmem_align(getpagesize(), max_size); CUDA_CHECK(cudaMemcpyAsync(d_source, h_source, max_size, cudaMemcpyHostToDevice, cstrm)); CUDA_CHECK(cudaMemcpyAsync(d_dest, h_dest, max_size, cudaMemcpyHostToDevice, cstrm)); rdxn_calling_kernel(NVSHMEM_TEAM_WORLD, d_dest, d_source, mype, cstrm, run_options, h_tables); DEBUG_PRINT("last error = %s\n", cudaGetErrorString(cudaGetLastError())); CUDA_CHECK(cudaMemcpyAsync(h_source, d_source, max_size, cudaMemcpyDeviceToHost, cstrm)); CUDA_CHECK(cudaMemcpyAsync(h_dest, d_dest, max_size, cudaMemcpyDeviceToHost, cstrm)); nvshmem_barrier_all(); CUDA_CHECK(cudaFreeHost(h_buffer)); nvshmem_free(d_source); nvshmem_free(d_dest); CUDA_CHECK(cudaStreamDestroy(cstrm)); finalize_wrapper(); out: return 0; }