/* * Copyright (c) 2021, NVIDIA CORPORATION All rights reserved. * * NVIDIA CORPORATION and its licensors retain all intellectual property * and proprietary rights in and to this software, related documentation * and any modifications thereto Any use, reproduction, disclosure or * distribution of this software and related documentation without an express * license agreement from NVIDIA CORPORATION is strictly prohibited. * * See COPYRIGHT.txt for license information */ #ifndef _ATOMIC_LAT_COMMON_H_ #define _ATOMIC_LAT_COMMON_H_ #include #include #include #include #include #include "utils.h" #define DEFINE_ATOMIC_LATENCY_CALL_KERNEL(AMO, TYPE_NAME) \ void test_lat_##TYPE_NAME##_##AMO##_cubin(cudaStream_t stream, void **arglist) { \ CUfunction test_cubin; \ init_test_case_kernel(&test_cubin, NVSHMEMI_TEST_STRINGIFY(lat_##TYPE_NAME##_##AMO)); \ CU_CHECK(cuLaunchCooperativeKernel(test_cubin, 1, 1, 1, 1, 1, 1, 0, stream, arglist)); \ } #define DEFINE_LAT_NON_FETCH_TEST_FOR_AMO_NO_ARG(TYPE, TYPE_NAME, AMO) \ DEFINE_ATOMIC_LATENCY_CALL_KERNEL(AMO, TYPE_NAME) \ __global__ void lat_##TYPE_NAME##_##AMO(TYPE *flag_d, int pe, int iter) { \ int i, tid, peer; \ \ assert(1 == blockDim.x * blockDim.y * blockDim.z * gridDim.x * gridDim.y * gridDim.z); \ peer = !pe; \ tid = threadIdx.x; \ \ if ((pe == 0) && !tid) { \ for (i = 0; i < iter; i++) { \ nvshmem_##TYPE_NAME##_atomic_##AMO(flag_d, peer); \ nvshmem_quiet(); \ } \ } \ } #define DEFINE_LAT_FETCH_TEST_FOR_AMO_NO_ARG(TYPE, TYPE_NAME, AMO) \ DEFINE_ATOMIC_LATENCY_CALL_KERNEL(AMO, TYPE_NAME) \ __global__ void lat_##TYPE_NAME##_##AMO(TYPE *flag_d, int pe, int iter) { \ int i, tid, peer; \ \ assert(1 == blockDim.x * blockDim.y * blockDim.z * gridDim.x * gridDim.y * gridDim.z); \ peer = !pe; \ tid = threadIdx.x; \ \ if ((pe == 0) && !tid) { \ for (i = 0; i < iter; i++) { \ (void)nvshmem_##TYPE_NAME##_atomic_##AMO(flag_d, peer); \ } \ nvshmem_quiet(); \ } \ } #define DEFINE_LAT_NON_FETCH_TEST_FOR_AMO_ONE_ARG(TYPE, TYPE_NAME, AMO, COMPARE_EXPR, SET_EXPR) \ DEFINE_ATOMIC_LATENCY_CALL_KERNEL(AMO, TYPE_NAME) \ __global__ void lat_##TYPE_NAME##_##AMO(TYPE *flag_d, int pe, int iter, TYPE value, \ TYPE cmp) { \ int i, tid, peer; \ \ assert(1 == blockDim.x * blockDim.y * blockDim.z * gridDim.x * gridDim.y * gridDim.z); \ peer = !pe; \ tid = threadIdx.x; \ \ if ((pe == 0) && !tid) { \ for (i = 0; i < iter; i++) { \ nvshmem_##TYPE_NAME##_atomic_##AMO(flag_d, SET_EXPR, peer); \ nvshmem_quiet(); \ } \ } \ } #define DEFINE_LAT_FETCH_TEST_FOR_AMO_ONE_ARG(TYPE, TYPE_NAME, AMO, COMPARE_EXPR, SET_EXPR) \ DEFINE_ATOMIC_LATENCY_CALL_KERNEL(AMO, TYPE_NAME) \ __global__ void lat_##TYPE_NAME##_##AMO(TYPE *flag_d, int pe, int iter, TYPE value, \ TYPE cmp) { \ int i, tid, peer; \ \ assert(1 == blockDim.x * blockDim.y * blockDim.z * gridDim.x * gridDim.y * gridDim.z); \ peer = !pe; \ tid = threadIdx.x; \ \ if ((pe == 0) && !tid) { \ for (i = 0; i < iter; i++) { \ (void)nvshmem_##TYPE_NAME##_atomic_##AMO(flag_d, SET_EXPR, peer); \ } \ nvshmem_quiet(); \ } \ } #define DEFINE_LAT_NON_FETCH_TEST_FOR_AMO_TWO_ARG(TYPE, TYPE_NAME, AMO, COMPARE_EXPR, SET_EXPR) \ DEFINE_ATOMIC_LATENCY_CALL_KERNEL(AMO, TYPE_NAME) \ __global__ void lat_##TYPE_NAME##_##AMO(TYPE *flag_d, int pe, int iter, TYPE value, \ TYPE cmp) { \ int i, tid, peer; \ \ assert(1 == blockDim.x * blockDim.y * blockDim.z * gridDim.x * gridDim.y * gridDim.z); \ peer = !pe; \ tid = threadIdx.x; \ \ if ((pe == 0) && !tid) { \ for (i = 0; i < iter; i++) { \ nvshmem_##TYPE_NAME##_atomic_##AMO(flag_d, COMPARE_EXPR, SET_EXPR, peer); \ } \ } \ } #define DEFINE_LAT_FETCH_TEST_FOR_AMO_TWO_ARG(TYPE, TYPE_NAME, AMO, COMPARE_EXPR, SET_EXPR) \ DEFINE_ATOMIC_LATENCY_CALL_KERNEL(AMO, TYPE_NAME) \ __global__ void lat_##TYPE_NAME##_##AMO(TYPE *flag_d, int pe, int iter, TYPE value, \ TYPE cmp) { \ int i, tid, peer; \ \ assert(1 == blockDim.x * blockDim.y * blockDim.z * gridDim.x * gridDim.y * gridDim.z); \ peer = !pe; \ tid = threadIdx.x; \ \ if ((pe == 0) && !tid) { \ for (i = 0; i < iter; i++) { \ (void)nvshmem_##TYPE_NAME##_atomic_##AMO(flag_d, COMPARE_EXPR, SET_EXPR, peer); \ } \ nvshmem_quiet(); \ } \ } #define MAIN_SETUP(c, v, mype, npes, flag_d, stream, h_size_arr, h_tables, h_lat) \ do { \ init_wrapper(&c, &v); \ \ if (use_cubin) { \ init_cumodule(CUMODULE_NAME); \ } \ \ mype = nvshmem_my_pe(); \ npes = nvshmem_n_pes(); \ \ if (npes != 2) { \ fprintf(stderr, "This test requires exactly two processes \n"); \ finalize_wrapper(); \ exit(-1); \ } \ \ alloc_tables(&h_tables, 2, 1); \ h_size_arr = (uint64_t *)h_tables[0]; \ h_lat = (double *)h_tables[1]; \ \ flag_d = nvshmem_malloc(sizeof(uint64_t)); \ CUDA_CHECK(cudaMemset(flag_d, 0, sizeof(uint64_t))); \ \ CUDA_CHECK(cudaStreamCreate(&stream)); \ \ nvshmem_barrier_all(); \ \ CUDA_CHECK(cudaDeviceSynchronize()); \ \ if (mype == 0) { \ printf("Note: This test measures full round-trip latency\n"); \ } \ } while (0) #define LAUNCH_KERNEL(TYPE_NAME, AMO, ARGLIST, STREAM) \ if (use_cubin) { \ test_lat_##TYPE_NAME##_##AMO##_cubin(STREAM, ARGLIST); \ } else { \ status = nvshmemx_collective_launch((const void *)lat_##TYPE_NAME##_##AMO, 1, 1, ARGLIST, \ 0, STREAM); \ if (status != NVSHMEMX_SUCCESS) { \ fprintf(stderr, "shmemx_collective_launch failed %d \n", status); \ exit(-1); \ } \ } #define RUN_TEST_WITHOUT_ARG(TYPE, TYPE_NAME, AMO, flag_d, mype, iter, skip, h_lat, h_size_arr, \ flag_init) \ do { \ int size = sizeof(TYPE); \ \ int status = 0; \ h_size_arr[0] = size; \ void *args_1[] = {&flag_d, &mype, &skip}; \ void *args_2[] = {&flag_d, &mype, &iter}; \ \ float milliseconds; \ cudaEvent_t start, stop; \ cudaEventCreate(&start); \ cudaEventCreate(&stop); \ \ TYPE flag_init_var = flag_init; \ CUDA_CHECK(cudaMemcpy(flag_d, &flag_init_var, sizeof(TYPE), cudaMemcpyHostToDevice)); \ CUDA_CHECK(cudaDeviceSynchronize()); \ nvshmem_barrier_all(); \ \ LAUNCH_KERNEL(TYPE_NAME, AMO, args_1, stream); \ if (status != NVSHMEMX_SUCCESS) { \ fprintf(stderr, "shmemx_collective_launch failed %d \n", status); \ exit(-1); \ } \ \ cudaStreamSynchronize(stream); \ \ nvshmem_barrier_all(); \ CUDA_CHECK(cudaMemcpy(flag_d, &flag_init_var, sizeof(TYPE), cudaMemcpyHostToDevice)); \ cudaEventRecord(start, stream); \ LAUNCH_KERNEL(TYPE_NAME, AMO, args_2, stream) \ cudaEventRecord(stop, stream); \ cudaStreamSynchronize(stream); \ /* give latency in us */ \ cudaEventElapsedTime(&milliseconds, start, stop); \ h_lat[0] = (milliseconds * 1000) / iter; \ \ nvshmem_barrier_all(); \ \ if (mype == 0) { \ print_table_basic("shmem_at_" #TYPE "_" #AMO "_ping_lat", "None", "size (Bytes)", \ "latency", "us", '-', h_size_arr, h_lat, 1); \ } \ \ CUDA_CHECK(cudaDeviceSynchronize()); \ \ } while (0) #define RUN_TEST_WITH_ARG(TYPE, TYPE_NAME, AMO, flag_d, mype, iter, skip, h_lat, h_size_arr, val, \ cmp, flag_init) \ do { \ int size = sizeof(TYPE); \ TYPE compare, value, flag_init_var; \ \ int status = 0; \ h_size_arr[0] = size; \ void *args_1[] = {&flag_d, &mype, &skip, &value, &compare}; \ void *args_2[] = {&flag_d, &mype, &iter, &value, &compare}; \ \ float milliseconds; \ cudaEvent_t start, stop; \ cudaEventCreate(&start); \ cudaEventCreate(&stop); \ \ compare = cmp; \ value = val; \ flag_init_var = flag_init; \ \ CUDA_CHECK(cudaDeviceSynchronize()); \ nvshmem_barrier_all(); \ CUDA_CHECK(cudaMemcpy(flag_d, &flag_init_var, sizeof(TYPE), cudaMemcpyHostToDevice)); \ \ LAUNCH_KERNEL(TYPE_NAME, AMO, args_1, stream) \ \ cudaStreamSynchronize(stream); \ \ nvshmem_barrier_all(); \ CUDA_CHECK(cudaMemcpy(flag_d, &flag_init_var, sizeof(TYPE), cudaMemcpyHostToDevice)); \ cudaEventRecord(start, stream); \ LAUNCH_KERNEL(TYPE_NAME, AMO, args_2, stream) \ cudaEventRecord(stop, stream); \ cudaStreamSynchronize(stream); \ /* give latency in us */ \ cudaEventElapsedTime(&milliseconds, start, stop); \ h_lat[0] = (milliseconds * 1000) / iter; \ \ nvshmem_barrier_all(); \ \ if (mype == 0) { \ print_table_basic("shmem_at_" #TYPE "_" #AMO "_ping_lat", "None", "size (Bytes)", \ "latency", "us", '-', h_size_arr, h_lat, 1); \ } \ \ CUDA_CHECK(cudaDeviceSynchronize()); \ \ } while (0) #define MAIN_CLEANUP(flag_d, stream, h_tables) \ do { \ if (flag_d) nvshmem_free(flag_d); \ cudaStreamDestroy(stream); \ free_tables(h_tables, 2); \ finalize_wrapper(); \ } while (0); #endif /* _ATOMIC_LAT_COMMON_H_ */