255 lines
10 KiB
Plaintext
255 lines
10 KiB
Plaintext
/*
|
|
* Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* NVIDIA CORPORATION and its licensors retain all intellectual property
|
|
* and proprietary rights in and to this software, related documentation
|
|
* and any modifications thereto. Any use, reproduction, disclosure or
|
|
* distribution of this software and related documentation without an express
|
|
* license agreement from NVIDIA CORPORATION is strictly prohibited.
|
|
*
|
|
* See COPYRIGHT.txt for license information
|
|
*/
|
|
|
|
#define CUMODULE_NAME "barrier_latency.cubin"
|
|
|
|
#include "coll_test.h"
|
|
|
|
#if defined __cplusplus || defined NVSHMEM_BITCODE_APPLICATION
|
|
extern "C" {
|
|
#endif
|
|
|
|
#define BARRIER_KERNEL_WRAPPER(TG_PRE, THREADGROUP, THREAD_COMP, VARIANT, VARIANT_API) \
|
|
void test_barrier##VARIANT##call_kernel##VARIANT_API##THREADGROUP##_cubin( \
|
|
int num_blocks, int num_tpb, cudaStream_t stream, void **arglist) { \
|
|
CUfunction test_cubin; \
|
|
\
|
|
init_test_case_kernel(&test_cubin, \
|
|
NVSHMEMI_TEST_STRINGIFY( \
|
|
test_barrier##VARIANT##call_kernel##VARIANT_API##THREADGROUP)); \
|
|
CU_CHECK(cuLaunchCooperativeKernel(test_cubin, num_blocks, 1, 1, num_tpb, 1, 1, 0, stream, \
|
|
arglist)); \
|
|
}
|
|
|
|
#define BARRIER_KERNEL(TG_PRE, THREADGROUP, THREAD_COMP) \
|
|
__global__ void test_barrier_call_kernel##THREADGROUP(nvshmem_team_t team, int iter) { \
|
|
int i; \
|
|
if (!blockIdx.x && (threadIdx.x < THREAD_COMP)) { \
|
|
for (i = 0; i < iter; i++) { \
|
|
nvshmem##TG_PRE##_barrier##THREADGROUP(team); \
|
|
} \
|
|
} \
|
|
} \
|
|
\
|
|
__global__ void test_barrier_all_call_kernel##THREADGROUP(int iter) { \
|
|
int i; \
|
|
if (!blockIdx.x && (threadIdx.x < THREAD_COMP)) { \
|
|
for (i = 0; i < iter; i++) { \
|
|
nvshmem##TG_PRE##_barrier_all##THREADGROUP(); \
|
|
} \
|
|
} \
|
|
}
|
|
|
|
#define CALL_BARRIER_KERNEL(THREADGROUP, BLOCKS, THREADS, ARG_LIST, STREAM, VARIANT) \
|
|
if (use_cubin) { \
|
|
test_barrier##VARIANT##call_kernel##THREADGROUP##_cubin(BLOCKS, THREADS, STREAM, \
|
|
ARG_LIST); \
|
|
} else { \
|
|
status = nvshmemx_collective_launch( \
|
|
(const void *)test_barrier##VARIANT##call_kernel##THREADGROUP, BLOCKS, THREADS, \
|
|
ARG_LIST, 0, STREAM); \
|
|
if (status != NVSHMEMX_SUCCESS) { \
|
|
fprintf(stderr, "shmemx_collective_launch failed %d \n", status); \
|
|
exit(-1); \
|
|
} \
|
|
}
|
|
|
|
BARRIER_KERNEL(, , 1);
|
|
BARRIER_KERNEL(x, _warp, warpSize);
|
|
BARRIER_KERNEL(x, _block, INT_MAX);
|
|
|
|
#if defined __cplusplus || defined NVSHMEM_BITCODE_APPLICATION
|
|
}
|
|
#endif
|
|
|
|
BARRIER_KERNEL_WRAPPER(, , 1, _, );
|
|
BARRIER_KERNEL_WRAPPER(x, warp, warpSize, _, _);
|
|
BARRIER_KERNEL_WRAPPER(x, block, INT_MAX, _, _);
|
|
BARRIER_KERNEL_WRAPPER(, , 1, _all_, );
|
|
BARRIER_KERNEL_WRAPPER(x, warp, warpSize, _all_, _);
|
|
BARRIER_KERNEL_WRAPPER(x, block, INT_MAX, _all_, _);
|
|
|
|
int barrier_calling_kernel(nvshmem_team_t team, cudaStream_t stream, int mype, void **h_tables) {
|
|
int status = 0;
|
|
int nvshm_test_num_tpb = threads_per_block;
|
|
size_t skip = warmup_iters;
|
|
size_t iter = iters;
|
|
int num_blocks = 1;
|
|
double *h_thread_lat = (double *)h_tables[0];
|
|
double *h_warp_lat = (double *)h_tables[1];
|
|
double *h_block_lat = (double *)h_tables[2];
|
|
uint64_t size = 0;
|
|
void *barrier_args_1[] = {&team, &skip};
|
|
void *barrier_args_2[] = {&team, &iter};
|
|
void *barrier_all_args_1[] = {&skip};
|
|
void *barrier_all_args_2[] = {&iter};
|
|
float milliseconds;
|
|
cudaEvent_t start, stop;
|
|
cudaEventCreate(&start);
|
|
cudaEventCreate(&stop);
|
|
|
|
nvshmem_barrier_all();
|
|
CALL_BARRIER_KERNEL(, num_blocks, nvshm_test_num_tpb, barrier_args_1, stream, _)
|
|
|
|
CUDA_CHECK(cudaStreamSynchronize(stream));
|
|
|
|
nvshmem_barrier_all();
|
|
|
|
cudaEventRecord(start, stream);
|
|
CALL_BARRIER_KERNEL(, num_blocks, nvshm_test_num_tpb, barrier_args_2, stream, _)
|
|
|
|
cudaEventRecord(stop, stream);
|
|
CUDA_CHECK(cudaStreamSynchronize(stream));
|
|
|
|
if (!mype) {
|
|
cudaEventElapsedTime(&milliseconds, start, stop);
|
|
h_thread_lat[0] = (milliseconds * 1000.0) / (float)iter;
|
|
}
|
|
|
|
nvshmem_barrier_all();
|
|
CALL_BARRIER_KERNEL(_warp, num_blocks, nvshm_test_num_tpb, barrier_args_1, stream, _)
|
|
|
|
CUDA_CHECK(cudaStreamSynchronize(stream));
|
|
|
|
nvshmem_barrier_all();
|
|
|
|
cudaEventRecord(start, stream);
|
|
CALL_BARRIER_KERNEL(_warp, num_blocks, nvshm_test_num_tpb, barrier_args_2, stream, _)
|
|
|
|
cudaEventRecord(stop, stream);
|
|
CUDA_CHECK(cudaStreamSynchronize(stream));
|
|
|
|
if (!mype) {
|
|
cudaEventElapsedTime(&milliseconds, start, stop);
|
|
h_warp_lat[0] = (milliseconds * 1000.0) / (float)iter;
|
|
}
|
|
|
|
nvshmem_barrier_all();
|
|
CALL_BARRIER_KERNEL(_block, num_blocks, nvshm_test_num_tpb, barrier_args_1, stream, _)
|
|
|
|
CUDA_CHECK(cudaStreamSynchronize(stream));
|
|
|
|
nvshmem_barrier_all();
|
|
|
|
cudaEventRecord(start, stream);
|
|
CALL_BARRIER_KERNEL(_block, num_blocks, nvshm_test_num_tpb, barrier_args_2, stream, _)
|
|
|
|
cudaEventRecord(stop, stream);
|
|
CUDA_CHECK(cudaStreamSynchronize(stream));
|
|
|
|
if (!mype) {
|
|
cudaEventElapsedTime(&milliseconds, start, stop);
|
|
h_block_lat[0] = (milliseconds * 1000.0) / (float)iter;
|
|
}
|
|
|
|
if (!mype) {
|
|
print_table_basic("barrier_device", "thread", "threads per block", "latency", "us", '-',
|
|
&size, h_thread_lat, 1);
|
|
print_table_basic("barrier_device", "warp", "threads per block", "latency", "us", '-',
|
|
&size, h_warp_lat, 1);
|
|
print_table_basic("barrier_device", "block", "threads per block", "latency", "us", '-',
|
|
&size, h_block_lat, 1);
|
|
}
|
|
|
|
nvshmem_barrier_all();
|
|
CALL_BARRIER_KERNEL(, num_blocks, nvshm_test_num_tpb, barrier_all_args_1, stream, _all_)
|
|
|
|
CUDA_CHECK(cudaStreamSynchronize(stream));
|
|
|
|
nvshmem_barrier_all();
|
|
|
|
cudaEventRecord(start, stream);
|
|
CALL_BARRIER_KERNEL(, num_blocks, nvshm_test_num_tpb, barrier_all_args_2, stream, _all_)
|
|
|
|
cudaEventRecord(stop, stream);
|
|
CUDA_CHECK(cudaStreamSynchronize(stream));
|
|
|
|
if (!mype) {
|
|
cudaEventElapsedTime(&milliseconds, start, stop);
|
|
h_thread_lat[0] = (milliseconds * 1000.0) / (float)iter;
|
|
}
|
|
|
|
nvshmem_barrier_all();
|
|
CALL_BARRIER_KERNEL(_warp, num_blocks, nvshm_test_num_tpb, barrier_all_args_1, stream, _all_)
|
|
|
|
CUDA_CHECK(cudaStreamSynchronize(stream));
|
|
|
|
nvshmem_barrier_all();
|
|
|
|
cudaEventRecord(start, stream);
|
|
CALL_BARRIER_KERNEL(_warp, num_blocks, nvshm_test_num_tpb, barrier_all_args_2, stream, _all_)
|
|
|
|
cudaEventRecord(stop, stream);
|
|
CUDA_CHECK(cudaStreamSynchronize(stream));
|
|
|
|
if (!mype) {
|
|
cudaEventElapsedTime(&milliseconds, start, stop);
|
|
h_warp_lat[0] = (milliseconds * 1000.0) / (float)iter;
|
|
}
|
|
|
|
nvshmem_barrier_all();
|
|
CALL_BARRIER_KERNEL(_block, num_blocks, nvshm_test_num_tpb, barrier_all_args_1, stream, _all_)
|
|
|
|
CUDA_CHECK(cudaStreamSynchronize(stream));
|
|
|
|
nvshmem_barrier_all();
|
|
|
|
cudaEventRecord(start, stream);
|
|
CALL_BARRIER_KERNEL(_block, num_blocks, nvshm_test_num_tpb, barrier_all_args_1, stream, _all_)
|
|
|
|
cudaEventRecord(stop, stream);
|
|
CUDA_CHECK(cudaStreamSynchronize(stream));
|
|
|
|
if (!mype) {
|
|
cudaEventElapsedTime(&milliseconds, start, stop);
|
|
h_block_lat[0] = (milliseconds * 1000.0) / (float)iter;
|
|
}
|
|
|
|
if (!mype) {
|
|
print_table_basic("barrier_all_device", "thread", "threads per block", "latency", "us", '-',
|
|
&size, h_thread_lat, 1);
|
|
print_table_basic("barrier_all_device", "warp", "threads per block", "latency", "us", '-',
|
|
&size, h_warp_lat, 1);
|
|
print_table_basic("barrier_all_device", "block", "threads per block", "latency", "us", '-',
|
|
&size, h_block_lat, 1);
|
|
}
|
|
|
|
return status;
|
|
}
|
|
|
|
int main(int argc, char **argv) {
|
|
int mype;
|
|
cudaStream_t cstrm;
|
|
void **h_tables;
|
|
|
|
read_args(argc, argv);
|
|
init_wrapper(&argc, &argv);
|
|
alloc_tables(&h_tables, 3, 1);
|
|
|
|
if (use_cubin) {
|
|
init_cumodule(CUMODULE_NAME);
|
|
}
|
|
|
|
mype = nvshmem_my_pe();
|
|
CUDA_CHECK(cudaStreamCreateWithFlags(&cstrm, cudaStreamNonBlocking));
|
|
|
|
barrier_calling_kernel(NVSHMEM_TEAM_WORLD, cstrm, mype, h_tables);
|
|
|
|
nvshmem_barrier_all();
|
|
|
|
CUDA_CHECK(cudaStreamDestroy(cstrm));
|
|
free_tables(h_tables, 3);
|
|
finalize_wrapper();
|
|
|
|
return 0;
|
|
}
|