345 lines
13 KiB
Plaintext
345 lines
13 KiB
Plaintext
/*
|
|
* Copyright (c) 2021, NVIDIA CORPORATION All rights reserved.
|
|
*
|
|
* NVIDIA CORPORATION and its licensors retain all intellectual property
|
|
* and proprietary rights in and to this software, related documentation
|
|
* and any modifications thereto Any use, reproduction, disclosure or
|
|
* distribution of this software and related documentation without an express
|
|
* license agreement from NVIDIA CORPORATION is strictly prohibited.
|
|
*
|
|
* See COPYRIGHT.txt for license information
|
|
*/
|
|
|
|
#define CUMODULE_NAME "shmem_atomic_bw.cubin"
|
|
|
|
#include "atomic_bw_common.h"
|
|
|
|
#if defined __cplusplus || defined NVSHMEM_BITCODE_APPLICATION
|
|
extern "C" {
|
|
#endif
|
|
|
|
DEFINE_ATOMIC_BW_FN_NO_ARG(inc);
|
|
DEFINE_ATOMIC_BW_FN_NO_ARG(fetch_inc);
|
|
|
|
DEFINE_ATOMIC_BW_FN_ONE_ARG(add, 1);
|
|
DEFINE_ATOMIC_BW_FN_ONE_ARG(fetch_add, 1);
|
|
|
|
DEFINE_ATOMIC_BW_FN_ONE_ARG(and, (*(data_d + idx) << (i + 1)));
|
|
DEFINE_ATOMIC_BW_FN_ONE_ARG(fetch_and, (*(data_d + idx) << (i + 1)));
|
|
|
|
DEFINE_ATOMIC_BW_FN_ONE_ARG(or, (*(data_d + idx) << i));
|
|
DEFINE_ATOMIC_BW_FN_ONE_ARG(fetch_or, (*(data_d + idx) << i));
|
|
|
|
DEFINE_ATOMIC_BW_FN_ONE_ARG(xor, 1);
|
|
DEFINE_ATOMIC_BW_FN_ONE_ARG(fetch_xor, 1);
|
|
|
|
DEFINE_ATOMIC_BW_FN_ONE_ARG(swap, i + 1);
|
|
DEFINE_ATOMIC_BW_FN_ONE_ARG(set, i + 1);
|
|
|
|
DEFINE_ATOMIC_BW_FN_TWO_ARG(compare_swap, i, i + 1);
|
|
|
|
#if defined __cplusplus || defined NVSHMEM_BITCODE_APPLICATION
|
|
}
|
|
#endif
|
|
|
|
int main(int argc, char *argv[]) {
|
|
int mype, npes;
|
|
int size;
|
|
int nelems;
|
|
uint64_t *data_d = NULL;
|
|
uint64_t set_value;
|
|
unsigned int *counter_d;
|
|
read_args(argc, argv);
|
|
|
|
int max_blocks = num_blocks, max_threads = threads_per_block;
|
|
int array_size, i;
|
|
void **h_tables;
|
|
uint64_t *h_size_arr;
|
|
double *h_bw;
|
|
char perf_table_name[30];
|
|
|
|
int iter = iters;
|
|
int skip = warmup_iters;
|
|
|
|
float milliseconds;
|
|
cudaEvent_t start, stop;
|
|
|
|
void *args_skip[] = {(void *)&data_d, (void *)&counter_d, (void *)&(nelems), (void *)&mype,
|
|
(void *)&skip};
|
|
void *args_iter[] = {(void *)&data_d, (void *)&counter_d, (void *)&(nelems), (void *)&mype,
|
|
(void *)&iter};
|
|
|
|
init_wrapper(&argc, &argv);
|
|
|
|
if (use_cubin) {
|
|
init_cumodule(CUMODULE_NAME);
|
|
}
|
|
|
|
cudaEventCreate(&start);
|
|
cudaEventCreate(&stop);
|
|
|
|
mype = nvshmem_my_pe();
|
|
npes = nvshmem_n_pes();
|
|
|
|
if (npes != 2) {
|
|
fprintf(stderr, "This test requires exactly two processes \n");
|
|
goto finalize;
|
|
}
|
|
|
|
array_size = max_size_log;
|
|
alloc_tables(&h_tables, 2, array_size);
|
|
h_size_arr = (uint64_t *)h_tables[0];
|
|
h_bw = (double *)h_tables[1];
|
|
|
|
data_d = (uint64_t *)nvshmem_malloc(max_size);
|
|
CUDA_CHECK(cudaMemset(data_d, 0, max_size));
|
|
|
|
CUDA_CHECK(cudaMalloc((void **)&counter_d, sizeof(unsigned int) * 2));
|
|
CUDA_CHECK(cudaMemset(counter_d, 0, sizeof(unsigned int) * 2));
|
|
|
|
CUDA_CHECK(cudaDeviceSynchronize());
|
|
|
|
strncpy(perf_table_name, ("shmem_atomic_" + test_amo.name).c_str(), 30);
|
|
|
|
i = 0;
|
|
if (mype == 0) {
|
|
for (size = min_size; size <= max_size; size *= step_factor) {
|
|
int blocks = max_blocks, threads = max_threads;
|
|
nelems = size / sizeof(uint64_t);
|
|
h_size_arr[i] = size;
|
|
CUDA_CHECK(cudaMemset(counter_d, 0, sizeof(unsigned int) * 2));
|
|
|
|
/* Do warmup round for NIC cache. */
|
|
switch (test_amo.type) {
|
|
case AMO_INC: {
|
|
CALL_ATOMIC_BW_KERNEL(inc, blocks, threads, data_d, counter_d, nelems, mype,
|
|
skip, args_skip)
|
|
break;
|
|
}
|
|
case AMO_SET: {
|
|
CALL_ATOMIC_BW_KERNEL(set, blocks, threads, data_d, counter_d, nelems, mype,
|
|
skip, args_skip)
|
|
break;
|
|
}
|
|
case AMO_ADD: {
|
|
CALL_ATOMIC_BW_KERNEL(add, blocks, threads, data_d, counter_d, nelems, mype,
|
|
skip, args_skip)
|
|
break;
|
|
}
|
|
case AMO_AND: {
|
|
CUDA_CHECK(cudaMemset(data_d, 0xFF, size));
|
|
CALL_ATOMIC_BW_KERNEL(and, blocks, threads, data_d, counter_d, nelems, mype,
|
|
skip, args_skip)
|
|
break;
|
|
}
|
|
case AMO_OR: {
|
|
CUDA_CHECK(cudaMemset(data_d, 0xFF, size));
|
|
CALL_ATOMIC_BW_KERNEL(or, blocks, threads, data_d, counter_d, nelems, mype,
|
|
skip, args_skip)
|
|
break;
|
|
}
|
|
case AMO_XOR: {
|
|
set_value = 1;
|
|
for (size_t j = 0; j < size / sizeof(uint64_t); j++) {
|
|
cudaMemcpy((data_d + j), &set_value, sizeof(uint64_t),
|
|
cudaMemcpyHostToDevice);
|
|
}
|
|
CALL_ATOMIC_BW_KERNEL(xor, blocks, threads, data_d, counter_d, nelems, mype,
|
|
skip, args_skip)
|
|
break;
|
|
}
|
|
case AMO_FETCH_INC: {
|
|
CALL_ATOMIC_BW_KERNEL(fetch_inc, blocks, threads, data_d, counter_d, nelems,
|
|
mype, skip, args_skip)
|
|
break;
|
|
}
|
|
case AMO_FETCH_ADD: {
|
|
CALL_ATOMIC_BW_KERNEL(fetch_add, blocks, threads, data_d, counter_d, nelems,
|
|
mype, skip, args_skip)
|
|
break;
|
|
}
|
|
case AMO_FETCH_AND: {
|
|
CUDA_CHECK(cudaMemset(data_d, 0xFF, size));
|
|
CALL_ATOMIC_BW_KERNEL(fetch_and, blocks, threads, data_d, counter_d, nelems,
|
|
mype, skip, args_skip)
|
|
break;
|
|
}
|
|
case AMO_FETCH_OR: {
|
|
CUDA_CHECK(cudaMemset(data_d, 0xFF, size));
|
|
CALL_ATOMIC_BW_KERNEL(fetch_or, blocks, threads, data_d, counter_d, nelems,
|
|
mype, skip, args_skip)
|
|
break;
|
|
}
|
|
case AMO_FETCH_XOR: {
|
|
for (size_t j = 0; j < nelems; j++) {
|
|
cudaMemcpy((data_d + j), &set_value, sizeof(uint64_t),
|
|
cudaMemcpyHostToDevice);
|
|
}
|
|
CALL_ATOMIC_BW_KERNEL(fetch_xor, blocks, threads, data_d, counter_d, nelems,
|
|
mype, skip, args_skip)
|
|
break;
|
|
}
|
|
case AMO_SWAP: {
|
|
CALL_ATOMIC_BW_KERNEL(swap, blocks, threads, data_d, counter_d, nelems, mype,
|
|
skip, args_skip)
|
|
break;
|
|
}
|
|
case AMO_COMPARE_SWAP: {
|
|
CALL_ATOMIC_BW_KERNEL(compare_swap, blocks, threads, data_d, counter_d, nelems,
|
|
mype, skip, args_skip)
|
|
break;
|
|
}
|
|
default: {
|
|
/* Should be unreachable */
|
|
fprintf(stderr, "Error, unsupported Atomic op %d.\n", test_amo.type);
|
|
goto finalize;
|
|
}
|
|
}
|
|
CUDA_CHECK(cudaGetLastError());
|
|
CUDA_CHECK(cudaDeviceSynchronize());
|
|
nvshmem_barrier_all();
|
|
|
|
/* reset values in code. */
|
|
CUDA_CHECK(cudaMemset(counter_d, 0, sizeof(unsigned int) * 2));
|
|
switch (test_amo.type) {
|
|
case AMO_AND: {
|
|
CUDA_CHECK(cudaMemset(data_d, 0xFF, size));
|
|
break;
|
|
}
|
|
case AMO_OR: {
|
|
CUDA_CHECK(cudaMemset(data_d, 0xFF, size));
|
|
break;
|
|
}
|
|
case AMO_XOR: {
|
|
set_value = 1;
|
|
for (size_t j = 0; j < size / sizeof(uint64_t); j++) {
|
|
cudaMemcpy((data_d + j), &set_value, sizeof(uint64_t),
|
|
cudaMemcpyHostToDevice);
|
|
}
|
|
break;
|
|
}
|
|
case AMO_FETCH_AND: {
|
|
CUDA_CHECK(cudaMemset(data_d, 0xFF, size));
|
|
break;
|
|
}
|
|
case AMO_FETCH_OR: {
|
|
CUDA_CHECK(cudaMemset(data_d, 0xFF, size));
|
|
break;
|
|
}
|
|
case AMO_FETCH_XOR: {
|
|
for (size_t j = 0; j < size / sizeof(uint64_t); j++) {
|
|
cudaMemcpy((data_d + j), &set_value, sizeof(uint64_t),
|
|
cudaMemcpyHostToDevice);
|
|
}
|
|
break;
|
|
}
|
|
default: { break; }
|
|
}
|
|
CUDA_CHECK(cudaGetLastError());
|
|
CUDA_CHECK(cudaDeviceSynchronize());
|
|
nvshmem_barrier_all();
|
|
|
|
cudaEventRecord(start);
|
|
switch (test_amo.type) {
|
|
case AMO_INC: {
|
|
CALL_ATOMIC_BW_KERNEL(inc, blocks, threads, data_d, counter_d, nelems, mype,
|
|
iter, args_iter)
|
|
break;
|
|
}
|
|
case AMO_SET: {
|
|
CALL_ATOMIC_BW_KERNEL(set, blocks, threads, data_d, counter_d, nelems, mype,
|
|
iter, args_iter)
|
|
break;
|
|
}
|
|
case AMO_ADD: {
|
|
CALL_ATOMIC_BW_KERNEL(add, blocks, threads, data_d, counter_d, nelems, mype,
|
|
iter, args_iter)
|
|
break;
|
|
}
|
|
case AMO_AND: {
|
|
CALL_ATOMIC_BW_KERNEL(and, blocks, threads, data_d, counter_d, nelems, mype,
|
|
iter, args_iter)
|
|
break;
|
|
}
|
|
case AMO_OR: {
|
|
CALL_ATOMIC_BW_KERNEL(or, blocks, threads, data_d, counter_d, nelems, mype,
|
|
iter, args_iter)
|
|
break;
|
|
}
|
|
case AMO_XOR: {
|
|
CALL_ATOMIC_BW_KERNEL(xor, blocks, threads, data_d, counter_d, nelems, mype,
|
|
iter, args_iter)
|
|
break;
|
|
}
|
|
case AMO_FETCH_INC: {
|
|
CALL_ATOMIC_BW_KERNEL(fetch_inc, blocks, threads, data_d, counter_d, nelems,
|
|
mype, iter, args_iter)
|
|
break;
|
|
}
|
|
case AMO_FETCH_ADD: {
|
|
CALL_ATOMIC_BW_KERNEL(fetch_add, blocks, threads, data_d, counter_d, nelems,
|
|
mype, iter, args_iter)
|
|
break;
|
|
}
|
|
case AMO_FETCH_AND: {
|
|
CALL_ATOMIC_BW_KERNEL(fetch_and, blocks, threads, data_d, counter_d, nelems,
|
|
mype, iter, args_iter)
|
|
break;
|
|
}
|
|
case AMO_FETCH_OR: {
|
|
CALL_ATOMIC_BW_KERNEL(fetch_or, blocks, threads, data_d, counter_d, nelems,
|
|
mype, iter, args_iter)
|
|
break;
|
|
}
|
|
case AMO_FETCH_XOR: {
|
|
CALL_ATOMIC_BW_KERNEL(fetch_xor, blocks, threads, data_d, counter_d, nelems,
|
|
mype, iter, args_iter)
|
|
break;
|
|
}
|
|
case AMO_SWAP: {
|
|
CALL_ATOMIC_BW_KERNEL(swap, blocks, threads, data_d, counter_d, nelems, mype,
|
|
iter, args_iter)
|
|
break;
|
|
}
|
|
case AMO_COMPARE_SWAP: {
|
|
CALL_ATOMIC_BW_KERNEL(compare_swap, blocks, threads, data_d, counter_d, nelems,
|
|
mype, iter, args_iter)
|
|
break;
|
|
}
|
|
default: {
|
|
/* Should be unreachable */
|
|
fprintf(stderr, "Error, unsupported Atomic op %d.\n", test_amo.type);
|
|
goto finalize;
|
|
}
|
|
}
|
|
cudaEventRecord(stop);
|
|
CUDA_CHECK(cudaGetLastError());
|
|
CUDA_CHECK(cudaEventSynchronize(stop));
|
|
cudaEventElapsedTime(&milliseconds, start, stop);
|
|
|
|
h_bw[i] = size / (milliseconds * (B_TO_GB / (iter * MS_TO_S)));
|
|
nvshmem_barrier_all();
|
|
i++;
|
|
}
|
|
} else {
|
|
for (size = min_size; size <= max_size; size *= step_factor) {
|
|
nvshmem_barrier_all();
|
|
nvshmem_barrier_all();
|
|
nvshmem_barrier_all();
|
|
}
|
|
}
|
|
|
|
if (mype == 0) {
|
|
print_table_basic(perf_table_name, "None", "size (Bytes)", "BW", "GB/sec", '+', h_size_arr,
|
|
h_bw, i);
|
|
}
|
|
|
|
finalize:
|
|
|
|
if (data_d) nvshmem_free(data_d);
|
|
free_tables(h_tables, 2);
|
|
finalize_wrapper();
|
|
|
|
return 0;
|
|
}
|