104 lines
3.3 KiB
Plaintext
104 lines
3.3 KiB
Plaintext
/*
|
|
* Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* NVIDIA CORPORATION and its licensors retain all intellectual property
|
|
* and proprietary rights in and to this software, related documentation
|
|
* and any modifications thereto. Any use, reproduction, disclosure or
|
|
* distribution of this software and related documentation without an express
|
|
* license agreement from NVIDIA CORPORATION is strictly prohibited.
|
|
*
|
|
* See COPYRIGHT.txt for license information
|
|
*/
|
|
|
|
#include <stdio.h>
|
|
|
|
#include "bootstrap_helper.h"
|
|
#include "nvshmem.h"
|
|
#include "nvshmemx.h"
|
|
|
|
#define NTHREADS 512
|
|
|
|
#undef CUDA_CHECK
|
|
#define CUDA_CHECK(stmt) \
|
|
do { \
|
|
cudaError_t result = (stmt); \
|
|
if (cudaSuccess != result) { \
|
|
fprintf(stderr, "[%s:%d] cuda failed with %s \n", __FILE__, __LINE__, \
|
|
cudaGetErrorString(result)); \
|
|
exit(-1); \
|
|
} \
|
|
} while (0)
|
|
|
|
__global__ void distributed_vector_sum(int *x, int *y, int *partial_sum, int *sum,
|
|
int use_threadgroup, int mype, int npes) {
|
|
int index = threadIdx.x;
|
|
int nelems = blockDim.x;
|
|
partial_sum[index] = x[index] + y[index];
|
|
|
|
if (use_threadgroup) {
|
|
/* all threads realize the entire fcollect operation */
|
|
nvshmemx_int_fcollect_block(NVSHMEM_TEAM_WORLD, sum, partial_sum, nelems);
|
|
} else {
|
|
/* thread 0 realizes the entire fcollect operation */
|
|
if (0 == index) {
|
|
nvshmem_int_fcollect(NVSHMEM_TEAM_WORLD, sum, partial_sum, nelems);
|
|
}
|
|
}
|
|
}
|
|
|
|
int main(int c, char *v[]) {
|
|
int mype, npes, mype_node;
|
|
int *x;
|
|
int *y;
|
|
int *partial_sum;
|
|
int *sum;
|
|
int use_threadgroup = 1;
|
|
int nthreads = NTHREADS;
|
|
|
|
#ifdef NVSHMEMTEST_MPI_SUPPORT
|
|
bool use_mpi = false;
|
|
char *value = getenv("NVSHMEMTEST_USE_MPI_LAUNCHER");
|
|
if (value) use_mpi = atoi(value);
|
|
#endif
|
|
|
|
#ifdef NVSHMEMTEST_MPI_SUPPORT
|
|
if (use_mpi) {
|
|
nvshmemi_init_mpi(&c, &v);
|
|
} else
|
|
nvshmem_init();
|
|
#else
|
|
nvshmem_init();
|
|
#endif
|
|
|
|
npes = nvshmem_n_pes();
|
|
mype = nvshmem_my_pe();
|
|
mype_node = nvshmem_team_my_pe(NVSHMEMX_TEAM_NODE);
|
|
|
|
CUDA_CHECK(cudaSetDevice(mype_node));
|
|
|
|
x = (int *)nvshmem_malloc(sizeof(int) * nthreads);
|
|
y = (int *)nvshmem_malloc(sizeof(int) * nthreads);
|
|
partial_sum = (int *)nvshmem_malloc(sizeof(int) * nthreads);
|
|
sum = (int *)nvshmem_malloc(sizeof(int) * nthreads * npes);
|
|
|
|
void *args[] = {&x, &y, &partial_sum, &sum, &use_threadgroup, &mype, &npes};
|
|
dim3 dimBlock(nthreads);
|
|
dim3 dimGrid(1);
|
|
nvshmemx_collective_launch((const void *)distributed_vector_sum, dimGrid, dimBlock, args, 0, 0);
|
|
CUDA_CHECK(cudaDeviceSynchronize());
|
|
|
|
printf("[%d of %d] run complete \n", mype, npes);
|
|
|
|
nvshmem_free(x);
|
|
nvshmem_free(y);
|
|
nvshmem_free(partial_sum);
|
|
nvshmem_free(sum);
|
|
|
|
nvshmem_finalize();
|
|
#ifdef NVSHMEMTEST_MPI_SUPPORT
|
|
if (use_mpi) nvshmemi_finalize_mpi();
|
|
#endif
|
|
|
|
return 0;
|
|
}
|