sglang.0.4.8.post1/nvshmem_src/examples/put-block.cu

122 lines
4.4 KiB
Plaintext

/*
* Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
*
* NVIDIA CORPORATION and its licensors retain all intellectual property
* and proprietary rights in and to this software, related documentation
* and any modifications thereto. Any use, reproduction, disclosure or
* distribution of this software and related documentation without an express
* license agreement from NVIDIA CORPORATION is strictly prohibited.
*
* See COPYRIGHT.txt for license information
*/
#include <stdio.h>
#include <assert.h>
#include "bootstrap_helper.h"
#include "nvshmem.h"
#include "nvshmemx.h"
#undef CUDA_CHECK
#define CUDA_CHECK(stmt) \
do { \
cudaError_t result = (stmt); \
if (cudaSuccess != result) { \
fprintf(stderr, "[%s:%d] cuda failed with %s \n", __FILE__, __LINE__, \
cudaGetErrorString(result)); \
exit(-1); \
} \
} while (0)
#define THREADS_PER_BLOCK 1024
__global__ void set_and_shift_kernel(float *send_data, float *recv_data, int num_elems, int mype,
int npes) {
int thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
/* set the corresponding element of send_data */
if (thread_idx < num_elems) send_data[thread_idx] = mype;
int peer = (mype + 1) % npes;
/* Every thread in block 0 calls nvshmemx_float_put_block. Alternatively,
every thread can call shmem_float_p, but shmem_float_p has a disadvantage
that when the destination GPU is connected via IB, there will be one rma
message for every single element which can be detrimental to performance.
And the disadvantage with shmem_float_put is that when the destination GPU is p2p
connected, it cannot leverage multiple threads to copy the data to the destination
GPU. */
int block_offset = blockIdx.x * blockDim.x;
nvshmemx_float_put_block(recv_data + block_offset, send_data + block_offset,
min(blockDim.x, num_elems - block_offset),
peer); /* All threads in a block call the API
with the same arguments */
}
int main(int c, char *v[]) {
int mype, npes, mype_node;
float *send_data, *recv_data;
int num_elems = 8192;
int num_blocks;
#ifdef NVSHMEMTEST_MPI_SUPPORT
bool use_mpi = false;
char *value = getenv("NVSHMEMTEST_USE_MPI_LAUNCHER");
if (value) use_mpi = atoi(value);
#endif
#ifdef NVSHMEMTEST_MPI_SUPPORT
if (use_mpi) {
nvshmemi_init_mpi(&c, &v);
} else
nvshmem_init();
#else
nvshmem_init();
#endif
mype = nvshmem_my_pe();
npes = nvshmem_n_pes();
mype_node = nvshmem_team_my_pe(NVSHMEMX_TEAM_NODE);
// application picks the device each PE will use
CUDA_CHECK(cudaSetDevice(mype_node));
send_data = (float *)nvshmem_malloc(sizeof(float) * num_elems);
recv_data = (float *)nvshmem_malloc(sizeof(float) * num_elems);
assert(send_data != NULL && recv_data != NULL);
assert(num_elems % THREADS_PER_BLOCK == 0); /* for simplicity */
num_blocks = num_elems / THREADS_PER_BLOCK;
set_and_shift_kernel<<<num_blocks, THREADS_PER_BLOCK>>>(send_data, recv_data, num_elems, mype,
npes);
CUDA_CHECK(cudaGetLastError());
CUDA_CHECK(cudaDeviceSynchronize());
/* Do data validation */
float *host = new float[num_elems];
CUDA_CHECK(cudaMemcpy(host, recv_data, num_elems * sizeof(float), cudaMemcpyDefault));
int ref = (mype - 1 + npes) % npes;
bool success = true;
for (int i = 0; i < num_elems; ++i) {
if (host[i] != ref) {
printf("Error at %d of rank %d: %f\n", i, mype, host[i]);
success = false;
break;
}
}
if (success) {
printf("[%d of %d] run complete \n", mype, npes);
} else {
printf("[%d of %d] run failure \n", mype, npes);
}
nvshmem_free(send_data);
nvshmem_free(recv_data);
nvshmem_finalize();
#ifdef NVSHMEMTEST_MPI_SUPPORT
if (use_mpi) nvshmemi_finalize_mpi();
#endif
return 0;
}