sglang.0.4.8.post1/nvshmem_src/src/host/team/team_internal.cpp

1602 lines
72 KiB
C++

/*
* Copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved.
*
* See COPYRIGHT for license information
*/
#include "team_internal.h"
#include <assert.h> // for assert
#include <cuda_runtime.h> // for cudaMemcpy
#include <driver_types.h> // for cudaMemcpyHos...
#include <limits.h> // for CHAR_BIT
#include <stdint.h> // for uint64_t
#include <stdio.h> // for snprintf, printf
#include <stdlib.h> // for free, malloc
#include <string.h> // for memset, memcmp
#include <cmath> // for ceil
#include "../coll/rdxn/rdxn.h" // for nvshmemi_call...
#include "device_host/nvshmem_types.h" // for nvshmemi_team_t
#include "cpu_coll.h" // for nccl_ftable
#include "device_host/nvshmem_common.cuh" // for nvshmemi_pe_i...
#include "bootstrap_host_transport/env_defs_internal.h" // for nvshmemi_opti...
#include "host/nvshmem_api.h" // for nvshmem_quiet
#include "host/nvshmem_coll_api.h" // for nvshmem_team_...
#include "host/nvshmemx_api.h" // for nvshmemx_char...
#include "non_abi/nvshmemx_error.h" // for NVSHMEMI_NULL...
#include "internal/host/debug.h" // for INFO, NVSHMEM...
#include "internal/host/nvshmem_internal.h" // for nvshmemi_free
#include "internal/host/nvshmemi_coll.h" // for nvshmemi_barrier
#include "internal/host/nvshmemi_symmetric_heap.hpp" // for nvshmemi_symm...
#include "internal/host/nvshmemi_team.h" // for N_PSYNC_BYTES
#include "internal/host/nvshmemi_types.h" // for nvshmemi_state
#include "internal/host/util.h" // for CUDA_RUNTIME_...
#include "internal/bootstrap_host_transport/nvshmemi_bootstrap_defines.h" // for nvshmemi_boot...
#include "internal/host_transport/transport.h" // for nvshmem_trans...
#include "non_abi/nvshmem_build_options.h" // for NVSHMEM_USE_NCCL
#include "internal/host/nvshmemi_nvls_rsc.hpp" // for nvshmemi_nvls...
#ifdef NVSHMEM_USE_NCCL
#include "nccl.h" // for ncclUniqueId
#endif
#include "internal/host_transport/nvshmemi_transport_defines.h" // for NVSHMEM_MEM_H...
using namespace nvls;
#define NVSHMEMI_DIAG_STRLEN 1024
#define NVSHMEMI_SYNC_VALUE 0
#define NVSHMEMI_REDUCE_MAX_CTA_COUNT 64
/* 0th entry in team duplicate resources is same team as the encapsulating team. This allows
* for reuse of the same business logic for nCTA == 1 and nCTA > 1 and minimizes if/else
*/
#define NVSHMEMI_TEAM_DUP_INITIALIZER(teami, team_idx) \
(teami).team_dups[0] = (team_idx); \
for (int i = 1; i < 128; i++) { \
(teami).team_dups[i] = NVSHMEM_TEAM_INVALID; \
}
long nvshmemi_max_teams;
nvshmemi_team_t nvshmemi_team_world;
nvshmemi_team_t nvshmemi_team_shared;
nvshmemi_team_t nvshmemi_team_node;
nvshmemi_team_t nvshmemi_team_same_mype_node;
nvshmemi_team_t nvshmemi_team_same_gpu;
nvshmemi_team_t nvshmemi_team_gpu_leaders;
nvshmemi_team_t *nvshmemi_device_team_world, *nvshmemi_device_team_shared,
*nvshmemi_device_team_node, *nvshmemi_device_team_same_mype_node,
*nvshmemi_device_team_same_gpu, *nvshmemi_device_team_gpu_leaders;
nvshmemi_team_t **nvshmemi_team_pool;
long *nvshmemi_psync_pool;
long *nvshmemi_sync_counter;
nvshmemi_team_t **nvshmemi_device_team_pool;
static unsigned char *psync_pool_avail;
static unsigned char *psync_pool_avail_reduced;
static unsigned char *device_psync_pool_avail;
static unsigned char *device_psync_pool_avail_reduced;
static int *team_ret_val;
static int *team_ret_val_reduced;
static int *device_team_ret_val;
static int *device_team_ret_val_reduced;
bool nvshmemi_team_support_nvls(nvshmemi_team_t *team) {
return ((team->are_gpus_p2p_connected) && (team->nvls_rsc != nullptr));
}
bool nvshmemi_team_is_owner_nvls(nvshmemi_team_t *team) {
nvshmemi_nvls_rsc *nvls = reinterpret_cast<nvshmemi_nvls_rsc *>(team->nvls_rsc);
INFO(NVSHMEM_TEAM, "Team ID: %d NVLS Resource Owner ID: %d\n", team->team_idx,
nvls->get_owner());
return (nvls->is_owner(team));
}
static bool nvshmemi_team_is_nvls_capable(nvshmemi_team_t *team) {
return (team->are_gpus_p2p_connected && team->size >= 2 && nvshmemi_state->is_platform_nvls);
}
static bool nvshmemi_team_is_identical(nvshmemi_team_t *t1, nvshmemi_team_t *t2) {
return (t1->start == t2->start && t1->stride == t2->stride && t1->size == t2->size);
}
static void nvshmemi_team_alloc_device(void) {
CUDA_RUNTIME_CHECK(cudaMalloc((void **)&nvshmemi_device_team_world, sizeof(nvshmemi_team_t)));
CUDA_RUNTIME_CHECK(cudaMalloc((void **)&nvshmemi_device_team_shared, sizeof(nvshmemi_team_t)));
CUDA_RUNTIME_CHECK(cudaMalloc((void **)&nvshmemi_device_team_node, sizeof(nvshmemi_team_t)));
CUDA_RUNTIME_CHECK(
cudaMalloc((void **)&nvshmemi_device_team_same_mype_node, sizeof(nvshmemi_team_t)));
CUDA_RUNTIME_CHECK(
cudaMalloc((void **)&nvshmemi_device_team_same_gpu, sizeof(nvshmemi_team_t)));
CUDA_RUNTIME_CHECK(
cudaMalloc((void **)&nvshmemi_device_team_gpu_leaders, sizeof(nvshmemi_team_t)));
}
static void nvshmemi_team_update_device(void) {
CUDA_RUNTIME_CHECK(cudaMemcpy(nvshmemi_device_team_world, &nvshmemi_team_world,
sizeof(nvshmemi_team_t), cudaMemcpyHostToDevice));
CUDA_RUNTIME_CHECK(cudaMemcpy(&nvshmemi_device_team_pool[NVSHMEM_TEAM_WORLD_INDEX],
&nvshmemi_device_team_world, sizeof(nvshmemi_team_t *),
cudaMemcpyHostToDevice));
CUDA_RUNTIME_CHECK(cudaMemcpy(nvshmemi_device_team_shared, &nvshmemi_team_shared,
sizeof(nvshmemi_team_t), cudaMemcpyHostToDevice));
CUDA_RUNTIME_CHECK(cudaMemcpy(&nvshmemi_device_team_pool[NVSHMEM_TEAM_SHARED_INDEX],
&nvshmemi_device_team_shared, sizeof(nvshmemi_team_t *),
cudaMemcpyHostToDevice));
CUDA_RUNTIME_CHECK(cudaMemcpy(nvshmemi_device_team_node, &nvshmemi_team_node,
sizeof(nvshmemi_team_t), cudaMemcpyHostToDevice));
CUDA_RUNTIME_CHECK(cudaMemcpy(&nvshmemi_device_team_pool[NVSHMEM_TEAM_NODE_INDEX],
&nvshmemi_device_team_node, sizeof(nvshmemi_team_t *),
cudaMemcpyHostToDevice));
CUDA_RUNTIME_CHECK(cudaMemcpy(nvshmemi_device_team_same_mype_node,
&nvshmemi_team_same_mype_node, sizeof(nvshmemi_team_t),
cudaMemcpyHostToDevice));
CUDA_RUNTIME_CHECK(cudaMemcpy(&nvshmemi_device_team_pool[NVSHMEM_TEAM_SAME_MYPE_NODE_INDEX],
&nvshmemi_device_team_same_mype_node, sizeof(nvshmemi_team_t *),
cudaMemcpyHostToDevice));
CUDA_RUNTIME_CHECK(cudaMemcpy(nvshmemi_device_team_same_gpu, &nvshmemi_team_same_gpu,
sizeof(nvshmemi_team_t), cudaMemcpyHostToDevice));
CUDA_RUNTIME_CHECK(cudaMemcpy(&nvshmemi_device_team_pool[NVSHMEM_TEAM_SAME_GPU_INDEX],
&nvshmemi_device_team_same_gpu, sizeof(nvshmemi_team_t *),
cudaMemcpyHostToDevice));
CUDA_RUNTIME_CHECK(cudaMemcpy(nvshmemi_device_team_gpu_leaders, &nvshmemi_team_gpu_leaders,
sizeof(nvshmemi_team_t), cudaMemcpyHostToDevice));
CUDA_RUNTIME_CHECK(cudaMemcpy(&nvshmemi_device_team_pool[NVSHMEM_TEAM_GPU_LEADERS_INDEX],
&nvshmemi_device_team_gpu_leaders, sizeof(nvshmemi_team_t *),
cudaMemcpyHostToDevice));
CUDA_RUNTIME_CHECK(cudaDeviceSynchronize());
}
static void nvshmemi_recexchalgo_get_neighbors(nvshmemi_team_t *teami) {
int i, j, k;
int p_of_k = 1, log_p_of_k = 0, rem, T, newpe;
int step1_sendto, step1_nrecvs, step2_nphases;
int *step1_recvfrom, **step2_nbrs;
int *step1_recvfrom_device, **step2_nbrs_device;
int my_pe = teami->my_pe;
int num_pes = teami->size;
INFO(NVSHMEM_COLL, "step 1 nbr calculation started, num_pes = %d", num_pes);
k = nvshmemi_options.REDUCE_RECEXCH_KVAL;
assert(k > 1);
if (num_pes < k) /* If size of the active set is less than k, reduce the value of k */
k = (num_pes > 2) ? num_pes : 2;
/* Calculate p_of_k, p_of_k is the largest power of k that is less than num_pes */
while (p_of_k <= num_pes) {
p_of_k *= k;
log_p_of_k++;
}
p_of_k /= k;
/* protect against underflow warnings when asserts are disabled. */
if (log_p_of_k > 0) {
log_p_of_k--;
}
step2_nphases = log_p_of_k;
step1_recvfrom = (int *)malloc(sizeof(int) * (k - 1));
assert(step1_recvfrom);
step2_nbrs = (int **)malloc(sizeof(int *) * step2_nphases);
assert(step2_nbrs);
for (int i = 0; i < step2_nphases; i++) {
step2_nbrs[i] = (int *)malloc(sizeof(int) * (k - 1));
assert(step2_nbrs[i]);
}
rem = num_pes - p_of_k;
/* rem is the number of PEs that do not particpate in Step 2
* We need to identify these non-participating PEs. This is done in the following way.
* The first T PEs are divided into sets of k consecutive PEs each.
* In each of these sets, the first k-1 PEs are the non-participating
* PEs while the last PE is the participating PE.
* The non-participating PEs send their data to the participating PE
* in their corresponding set.
*/
T = (rem * k) / (k - 1);
INFO(NVSHMEM_COLL, "step 1 nbr calculation started. T is %d", T);
step1_nrecvs = 0;
step1_sendto = -1;
/* Step 1 */
if (my_pe < T) {
if (my_pe % k != (k - 1)) { /* I am a non-participating PE */
step1_sendto = my_pe + (k - 1 - my_pe % k); /* partipating PE to send the data to */
/* if the corresponding participating PE is not in T,
* then send to the Tth PE to preserve non-commutativity */
if (step1_sendto > T - 1) step1_sendto = T;
newpe = -1; /* tag this PE as non-participating */
} else { /* participating PE */
for (i = 0; i < k - 1; i++) {
step1_recvfrom[i] = my_pe - i - 1;
}
step1_nrecvs = k - 1;
newpe = my_pe / k; /* this is the new PE amongst the set of participating PEs */
}
} else { /* PE >= T */
newpe = my_pe - rem;
if (my_pe == T && (T - 1) % k != k - 1 && T >= 1) {
int nsenders = (T - 1) % k + 1; /* number of PEs sending their data to me in Step 1 */
for (j = nsenders - 1; j >= 0; j--) {
step1_recvfrom[nsenders - 1 - j] = T - nsenders + j;
}
step1_nrecvs = nsenders;
}
}
INFO(NVSHMEM_COLL, "step 1 nbr computation completed");
/* Step 2 */
if (step1_sendto == -1) { /* calulate step2_nbrs only for participating PEs */
int *digit = (int *)malloc(sizeof(int) * step2_nphases);
assert(digit != NULL);
int temppe = newpe;
int mask = 0x1;
int phase = 0, cbit, cnt, nbr, power;
/* calculate the digits in base k representation of newpe */
for (i = 0; i < log_p_of_k; i++) digit[i] = 0;
int remainder, i_digit = 0;
while (temppe != 0) {
remainder = temppe % k;
temppe = temppe / k;
digit[i_digit] = remainder;
i_digit++;
}
while (mask < p_of_k) {
cbit =
digit[phase]; /* phase_th digit changes in this phase, obtain its original value */
cnt = 0;
for (i = 0; i < k; i++) { /* there are k-1 neighbors */
if (i != cbit) { /* do not generate yourself as your nieighbor */
digit[phase] = i; /* this gets us the base k representation of the neighbor */
/* calculate the base 10 value of the neighbor PE */
nbr = 0;
power = 1;
for (j = 0; j < log_p_of_k; j++) {
nbr += digit[j] * power;
power *= k;
}
/* calculate its real PE and store it */
step2_nbrs[phase][cnt] =
(nbr < rem / (k - 1)) ? (nbr * k) + (k - 1) : nbr + rem;
cnt++;
}
}
INFO(NVSHMEM_COLL, "step 2, phase %d nbr calculation completed", phase);
digit[phase] = cbit; /* reset the digit to original value */
phase++;
mask *= k;
}
free(digit);
}
// Update with global PE numbers
if (step1_sendto != -1) step1_sendto = teami->start + step1_sendto * teami->stride;
for (int i = 0; i < step1_nrecvs; i++)
step1_recvfrom[i] = teami->start + step1_recvfrom[i] * teami->stride;
for (int i = 0; i < step2_nphases; i++)
for (int j = 0; j < k - 1; j++)
step2_nbrs[i][j] = teami->start + step2_nbrs[i][j] * teami->stride;
// Copy the data to device memory
CUDA_RUNTIME_CHECK(cudaMalloc(&step1_recvfrom_device, sizeof(int) * (k - 1)));
CUDA_RUNTIME_CHECK(cudaMalloc(
&step2_nbrs_device,
sizeof(int *) * (step2_nphases + 1))); /* + 1 to make it non-zero otherwise cuMemAlloc
returns error when step2_nphases is 0 */
for (int i = 0; i < step2_nphases; i++) {
void *dev_ptr;
CUDA_RUNTIME_CHECK(cudaMalloc(&dev_ptr, sizeof(int) * (k - 1)));
CUDA_RUNTIME_CHECK(cudaMemcpy((int **)step2_nbrs_device + i, &dev_ptr, sizeof(int *),
cudaMemcpyHostToDevice));
}
CUDA_RUNTIME_CHECK(cudaMemcpy(step1_recvfrom_device, step1_recvfrom, sizeof(int) * step1_nrecvs,
cudaMemcpyHostToDevice));
void *dev_ptr, *dev_ptr_2;
dev_ptr = step2_nbrs_device;
for (int i = 0; i < step2_nphases; i++) {
CUDA_RUNTIME_CHECK(
cudaMemcpy(&dev_ptr_2, (int **)dev_ptr + i, sizeof(int *), cudaMemcpyDeviceToHost));
CUDA_RUNTIME_CHECK(
cudaMemcpy(dev_ptr_2, step2_nbrs[i], sizeof(int) * (k - 1), cudaMemcpyHostToDevice));
}
teami->reduce_recexch.step1_sendto = step1_sendto;
teami->reduce_recexch.step1_nrecvs = step1_nrecvs;
teami->reduce_recexch.step2_nphases = step2_nphases;
teami->reduce_recexch.step1_recvfrom = step1_recvfrom_device;
teami->reduce_recexch.step2_nbrs = step2_nbrs_device;
free(step1_recvfrom);
for (int i = 0; i < step2_nphases; i++) {
if (step2_nbrs[i]) {
free(step2_nbrs[i]);
}
}
free(step2_nbrs);
}
static void nvshmemi_recexchalgo_free_mem(nvshmemi_team_t *teami) {
CUDA_RUNTIME_CHECK(cudaFree(teami->reduce_recexch.step1_recvfrom));
for (int i = 0; i < teami->reduce_recexch.step2_nphases; i++) {
void *dev_ptr;
CUDA_RUNTIME_CHECK(cudaMemcpy(&dev_ptr, teami->reduce_recexch.step2_nbrs + i, sizeof(int *),
cudaMemcpyDeviceToHost));
CUDA_RUNTIME_CHECK(cudaFree(dev_ptr));
}
CUDA_RUNTIME_CHECK(cudaFree(teami->reduce_recexch.step2_nbrs));
}
static inline void nvshmemi_bit_set(unsigned char *ptr, size_t size, size_t index) {
assert(size > 0 && (index < size * CHAR_BIT));
size_t which_byte = index / CHAR_BIT;
ptr[which_byte] |= (1 << (index % CHAR_BIT));
return;
}
static inline void nvshmemi_bit_clear(unsigned char *ptr, size_t size, size_t index) {
assert(size > 0 && (index < size * CHAR_BIT));
size_t which_byte = index / CHAR_BIT;
ptr[which_byte] &= ~(1 << (index % CHAR_BIT));
return;
}
static inline unsigned char nvshmemi_bit_fetch(unsigned char *ptr, size_t index) {
return (ptr[index / CHAR_BIT] >> (index % CHAR_BIT)) & 1;
}
static inline size_t nvshmemi_bit_1st_nonzero(const unsigned char *ptr, const size_t size) {
/* The following ignores endianess: */
for (size_t i = 0; i < size; i++) {
unsigned char bit_val = ptr[i];
for (size_t j = 0; bit_val && j < CHAR_BIT; j++) {
if (bit_val & 1) return i * CHAR_BIT + j;
bit_val >>= 1;
}
}
return (size_t)-1;
}
/* Create a bit string of the format AAAAAAAA.BBBBBBBB into str for the byte
* array passed via ptr. */
static inline void nvshmemi_bit_to_string(char *str, size_t str_size, unsigned char *ptr,
size_t ptr_size) {
size_t off = 0;
for (size_t i = 0; i < ptr_size; i++) {
for (size_t j = 0; j < CHAR_BIT; j++) {
off += snprintf(str + off, str_size - off, "%s",
(ptr[i] & (1 << (CHAR_BIT - 1 - j))) ? "1" : "0");
if (off >= str_size) return;
}
if (i < ptr_size - 1) {
off += snprintf(str + off, str_size - off, ".");
if (off >= str_size) return;
}
}
}
/* Checks whether a PE has a consistent stride given (start, stride, size).
* This function is useful within a loop across PE IDs, and sets 'start',
* 'stride' and 'size' accordingly upon exiting the loop. It also assumes
* 'start' and 'stride' are initialized to a negative number and 'size' to 0.
* If an inconsistent stride is found, returns -1. */
static inline int check_for_linear_stride(int pe, int *start, int *stride, int *size) {
if (*start < 0) {
*start = pe;
(*size)++;
} else if (*stride < 0) {
*stride = pe - *start;
(*size)++;
} else if ((pe - *start) % *stride != 0) {
NVSHMEMI_WARN_PRINT("Detected non-uniform stride inserting PE %d into <%d, %d, %d>\n", pe,
*start, *stride, *size);
return -1;
} else {
(*size)++;
}
return 0;
}
static inline int nvshmemi_pe_in_active_set(int global_pe, int PE_start, int PE_stride,
int PE_size) {
int n = (global_pe - PE_start) / PE_stride;
if (global_pe < PE_start || (global_pe - PE_start) % PE_stride || n >= PE_size)
return -1;
else {
return n;
}
}
int nvshmemi_team_translate_pe(nvshmemi_team_t *src_team, int src_pe, nvshmemi_team_t *dest_team) {
int src_pe_world, dest_pe = -1;
if (src_pe > src_team->size) return -1;
src_pe_world = src_team->start + src_pe * src_team->stride;
assert(src_pe_world >= src_team->start && src_pe_world < nvshmemi_state->npes);
dest_pe = nvshmemi_pe_in_active_set(src_pe_world, dest_team->start, dest_team->stride,
dest_team->size);
return dest_pe;
}
static inline size_t get_fcollect_psync_len_per_team() {
size_t fcollect_ll_threshold =
nvshmemi_device_state.gpu_coll_env_params_var.fcollect_ll_threshold;
size_t fcollect_sync_size =
(2 * 2 * nvshmemi_state->npes * fcollect_ll_threshold) / sizeof(long);
assert(fcollect_ll_threshold % sizeof(long) == 0);
return fcollect_sync_size;
}
static inline size_t get_fcollect_ll128_psync_len_per_team() {
size_t fcollect_ll128_threshold =
nvshmemi_device_state.gpu_coll_env_params_var.fcollect_ll128_threshold;
size_t fcollect_ll128_sync_size =
NVSHMEMI_FCOLLECT_LL128_CALC_PSYNC_SIZE(fcollect_ll128_threshold, char);
/* scale for npes and two separate psyncs */
fcollect_ll128_sync_size = fcollect_ll128_sync_size * 2 * nvshmemi_state->npes / sizeof(long);
return fcollect_ll128_sync_size;
}
static inline size_t get_psync_len_per_team() {
size_t fcollect_sync_size = get_fcollect_psync_len_per_team();
size_t fcollect_ll128_sync_size = get_fcollect_ll128_psync_len_per_team();
/* sync: Two buffers are used - one for sync/barrier collective ops, the second one during team
split operation reduce: Two pWrk's are used alternatively across consecutive reduce calls,
this is to avoid having to put a barrier in between bcast: The buffer is split to do multiple
consecutive broadcast, when all buffers are used, a barrier is called and then again we begin
from the start of the buffer fcollect: Two sets of buffer are used to alternate between -
same way as in reduce. The other fator of 2 is because when using LL double the space is
needed to fuse flag with data */
size_t ans = (2 * NVSHMEMI_SYNC_SIZE +
nvshmemi_device_state.gpu_coll_env_params_var.reduce_scratch_size / sizeof(long) +
NVSHMEMI_BCAST_SYNC_SIZE + fcollect_sync_size + 2 * NVSHMEMI_ALLTOALL_SYNC_SIZE +
fcollect_ll128_sync_size);
return ans;
}
size_t nvshmemi_get_teams_mem_requirement() {
size_t psync_size = get_psync_len_per_team();
size_t teams_mem_req = sizeof(long) * nvshmemi_max_teams * psync_size + /* psync's */
2 * N_PSYNC_BYTES + /* psync_pool_avail */
2 * sizeof(int) + /* team_ret_val */
2 * sizeof(long) * nvshmemi_max_teams /* storing counters */
#ifdef NVSHMEM_USE_NCCL
+ sizeof(ncclUniqueId)
#endif
;
INFO(NVSHMEM_INIT, "team psync mem req %ld bytes, team mem total req %d bytes, max teams %ld\n",
psync_size, teams_mem_req, nvshmemi_max_teams);
return teams_mem_req;
}
#ifdef NVSHMEM_USE_NCCL
void nvshmemi_team_init_nccl_comm(nvshmemi_team_t *teami) {
ncclUniqueId Id;
int start = teami->start;
int stride = teami->stride;
int size = teami->size;
/* This is technical debt where we are using the REDUCE op psync as scratchpad for src/dst of
* broadcast broadcast's psync is used for LL8 and other algorithms, making it non-trivial to
* share when issued from the host as a src or dest buffer.
*
* When reduce coll supports LL8 algorithm, we need to clean this up as a independent scratch
* space
*/
long *pWrk = nvshmemi_team_get_psync(teami, REDUCE);
if (teami->my_pe == 0) {
NCCL_CHECK(nccl_ftable.GetUniqueId(&Id));
CUDA_RUNTIME_CHECK(cudaMemcpy(pWrk, &Id, sizeof(ncclUniqueId), cudaMemcpyHostToDevice));
CUDA_RUNTIME_CHECK(cudaDeviceSynchronize());
for (int i = 0; i < size; i++) {
nvshmemx_char_put_nbi_on_stream((char *)pWrk, (const char *)pWrk, sizeof(ncclUniqueId),
start + i * stride, (cudaStream_t)0);
}
CUDA_RUNTIME_CHECK(cudaDeviceSynchronize());
nvshmemi_barrier(teami->team_idx);
} else {
nvshmemi_barrier(teami->team_idx);
CUDA_RUNTIME_CHECK(cudaMemcpy(&Id, pWrk, sizeof(ncclUniqueId), cudaMemcpyDeviceToHost));
}
INFO(NVSHMEM_TEAM, "Calling ncclCommInitRank, teami->size = %d, teami->my_pe = %d", teami->size,
teami->my_pe);
NCCL_CHECK(
nccl_ftable.CommInitRank((ncclComm_t *)&teami->nccl_comm, teami->size, Id, teami->my_pe));
}
#endif /* NVSHMEM_USE_NCCL */
void nvshmemi_team_set_p2p_connectivity(nvshmemi_team_t *teami) {
teami->are_gpus_p2p_connected = 1;
for (int pe = teami->start; pe < teami->start + teami->stride * teami->size;
pe += teami->stride) {
if (nvshmemi_state->heap_obj->get_local_pe_base()[pe] == NULL) {
teami->are_gpus_p2p_connected = 0;
break;
}
}
}
/* NVLS Resource management for teams */
static void nvshmemi_team_destroy_nvls(nvshmemi_team_t *team) {
if (team->nvls_rsc == nullptr) return; /* NOOP */
nvshmemi_nvls_rsc *nvls_obj = nullptr;
nvls_obj = reinterpret_cast<nvshmemi_nvls_rsc *>(team->nvls_rsc);
if (nvls_obj->get_refcount() == 0) { /* Last reference */
nvshmemi_state->heap_obj->nvls_unmap_heap_memory_by_team(team);
nvshmemi_state->heap_obj->nvls_unbind_heap_memory_by_team(team);
nvls_obj->free_group_mem();
nvls_obj->release_owner();
delete nvls_obj;
cudaFree(team->nvls_rsc_base_ptr);
team->nvls_rsc = nullptr;
INFO(NVSHMEM_TEAM, "NVLS Resource Destroyed for Team ID %d\n", team->team_idx);
} else {
nvls_obj->del_refcount(); /* Shared nvls resource */
/* Ownership of NVLS resource is necessary to allow for newly allocated UC memory to be
* bound and mapped to MC heap */
if (nvls_obj->is_owner(team)) {
// Transfer ownership to one of the dup teams
NVSHMEMU_FOR_EACH_IF(
i, nvshmemi_max_teams,
nvshmemi_team_pool[i] != NULL && nvshmemi_team_support_nvls(nvshmemi_team_pool[i]),
{
// Find first duplicate team that shares the nvls rsc and make it the owner
if (nvshmemi_team_pool[i]->nvls_rsc == team->nvls_rsc) {
nvls_obj->release_owner();
nvls_obj->assign_owner(nvshmemi_team_pool[i]);
break;
}
})
}
}
}
static int nvshmemi_team_create_nvls(nvshmemi_team_t *team) {
int status = -1;
uint64_t mc_heap_base;
nvshmemi_nvls_rsc *nvls_obj = nullptr;
if (!nvshmemi_team_is_nvls_capable(team)) {
team->nvls_rsc = nullptr;
WARN("Skipping NVLINK SHARP resource initialized for team ID: %d\n", team->team_idx);
return 0;
}
try {
team->nvls_rsc = reinterpret_cast<void *>(new nvshmemi_nvls_rsc(team, nvshmemi_state));
} catch (nvshmemi_nvls_exception &exp) {
WARN("NVLINK SHARP resource initialization failed for team ID: %d\n", team->team_idx);
team->nvls_rsc = nullptr;
return 0;
}
nvls_obj = reinterpret_cast<nvshmemi_nvls_rsc *>(team->nvls_rsc);
status = nvls_obj->reserve_group_mem();
NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, cleanup,
"Reserve multicast group mapping failed for pe %d\n", team->my_pe);
nvls_obj->assign_owner(team);
CUDA_RUNTIME_CHECK(cudaMalloc((void **)&team->nvls_rsc_base_ptr, sizeof(void *)));
mc_heap_base = (uint64_t)(nvls_obj->get_mc_base());
CUDA_RUNTIME_CHECK(
cudaMemcpy(team->nvls_rsc_base_ptr, &mc_heap_base, sizeof(void *), cudaMemcpyHostToDevice));
INFO(NVSHMEM_TEAM, "NVLS Resource Created for Team ID %d MC VA Base: %llx\n", team->team_idx,
mc_heap_base);
return (status);
cleanup:
(void)nvls_obj->free_group_mem();
delete nvls_obj;
team->nvls_rsc = nullptr;
return (status);
}
static int nvshmemi_team_bind_nvls(nvshmemi_team_t *team) {
int status = -1;
/* Make a MC handle as large as physical heap size */
nvshmemi_nvls_rsc *nvls_obj = nullptr;
nvls_obj = reinterpret_cast<nvshmemi_nvls_rsc *>(team->nvls_rsc);
if (!nvls_obj->is_owner(team)) return 0;
status = nvshmemi_state->heap_obj->nvls_create_heap_memory_by_team(team);
NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, cleanup,
"Create multicast groups for UC heap failed for pe %d team ID %d\n",
team->my_pe, team->team_idx);
status = nvshmemi_state->heap_obj->nvls_bind_heap_memory_by_team(team);
NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, cleanup,
"Binding multicast groups to UC heap failed for pe %d team ID %d\n",
team->my_pe, team->team_idx);
status = nvshmemi_state->heap_obj->nvls_map_heap_memory_by_team(team);
NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, cleanup,
"Mapping multicast groups for UC heap failed for pe %d team ID %d\n",
team->my_pe, team->team_idx);
cleanup:
return (status);
}
static int nvshmemi_team_nvls_setup(nvshmemi_team_t *team) {
int status = 0;
/* Initialize NVLS resources for team supporting P2P connected GPUs */
status = nvshmemi_team_create_nvls(team);
NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, cleanup,
"NVLS resource initialization failed for team ID: %d\n", team->team_idx);
/* Any prior UC allocations need to bound to this team's MC groups */
if (team->nvls_rsc != nullptr) {
status = nvshmemi_team_bind_nvls(team);
NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, cleanup,
"NVLS resource bind and mapping existing UC mappings to MC heap "
"failed for team ID: %d\n",
team->team_idx);
}
cleanup:
return (status);
}
/* Team Management Routines */
int nvshmemi_set_max_teams(void) {
nvshmemi_max_teams = nvshmemi_options.MAX_TEAMS;
if (nvshmemi_max_teams < NVSHMEM_TEAMS_MIN) nvshmemi_max_teams = NVSHMEM_TEAMS_MIN;
/* On NVLS sharp enabled platforms increase default max teams to 64 */
if (nvshmemi_state->is_platform_nvls) {
nvshmemi_max_teams = (nvshmemi_max_teams > NVSHMEMI_REDUCE_MAX_CTA_COUNT)
? nvshmemi_max_teams
: NVSHMEMI_REDUCE_MAX_CTA_COUNT;
}
if (nvshmemi_max_teams > N_PSYNC_BYTES * CHAR_BIT) {
NVSHMEMI_ERROR_EXIT("Requested %ld teams, but only %d are supported\n", nvshmemi_max_teams,
N_PSYNC_BYTES * CHAR_BIT);
return 1;
}
return 0;
}
int nvshmemi_team_init(void) {
long psync_len;
int start, stride, size;
int *scratch = NULL;
int status = 0;
uint64_t *hostHash = NULL;
uint64_t myHostHash = 0;
nvshmem_transport_pe_info_t *pe_info;
int i;
nvshmemi_team_world = NVSHMEMI_TEAM_INITIALIZER;
nvshmemi_team_shared = NVSHMEMI_TEAM_INITIALIZER;
nvshmemi_team_node = NVSHMEMI_TEAM_INITIALIZER;
nvshmemi_team_same_mype_node = NVSHMEMI_TEAM_INITIALIZER;
nvshmemi_team_same_gpu = NVSHMEMI_TEAM_INITIALIZER;
nvshmemi_team_gpu_leaders = NVSHMEMI_TEAM_INITIALIZER;
/* Initialize NVSHMEM_TEAM_WORLD */
nvshmemi_team_world.team_idx = NVSHMEM_TEAM_WORLD_INDEX;
NVSHMEMI_TEAM_DUP_INITIALIZER(nvshmemi_team_world, NVSHMEM_TEAM_WORLD_INDEX);
nvshmemi_team_world.start = 0;
nvshmemi_team_world.stride = 1;
nvshmemi_team_world.size = nvshmemi_state->npes;
nvshmemi_team_world.my_pe = nvshmemi_state->mype;
nvshmemi_team_world.rdxn_count = 0;
nvshmemi_team_world.config_mask = 0;
nvshmemi_team_world.ll_flag = 1;
nvshmemi_team_world.alltoall_count = 0;
nvshmemi_team_world.bcast_count = 0;
nvshmemi_team_world.bcast_sync_offset = 0;
nvshmemi_team_world.fcollect_count = 0;
nvshmemi_team_set_p2p_connectivity(&nvshmemi_team_world);
nvshmemi_recexchalgo_get_neighbors(&nvshmemi_team_world);
nvshmemi_team_world.is_team_node = false;
nvshmemi_team_world.is_team_same_mype_node = false;
/* Initialize NVSHMEM_TEAM_SHARED */
nvshmemi_team_shared.team_idx = NVSHMEM_TEAM_SHARED_INDEX;
NVSHMEMI_TEAM_DUP_INITIALIZER(nvshmemi_team_shared, NVSHMEM_TEAM_SHARED_INDEX);
/* Collect list of p2p connected PEs */
int *p2p_pe_list = (int *)malloc(nvshmemi_team_world.size * sizeof(int));
int n_p2p_pes = 0;
int my_idx_in_p2p_list = 0;
for (int i = 0; i < nvshmemi_team_world.size; i++) {
if (nvshmemi_state->heap_obj->get_local_pe_base()[i]) {
if (i == nvshmemi_team_world.my_pe) my_idx_in_p2p_list = n_p2p_pes;
p2p_pe_list[n_p2p_pes++] = i;
}
}
std::ostringstream ss;
for (int i = 0; i < n_p2p_pes; i++) {
ss << p2p_pe_list[i] << " ";
}
INFO(NVSHMEM_INIT, "P2P list: %s", ss.str().c_str());
/* Make sure that n_p2p_pes is same for all PEs to form TEAM_SHARED */
int *n_p2p_pes_all = (int *)malloc(nvshmemi_team_world.size * sizeof(int));
int *p2p_pe_list_all = (int *)malloc(sizeof(int) * n_p2p_pes * nvshmemi_team_world.size);
nvshmemi_boot_handle.allgather((void *)&n_p2p_pes, (void *)n_p2p_pes_all, sizeof(int),
&nvshmemi_boot_handle);
for (i = 0; i < nvshmemi_team_world.size; i++) {
if (n_p2p_pes_all[i] != n_p2p_pes) {
INFO(NVSHMEM_INIT,
"n_p2p_pes is not equal across PEs, setting NVSHMEM_TEAM_SHARED to self");
goto team_shared_single_pe;
}
}
/* Gather p2p lists of all PEs and ensure they are the same */
nvshmemi_boot_handle.allgather((void *)p2p_pe_list, (void *)p2p_pe_list_all,
sizeof(int) * n_p2p_pes, &nvshmemi_boot_handle);
for (i = 0; i < n_p2p_pes; i++) {
if (memcmp((void *)p2p_pe_list, (void *)&p2p_pe_list_all[p2p_pe_list[i] * n_p2p_pes],
sizeof(int) * n_p2p_pes) != 0) {
INFO(NVSHMEM_INIT, "P2P lists are not symmetric, setting NVSHMEM_TEAM_SHARED to self");
goto team_shared_single_pe;
}
}
for (int i = 2; i < n_p2p_pes; i++) {
if (p2p_pe_list[i] - p2p_pe_list[i - 1] != p2p_pe_list[i - 1] - p2p_pe_list[i - 2]) {
INFO(NVSHMEM_INIT,
"P2P list is not of the form (start, stride, size). Cannot form "
"NVSHMEM_TEAM_SHARED.");
goto team_shared_single_pe;
}
}
nvshmemi_team_shared.my_pe = my_idx_in_p2p_list;
nvshmemi_team_shared.start = p2p_pe_list[0];
nvshmemi_team_shared.stride = n_p2p_pes > 1 ? (p2p_pe_list[1] - p2p_pe_list[0]) : 1;
nvshmemi_team_shared.size = n_p2p_pes;
goto team_shared_setup;
team_shared_single_pe:
nvshmemi_team_shared.my_pe = 0;
nvshmemi_team_shared.start = nvshmemi_state->mype;
nvshmemi_team_shared.stride = 1;
nvshmemi_team_shared.size = 1;
nvshmemi_team_shared.is_team_node = true;
nvshmemi_team_shared.is_team_same_mype_node = true;
team_shared_setup:
free(n_p2p_pes_all);
free(p2p_pe_list_all);
free(p2p_pe_list);
nvshmemi_team_shared.rdxn_count = 0;
nvshmemi_team_shared.config_mask = 0;
nvshmemi_team_shared.ll_flag = 1;
nvshmemi_team_shared.alltoall_count = 0;
nvshmemi_team_shared.bcast_count = 0;
nvshmemi_team_shared.bcast_sync_offset = 0;
nvshmemi_team_shared.fcollect_count = 0;
nvshmemi_team_shared.are_gpus_p2p_connected = 0;
nvshmemi_team_set_p2p_connectivity(&nvshmemi_team_shared);
nvshmemi_recexchalgo_get_neighbors(&nvshmemi_team_shared);
INFO(NVSHMEM_INIT, "NVSHMEM_TEAM_SHARED: start=%d, stride=%d, size=%d",
nvshmemi_team_shared.start, nvshmemi_team_shared.stride, nvshmemi_team_shared.size);
nvshmemi_team_shared.is_team_same_mype_node = false;
/* Initialize NVSHMEMX_TEAM_NODE */
nvshmemi_team_node.team_idx = NVSHMEM_TEAM_NODE_INDEX;
NVSHMEMI_TEAM_DUP_INITIALIZER(nvshmemi_team_node, NVSHMEM_TEAM_NODE_INDEX);
nvshmemi_team_world.team_node = nvshmemi_team_node.team_idx;
nvshmemi_team_node.my_pe = nvshmemi_state->mype_node;
nvshmemi_team_node.rdxn_count = 0;
nvshmemi_team_node.config_mask = 0;
nvshmemi_team_node.ll_flag = 1;
nvshmemi_team_node.alltoall_count = 0;
nvshmemi_team_node.bcast_count = 0;
nvshmemi_team_node.bcast_sync_offset = 0;
nvshmemi_team_node.fcollect_count = 0;
myHostHash = getHostHash();
hostHash = (uint64_t *)malloc(sizeof(uint64_t) * nvshmemi_state->npes);
NVSHMEMI_NULL_ERROR_JMP(hostHash, status, NVSHMEMX_ERROR_OUT_OF_MEMORY, cleanup,
"hostHash allocation failed \n");
status = nvshmemi_boot_handle.allgather((void *)&myHostHash, (void *)hostHash, sizeof(uint64_t),
&nvshmemi_boot_handle);
NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, cleanup,
"allgather of host hashes failed\n");
/* Search for on-node peer PEs while checking for a consistent stride */
start = -1;
stride = -1;
size = 0;
for (int pe = 0; pe < nvshmemi_state->npes; pe++) {
if (hostHash[pe] != myHostHash) continue;
int ret = check_for_linear_stride(pe, &start, &stride, &size);
if (ret < 0) {
start = nvshmemi_state->mype;
stride = 1;
size = 1;
break;
}
}
assert(start >= 0 && size > 0);
nvshmemi_team_node.start = start;
nvshmemi_team_node.stride = (stride == -1) ? 1 : stride;
nvshmemi_team_node.size = size;
if (nvshmemi_team_is_identical(&nvshmemi_team_world, &nvshmemi_team_node)) {
nvshmemi_team_world.is_team_node = true;
}
if (nvshmemi_team_is_identical(&nvshmemi_team_shared, &nvshmemi_team_node)) {
nvshmemi_team_shared.is_team_node = true;
}
nvshmemi_team_set_p2p_connectivity(&nvshmemi_team_node);
nvshmemi_recexchalgo_get_neighbors(&nvshmemi_team_node);
nvshmemi_team_node.is_team_node = true;
nvshmemi_team_node.is_team_same_mype_node = false;
INFO(NVSHMEM_INIT, "NVSHMEMX_TEAM_NODE: start=%d, stride=%d, size=%d", nvshmemi_team_node.start,
nvshmemi_team_node.stride, nvshmemi_team_node.size);
/* Initialize NVSHMEMX_TEAM_SAME_MYPE_NODE */
nvshmemi_team_same_mype_node.team_idx = NVSHMEM_TEAM_SAME_MYPE_NODE_INDEX;
NVSHMEMI_TEAM_DUP_INITIALIZER(nvshmemi_team_same_mype_node, NVSHMEM_TEAM_SAME_MYPE_NODE_INDEX);
nvshmemi_team_world.team_same_mype_node = nvshmemi_team_same_mype_node.team_idx;
nvshmemi_team_same_mype_node.my_pe = nvshmemi_state->mype / nvshmemi_state->npes_node;
nvshmemi_team_same_mype_node.rdxn_count = 0;
nvshmemi_team_same_mype_node.config_mask = 0;
nvshmemi_team_same_mype_node.start = nvshmemi_state->mype_node;
nvshmemi_team_same_mype_node.stride = nvshmemi_state->npes_node;
nvshmemi_team_same_mype_node.size = nvshmemi_state->npes / nvshmemi_state->npes_node;
assert(nvshmemi_state->npes % nvshmemi_state->npes_node == 0);
nvshmemi_team_same_mype_node.ll_flag = 1;
nvshmemi_team_same_mype_node.alltoall_count = 0;
nvshmemi_team_same_mype_node.bcast_count = 0;
nvshmemi_team_same_mype_node.bcast_sync_offset = 0;
nvshmemi_team_same_mype_node.fcollect_count = 0;
nvshmemi_team_set_p2p_connectivity(&nvshmemi_team_same_mype_node);
nvshmemi_recexchalgo_get_neighbors(&nvshmemi_team_same_mype_node);
nvshmemi_team_same_mype_node.is_team_node = false;
nvshmemi_team_same_mype_node.is_team_same_mype_node = true;
INFO(NVSHMEM_INIT, "NVSHMEMX_TEAM_SAME_MYPE_NODE: start=%d, stride=%d, size=%d",
nvshmemi_team_same_mype_node.start, nvshmemi_team_same_mype_node.stride,
nvshmemi_team_same_mype_node.size);
/* Initialize team NVSHMEMI_TEAM_SAME_GPU */
nvshmemi_team_same_gpu.team_idx = NVSHMEM_TEAM_SAME_GPU_INDEX;
NVSHMEMI_TEAM_DUP_INITIALIZER(nvshmemi_team_same_gpu, NVSHMEM_TEAM_SAME_GPU_INDEX);
nvshmemi_team_same_gpu.rdxn_count = 0;
nvshmemi_team_same_gpu.ll_flag = 1;
nvshmemi_team_same_gpu.alltoall_count = 0;
nvshmemi_team_same_gpu.bcast_count = 0;
nvshmemi_team_same_gpu.bcast_sync_offset = 0;
nvshmemi_team_same_gpu.fcollect_count = 0;
nvshmemi_team_same_gpu.config_mask = 0;
pe_info = nvshmemi_state->pe_info;
start = -1;
stride = -1;
size = 0;
for (int pe = 0; pe < nvshmemi_state->npes; pe++) {
if (pe_info[pe].hostHash != pe_info[nvshmemi_state->mype].hostHash ||
memcmp(&pe_info[pe].gpu_uuid, &pe_info[nvshmemi_state->mype].gpu_uuid,
sizeof(cudaUUID_t)) != 0)
continue;
int ret = check_for_linear_stride(pe, &start, &stride, &size);
if (ret < 0) {
NVSHMEMI_ERROR_EXIT("Could not form NVSHMEMI_TEAM_SAME_GPU\n");
break;
}
}
assert(start >= 0 && size > 0);
nvshmemi_team_same_gpu.my_pe = (nvshmemi_state->mype - start) / stride;
nvshmemi_team_same_gpu.start = start;
nvshmemi_team_same_gpu.stride = (stride == -1) ? 1 : stride;
nvshmemi_team_same_gpu.size = size;
nvshmemi_team_set_p2p_connectivity(&nvshmemi_team_same_gpu);
nvshmemi_recexchalgo_get_neighbors(&nvshmemi_team_same_gpu);
nvshmemi_team_same_gpu.is_team_node = true;
nvshmemi_team_same_gpu.is_team_same_mype_node = false;
INFO(NVSHMEM_INIT, "NVSHMEMI_TEAM_SAME_GPU: start=%d, stride=%d, size=%d",
nvshmemi_team_same_gpu.start, nvshmemi_team_same_gpu.stride, nvshmemi_team_same_gpu.size);
/* All GPUs must have same number of processes (requires for us to form teams) */
/* Initialize team NVSHMEMI_TEAM_GPU_LEADERS */
scratch = (int *)malloc(sizeof(int) * nvshmemi_state->npes);
NVSHMEMI_NULL_ERROR_JMP(scratch, status, NVSHMEMX_ERROR_OUT_OF_MEMORY, cleanup,
"Unable to allocate host memory for team creation.\n");
if (nvshmemi_team_same_gpu.start ==
nvshmemi_state->mype) { /* Only GPU leaders are part of this team */
nvshmemi_team_gpu_leaders.team_idx = NVSHMEM_TEAM_GPU_LEADERS_INDEX;
NVSHMEMI_TEAM_DUP_INITIALIZER(nvshmemi_team_gpu_leaders, NVSHMEM_TEAM_GPU_LEADERS_INDEX);
nvshmemi_team_gpu_leaders.config_mask = 0;
nvshmemi_team_gpu_leaders.start = 0;
nvshmemi_team_gpu_leaders.stride =
(nvshmemi_team_same_gpu.stride == 1) ? nvshmemi_team_same_gpu.size : 1;
nvshmemi_team_gpu_leaders.size = nvshmemi_state->npes / nvshmemi_team_same_gpu.size;
nvshmemi_team_gpu_leaders.my_pe = (nvshmemi_state->mype - nvshmemi_team_gpu_leaders.start) /
nvshmemi_team_gpu_leaders.stride;
nvshmemi_team_gpu_leaders.rdxn_count = 0;
nvshmemi_team_gpu_leaders.ll_flag = 1;
nvshmemi_team_gpu_leaders.alltoall_count = 0;
nvshmemi_team_gpu_leaders.bcast_count = 0;
nvshmemi_team_gpu_leaders.bcast_sync_offset = 0;
nvshmemi_team_gpu_leaders.fcollect_count = 0;
nvshmemi_team_set_p2p_connectivity(&nvshmemi_team_gpu_leaders);
nvshmemi_recexchalgo_get_neighbors(&nvshmemi_team_gpu_leaders);
status =
nvshmemi_boot_handle.allgather((void *)&nvshmemi_team_gpu_leaders.my_pe,
(void *)scratch, sizeof(int), &nvshmemi_boot_handle);
NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, cleanup,
"allgather of gpu leaders failed\n");
/* Check whether a valid TEAM_GPU_LEADERS was formed */
int last_mype = -1;
for (int i = 0; i < nvshmemi_state->npes; i++) {
if (scratch[i] != -1) {
if (scratch[i] != last_mype + 1) {
WARN(
"NVSHMEMI_TEAM_GPU_LEADERS could not be formed, Limited MPG support will "
"not be available\n");
break;
} else {
last_mype++;
}
}
}
/* XXX: Note that we are not setting team_node and team_same_mype_node for
* nvshmemi_team_gpu_leaders */
nvshmemi_team_gpu_leaders.is_team_node = false;
nvshmemi_team_gpu_leaders.is_team_same_mype_node = false;
INFO(NVSHMEM_INIT, "NVSHMEMI_TEAM_GPU_LEADERS: start=%d, stride=%d, size=%d",
nvshmemi_team_gpu_leaders.start, nvshmemi_team_gpu_leaders.stride,
nvshmemi_team_gpu_leaders.size);
} else {
int my_pe = -1;
status = nvshmemi_boot_handle.allgather((void *)&my_pe, (void *)scratch, sizeof(int),
&nvshmemi_boot_handle);
NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, cleanup,
"allgather of gpu leaders failed\n");
}
if (nvshmemi_max_teams < NVSHMEM_TEAMS_MIN) nvshmemi_max_teams = NVSHMEM_TEAMS_MIN;
if (nvshmemi_max_teams > N_PSYNC_BYTES * CHAR_BIT) {
NVSHMEMI_ERROR_EXIT("Requested %ld teams, but only %d are supported\n", nvshmemi_max_teams,
N_PSYNC_BYTES * CHAR_BIT);
goto cleanup;
}
nvshmemi_team_pool = (nvshmemi_team_t **)calloc(nvshmemi_max_teams, sizeof(nvshmemi_team_t *));
NVSHMEMI_NULL_ERROR_JMP(nvshmemi_team_pool, status, NVSHMEMX_ERROR_OUT_OF_MEMORY, cleanup,
"nvshmemi_team_pool allocation failed \n");
CUDA_RUNTIME_CHECK(cudaMalloc((void **)&nvshmemi_device_team_pool,
nvshmemi_max_teams * sizeof(nvshmemi_team_t *)));
nvshmemi_device_state.team_pool = nvshmemi_device_team_pool;
for (long i = 0; i < nvshmemi_max_teams; i++) {
nvshmemi_team_pool[i] = NULL;
}
nvshmemi_call_init_array_kernel<nvshmemi_team_t *>(nvshmemi_device_team_pool,
nvshmemi_max_teams, NULL);
nvshmemi_team_pool[NVSHMEM_TEAM_WORLD_INDEX] = &nvshmemi_team_world;
nvshmemi_team_pool[NVSHMEM_TEAM_SHARED_INDEX] = &nvshmemi_team_shared;
nvshmemi_team_pool[NVSHMEM_TEAM_NODE_INDEX] = &nvshmemi_team_node;
nvshmemi_team_pool[NVSHMEM_TEAM_SAME_MYPE_NODE_INDEX] = &nvshmemi_team_same_mype_node;
nvshmemi_team_pool[NVSHMEM_TEAM_SAME_GPU_INDEX] = &nvshmemi_team_same_gpu;
if (nvshmemi_team_same_gpu.start == nvshmemi_state->mype)
nvshmemi_team_pool[NVSHMEM_TEAM_GPU_LEADERS_INDEX] = &nvshmemi_team_gpu_leaders;
/* Allocate pSync pool, each with the maximum possible size requirement */
/* Create two pSyncs per team for back-to-back collectives and one for barriers.
* Array organization:
*
* [ (world) (shared) (team 1) (team 2) ... (world) (shared) (team 1) (team 2) ... ]
* <----------- groups 1 & 2-------------->|<------------- group 3 ---------------->
* <--- (bcast, collect, reduce, etc.) --->|<------ (barriers and syncs) ---------->
* */
psync_len = nvshmemi_max_teams * get_psync_len_per_team();
nvshmemi_psync_pool = (long *)nvshmemi_malloc(sizeof(long) * psync_len);
NVSHMEMI_NULL_ERROR_JMP(nvshmemi_psync_pool, status, NVSHMEMX_ERROR_OUT_OF_MEMORY, cleanup,
"nvshmemi_psync_pool allocation failed \n");
nvshmemi_device_state.psync_pool = nvshmemi_psync_pool;
nvshmemi_call_init_array_kernel<long>(nvshmemi_psync_pool, psync_len, NVSHMEMI_SYNC_VALUE);
nvshmemi_sync_counter = (long *)nvshmemi_malloc(2 * nvshmemi_max_teams * sizeof(long));
NVSHMEMI_NULL_ERROR_JMP(nvshmemi_sync_counter, status, NVSHMEMX_ERROR_OUT_OF_MEMORY, cleanup,
"nvshmemi_sync_counter allocation failed \n");
nvshmemi_device_state.sync_counter = nvshmemi_sync_counter;
nvshmemi_update_device_state();
nvshmemi_call_init_array_kernel<long>(nvshmemi_sync_counter, 2 * nvshmemi_max_teams, 1);
/* Convenience pointer to the group-3 pSync array (for barriers and syncs): */
psync_pool_avail = (unsigned char *)malloc(2 * N_PSYNC_BYTES);
NVSHMEMI_NULL_ERROR_JMP(psync_pool_avail, status, NVSHMEMX_ERROR_OUT_OF_MEMORY, cleanup,
"psync_pool_avail allocation failed \n");
psync_pool_avail_reduced = &psync_pool_avail[N_PSYNC_BYTES];
device_psync_pool_avail = (unsigned char *)nvshmemi_malloc(2 * N_PSYNC_BYTES);
NVSHMEMI_NULL_ERROR_JMP(device_psync_pool_avail, status, NVSHMEMX_ERROR_OUT_OF_MEMORY, cleanup,
"device_psync_pool_avail allocation failed \n");
device_psync_pool_avail_reduced = &device_psync_pool_avail[N_PSYNC_BYTES];
/* Initialize the psync bits to 1, making all slots available: */
memset(psync_pool_avail, 0, 2 * N_PSYNC_BYTES);
for (size_t i = 0; i < (size_t)nvshmemi_max_teams; i++) {
nvshmemi_bit_set(psync_pool_avail, N_PSYNC_BYTES, i);
}
/* Set the bits for NVSHMEM_TEAM_WORLD, NVSHMEM_TEAM_SHARED, NVSHMEMX_TEAM_NODE to 0: */
nvshmemi_bit_clear(psync_pool_avail, N_PSYNC_BYTES, NVSHMEM_TEAM_WORLD_INDEX);
nvshmemi_bit_clear(psync_pool_avail, N_PSYNC_BYTES, NVSHMEM_TEAM_SHARED_INDEX);
nvshmemi_bit_clear(psync_pool_avail, N_PSYNC_BYTES, NVSHMEM_TEAM_NODE_INDEX);
nvshmemi_bit_clear(psync_pool_avail, N_PSYNC_BYTES, NVSHMEM_TEAM_SAME_MYPE_NODE_INDEX);
nvshmemi_bit_clear(psync_pool_avail, N_PSYNC_BYTES, NVSHMEM_TEAM_SAME_GPU_INDEX);
nvshmemi_bit_clear(psync_pool_avail, N_PSYNC_BYTES, NVSHMEM_TEAM_GPU_LEADERS_INDEX);
/* Initialize an integer used to agree on an equal return value across PEs in team creation: */
team_ret_val = (int *)malloc(sizeof(int) * 2);
NVSHMEMI_NULL_ERROR_JMP(team_ret_val, status, NVSHMEMX_ERROR_OUT_OF_MEMORY, cleanup,
"team_ret_val allocation failed \n");
team_ret_val_reduced = &team_ret_val[1];
device_team_ret_val = (int *)nvshmemi_malloc(sizeof(int) * 2);
NVSHMEMI_NULL_ERROR_JMP(team_ret_val, status, NVSHMEMX_ERROR_OUT_OF_MEMORY, cleanup,
"device_team_ret_val allocation failed \n");
device_team_ret_val_reduced = &device_team_ret_val[1];
nvshmemi_boot_handle.barrier(
&nvshmemi_boot_handle); /* To ensure neccessary setup has been done all PEs */
nvshmemi_team_alloc_device();
nvshmemi_team_update_device();
nvshmemi_boot_handle.barrier(
&nvshmemi_boot_handle); /* To ensure neccessary setup has been done all PEs */
#ifdef NVSHMEM_USE_NCCL
if (nvshmemi_use_nccl) {
/* Setup NCCL usage */
nvshmemi_team_init_nccl_comm(&nvshmemi_team_world);
nvshmemi_team_init_nccl_comm(&nvshmemi_team_shared);
nvshmemi_team_init_nccl_comm(&nvshmemi_team_node);
nvshmemi_team_init_nccl_comm(&nvshmemi_team_same_mype_node);
nvshmemi_team_init_nccl_comm(&nvshmemi_team_same_gpu);
if (nvshmemi_pe_in_active_set(nvshmemi_state->mype, nvshmemi_team_gpu_leaders.start,
nvshmemi_team_gpu_leaders.stride,
nvshmemi_team_gpu_leaders.size) >= 0) {
nvshmemi_team_init_nccl_comm(&nvshmemi_team_gpu_leaders);
}
}
#endif /* NVSHMEM_USE_NCCL */
/* Setup NVLS resources for all internal p2p connected teams */
NVSHMEMU_FOR_EACH_IF(
i, nvshmemi_max_teams,
nvshmemi_team_pool[i] != NULL && nvshmemi_team_pool[i]->are_gpus_p2p_connected, {
status = nvshmemi_team_nvls_setup(nvshmemi_team_pool[i]);
NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, cleanup,
"NVLS resource setup failed for team ID: %d\n",
nvshmemi_team_pool[i]->team_idx);
if (nvshmemi_team_pool[i]->nvls_rsc) {
INFO(NVSHMEM_TEAM, "Successful NVLS resource setup for team ID: %d\n",
nvshmemi_team_pool[i]->team_idx);
}
})
nvshmemi_boot_handle.barrier(
&nvshmemi_boot_handle); /* To ensure neccessary setup has been done all PEs */
nvshmemi_team_update_device();
nvshmemi_boot_handle.barrier(
&nvshmemi_boot_handle); /* To ensure neccessary setup has been done all PEs */
#if defined(NVSHMEM_PPC64LE)
if (nvshmemi_use_nccl) {
/* Set GPU thread stack size to be max stack size of any kernel invoked by NCCL.
The value 1256 has been obtained by profiling all NCCL kernels in NCCL 2.8.3-1.
This value is being set to prevent any memory config during application run
as that can lead to potential deadlock */
if (nvshmemi_options.CUDA_LIMIT_STACK_SIZE_provided) {
CUDA_RUNTIME_CHECK(
cudaDeviceSetLimit(cudaLimitStackSize, nvshmemi_options.CUDA_LIMIT_STACK_SIZE));
if (nvshmemi_options.CUDA_LIMIT_STACK_SIZE < 1256)
NVSHMEMI_WARN_PRINT(
"CUDA stack size limit has been set to less than 1256.\n"
"This can lead to hangs because a NCCL kernel can need up\n"
"to 1256 bytes");
} else
CUDA_RUNTIME_CHECK(cudaDeviceSetLimit(cudaLimitStackSize, 1256));
} else if (nvshmemi_options.CUDA_LIMIT_STACK_SIZE_provided) {
CUDA_RUNTIME_CHECK(
cudaDeviceSetLimit(cudaLimitStackSize, nvshmemi_options.CUDA_LIMIT_STACK_SIZE));
}
#endif
cleanup:
if (scratch) {
free(scratch);
}
if (hostHash) {
free(hostHash);
}
if (status != NVSHMEMX_SUCCESS) {
if (nvshmemi_team_pool) {
free(nvshmemi_team_pool);
nvshmemi_team_pool = NULL;
cudaFree(nvshmemi_device_team_pool);
nvshmemi_device_team_pool = NULL;
}
if (nvshmemi_psync_pool) {
nvshmemi_free(nvshmemi_psync_pool);
nvshmemi_psync_pool = NULL;
}
if (psync_pool_avail) {
free(psync_pool_avail);
psync_pool_avail = NULL;
}
if (device_psync_pool_avail) {
nvshmemi_free(device_psync_pool_avail);
device_psync_pool_avail = NULL;
}
if (team_ret_val) {
free(team_ret_val);
team_ret_val = NULL;
}
if (device_team_ret_val) {
nvshmemi_free(device_team_ret_val);
device_team_ret_val = NULL;
}
}
return status;
}
int nvshmemi_team_finalize(void) {
/* Destroy all undestroyed teams */
for (long i = 0; i < nvshmemi_max_teams; i++) {
if (nvshmemi_team_pool[i] != NULL) nvshmemi_team_destroy(nvshmemi_team_pool[i]);
}
free(nvshmemi_team_pool);
nvshmemi_team_pool = NULL;
CUDA_RUNTIME_CHECK(cudaFree(nvshmemi_device_team_pool));
nvshmemi_free(nvshmemi_psync_pool);
nvshmemi_free(nvshmemi_sync_counter);
free(psync_pool_avail);
nvshmemi_free(device_psync_pool_avail);
free(team_ret_val);
nvshmemi_free(device_team_ret_val);
cudaFree(nvshmemi_device_team_world);
cudaFree(nvshmemi_device_team_shared);
cudaFree(nvshmemi_device_team_node);
cudaFree(nvshmemi_device_team_same_mype_node);
cudaFree(nvshmemi_device_team_same_gpu);
cudaFree(nvshmemi_device_team_gpu_leaders);
return 0;
}
int nvshmemi_team_split_strided(nvshmemi_team_t *parent_team, int PE_start, int PE_stride,
int PE_size, const nvshmem_team_config_t *config, long config_mask,
nvshmem_team_t *new_team) {
*new_team = NVSHMEM_TEAM_INVALID;
nvshmem_barrier(parent_team->team_idx);
int global_PE_start = nvshmemi_team_pe(parent_team, PE_start);
int global_PE_stride = parent_team->stride * PE_stride;
int global_PE_end = global_PE_start + global_PE_stride * (PE_size - 1);
if (PE_start < 0 || PE_start >= parent_team->size || PE_size <= 0 ||
PE_size > parent_team->size || PE_stride < 1) {
NVSHMEMI_WARN_PRINT(
"Invalid <start, stride, size>: child <%d, %d, %d>, parent <%d, %d, %d>\n", PE_start,
PE_stride, PE_size, parent_team->start, parent_team->stride, parent_team->size);
return -1;
}
if (global_PE_start >= nvshmemi_state->npes || global_PE_end >= nvshmemi_state->npes) {
NVSHMEMI_WARN_PRINT("Starting PE (%d) or ending PE (%d) is invalid\n", global_PE_start,
global_PE_end);
return -1;
}
/* idx in new team: */
int my_pe =
nvshmemi_pe_in_active_set(nvshmemi_state->mype, global_PE_start, global_PE_stride, PE_size);
long *psync_reduce = nvshmemi_team_get_psync(parent_team, REDUCE);
long *psync = &nvshmemi_team_get_psync(parent_team, SYNC)[NVSHMEMI_SYNC_SIZE];
long *sync_counter = &nvshmemi_team_get_sync_counter(parent_team)[1];
nvshmemi_team_t *myteam = NULL;
*team_ret_val = 0;
*team_ret_val_reduced = 0;
if (my_pe >= 0) {
char bit_str[NVSHMEMI_DIAG_STRLEN];
myteam = (nvshmemi_team_t *)calloc(1, sizeof(nvshmemi_team_t));
(*myteam) = NVSHMEMI_TEAM_INITIALIZER;
myteam->my_pe = my_pe;
myteam->start = global_PE_start;
myteam->stride = global_PE_stride;
myteam->size = PE_size;
myteam->rdxn_count = 0;
myteam->ll_flag = 1;
myteam->alltoall_count = 0;
myteam->bcast_count = 0;
myteam->bcast_sync_offset = 0;
myteam->fcollect_count = 0;
if (config) {
myteam->config = *config;
myteam->config_mask = config_mask;
}
myteam->team_idx = -1;
nvshmemi_bit_to_string(bit_str, NVSHMEMI_DIAG_STRLEN, psync_pool_avail, N_PSYNC_BYTES);
CUDA_RUNTIME_CHECK(cudaMemcpy(device_psync_pool_avail, psync_pool_avail, N_PSYNC_BYTES,
cudaMemcpyHostToDevice));
CUDA_RUNTIME_CHECK(cudaDeviceSynchronize());
nvshmemi_call_reduce_kernel<unsigned char, RDXN_OPS_AND>(
myteam->start, myteam->stride, myteam->size,
(unsigned char *)device_psync_pool_avail_reduced,
(const unsigned char *)device_psync_pool_avail, N_PSYNC_BYTES,
(unsigned char *)psync_reduce, (long *)(psync), sync_counter);
CUDA_RUNTIME_CHECK(cudaMemcpy(psync_pool_avail_reduced, device_psync_pool_avail_reduced,
N_PSYNC_BYTES, cudaMemcpyDeviceToHost));
/* We cannot release the psync here, because this reduction may not
* have been performed on the entire parent team. */
nvshmemi_bit_to_string(bit_str, NVSHMEMI_DIAG_STRLEN, psync_pool_avail_reduced,
N_PSYNC_BYTES);
/* Select the least signficant nonzero bit, which corresponds to an available pSync. */
myteam->team_idx = nvshmemi_bit_1st_nonzero(psync_pool_avail_reduced, N_PSYNC_BYTES);
NVSHMEMI_TEAM_DUP_INITIALIZER(*myteam, myteam->team_idx);
nvshmemi_bit_to_string(bit_str, NVSHMEMI_DIAG_STRLEN, psync_pool_avail_reduced,
N_PSYNC_BYTES);
if (myteam->team_idx == -1 || myteam->team_idx >= (int)nvshmemi_max_teams) {
NVSHMEMI_WARN_PRINT(
"No more teams available (max = %ld), try setting NVSHMEM_MAX_TEAMS environment "
"variable\n",
nvshmemi_max_teams);
/* No psync was available, but must call barrier across parent team before returning. */
myteam->team_idx = -1;
*team_ret_val = 1;
} else {
/* Set the selected psync bit to 0, reserving that slot */
nvshmemi_bit_clear(psync_pool_avail, N_PSYNC_BYTES, myteam->team_idx);
*new_team = myteam->team_idx;
nvshmemi_team_pool[myteam->team_idx] = myteam;
nvshmemi_team_t *device_team_addr;
CUDA_RUNTIME_CHECK(cudaMalloc((void **)&device_team_addr, sizeof(nvshmemi_team_t)));
nvshmemi_team_set_p2p_connectivity(myteam);
nvshmemi_recexchalgo_get_neighbors(myteam);
CUDA_RUNTIME_CHECK(cudaMemcpy(device_team_addr, myteam, sizeof(nvshmemi_team_t),
cudaMemcpyHostToDevice));
CUDA_RUNTIME_CHECK(cudaMemcpy(&nvshmemi_device_team_pool[myteam->team_idx],
&device_team_addr, sizeof(nvshmemi_team_t *),
cudaMemcpyHostToDevice));
CUDA_RUNTIME_CHECK(cudaDeviceSynchronize());
#ifdef NVSHMEM_USE_NCCL
if (nvshmemi_use_nccl) nvshmemi_team_init_nccl_comm(myteam);
#endif
/*
* Reuse NVLS resources if teams are identical,
* else creating a new NVLS resources for p2p connected teams
*/
if (nvshmemi_team_is_identical(myteam, parent_team) &&
nvshmemi_team_is_nvls_capable(myteam) && !nvshmemi_options.DISABLE_NVLS_SHARING) {
myteam->nvls_rsc = parent_team->nvls_rsc; /* Inherit the parent team's nvls_rsc */
if (myteam->nvls_rsc) {
nvshmemi_nvls_rsc *nvls =
reinterpret_cast<nvshmemi_nvls_rsc *>(myteam->nvls_rsc);
nvls->add_refcount();
INFO(NVSHMEM_TEAM,
"Successful NVLS resource sharing for new team ID: %d (parent ID: %d)\n",
myteam->team_idx, parent_team->team_idx);
}
} else if (nvshmemi_team_is_nvls_capable(myteam)) {
if (nvshmemi_team_nvls_setup(myteam) != 0) {
NVSHMEMI_WARN_PRINT("NVLS resource setup failed for team ID: %d\n",
myteam->team_idx);
return -1;
}
INFO(NVSHMEM_TEAM, "Successful NVLS resource setup for team ID: %d\n",
myteam->team_idx);
} else {
myteam->nvls_rsc = nullptr; /* NVLS not supported, so no resource created/bound */
}
/* Build team_node */
myteam->is_team_node = false;
int i;
for (i = 1; i < myteam->size; i++) {
if (nvshmemi_host_hashes[myteam->start] !=
nvshmemi_host_hashes[myteam->start + i * myteam->stride]) {
break;
}
}
if (i == myteam->size) myteam->is_team_node = true;
myteam->is_team_same_mype_node = true;
for (int i = 0; i < myteam->size; i++) {
for (int j = i + 1; j < myteam->size; j++) {
if (nvshmemi_host_hashes[myteam->start + i * myteam->stride] ==
nvshmemi_host_hashes[myteam->start + j * myteam->stride]) {
myteam->is_team_same_mype_node = false;
}
}
}
/* count PEs on the same node */
int team_npes_node = 0;
for (int i = 0; i < myteam->size; i++) {
if (nvshmemi_team_translate_pe(myteam, i, &nvshmemi_team_node) != -1) {
team_npes_node++;
}
}
if (!myteam->is_team_node && !myteam->is_team_same_mype_node) {
/* Now I am just going to repurpose device_psync_pool_avail symm memory for the
purpose of finding max of team_npes_node */
assert(sizeof(int) <= N_PSYNC_BYTES);
CUDA_RUNTIME_CHECK(cudaMemcpy(device_psync_pool_avail, &team_npes_node, sizeof(int),
cudaMemcpyHostToDevice));
CUDA_RUNTIME_CHECK(cudaDeviceSynchronize());
nvshmemi_call_reduce_kernel<int, RDXN_OPS_MAX>(
myteam->start, myteam->stride, myteam->size,
(int *)device_psync_pool_avail_reduced, (const int *)device_psync_pool_avail, 1,
(int *)psync_reduce, (long *)(psync), sync_counter);
CUDA_RUNTIME_CHECK(cudaMemcpy(&team_npes_node, device_psync_pool_avail_reduced,
sizeof(int), cudaMemcpyDeviceToHost));
nvshmemi_team_split_2d(myteam, team_npes_node, NULL, 0, &myteam->team_node, NULL, 0,
&myteam->team_same_mype_node);
}
CUDA_RUNTIME_CHECK(cudaMemcpy(device_team_addr, myteam, sizeof(nvshmemi_team_t),
cudaMemcpyHostToDevice));
CUDA_RUNTIME_CHECK(cudaDeviceSynchronize());
}
nvshmemi_call_init_array_kernel<long>(sync_counter, 1, 1);
nvshmemi_call_init_array_kernel<long>(psync, NVSHMEMI_SYNC_SIZE, NVSHMEMI_SYNC_VALUE);
CUDA_RUNTIME_CHECK(cudaDeviceSynchronize());
}
/* This barrier on the parent team eliminates problematic race conditions
* during psync allocation between back-to-back team creations. */
nvshmem_quiet();
// nvshmem_barrier(parent_team->start, parent_team->stride, parent_team->size, psync);
nvshmem_team_sync(parent_team->team_idx);
/* This OR reduction assures all PEs return the same value. */
CUDA_RUNTIME_CHECK(
cudaMemcpy(device_team_ret_val, team_ret_val, sizeof(int), cudaMemcpyHostToDevice));
CUDA_RUNTIME_CHECK(cudaDeviceSynchronize());
nvshmemi_call_rdxn_on_stream_kernel<int, RDXN_OPS_MAX>(
parent_team->team_idx, device_team_ret_val_reduced, device_team_ret_val, 1,
nvshmemi_state->my_stream);
CUDA_RUNTIME_CHECK(cudaStreamSynchronize(nvshmemi_state->my_stream));
CUDA_RUNTIME_CHECK(cudaMemcpy(team_ret_val_reduced, device_team_ret_val_reduced, sizeof(int),
cudaMemcpyDeviceToHost));
/* If no team was available, print some team triplet info and return nonzero. */
if (myteam != NULL && myteam->team_idx == -1) {
NVSHMEMI_WARN_PRINT("Team split strided failed: child <%d, %d, %d>, parent <%d, %d, %d>\n",
global_PE_start, global_PE_stride, PE_size, parent_team->start,
parent_team->stride, parent_team->size);
/* TODO: In the event one of the PEs fails to create the team, do we need to revert the team
* on all of the other ones? */
free(myteam);
}
return *team_ret_val_reduced;
}
int nvshmemi_team_split_2d(nvshmemi_team_t *parent_team, int xrange,
const nvshmem_team_config_t *xaxis_config, long xaxis_mask,
nvshmem_team_t *xaxis_team, const nvshmem_team_config_t *yaxis_config,
long yaxis_mask, nvshmem_team_t *yaxis_team) {
*xaxis_team = NVSHMEM_TEAM_INVALID;
*yaxis_team = NVSHMEM_TEAM_INVALID;
if (xrange > parent_team->size) {
xrange = parent_team->size;
}
const int parent_size = parent_team->size;
const int num_xteams = ceil(parent_size / (float)xrange);
const int num_yteams = xrange;
int start = 0;
int ret = 0;
for (int i = 0; i < num_xteams; i++) {
nvshmem_team_t my_xteam;
int xsize = (i == num_xteams - 1 && parent_size % xrange) ? parent_size % xrange : xrange;
ret = nvshmemi_team_split_strided(parent_team, start, 1, xsize, xaxis_config, xaxis_mask,
&my_xteam);
if (ret) {
NVSHMEMI_ERROR_PRINT("Creation of x-axis team %d of %d failed\n", i + 1, num_xteams);
}
start += xrange;
if (my_xteam != NVSHMEM_TEAM_INVALID) {
assert(*xaxis_team == NVSHMEM_TEAM_INVALID);
*xaxis_team = my_xteam;
}
}
start = 0;
for (int i = 0; i < num_yteams; i++) {
nvshmem_team_t my_yteam;
int remainder = parent_size % xrange;
int yrange = parent_size / xrange;
int ysize = (remainder && i < remainder) ? yrange + 1 : yrange;
ret = nvshmemi_team_split_strided(parent_team, start, xrange, ysize, yaxis_config,
yaxis_mask, &my_yteam);
if (ret) {
NVSHMEMI_ERROR_PRINT("Creation of y-axis team %d of %d failed\n", i + 1, num_yteams);
}
start += 1;
if (my_yteam != NVSHMEM_TEAM_INVALID) {
assert(*yaxis_team == NVSHMEM_TEAM_INVALID);
*yaxis_team = my_yteam;
}
}
nvshmem_quiet();
nvshmem_team_sync(parent_team->team_idx);
return 0;
}
static bool inline nvshmemi_is_rsvd_teams(nvshmem_team_t team_idx) {
/* This team resource shouldn't not be deleted as they are used for collectives APIs during
* init/finalize */
return ((team_idx == NVSHMEM_TEAM_INVALID) ||
(team_idx >= NVSHMEM_TEAM_WORLD_INDEX && team_idx < NVSHMEM_TEAMS_MIN));
}
void nvshmemi_team_destroy(nvshmemi_team_t *team) {
int idx = team->team_idx;
if (nvshmemi_bit_fetch(psync_pool_avail, idx)) {
NVSHMEMI_ERROR_PRINT("Destroying a team without an active pSync\n");
}
/* Since it is a collective routine, perform a barrier */
// nvshmem_barrier(idx);
if (!team->is_team_node) {
if (!nvshmemi_is_rsvd_teams(team->team_node) &&
nvshmemi_team_pool[team->team_node] != NULL) {
INFO(NVSHMEM_COLL, "Destroy sub-team 1 [%p] at index[%d] for parent-team [%p]",
nvshmemi_team_pool[team->team_node], team->team_node, team);
nvshmemi_team_destroy(nvshmemi_team_pool[team->team_node]);
}
}
if (!team->is_team_same_mype_node) {
if (!nvshmemi_is_rsvd_teams(team->team_same_mype_node) &&
nvshmemi_team_pool[team->team_same_mype_node] != NULL) {
INFO(NVSHMEM_COLL, "Destroy sub-team 2 [%p] at index[%d] for parent-team [%p]",
nvshmemi_team_pool[team->team_same_mype_node], team->team_same_mype_node, team);
nvshmemi_team_destroy(nvshmemi_team_pool[team->team_same_mype_node]);
}
}
nvshmemi_bit_set(psync_pool_avail, N_PSYNC_BYTES, idx);
nvshmemi_team_pool[idx] = NULL;
CUDA_RUNTIME_CHECK(cudaMemset(&nvshmemi_device_team_pool[idx], 0, sizeof(nvshmemi_team_t *)));
nvshmemi_call_init_array_kernel<long>(&nvshmemi_sync_counter[2 * idx], 2, 1);
nvshmemi_call_init_array_kernel<long>(&nvshmemi_psync_pool[idx * get_psync_len_per_team()],
get_psync_len_per_team(), NVSHMEMI_SYNC_VALUE);
CUDA_RUNTIME_CHECK(cudaDeviceSynchronize());
nvshmemi_team_destroy_nvls(team);
nvshmemi_recexchalgo_free_mem(team);
#ifdef NVSHMEM_USE_NCCL
if (nvshmemi_use_nccl) NCCL_CHECK(nccl_ftable.CommDestroy((ncclComm_t)team->nccl_comm));
#endif
if (team != &nvshmemi_team_world && team != &nvshmemi_team_shared &&
team != &nvshmemi_team_node && team != &nvshmemi_team_same_mype_node &&
team != &nvshmemi_team_same_gpu && team != &nvshmemi_team_gpu_leaders) {
free(team);
}
nvshmemi_team_t *device_team_addr;
CUDA_RUNTIME_CHECK(cudaMemcpy((void **)&device_team_addr, &nvshmemi_device_team_pool[idx],
sizeof(nvshmemi_team_t *), cudaMemcpyDeviceToHost));
CUDA_RUNTIME_CHECK(cudaFree(device_team_addr));
}
long *nvshmemi_team_get_psync(nvshmemi_team_t *team, nvshmemi_team_op_t op) {
long *team_psync;
size_t psync_fcollect_len;
psync_fcollect_len = get_fcollect_psync_len_per_team();
team_psync = &nvshmemi_psync_pool[team->team_idx * get_psync_len_per_team()];
switch (op) {
case SYNC:
return team_psync;
case REDUCE:
return &team_psync
[2 * NVSHMEMI_SYNC_SIZE +
(((nvshmemi_device_state.gpu_coll_env_params_var.reduce_scratch_size / 2) /
sizeof(long)) *
(team->rdxn_count % 2))];
case BCAST:
return &team_psync[2 * NVSHMEMI_SYNC_SIZE +
nvshmemi_device_state.gpu_coll_env_params_var.reduce_scratch_size /
sizeof(long)];
case FCOLLECT:
return &team_psync[2 * NVSHMEMI_SYNC_SIZE +
nvshmemi_device_state.gpu_coll_env_params_var.reduce_scratch_size /
sizeof(long) +
NVSHMEMI_BCAST_SYNC_SIZE];
case ALLTOALL:
return &team_psync[2 * NVSHMEMI_SYNC_SIZE +
nvshmemi_device_state.gpu_coll_env_params_var.reduce_scratch_size /
sizeof(long) +
NVSHMEMI_BCAST_SYNC_SIZE + psync_fcollect_len +
(NVSHMEMI_ALLTOALL_SYNC_SIZE * (team->alltoall_count % 2))];
case FCOLLECT_128:
return &team_psync[2 * NVSHMEMI_SYNC_SIZE +
nvshmemi_device_state.gpu_coll_env_params_var.reduce_scratch_size /
sizeof(long) +
NVSHMEMI_BCAST_SYNC_SIZE + psync_fcollect_len +
2 * NVSHMEMI_ALLTOALL_SYNC_SIZE];
default:
WARN("Incorrect argument to nvshmemi_team_get_psync\n");
return NULL;
}
}
long *nvshmemi_team_get_sync_counter(nvshmemi_team_t *team) {
return &nvshmemi_sync_counter[2 * team->team_idx];
}