1602 lines
72 KiB
C++
1602 lines
72 KiB
C++
/*
|
|
* Copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* See COPYRIGHT for license information
|
|
*/
|
|
|
|
#include "team_internal.h"
|
|
#include <assert.h> // for assert
|
|
#include <cuda_runtime.h> // for cudaMemcpy
|
|
#include <driver_types.h> // for cudaMemcpyHos...
|
|
#include <limits.h> // for CHAR_BIT
|
|
#include <stdint.h> // for uint64_t
|
|
#include <stdio.h> // for snprintf, printf
|
|
#include <stdlib.h> // for free, malloc
|
|
#include <string.h> // for memset, memcmp
|
|
#include <cmath> // for ceil
|
|
#include "../coll/rdxn/rdxn.h" // for nvshmemi_call...
|
|
#include "device_host/nvshmem_types.h" // for nvshmemi_team_t
|
|
#include "cpu_coll.h" // for nccl_ftable
|
|
#include "device_host/nvshmem_common.cuh" // for nvshmemi_pe_i...
|
|
#include "bootstrap_host_transport/env_defs_internal.h" // for nvshmemi_opti...
|
|
#include "host/nvshmem_api.h" // for nvshmem_quiet
|
|
#include "host/nvshmem_coll_api.h" // for nvshmem_team_...
|
|
#include "host/nvshmemx_api.h" // for nvshmemx_char...
|
|
#include "non_abi/nvshmemx_error.h" // for NVSHMEMI_NULL...
|
|
#include "internal/host/debug.h" // for INFO, NVSHMEM...
|
|
#include "internal/host/nvshmem_internal.h" // for nvshmemi_free
|
|
#include "internal/host/nvshmemi_coll.h" // for nvshmemi_barrier
|
|
#include "internal/host/nvshmemi_symmetric_heap.hpp" // for nvshmemi_symm...
|
|
#include "internal/host/nvshmemi_team.h" // for N_PSYNC_BYTES
|
|
#include "internal/host/nvshmemi_types.h" // for nvshmemi_state
|
|
#include "internal/host/util.h" // for CUDA_RUNTIME_...
|
|
#include "internal/bootstrap_host_transport/nvshmemi_bootstrap_defines.h" // for nvshmemi_boot...
|
|
#include "internal/host_transport/transport.h" // for nvshmem_trans...
|
|
#include "non_abi/nvshmem_build_options.h" // for NVSHMEM_USE_NCCL
|
|
#include "internal/host/nvshmemi_nvls_rsc.hpp" // for nvshmemi_nvls...
|
|
#ifdef NVSHMEM_USE_NCCL
|
|
#include "nccl.h" // for ncclUniqueId
|
|
#endif
|
|
#include "internal/host_transport/nvshmemi_transport_defines.h" // for NVSHMEM_MEM_H...
|
|
|
|
using namespace nvls;
|
|
|
|
#define NVSHMEMI_DIAG_STRLEN 1024
|
|
#define NVSHMEMI_SYNC_VALUE 0
|
|
#define NVSHMEMI_REDUCE_MAX_CTA_COUNT 64
|
|
|
|
/* 0th entry in team duplicate resources is same team as the encapsulating team. This allows
|
|
* for reuse of the same business logic for nCTA == 1 and nCTA > 1 and minimizes if/else
|
|
*/
|
|
#define NVSHMEMI_TEAM_DUP_INITIALIZER(teami, team_idx) \
|
|
(teami).team_dups[0] = (team_idx); \
|
|
for (int i = 1; i < 128; i++) { \
|
|
(teami).team_dups[i] = NVSHMEM_TEAM_INVALID; \
|
|
}
|
|
|
|
long nvshmemi_max_teams;
|
|
|
|
nvshmemi_team_t nvshmemi_team_world;
|
|
nvshmemi_team_t nvshmemi_team_shared;
|
|
nvshmemi_team_t nvshmemi_team_node;
|
|
nvshmemi_team_t nvshmemi_team_same_mype_node;
|
|
nvshmemi_team_t nvshmemi_team_same_gpu;
|
|
nvshmemi_team_t nvshmemi_team_gpu_leaders;
|
|
|
|
nvshmemi_team_t *nvshmemi_device_team_world, *nvshmemi_device_team_shared,
|
|
*nvshmemi_device_team_node, *nvshmemi_device_team_same_mype_node,
|
|
*nvshmemi_device_team_same_gpu, *nvshmemi_device_team_gpu_leaders;
|
|
|
|
nvshmemi_team_t **nvshmemi_team_pool;
|
|
long *nvshmemi_psync_pool;
|
|
long *nvshmemi_sync_counter;
|
|
|
|
nvshmemi_team_t **nvshmemi_device_team_pool;
|
|
|
|
static unsigned char *psync_pool_avail;
|
|
static unsigned char *psync_pool_avail_reduced;
|
|
static unsigned char *device_psync_pool_avail;
|
|
static unsigned char *device_psync_pool_avail_reduced;
|
|
|
|
static int *team_ret_val;
|
|
static int *team_ret_val_reduced;
|
|
static int *device_team_ret_val;
|
|
static int *device_team_ret_val_reduced;
|
|
|
|
bool nvshmemi_team_support_nvls(nvshmemi_team_t *team) {
|
|
return ((team->are_gpus_p2p_connected) && (team->nvls_rsc != nullptr));
|
|
}
|
|
|
|
bool nvshmemi_team_is_owner_nvls(nvshmemi_team_t *team) {
|
|
nvshmemi_nvls_rsc *nvls = reinterpret_cast<nvshmemi_nvls_rsc *>(team->nvls_rsc);
|
|
INFO(NVSHMEM_TEAM, "Team ID: %d NVLS Resource Owner ID: %d\n", team->team_idx,
|
|
nvls->get_owner());
|
|
return (nvls->is_owner(team));
|
|
}
|
|
|
|
static bool nvshmemi_team_is_nvls_capable(nvshmemi_team_t *team) {
|
|
return (team->are_gpus_p2p_connected && team->size >= 2 && nvshmemi_state->is_platform_nvls);
|
|
}
|
|
|
|
static bool nvshmemi_team_is_identical(nvshmemi_team_t *t1, nvshmemi_team_t *t2) {
|
|
return (t1->start == t2->start && t1->stride == t2->stride && t1->size == t2->size);
|
|
}
|
|
|
|
static void nvshmemi_team_alloc_device(void) {
|
|
CUDA_RUNTIME_CHECK(cudaMalloc((void **)&nvshmemi_device_team_world, sizeof(nvshmemi_team_t)));
|
|
CUDA_RUNTIME_CHECK(cudaMalloc((void **)&nvshmemi_device_team_shared, sizeof(nvshmemi_team_t)));
|
|
CUDA_RUNTIME_CHECK(cudaMalloc((void **)&nvshmemi_device_team_node, sizeof(nvshmemi_team_t)));
|
|
CUDA_RUNTIME_CHECK(
|
|
cudaMalloc((void **)&nvshmemi_device_team_same_mype_node, sizeof(nvshmemi_team_t)));
|
|
CUDA_RUNTIME_CHECK(
|
|
cudaMalloc((void **)&nvshmemi_device_team_same_gpu, sizeof(nvshmemi_team_t)));
|
|
CUDA_RUNTIME_CHECK(
|
|
cudaMalloc((void **)&nvshmemi_device_team_gpu_leaders, sizeof(nvshmemi_team_t)));
|
|
}
|
|
|
|
static void nvshmemi_team_update_device(void) {
|
|
CUDA_RUNTIME_CHECK(cudaMemcpy(nvshmemi_device_team_world, &nvshmemi_team_world,
|
|
sizeof(nvshmemi_team_t), cudaMemcpyHostToDevice));
|
|
CUDA_RUNTIME_CHECK(cudaMemcpy(&nvshmemi_device_team_pool[NVSHMEM_TEAM_WORLD_INDEX],
|
|
&nvshmemi_device_team_world, sizeof(nvshmemi_team_t *),
|
|
cudaMemcpyHostToDevice));
|
|
|
|
CUDA_RUNTIME_CHECK(cudaMemcpy(nvshmemi_device_team_shared, &nvshmemi_team_shared,
|
|
sizeof(nvshmemi_team_t), cudaMemcpyHostToDevice));
|
|
CUDA_RUNTIME_CHECK(cudaMemcpy(&nvshmemi_device_team_pool[NVSHMEM_TEAM_SHARED_INDEX],
|
|
&nvshmemi_device_team_shared, sizeof(nvshmemi_team_t *),
|
|
cudaMemcpyHostToDevice));
|
|
|
|
CUDA_RUNTIME_CHECK(cudaMemcpy(nvshmemi_device_team_node, &nvshmemi_team_node,
|
|
sizeof(nvshmemi_team_t), cudaMemcpyHostToDevice));
|
|
CUDA_RUNTIME_CHECK(cudaMemcpy(&nvshmemi_device_team_pool[NVSHMEM_TEAM_NODE_INDEX],
|
|
&nvshmemi_device_team_node, sizeof(nvshmemi_team_t *),
|
|
cudaMemcpyHostToDevice));
|
|
|
|
CUDA_RUNTIME_CHECK(cudaMemcpy(nvshmemi_device_team_same_mype_node,
|
|
&nvshmemi_team_same_mype_node, sizeof(nvshmemi_team_t),
|
|
cudaMemcpyHostToDevice));
|
|
CUDA_RUNTIME_CHECK(cudaMemcpy(&nvshmemi_device_team_pool[NVSHMEM_TEAM_SAME_MYPE_NODE_INDEX],
|
|
&nvshmemi_device_team_same_mype_node, sizeof(nvshmemi_team_t *),
|
|
cudaMemcpyHostToDevice));
|
|
|
|
CUDA_RUNTIME_CHECK(cudaMemcpy(nvshmemi_device_team_same_gpu, &nvshmemi_team_same_gpu,
|
|
sizeof(nvshmemi_team_t), cudaMemcpyHostToDevice));
|
|
CUDA_RUNTIME_CHECK(cudaMemcpy(&nvshmemi_device_team_pool[NVSHMEM_TEAM_SAME_GPU_INDEX],
|
|
&nvshmemi_device_team_same_gpu, sizeof(nvshmemi_team_t *),
|
|
cudaMemcpyHostToDevice));
|
|
|
|
CUDA_RUNTIME_CHECK(cudaMemcpy(nvshmemi_device_team_gpu_leaders, &nvshmemi_team_gpu_leaders,
|
|
sizeof(nvshmemi_team_t), cudaMemcpyHostToDevice));
|
|
CUDA_RUNTIME_CHECK(cudaMemcpy(&nvshmemi_device_team_pool[NVSHMEM_TEAM_GPU_LEADERS_INDEX],
|
|
&nvshmemi_device_team_gpu_leaders, sizeof(nvshmemi_team_t *),
|
|
cudaMemcpyHostToDevice));
|
|
CUDA_RUNTIME_CHECK(cudaDeviceSynchronize());
|
|
}
|
|
|
|
static void nvshmemi_recexchalgo_get_neighbors(nvshmemi_team_t *teami) {
|
|
int i, j, k;
|
|
int p_of_k = 1, log_p_of_k = 0, rem, T, newpe;
|
|
int step1_sendto, step1_nrecvs, step2_nphases;
|
|
int *step1_recvfrom, **step2_nbrs;
|
|
int *step1_recvfrom_device, **step2_nbrs_device;
|
|
|
|
int my_pe = teami->my_pe;
|
|
int num_pes = teami->size;
|
|
INFO(NVSHMEM_COLL, "step 1 nbr calculation started, num_pes = %d", num_pes);
|
|
|
|
k = nvshmemi_options.REDUCE_RECEXCH_KVAL;
|
|
assert(k > 1);
|
|
|
|
if (num_pes < k) /* If size of the active set is less than k, reduce the value of k */
|
|
k = (num_pes > 2) ? num_pes : 2;
|
|
|
|
/* Calculate p_of_k, p_of_k is the largest power of k that is less than num_pes */
|
|
while (p_of_k <= num_pes) {
|
|
p_of_k *= k;
|
|
log_p_of_k++;
|
|
}
|
|
p_of_k /= k;
|
|
/* protect against underflow warnings when asserts are disabled. */
|
|
if (log_p_of_k > 0) {
|
|
log_p_of_k--;
|
|
}
|
|
|
|
step2_nphases = log_p_of_k;
|
|
step1_recvfrom = (int *)malloc(sizeof(int) * (k - 1));
|
|
assert(step1_recvfrom);
|
|
step2_nbrs = (int **)malloc(sizeof(int *) * step2_nphases);
|
|
assert(step2_nbrs);
|
|
|
|
for (int i = 0; i < step2_nphases; i++) {
|
|
step2_nbrs[i] = (int *)malloc(sizeof(int) * (k - 1));
|
|
assert(step2_nbrs[i]);
|
|
}
|
|
|
|
rem = num_pes - p_of_k;
|
|
/* rem is the number of PEs that do not particpate in Step 2
|
|
* We need to identify these non-participating PEs. This is done in the following way.
|
|
* The first T PEs are divided into sets of k consecutive PEs each.
|
|
* In each of these sets, the first k-1 PEs are the non-participating
|
|
* PEs while the last PE is the participating PE.
|
|
* The non-participating PEs send their data to the participating PE
|
|
* in their corresponding set.
|
|
*/
|
|
T = (rem * k) / (k - 1);
|
|
|
|
INFO(NVSHMEM_COLL, "step 1 nbr calculation started. T is %d", T);
|
|
step1_nrecvs = 0;
|
|
step1_sendto = -1;
|
|
|
|
/* Step 1 */
|
|
if (my_pe < T) {
|
|
if (my_pe % k != (k - 1)) { /* I am a non-participating PE */
|
|
step1_sendto = my_pe + (k - 1 - my_pe % k); /* partipating PE to send the data to */
|
|
/* if the corresponding participating PE is not in T,
|
|
* then send to the Tth PE to preserve non-commutativity */
|
|
if (step1_sendto > T - 1) step1_sendto = T;
|
|
newpe = -1; /* tag this PE as non-participating */
|
|
} else { /* participating PE */
|
|
for (i = 0; i < k - 1; i++) {
|
|
step1_recvfrom[i] = my_pe - i - 1;
|
|
}
|
|
step1_nrecvs = k - 1;
|
|
newpe = my_pe / k; /* this is the new PE amongst the set of participating PEs */
|
|
}
|
|
} else { /* PE >= T */
|
|
newpe = my_pe - rem;
|
|
|
|
if (my_pe == T && (T - 1) % k != k - 1 && T >= 1) {
|
|
int nsenders = (T - 1) % k + 1; /* number of PEs sending their data to me in Step 1 */
|
|
|
|
for (j = nsenders - 1; j >= 0; j--) {
|
|
step1_recvfrom[nsenders - 1 - j] = T - nsenders + j;
|
|
}
|
|
step1_nrecvs = nsenders;
|
|
}
|
|
}
|
|
|
|
INFO(NVSHMEM_COLL, "step 1 nbr computation completed");
|
|
|
|
/* Step 2 */
|
|
if (step1_sendto == -1) { /* calulate step2_nbrs only for participating PEs */
|
|
int *digit = (int *)malloc(sizeof(int) * step2_nphases);
|
|
assert(digit != NULL);
|
|
int temppe = newpe;
|
|
int mask = 0x1;
|
|
int phase = 0, cbit, cnt, nbr, power;
|
|
|
|
/* calculate the digits in base k representation of newpe */
|
|
for (i = 0; i < log_p_of_k; i++) digit[i] = 0;
|
|
|
|
int remainder, i_digit = 0;
|
|
while (temppe != 0) {
|
|
remainder = temppe % k;
|
|
temppe = temppe / k;
|
|
digit[i_digit] = remainder;
|
|
i_digit++;
|
|
}
|
|
|
|
while (mask < p_of_k) {
|
|
cbit =
|
|
digit[phase]; /* phase_th digit changes in this phase, obtain its original value */
|
|
cnt = 0;
|
|
for (i = 0; i < k; i++) { /* there are k-1 neighbors */
|
|
if (i != cbit) { /* do not generate yourself as your nieighbor */
|
|
digit[phase] = i; /* this gets us the base k representation of the neighbor */
|
|
|
|
/* calculate the base 10 value of the neighbor PE */
|
|
nbr = 0;
|
|
power = 1;
|
|
for (j = 0; j < log_p_of_k; j++) {
|
|
nbr += digit[j] * power;
|
|
power *= k;
|
|
}
|
|
|
|
/* calculate its real PE and store it */
|
|
step2_nbrs[phase][cnt] =
|
|
(nbr < rem / (k - 1)) ? (nbr * k) + (k - 1) : nbr + rem;
|
|
cnt++;
|
|
}
|
|
}
|
|
INFO(NVSHMEM_COLL, "step 2, phase %d nbr calculation completed", phase);
|
|
digit[phase] = cbit; /* reset the digit to original value */
|
|
phase++;
|
|
mask *= k;
|
|
}
|
|
free(digit);
|
|
}
|
|
// Update with global PE numbers
|
|
if (step1_sendto != -1) step1_sendto = teami->start + step1_sendto * teami->stride;
|
|
for (int i = 0; i < step1_nrecvs; i++)
|
|
step1_recvfrom[i] = teami->start + step1_recvfrom[i] * teami->stride;
|
|
for (int i = 0; i < step2_nphases; i++)
|
|
for (int j = 0; j < k - 1; j++)
|
|
step2_nbrs[i][j] = teami->start + step2_nbrs[i][j] * teami->stride;
|
|
|
|
// Copy the data to device memory
|
|
CUDA_RUNTIME_CHECK(cudaMalloc(&step1_recvfrom_device, sizeof(int) * (k - 1)));
|
|
CUDA_RUNTIME_CHECK(cudaMalloc(
|
|
&step2_nbrs_device,
|
|
sizeof(int *) * (step2_nphases + 1))); /* + 1 to make it non-zero otherwise cuMemAlloc
|
|
returns error when step2_nphases is 0 */
|
|
|
|
for (int i = 0; i < step2_nphases; i++) {
|
|
void *dev_ptr;
|
|
CUDA_RUNTIME_CHECK(cudaMalloc(&dev_ptr, sizeof(int) * (k - 1)));
|
|
CUDA_RUNTIME_CHECK(cudaMemcpy((int **)step2_nbrs_device + i, &dev_ptr, sizeof(int *),
|
|
cudaMemcpyHostToDevice));
|
|
}
|
|
CUDA_RUNTIME_CHECK(cudaMemcpy(step1_recvfrom_device, step1_recvfrom, sizeof(int) * step1_nrecvs,
|
|
cudaMemcpyHostToDevice));
|
|
void *dev_ptr, *dev_ptr_2;
|
|
dev_ptr = step2_nbrs_device;
|
|
for (int i = 0; i < step2_nphases; i++) {
|
|
CUDA_RUNTIME_CHECK(
|
|
cudaMemcpy(&dev_ptr_2, (int **)dev_ptr + i, sizeof(int *), cudaMemcpyDeviceToHost));
|
|
CUDA_RUNTIME_CHECK(
|
|
cudaMemcpy(dev_ptr_2, step2_nbrs[i], sizeof(int) * (k - 1), cudaMemcpyHostToDevice));
|
|
}
|
|
teami->reduce_recexch.step1_sendto = step1_sendto;
|
|
teami->reduce_recexch.step1_nrecvs = step1_nrecvs;
|
|
teami->reduce_recexch.step2_nphases = step2_nphases;
|
|
teami->reduce_recexch.step1_recvfrom = step1_recvfrom_device;
|
|
teami->reduce_recexch.step2_nbrs = step2_nbrs_device;
|
|
|
|
free(step1_recvfrom);
|
|
for (int i = 0; i < step2_nphases; i++) {
|
|
if (step2_nbrs[i]) {
|
|
free(step2_nbrs[i]);
|
|
}
|
|
}
|
|
free(step2_nbrs);
|
|
}
|
|
|
|
static void nvshmemi_recexchalgo_free_mem(nvshmemi_team_t *teami) {
|
|
CUDA_RUNTIME_CHECK(cudaFree(teami->reduce_recexch.step1_recvfrom));
|
|
for (int i = 0; i < teami->reduce_recexch.step2_nphases; i++) {
|
|
void *dev_ptr;
|
|
CUDA_RUNTIME_CHECK(cudaMemcpy(&dev_ptr, teami->reduce_recexch.step2_nbrs + i, sizeof(int *),
|
|
cudaMemcpyDeviceToHost));
|
|
CUDA_RUNTIME_CHECK(cudaFree(dev_ptr));
|
|
}
|
|
CUDA_RUNTIME_CHECK(cudaFree(teami->reduce_recexch.step2_nbrs));
|
|
}
|
|
|
|
static inline void nvshmemi_bit_set(unsigned char *ptr, size_t size, size_t index) {
|
|
assert(size > 0 && (index < size * CHAR_BIT));
|
|
|
|
size_t which_byte = index / CHAR_BIT;
|
|
ptr[which_byte] |= (1 << (index % CHAR_BIT));
|
|
|
|
return;
|
|
}
|
|
|
|
static inline void nvshmemi_bit_clear(unsigned char *ptr, size_t size, size_t index) {
|
|
assert(size > 0 && (index < size * CHAR_BIT));
|
|
|
|
size_t which_byte = index / CHAR_BIT;
|
|
ptr[which_byte] &= ~(1 << (index % CHAR_BIT));
|
|
|
|
return;
|
|
}
|
|
|
|
static inline unsigned char nvshmemi_bit_fetch(unsigned char *ptr, size_t index) {
|
|
return (ptr[index / CHAR_BIT] >> (index % CHAR_BIT)) & 1;
|
|
}
|
|
|
|
static inline size_t nvshmemi_bit_1st_nonzero(const unsigned char *ptr, const size_t size) {
|
|
/* The following ignores endianess: */
|
|
for (size_t i = 0; i < size; i++) {
|
|
unsigned char bit_val = ptr[i];
|
|
for (size_t j = 0; bit_val && j < CHAR_BIT; j++) {
|
|
if (bit_val & 1) return i * CHAR_BIT + j;
|
|
bit_val >>= 1;
|
|
}
|
|
}
|
|
|
|
return (size_t)-1;
|
|
}
|
|
|
|
/* Create a bit string of the format AAAAAAAA.BBBBBBBB into str for the byte
|
|
* array passed via ptr. */
|
|
static inline void nvshmemi_bit_to_string(char *str, size_t str_size, unsigned char *ptr,
|
|
size_t ptr_size) {
|
|
size_t off = 0;
|
|
|
|
for (size_t i = 0; i < ptr_size; i++) {
|
|
for (size_t j = 0; j < CHAR_BIT; j++) {
|
|
off += snprintf(str + off, str_size - off, "%s",
|
|
(ptr[i] & (1 << (CHAR_BIT - 1 - j))) ? "1" : "0");
|
|
if (off >= str_size) return;
|
|
}
|
|
if (i < ptr_size - 1) {
|
|
off += snprintf(str + off, str_size - off, ".");
|
|
if (off >= str_size) return;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Checks whether a PE has a consistent stride given (start, stride, size).
|
|
* This function is useful within a loop across PE IDs, and sets 'start',
|
|
* 'stride' and 'size' accordingly upon exiting the loop. It also assumes
|
|
* 'start' and 'stride' are initialized to a negative number and 'size' to 0.
|
|
* If an inconsistent stride is found, returns -1. */
|
|
static inline int check_for_linear_stride(int pe, int *start, int *stride, int *size) {
|
|
if (*start < 0) {
|
|
*start = pe;
|
|
(*size)++;
|
|
} else if (*stride < 0) {
|
|
*stride = pe - *start;
|
|
(*size)++;
|
|
} else if ((pe - *start) % *stride != 0) {
|
|
NVSHMEMI_WARN_PRINT("Detected non-uniform stride inserting PE %d into <%d, %d, %d>\n", pe,
|
|
*start, *stride, *size);
|
|
return -1;
|
|
} else {
|
|
(*size)++;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static inline int nvshmemi_pe_in_active_set(int global_pe, int PE_start, int PE_stride,
|
|
int PE_size) {
|
|
int n = (global_pe - PE_start) / PE_stride;
|
|
if (global_pe < PE_start || (global_pe - PE_start) % PE_stride || n >= PE_size)
|
|
return -1;
|
|
else {
|
|
return n;
|
|
}
|
|
}
|
|
|
|
int nvshmemi_team_translate_pe(nvshmemi_team_t *src_team, int src_pe, nvshmemi_team_t *dest_team) {
|
|
int src_pe_world, dest_pe = -1;
|
|
|
|
if (src_pe > src_team->size) return -1;
|
|
|
|
src_pe_world = src_team->start + src_pe * src_team->stride;
|
|
assert(src_pe_world >= src_team->start && src_pe_world < nvshmemi_state->npes);
|
|
|
|
dest_pe = nvshmemi_pe_in_active_set(src_pe_world, dest_team->start, dest_team->stride,
|
|
dest_team->size);
|
|
|
|
return dest_pe;
|
|
}
|
|
|
|
static inline size_t get_fcollect_psync_len_per_team() {
|
|
size_t fcollect_ll_threshold =
|
|
nvshmemi_device_state.gpu_coll_env_params_var.fcollect_ll_threshold;
|
|
size_t fcollect_sync_size =
|
|
(2 * 2 * nvshmemi_state->npes * fcollect_ll_threshold) / sizeof(long);
|
|
assert(fcollect_ll_threshold % sizeof(long) == 0);
|
|
|
|
return fcollect_sync_size;
|
|
}
|
|
|
|
static inline size_t get_fcollect_ll128_psync_len_per_team() {
|
|
size_t fcollect_ll128_threshold =
|
|
nvshmemi_device_state.gpu_coll_env_params_var.fcollect_ll128_threshold;
|
|
size_t fcollect_ll128_sync_size =
|
|
NVSHMEMI_FCOLLECT_LL128_CALC_PSYNC_SIZE(fcollect_ll128_threshold, char);
|
|
|
|
/* scale for npes and two separate psyncs */
|
|
fcollect_ll128_sync_size = fcollect_ll128_sync_size * 2 * nvshmemi_state->npes / sizeof(long);
|
|
|
|
return fcollect_ll128_sync_size;
|
|
}
|
|
|
|
static inline size_t get_psync_len_per_team() {
|
|
size_t fcollect_sync_size = get_fcollect_psync_len_per_team();
|
|
size_t fcollect_ll128_sync_size = get_fcollect_ll128_psync_len_per_team();
|
|
/* sync: Two buffers are used - one for sync/barrier collective ops, the second one during team
|
|
split operation reduce: Two pWrk's are used alternatively across consecutive reduce calls,
|
|
this is to avoid having to put a barrier in between bcast: The buffer is split to do multiple
|
|
consecutive broadcast, when all buffers are used, a barrier is called and then again we begin
|
|
from the start of the buffer fcollect: Two sets of buffer are used to alternate between -
|
|
same way as in reduce. The other fator of 2 is because when using LL double the space is
|
|
needed to fuse flag with data */
|
|
|
|
size_t ans = (2 * NVSHMEMI_SYNC_SIZE +
|
|
nvshmemi_device_state.gpu_coll_env_params_var.reduce_scratch_size / sizeof(long) +
|
|
NVSHMEMI_BCAST_SYNC_SIZE + fcollect_sync_size + 2 * NVSHMEMI_ALLTOALL_SYNC_SIZE +
|
|
fcollect_ll128_sync_size);
|
|
return ans;
|
|
}
|
|
|
|
size_t nvshmemi_get_teams_mem_requirement() {
|
|
size_t psync_size = get_psync_len_per_team();
|
|
size_t teams_mem_req = sizeof(long) * nvshmemi_max_teams * psync_size + /* psync's */
|
|
2 * N_PSYNC_BYTES + /* psync_pool_avail */
|
|
2 * sizeof(int) + /* team_ret_val */
|
|
2 * sizeof(long) * nvshmemi_max_teams /* storing counters */
|
|
#ifdef NVSHMEM_USE_NCCL
|
|
+ sizeof(ncclUniqueId)
|
|
#endif
|
|
;
|
|
INFO(NVSHMEM_INIT, "team psync mem req %ld bytes, team mem total req %d bytes, max teams %ld\n",
|
|
psync_size, teams_mem_req, nvshmemi_max_teams);
|
|
return teams_mem_req;
|
|
}
|
|
|
|
#ifdef NVSHMEM_USE_NCCL
|
|
void nvshmemi_team_init_nccl_comm(nvshmemi_team_t *teami) {
|
|
ncclUniqueId Id;
|
|
int start = teami->start;
|
|
int stride = teami->stride;
|
|
int size = teami->size;
|
|
/* This is technical debt where we are using the REDUCE op psync as scratchpad for src/dst of
|
|
* broadcast broadcast's psync is used for LL8 and other algorithms, making it non-trivial to
|
|
* share when issued from the host as a src or dest buffer.
|
|
*
|
|
* When reduce coll supports LL8 algorithm, we need to clean this up as a independent scratch
|
|
* space
|
|
*/
|
|
long *pWrk = nvshmemi_team_get_psync(teami, REDUCE);
|
|
if (teami->my_pe == 0) {
|
|
NCCL_CHECK(nccl_ftable.GetUniqueId(&Id));
|
|
CUDA_RUNTIME_CHECK(cudaMemcpy(pWrk, &Id, sizeof(ncclUniqueId), cudaMemcpyHostToDevice));
|
|
CUDA_RUNTIME_CHECK(cudaDeviceSynchronize());
|
|
for (int i = 0; i < size; i++) {
|
|
nvshmemx_char_put_nbi_on_stream((char *)pWrk, (const char *)pWrk, sizeof(ncclUniqueId),
|
|
start + i * stride, (cudaStream_t)0);
|
|
}
|
|
CUDA_RUNTIME_CHECK(cudaDeviceSynchronize());
|
|
nvshmemi_barrier(teami->team_idx);
|
|
} else {
|
|
nvshmemi_barrier(teami->team_idx);
|
|
CUDA_RUNTIME_CHECK(cudaMemcpy(&Id, pWrk, sizeof(ncclUniqueId), cudaMemcpyDeviceToHost));
|
|
}
|
|
INFO(NVSHMEM_TEAM, "Calling ncclCommInitRank, teami->size = %d, teami->my_pe = %d", teami->size,
|
|
teami->my_pe);
|
|
NCCL_CHECK(
|
|
nccl_ftable.CommInitRank((ncclComm_t *)&teami->nccl_comm, teami->size, Id, teami->my_pe));
|
|
}
|
|
#endif /* NVSHMEM_USE_NCCL */
|
|
void nvshmemi_team_set_p2p_connectivity(nvshmemi_team_t *teami) {
|
|
teami->are_gpus_p2p_connected = 1;
|
|
for (int pe = teami->start; pe < teami->start + teami->stride * teami->size;
|
|
pe += teami->stride) {
|
|
if (nvshmemi_state->heap_obj->get_local_pe_base()[pe] == NULL) {
|
|
teami->are_gpus_p2p_connected = 0;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* NVLS Resource management for teams */
|
|
static void nvshmemi_team_destroy_nvls(nvshmemi_team_t *team) {
|
|
if (team->nvls_rsc == nullptr) return; /* NOOP */
|
|
|
|
nvshmemi_nvls_rsc *nvls_obj = nullptr;
|
|
nvls_obj = reinterpret_cast<nvshmemi_nvls_rsc *>(team->nvls_rsc);
|
|
if (nvls_obj->get_refcount() == 0) { /* Last reference */
|
|
nvshmemi_state->heap_obj->nvls_unmap_heap_memory_by_team(team);
|
|
nvshmemi_state->heap_obj->nvls_unbind_heap_memory_by_team(team);
|
|
nvls_obj->free_group_mem();
|
|
nvls_obj->release_owner();
|
|
delete nvls_obj;
|
|
cudaFree(team->nvls_rsc_base_ptr);
|
|
team->nvls_rsc = nullptr;
|
|
INFO(NVSHMEM_TEAM, "NVLS Resource Destroyed for Team ID %d\n", team->team_idx);
|
|
} else {
|
|
nvls_obj->del_refcount(); /* Shared nvls resource */
|
|
/* Ownership of NVLS resource is necessary to allow for newly allocated UC memory to be
|
|
* bound and mapped to MC heap */
|
|
if (nvls_obj->is_owner(team)) {
|
|
// Transfer ownership to one of the dup teams
|
|
NVSHMEMU_FOR_EACH_IF(
|
|
i, nvshmemi_max_teams,
|
|
nvshmemi_team_pool[i] != NULL && nvshmemi_team_support_nvls(nvshmemi_team_pool[i]),
|
|
{
|
|
// Find first duplicate team that shares the nvls rsc and make it the owner
|
|
if (nvshmemi_team_pool[i]->nvls_rsc == team->nvls_rsc) {
|
|
nvls_obj->release_owner();
|
|
nvls_obj->assign_owner(nvshmemi_team_pool[i]);
|
|
break;
|
|
}
|
|
})
|
|
}
|
|
}
|
|
}
|
|
|
|
static int nvshmemi_team_create_nvls(nvshmemi_team_t *team) {
|
|
int status = -1;
|
|
uint64_t mc_heap_base;
|
|
nvshmemi_nvls_rsc *nvls_obj = nullptr;
|
|
|
|
if (!nvshmemi_team_is_nvls_capable(team)) {
|
|
team->nvls_rsc = nullptr;
|
|
WARN("Skipping NVLINK SHARP resource initialized for team ID: %d\n", team->team_idx);
|
|
return 0;
|
|
}
|
|
|
|
try {
|
|
team->nvls_rsc = reinterpret_cast<void *>(new nvshmemi_nvls_rsc(team, nvshmemi_state));
|
|
} catch (nvshmemi_nvls_exception &exp) {
|
|
WARN("NVLINK SHARP resource initialization failed for team ID: %d\n", team->team_idx);
|
|
team->nvls_rsc = nullptr;
|
|
return 0;
|
|
}
|
|
|
|
nvls_obj = reinterpret_cast<nvshmemi_nvls_rsc *>(team->nvls_rsc);
|
|
status = nvls_obj->reserve_group_mem();
|
|
NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, cleanup,
|
|
"Reserve multicast group mapping failed for pe %d\n", team->my_pe);
|
|
|
|
nvls_obj->assign_owner(team);
|
|
CUDA_RUNTIME_CHECK(cudaMalloc((void **)&team->nvls_rsc_base_ptr, sizeof(void *)));
|
|
mc_heap_base = (uint64_t)(nvls_obj->get_mc_base());
|
|
CUDA_RUNTIME_CHECK(
|
|
cudaMemcpy(team->nvls_rsc_base_ptr, &mc_heap_base, sizeof(void *), cudaMemcpyHostToDevice));
|
|
|
|
INFO(NVSHMEM_TEAM, "NVLS Resource Created for Team ID %d MC VA Base: %llx\n", team->team_idx,
|
|
mc_heap_base);
|
|
return (status);
|
|
|
|
cleanup:
|
|
(void)nvls_obj->free_group_mem();
|
|
delete nvls_obj;
|
|
team->nvls_rsc = nullptr;
|
|
return (status);
|
|
}
|
|
|
|
static int nvshmemi_team_bind_nvls(nvshmemi_team_t *team) {
|
|
int status = -1;
|
|
/* Make a MC handle as large as physical heap size */
|
|
nvshmemi_nvls_rsc *nvls_obj = nullptr;
|
|
nvls_obj = reinterpret_cast<nvshmemi_nvls_rsc *>(team->nvls_rsc);
|
|
if (!nvls_obj->is_owner(team)) return 0;
|
|
|
|
status = nvshmemi_state->heap_obj->nvls_create_heap_memory_by_team(team);
|
|
NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, cleanup,
|
|
"Create multicast groups for UC heap failed for pe %d team ID %d\n",
|
|
team->my_pe, team->team_idx);
|
|
|
|
status = nvshmemi_state->heap_obj->nvls_bind_heap_memory_by_team(team);
|
|
NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, cleanup,
|
|
"Binding multicast groups to UC heap failed for pe %d team ID %d\n",
|
|
team->my_pe, team->team_idx);
|
|
status = nvshmemi_state->heap_obj->nvls_map_heap_memory_by_team(team);
|
|
NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, cleanup,
|
|
"Mapping multicast groups for UC heap failed for pe %d team ID %d\n",
|
|
team->my_pe, team->team_idx);
|
|
cleanup:
|
|
return (status);
|
|
}
|
|
|
|
static int nvshmemi_team_nvls_setup(nvshmemi_team_t *team) {
|
|
int status = 0;
|
|
|
|
/* Initialize NVLS resources for team supporting P2P connected GPUs */
|
|
status = nvshmemi_team_create_nvls(team);
|
|
NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, cleanup,
|
|
"NVLS resource initialization failed for team ID: %d\n", team->team_idx);
|
|
|
|
/* Any prior UC allocations need to bound to this team's MC groups */
|
|
if (team->nvls_rsc != nullptr) {
|
|
status = nvshmemi_team_bind_nvls(team);
|
|
NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, cleanup,
|
|
"NVLS resource bind and mapping existing UC mappings to MC heap "
|
|
"failed for team ID: %d\n",
|
|
team->team_idx);
|
|
}
|
|
|
|
cleanup:
|
|
return (status);
|
|
}
|
|
|
|
/* Team Management Routines */
|
|
int nvshmemi_set_max_teams(void) {
|
|
nvshmemi_max_teams = nvshmemi_options.MAX_TEAMS;
|
|
if (nvshmemi_max_teams < NVSHMEM_TEAMS_MIN) nvshmemi_max_teams = NVSHMEM_TEAMS_MIN;
|
|
/* On NVLS sharp enabled platforms increase default max teams to 64 */
|
|
if (nvshmemi_state->is_platform_nvls) {
|
|
nvshmemi_max_teams = (nvshmemi_max_teams > NVSHMEMI_REDUCE_MAX_CTA_COUNT)
|
|
? nvshmemi_max_teams
|
|
: NVSHMEMI_REDUCE_MAX_CTA_COUNT;
|
|
}
|
|
|
|
if (nvshmemi_max_teams > N_PSYNC_BYTES * CHAR_BIT) {
|
|
NVSHMEMI_ERROR_EXIT("Requested %ld teams, but only %d are supported\n", nvshmemi_max_teams,
|
|
N_PSYNC_BYTES * CHAR_BIT);
|
|
return 1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
int nvshmemi_team_init(void) {
|
|
long psync_len;
|
|
int start, stride, size;
|
|
int *scratch = NULL;
|
|
int status = 0;
|
|
uint64_t *hostHash = NULL;
|
|
uint64_t myHostHash = 0;
|
|
nvshmem_transport_pe_info_t *pe_info;
|
|
int i;
|
|
|
|
nvshmemi_team_world = NVSHMEMI_TEAM_INITIALIZER;
|
|
nvshmemi_team_shared = NVSHMEMI_TEAM_INITIALIZER;
|
|
nvshmemi_team_node = NVSHMEMI_TEAM_INITIALIZER;
|
|
nvshmemi_team_same_mype_node = NVSHMEMI_TEAM_INITIALIZER;
|
|
nvshmemi_team_same_gpu = NVSHMEMI_TEAM_INITIALIZER;
|
|
nvshmemi_team_gpu_leaders = NVSHMEMI_TEAM_INITIALIZER;
|
|
|
|
/* Initialize NVSHMEM_TEAM_WORLD */
|
|
nvshmemi_team_world.team_idx = NVSHMEM_TEAM_WORLD_INDEX;
|
|
NVSHMEMI_TEAM_DUP_INITIALIZER(nvshmemi_team_world, NVSHMEM_TEAM_WORLD_INDEX);
|
|
nvshmemi_team_world.start = 0;
|
|
nvshmemi_team_world.stride = 1;
|
|
nvshmemi_team_world.size = nvshmemi_state->npes;
|
|
nvshmemi_team_world.my_pe = nvshmemi_state->mype;
|
|
nvshmemi_team_world.rdxn_count = 0;
|
|
nvshmemi_team_world.config_mask = 0;
|
|
nvshmemi_team_world.ll_flag = 1;
|
|
nvshmemi_team_world.alltoall_count = 0;
|
|
nvshmemi_team_world.bcast_count = 0;
|
|
nvshmemi_team_world.bcast_sync_offset = 0;
|
|
nvshmemi_team_world.fcollect_count = 0;
|
|
nvshmemi_team_set_p2p_connectivity(&nvshmemi_team_world);
|
|
nvshmemi_recexchalgo_get_neighbors(&nvshmemi_team_world);
|
|
nvshmemi_team_world.is_team_node = false;
|
|
nvshmemi_team_world.is_team_same_mype_node = false;
|
|
|
|
/* Initialize NVSHMEM_TEAM_SHARED */
|
|
nvshmemi_team_shared.team_idx = NVSHMEM_TEAM_SHARED_INDEX;
|
|
NVSHMEMI_TEAM_DUP_INITIALIZER(nvshmemi_team_shared, NVSHMEM_TEAM_SHARED_INDEX);
|
|
/* Collect list of p2p connected PEs */
|
|
int *p2p_pe_list = (int *)malloc(nvshmemi_team_world.size * sizeof(int));
|
|
int n_p2p_pes = 0;
|
|
int my_idx_in_p2p_list = 0;
|
|
for (int i = 0; i < nvshmemi_team_world.size; i++) {
|
|
if (nvshmemi_state->heap_obj->get_local_pe_base()[i]) {
|
|
if (i == nvshmemi_team_world.my_pe) my_idx_in_p2p_list = n_p2p_pes;
|
|
p2p_pe_list[n_p2p_pes++] = i;
|
|
}
|
|
}
|
|
|
|
std::ostringstream ss;
|
|
for (int i = 0; i < n_p2p_pes; i++) {
|
|
ss << p2p_pe_list[i] << " ";
|
|
}
|
|
INFO(NVSHMEM_INIT, "P2P list: %s", ss.str().c_str());
|
|
|
|
/* Make sure that n_p2p_pes is same for all PEs to form TEAM_SHARED */
|
|
int *n_p2p_pes_all = (int *)malloc(nvshmemi_team_world.size * sizeof(int));
|
|
int *p2p_pe_list_all = (int *)malloc(sizeof(int) * n_p2p_pes * nvshmemi_team_world.size);
|
|
|
|
nvshmemi_boot_handle.allgather((void *)&n_p2p_pes, (void *)n_p2p_pes_all, sizeof(int),
|
|
&nvshmemi_boot_handle);
|
|
|
|
for (i = 0; i < nvshmemi_team_world.size; i++) {
|
|
if (n_p2p_pes_all[i] != n_p2p_pes) {
|
|
INFO(NVSHMEM_INIT,
|
|
"n_p2p_pes is not equal across PEs, setting NVSHMEM_TEAM_SHARED to self");
|
|
goto team_shared_single_pe;
|
|
}
|
|
}
|
|
|
|
/* Gather p2p lists of all PEs and ensure they are the same */
|
|
nvshmemi_boot_handle.allgather((void *)p2p_pe_list, (void *)p2p_pe_list_all,
|
|
sizeof(int) * n_p2p_pes, &nvshmemi_boot_handle);
|
|
for (i = 0; i < n_p2p_pes; i++) {
|
|
if (memcmp((void *)p2p_pe_list, (void *)&p2p_pe_list_all[p2p_pe_list[i] * n_p2p_pes],
|
|
sizeof(int) * n_p2p_pes) != 0) {
|
|
INFO(NVSHMEM_INIT, "P2P lists are not symmetric, setting NVSHMEM_TEAM_SHARED to self");
|
|
goto team_shared_single_pe;
|
|
}
|
|
}
|
|
|
|
for (int i = 2; i < n_p2p_pes; i++) {
|
|
if (p2p_pe_list[i] - p2p_pe_list[i - 1] != p2p_pe_list[i - 1] - p2p_pe_list[i - 2]) {
|
|
INFO(NVSHMEM_INIT,
|
|
"P2P list is not of the form (start, stride, size). Cannot form "
|
|
"NVSHMEM_TEAM_SHARED.");
|
|
goto team_shared_single_pe;
|
|
}
|
|
}
|
|
|
|
nvshmemi_team_shared.my_pe = my_idx_in_p2p_list;
|
|
nvshmemi_team_shared.start = p2p_pe_list[0];
|
|
nvshmemi_team_shared.stride = n_p2p_pes > 1 ? (p2p_pe_list[1] - p2p_pe_list[0]) : 1;
|
|
nvshmemi_team_shared.size = n_p2p_pes;
|
|
|
|
goto team_shared_setup;
|
|
|
|
team_shared_single_pe:
|
|
nvshmemi_team_shared.my_pe = 0;
|
|
nvshmemi_team_shared.start = nvshmemi_state->mype;
|
|
nvshmemi_team_shared.stride = 1;
|
|
nvshmemi_team_shared.size = 1;
|
|
nvshmemi_team_shared.is_team_node = true;
|
|
nvshmemi_team_shared.is_team_same_mype_node = true;
|
|
|
|
team_shared_setup:
|
|
free(n_p2p_pes_all);
|
|
free(p2p_pe_list_all);
|
|
free(p2p_pe_list);
|
|
|
|
nvshmemi_team_shared.rdxn_count = 0;
|
|
nvshmemi_team_shared.config_mask = 0;
|
|
|
|
nvshmemi_team_shared.ll_flag = 1;
|
|
nvshmemi_team_shared.alltoall_count = 0;
|
|
nvshmemi_team_shared.bcast_count = 0;
|
|
nvshmemi_team_shared.bcast_sync_offset = 0;
|
|
nvshmemi_team_shared.fcollect_count = 0;
|
|
nvshmemi_team_shared.are_gpus_p2p_connected = 0;
|
|
nvshmemi_team_set_p2p_connectivity(&nvshmemi_team_shared);
|
|
nvshmemi_recexchalgo_get_neighbors(&nvshmemi_team_shared);
|
|
INFO(NVSHMEM_INIT, "NVSHMEM_TEAM_SHARED: start=%d, stride=%d, size=%d",
|
|
nvshmemi_team_shared.start, nvshmemi_team_shared.stride, nvshmemi_team_shared.size);
|
|
nvshmemi_team_shared.is_team_same_mype_node = false;
|
|
|
|
/* Initialize NVSHMEMX_TEAM_NODE */
|
|
nvshmemi_team_node.team_idx = NVSHMEM_TEAM_NODE_INDEX;
|
|
NVSHMEMI_TEAM_DUP_INITIALIZER(nvshmemi_team_node, NVSHMEM_TEAM_NODE_INDEX);
|
|
nvshmemi_team_world.team_node = nvshmemi_team_node.team_idx;
|
|
nvshmemi_team_node.my_pe = nvshmemi_state->mype_node;
|
|
nvshmemi_team_node.rdxn_count = 0;
|
|
nvshmemi_team_node.config_mask = 0;
|
|
nvshmemi_team_node.ll_flag = 1;
|
|
nvshmemi_team_node.alltoall_count = 0;
|
|
nvshmemi_team_node.bcast_count = 0;
|
|
nvshmemi_team_node.bcast_sync_offset = 0;
|
|
nvshmemi_team_node.fcollect_count = 0;
|
|
myHostHash = getHostHash();
|
|
hostHash = (uint64_t *)malloc(sizeof(uint64_t) * nvshmemi_state->npes);
|
|
NVSHMEMI_NULL_ERROR_JMP(hostHash, status, NVSHMEMX_ERROR_OUT_OF_MEMORY, cleanup,
|
|
"hostHash allocation failed \n");
|
|
status = nvshmemi_boot_handle.allgather((void *)&myHostHash, (void *)hostHash, sizeof(uint64_t),
|
|
&nvshmemi_boot_handle);
|
|
NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, cleanup,
|
|
"allgather of host hashes failed\n");
|
|
|
|
/* Search for on-node peer PEs while checking for a consistent stride */
|
|
start = -1;
|
|
stride = -1;
|
|
size = 0;
|
|
|
|
for (int pe = 0; pe < nvshmemi_state->npes; pe++) {
|
|
if (hostHash[pe] != myHostHash) continue;
|
|
|
|
int ret = check_for_linear_stride(pe, &start, &stride, &size);
|
|
if (ret < 0) {
|
|
start = nvshmemi_state->mype;
|
|
stride = 1;
|
|
size = 1;
|
|
break;
|
|
}
|
|
}
|
|
assert(start >= 0 && size > 0);
|
|
nvshmemi_team_node.start = start;
|
|
nvshmemi_team_node.stride = (stride == -1) ? 1 : stride;
|
|
nvshmemi_team_node.size = size;
|
|
if (nvshmemi_team_is_identical(&nvshmemi_team_world, &nvshmemi_team_node)) {
|
|
nvshmemi_team_world.is_team_node = true;
|
|
}
|
|
if (nvshmemi_team_is_identical(&nvshmemi_team_shared, &nvshmemi_team_node)) {
|
|
nvshmemi_team_shared.is_team_node = true;
|
|
}
|
|
|
|
nvshmemi_team_set_p2p_connectivity(&nvshmemi_team_node);
|
|
nvshmemi_recexchalgo_get_neighbors(&nvshmemi_team_node);
|
|
nvshmemi_team_node.is_team_node = true;
|
|
nvshmemi_team_node.is_team_same_mype_node = false;
|
|
INFO(NVSHMEM_INIT, "NVSHMEMX_TEAM_NODE: start=%d, stride=%d, size=%d", nvshmemi_team_node.start,
|
|
nvshmemi_team_node.stride, nvshmemi_team_node.size);
|
|
|
|
/* Initialize NVSHMEMX_TEAM_SAME_MYPE_NODE */
|
|
nvshmemi_team_same_mype_node.team_idx = NVSHMEM_TEAM_SAME_MYPE_NODE_INDEX;
|
|
NVSHMEMI_TEAM_DUP_INITIALIZER(nvshmemi_team_same_mype_node, NVSHMEM_TEAM_SAME_MYPE_NODE_INDEX);
|
|
nvshmemi_team_world.team_same_mype_node = nvshmemi_team_same_mype_node.team_idx;
|
|
nvshmemi_team_same_mype_node.my_pe = nvshmemi_state->mype / nvshmemi_state->npes_node;
|
|
nvshmemi_team_same_mype_node.rdxn_count = 0;
|
|
nvshmemi_team_same_mype_node.config_mask = 0;
|
|
|
|
nvshmemi_team_same_mype_node.start = nvshmemi_state->mype_node;
|
|
nvshmemi_team_same_mype_node.stride = nvshmemi_state->npes_node;
|
|
nvshmemi_team_same_mype_node.size = nvshmemi_state->npes / nvshmemi_state->npes_node;
|
|
assert(nvshmemi_state->npes % nvshmemi_state->npes_node == 0);
|
|
nvshmemi_team_same_mype_node.ll_flag = 1;
|
|
nvshmemi_team_same_mype_node.alltoall_count = 0;
|
|
nvshmemi_team_same_mype_node.bcast_count = 0;
|
|
nvshmemi_team_same_mype_node.bcast_sync_offset = 0;
|
|
nvshmemi_team_same_mype_node.fcollect_count = 0;
|
|
nvshmemi_team_set_p2p_connectivity(&nvshmemi_team_same_mype_node);
|
|
nvshmemi_recexchalgo_get_neighbors(&nvshmemi_team_same_mype_node);
|
|
nvshmemi_team_same_mype_node.is_team_node = false;
|
|
nvshmemi_team_same_mype_node.is_team_same_mype_node = true;
|
|
INFO(NVSHMEM_INIT, "NVSHMEMX_TEAM_SAME_MYPE_NODE: start=%d, stride=%d, size=%d",
|
|
nvshmemi_team_same_mype_node.start, nvshmemi_team_same_mype_node.stride,
|
|
nvshmemi_team_same_mype_node.size);
|
|
|
|
/* Initialize team NVSHMEMI_TEAM_SAME_GPU */
|
|
nvshmemi_team_same_gpu.team_idx = NVSHMEM_TEAM_SAME_GPU_INDEX;
|
|
NVSHMEMI_TEAM_DUP_INITIALIZER(nvshmemi_team_same_gpu, NVSHMEM_TEAM_SAME_GPU_INDEX);
|
|
nvshmemi_team_same_gpu.rdxn_count = 0;
|
|
nvshmemi_team_same_gpu.ll_flag = 1;
|
|
nvshmemi_team_same_gpu.alltoall_count = 0;
|
|
nvshmemi_team_same_gpu.bcast_count = 0;
|
|
nvshmemi_team_same_gpu.bcast_sync_offset = 0;
|
|
nvshmemi_team_same_gpu.fcollect_count = 0;
|
|
nvshmemi_team_same_gpu.config_mask = 0;
|
|
pe_info = nvshmemi_state->pe_info;
|
|
start = -1;
|
|
stride = -1;
|
|
size = 0;
|
|
for (int pe = 0; pe < nvshmemi_state->npes; pe++) {
|
|
if (pe_info[pe].hostHash != pe_info[nvshmemi_state->mype].hostHash ||
|
|
memcmp(&pe_info[pe].gpu_uuid, &pe_info[nvshmemi_state->mype].gpu_uuid,
|
|
sizeof(cudaUUID_t)) != 0)
|
|
continue;
|
|
|
|
int ret = check_for_linear_stride(pe, &start, &stride, &size);
|
|
if (ret < 0) {
|
|
NVSHMEMI_ERROR_EXIT("Could not form NVSHMEMI_TEAM_SAME_GPU\n");
|
|
break;
|
|
}
|
|
}
|
|
assert(start >= 0 && size > 0);
|
|
nvshmemi_team_same_gpu.my_pe = (nvshmemi_state->mype - start) / stride;
|
|
nvshmemi_team_same_gpu.start = start;
|
|
nvshmemi_team_same_gpu.stride = (stride == -1) ? 1 : stride;
|
|
nvshmemi_team_same_gpu.size = size;
|
|
nvshmemi_team_set_p2p_connectivity(&nvshmemi_team_same_gpu);
|
|
nvshmemi_recexchalgo_get_neighbors(&nvshmemi_team_same_gpu);
|
|
nvshmemi_team_same_gpu.is_team_node = true;
|
|
nvshmemi_team_same_gpu.is_team_same_mype_node = false;
|
|
INFO(NVSHMEM_INIT, "NVSHMEMI_TEAM_SAME_GPU: start=%d, stride=%d, size=%d",
|
|
nvshmemi_team_same_gpu.start, nvshmemi_team_same_gpu.stride, nvshmemi_team_same_gpu.size);
|
|
|
|
/* All GPUs must have same number of processes (requires for us to form teams) */
|
|
|
|
/* Initialize team NVSHMEMI_TEAM_GPU_LEADERS */
|
|
scratch = (int *)malloc(sizeof(int) * nvshmemi_state->npes);
|
|
NVSHMEMI_NULL_ERROR_JMP(scratch, status, NVSHMEMX_ERROR_OUT_OF_MEMORY, cleanup,
|
|
"Unable to allocate host memory for team creation.\n");
|
|
if (nvshmemi_team_same_gpu.start ==
|
|
nvshmemi_state->mype) { /* Only GPU leaders are part of this team */
|
|
nvshmemi_team_gpu_leaders.team_idx = NVSHMEM_TEAM_GPU_LEADERS_INDEX;
|
|
NVSHMEMI_TEAM_DUP_INITIALIZER(nvshmemi_team_gpu_leaders, NVSHMEM_TEAM_GPU_LEADERS_INDEX);
|
|
nvshmemi_team_gpu_leaders.config_mask = 0;
|
|
|
|
nvshmemi_team_gpu_leaders.start = 0;
|
|
nvshmemi_team_gpu_leaders.stride =
|
|
(nvshmemi_team_same_gpu.stride == 1) ? nvshmemi_team_same_gpu.size : 1;
|
|
nvshmemi_team_gpu_leaders.size = nvshmemi_state->npes / nvshmemi_team_same_gpu.size;
|
|
nvshmemi_team_gpu_leaders.my_pe = (nvshmemi_state->mype - nvshmemi_team_gpu_leaders.start) /
|
|
nvshmemi_team_gpu_leaders.stride;
|
|
nvshmemi_team_gpu_leaders.rdxn_count = 0;
|
|
nvshmemi_team_gpu_leaders.ll_flag = 1;
|
|
nvshmemi_team_gpu_leaders.alltoall_count = 0;
|
|
nvshmemi_team_gpu_leaders.bcast_count = 0;
|
|
nvshmemi_team_gpu_leaders.bcast_sync_offset = 0;
|
|
nvshmemi_team_gpu_leaders.fcollect_count = 0;
|
|
nvshmemi_team_set_p2p_connectivity(&nvshmemi_team_gpu_leaders);
|
|
nvshmemi_recexchalgo_get_neighbors(&nvshmemi_team_gpu_leaders);
|
|
status =
|
|
nvshmemi_boot_handle.allgather((void *)&nvshmemi_team_gpu_leaders.my_pe,
|
|
(void *)scratch, sizeof(int), &nvshmemi_boot_handle);
|
|
NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, cleanup,
|
|
"allgather of gpu leaders failed\n");
|
|
/* Check whether a valid TEAM_GPU_LEADERS was formed */
|
|
int last_mype = -1;
|
|
for (int i = 0; i < nvshmemi_state->npes; i++) {
|
|
if (scratch[i] != -1) {
|
|
if (scratch[i] != last_mype + 1) {
|
|
WARN(
|
|
"NVSHMEMI_TEAM_GPU_LEADERS could not be formed, Limited MPG support will "
|
|
"not be available\n");
|
|
break;
|
|
} else {
|
|
last_mype++;
|
|
}
|
|
}
|
|
}
|
|
/* XXX: Note that we are not setting team_node and team_same_mype_node for
|
|
* nvshmemi_team_gpu_leaders */
|
|
nvshmemi_team_gpu_leaders.is_team_node = false;
|
|
nvshmemi_team_gpu_leaders.is_team_same_mype_node = false;
|
|
INFO(NVSHMEM_INIT, "NVSHMEMI_TEAM_GPU_LEADERS: start=%d, stride=%d, size=%d",
|
|
nvshmemi_team_gpu_leaders.start, nvshmemi_team_gpu_leaders.stride,
|
|
nvshmemi_team_gpu_leaders.size);
|
|
} else {
|
|
int my_pe = -1;
|
|
status = nvshmemi_boot_handle.allgather((void *)&my_pe, (void *)scratch, sizeof(int),
|
|
&nvshmemi_boot_handle);
|
|
NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, cleanup,
|
|
"allgather of gpu leaders failed\n");
|
|
}
|
|
if (nvshmemi_max_teams < NVSHMEM_TEAMS_MIN) nvshmemi_max_teams = NVSHMEM_TEAMS_MIN;
|
|
|
|
if (nvshmemi_max_teams > N_PSYNC_BYTES * CHAR_BIT) {
|
|
NVSHMEMI_ERROR_EXIT("Requested %ld teams, but only %d are supported\n", nvshmemi_max_teams,
|
|
N_PSYNC_BYTES * CHAR_BIT);
|
|
goto cleanup;
|
|
}
|
|
|
|
nvshmemi_team_pool = (nvshmemi_team_t **)calloc(nvshmemi_max_teams, sizeof(nvshmemi_team_t *));
|
|
NVSHMEMI_NULL_ERROR_JMP(nvshmemi_team_pool, status, NVSHMEMX_ERROR_OUT_OF_MEMORY, cleanup,
|
|
"nvshmemi_team_pool allocation failed \n");
|
|
CUDA_RUNTIME_CHECK(cudaMalloc((void **)&nvshmemi_device_team_pool,
|
|
nvshmemi_max_teams * sizeof(nvshmemi_team_t *)));
|
|
nvshmemi_device_state.team_pool = nvshmemi_device_team_pool;
|
|
|
|
for (long i = 0; i < nvshmemi_max_teams; i++) {
|
|
nvshmemi_team_pool[i] = NULL;
|
|
}
|
|
|
|
nvshmemi_call_init_array_kernel<nvshmemi_team_t *>(nvshmemi_device_team_pool,
|
|
nvshmemi_max_teams, NULL);
|
|
|
|
nvshmemi_team_pool[NVSHMEM_TEAM_WORLD_INDEX] = &nvshmemi_team_world;
|
|
nvshmemi_team_pool[NVSHMEM_TEAM_SHARED_INDEX] = &nvshmemi_team_shared;
|
|
nvshmemi_team_pool[NVSHMEM_TEAM_NODE_INDEX] = &nvshmemi_team_node;
|
|
nvshmemi_team_pool[NVSHMEM_TEAM_SAME_MYPE_NODE_INDEX] = &nvshmemi_team_same_mype_node;
|
|
nvshmemi_team_pool[NVSHMEM_TEAM_SAME_GPU_INDEX] = &nvshmemi_team_same_gpu;
|
|
if (nvshmemi_team_same_gpu.start == nvshmemi_state->mype)
|
|
nvshmemi_team_pool[NVSHMEM_TEAM_GPU_LEADERS_INDEX] = &nvshmemi_team_gpu_leaders;
|
|
|
|
/* Allocate pSync pool, each with the maximum possible size requirement */
|
|
/* Create two pSyncs per team for back-to-back collectives and one for barriers.
|
|
* Array organization:
|
|
*
|
|
* [ (world) (shared) (team 1) (team 2) ... (world) (shared) (team 1) (team 2) ... ]
|
|
* <----------- groups 1 & 2-------------->|<------------- group 3 ---------------->
|
|
* <--- (bcast, collect, reduce, etc.) --->|<------ (barriers and syncs) ---------->
|
|
* */
|
|
psync_len = nvshmemi_max_teams * get_psync_len_per_team();
|
|
nvshmemi_psync_pool = (long *)nvshmemi_malloc(sizeof(long) * psync_len);
|
|
NVSHMEMI_NULL_ERROR_JMP(nvshmemi_psync_pool, status, NVSHMEMX_ERROR_OUT_OF_MEMORY, cleanup,
|
|
"nvshmemi_psync_pool allocation failed \n");
|
|
|
|
nvshmemi_device_state.psync_pool = nvshmemi_psync_pool;
|
|
|
|
nvshmemi_call_init_array_kernel<long>(nvshmemi_psync_pool, psync_len, NVSHMEMI_SYNC_VALUE);
|
|
|
|
nvshmemi_sync_counter = (long *)nvshmemi_malloc(2 * nvshmemi_max_teams * sizeof(long));
|
|
NVSHMEMI_NULL_ERROR_JMP(nvshmemi_sync_counter, status, NVSHMEMX_ERROR_OUT_OF_MEMORY, cleanup,
|
|
"nvshmemi_sync_counter allocation failed \n");
|
|
|
|
nvshmemi_device_state.sync_counter = nvshmemi_sync_counter;
|
|
nvshmemi_update_device_state();
|
|
|
|
nvshmemi_call_init_array_kernel<long>(nvshmemi_sync_counter, 2 * nvshmemi_max_teams, 1);
|
|
|
|
/* Convenience pointer to the group-3 pSync array (for barriers and syncs): */
|
|
psync_pool_avail = (unsigned char *)malloc(2 * N_PSYNC_BYTES);
|
|
NVSHMEMI_NULL_ERROR_JMP(psync_pool_avail, status, NVSHMEMX_ERROR_OUT_OF_MEMORY, cleanup,
|
|
"psync_pool_avail allocation failed \n");
|
|
psync_pool_avail_reduced = &psync_pool_avail[N_PSYNC_BYTES];
|
|
|
|
device_psync_pool_avail = (unsigned char *)nvshmemi_malloc(2 * N_PSYNC_BYTES);
|
|
NVSHMEMI_NULL_ERROR_JMP(device_psync_pool_avail, status, NVSHMEMX_ERROR_OUT_OF_MEMORY, cleanup,
|
|
"device_psync_pool_avail allocation failed \n");
|
|
device_psync_pool_avail_reduced = &device_psync_pool_avail[N_PSYNC_BYTES];
|
|
/* Initialize the psync bits to 1, making all slots available: */
|
|
memset(psync_pool_avail, 0, 2 * N_PSYNC_BYTES);
|
|
for (size_t i = 0; i < (size_t)nvshmemi_max_teams; i++) {
|
|
nvshmemi_bit_set(psync_pool_avail, N_PSYNC_BYTES, i);
|
|
}
|
|
|
|
/* Set the bits for NVSHMEM_TEAM_WORLD, NVSHMEM_TEAM_SHARED, NVSHMEMX_TEAM_NODE to 0: */
|
|
nvshmemi_bit_clear(psync_pool_avail, N_PSYNC_BYTES, NVSHMEM_TEAM_WORLD_INDEX);
|
|
nvshmemi_bit_clear(psync_pool_avail, N_PSYNC_BYTES, NVSHMEM_TEAM_SHARED_INDEX);
|
|
nvshmemi_bit_clear(psync_pool_avail, N_PSYNC_BYTES, NVSHMEM_TEAM_NODE_INDEX);
|
|
nvshmemi_bit_clear(psync_pool_avail, N_PSYNC_BYTES, NVSHMEM_TEAM_SAME_MYPE_NODE_INDEX);
|
|
nvshmemi_bit_clear(psync_pool_avail, N_PSYNC_BYTES, NVSHMEM_TEAM_SAME_GPU_INDEX);
|
|
nvshmemi_bit_clear(psync_pool_avail, N_PSYNC_BYTES, NVSHMEM_TEAM_GPU_LEADERS_INDEX);
|
|
|
|
/* Initialize an integer used to agree on an equal return value across PEs in team creation: */
|
|
team_ret_val = (int *)malloc(sizeof(int) * 2);
|
|
NVSHMEMI_NULL_ERROR_JMP(team_ret_val, status, NVSHMEMX_ERROR_OUT_OF_MEMORY, cleanup,
|
|
"team_ret_val allocation failed \n");
|
|
team_ret_val_reduced = &team_ret_val[1];
|
|
|
|
device_team_ret_val = (int *)nvshmemi_malloc(sizeof(int) * 2);
|
|
NVSHMEMI_NULL_ERROR_JMP(team_ret_val, status, NVSHMEMX_ERROR_OUT_OF_MEMORY, cleanup,
|
|
"device_team_ret_val allocation failed \n");
|
|
device_team_ret_val_reduced = &device_team_ret_val[1];
|
|
|
|
nvshmemi_boot_handle.barrier(
|
|
&nvshmemi_boot_handle); /* To ensure neccessary setup has been done all PEs */
|
|
|
|
nvshmemi_team_alloc_device();
|
|
nvshmemi_team_update_device();
|
|
nvshmemi_boot_handle.barrier(
|
|
&nvshmemi_boot_handle); /* To ensure neccessary setup has been done all PEs */
|
|
|
|
#ifdef NVSHMEM_USE_NCCL
|
|
if (nvshmemi_use_nccl) {
|
|
/* Setup NCCL usage */
|
|
nvshmemi_team_init_nccl_comm(&nvshmemi_team_world);
|
|
nvshmemi_team_init_nccl_comm(&nvshmemi_team_shared);
|
|
nvshmemi_team_init_nccl_comm(&nvshmemi_team_node);
|
|
nvshmemi_team_init_nccl_comm(&nvshmemi_team_same_mype_node);
|
|
nvshmemi_team_init_nccl_comm(&nvshmemi_team_same_gpu);
|
|
if (nvshmemi_pe_in_active_set(nvshmemi_state->mype, nvshmemi_team_gpu_leaders.start,
|
|
nvshmemi_team_gpu_leaders.stride,
|
|
nvshmemi_team_gpu_leaders.size) >= 0) {
|
|
nvshmemi_team_init_nccl_comm(&nvshmemi_team_gpu_leaders);
|
|
}
|
|
}
|
|
#endif /* NVSHMEM_USE_NCCL */
|
|
|
|
/* Setup NVLS resources for all internal p2p connected teams */
|
|
NVSHMEMU_FOR_EACH_IF(
|
|
i, nvshmemi_max_teams,
|
|
nvshmemi_team_pool[i] != NULL && nvshmemi_team_pool[i]->are_gpus_p2p_connected, {
|
|
status = nvshmemi_team_nvls_setup(nvshmemi_team_pool[i]);
|
|
NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, cleanup,
|
|
"NVLS resource setup failed for team ID: %d\n",
|
|
nvshmemi_team_pool[i]->team_idx);
|
|
if (nvshmemi_team_pool[i]->nvls_rsc) {
|
|
INFO(NVSHMEM_TEAM, "Successful NVLS resource setup for team ID: %d\n",
|
|
nvshmemi_team_pool[i]->team_idx);
|
|
}
|
|
})
|
|
|
|
nvshmemi_boot_handle.barrier(
|
|
&nvshmemi_boot_handle); /* To ensure neccessary setup has been done all PEs */
|
|
|
|
nvshmemi_team_update_device();
|
|
|
|
nvshmemi_boot_handle.barrier(
|
|
&nvshmemi_boot_handle); /* To ensure neccessary setup has been done all PEs */
|
|
|
|
#if defined(NVSHMEM_PPC64LE)
|
|
if (nvshmemi_use_nccl) {
|
|
/* Set GPU thread stack size to be max stack size of any kernel invoked by NCCL.
|
|
The value 1256 has been obtained by profiling all NCCL kernels in NCCL 2.8.3-1.
|
|
This value is being set to prevent any memory config during application run
|
|
as that can lead to potential deadlock */
|
|
if (nvshmemi_options.CUDA_LIMIT_STACK_SIZE_provided) {
|
|
CUDA_RUNTIME_CHECK(
|
|
cudaDeviceSetLimit(cudaLimitStackSize, nvshmemi_options.CUDA_LIMIT_STACK_SIZE));
|
|
if (nvshmemi_options.CUDA_LIMIT_STACK_SIZE < 1256)
|
|
NVSHMEMI_WARN_PRINT(
|
|
"CUDA stack size limit has been set to less than 1256.\n"
|
|
"This can lead to hangs because a NCCL kernel can need up\n"
|
|
"to 1256 bytes");
|
|
} else
|
|
CUDA_RUNTIME_CHECK(cudaDeviceSetLimit(cudaLimitStackSize, 1256));
|
|
} else if (nvshmemi_options.CUDA_LIMIT_STACK_SIZE_provided) {
|
|
CUDA_RUNTIME_CHECK(
|
|
cudaDeviceSetLimit(cudaLimitStackSize, nvshmemi_options.CUDA_LIMIT_STACK_SIZE));
|
|
}
|
|
#endif
|
|
|
|
cleanup:
|
|
if (scratch) {
|
|
free(scratch);
|
|
}
|
|
if (hostHash) {
|
|
free(hostHash);
|
|
}
|
|
|
|
if (status != NVSHMEMX_SUCCESS) {
|
|
if (nvshmemi_team_pool) {
|
|
free(nvshmemi_team_pool);
|
|
nvshmemi_team_pool = NULL;
|
|
cudaFree(nvshmemi_device_team_pool);
|
|
nvshmemi_device_team_pool = NULL;
|
|
}
|
|
if (nvshmemi_psync_pool) {
|
|
nvshmemi_free(nvshmemi_psync_pool);
|
|
nvshmemi_psync_pool = NULL;
|
|
}
|
|
if (psync_pool_avail) {
|
|
free(psync_pool_avail);
|
|
psync_pool_avail = NULL;
|
|
}
|
|
if (device_psync_pool_avail) {
|
|
nvshmemi_free(device_psync_pool_avail);
|
|
device_psync_pool_avail = NULL;
|
|
}
|
|
if (team_ret_val) {
|
|
free(team_ret_val);
|
|
team_ret_val = NULL;
|
|
}
|
|
if (device_team_ret_val) {
|
|
nvshmemi_free(device_team_ret_val);
|
|
device_team_ret_val = NULL;
|
|
}
|
|
}
|
|
|
|
return status;
|
|
}
|
|
|
|
int nvshmemi_team_finalize(void) {
|
|
/* Destroy all undestroyed teams */
|
|
for (long i = 0; i < nvshmemi_max_teams; i++) {
|
|
if (nvshmemi_team_pool[i] != NULL) nvshmemi_team_destroy(nvshmemi_team_pool[i]);
|
|
}
|
|
|
|
free(nvshmemi_team_pool);
|
|
nvshmemi_team_pool = NULL;
|
|
CUDA_RUNTIME_CHECK(cudaFree(nvshmemi_device_team_pool));
|
|
|
|
nvshmemi_free(nvshmemi_psync_pool);
|
|
nvshmemi_free(nvshmemi_sync_counter);
|
|
|
|
free(psync_pool_avail);
|
|
nvshmemi_free(device_psync_pool_avail);
|
|
free(team_ret_val);
|
|
nvshmemi_free(device_team_ret_val);
|
|
cudaFree(nvshmemi_device_team_world);
|
|
cudaFree(nvshmemi_device_team_shared);
|
|
cudaFree(nvshmemi_device_team_node);
|
|
cudaFree(nvshmemi_device_team_same_mype_node);
|
|
cudaFree(nvshmemi_device_team_same_gpu);
|
|
cudaFree(nvshmemi_device_team_gpu_leaders);
|
|
|
|
return 0;
|
|
}
|
|
|
|
int nvshmemi_team_split_strided(nvshmemi_team_t *parent_team, int PE_start, int PE_stride,
|
|
int PE_size, const nvshmem_team_config_t *config, long config_mask,
|
|
nvshmem_team_t *new_team) {
|
|
*new_team = NVSHMEM_TEAM_INVALID;
|
|
nvshmem_barrier(parent_team->team_idx);
|
|
|
|
int global_PE_start = nvshmemi_team_pe(parent_team, PE_start);
|
|
int global_PE_stride = parent_team->stride * PE_stride;
|
|
int global_PE_end = global_PE_start + global_PE_stride * (PE_size - 1);
|
|
|
|
if (PE_start < 0 || PE_start >= parent_team->size || PE_size <= 0 ||
|
|
PE_size > parent_team->size || PE_stride < 1) {
|
|
NVSHMEMI_WARN_PRINT(
|
|
"Invalid <start, stride, size>: child <%d, %d, %d>, parent <%d, %d, %d>\n", PE_start,
|
|
PE_stride, PE_size, parent_team->start, parent_team->stride, parent_team->size);
|
|
return -1;
|
|
}
|
|
|
|
if (global_PE_start >= nvshmemi_state->npes || global_PE_end >= nvshmemi_state->npes) {
|
|
NVSHMEMI_WARN_PRINT("Starting PE (%d) or ending PE (%d) is invalid\n", global_PE_start,
|
|
global_PE_end);
|
|
return -1;
|
|
}
|
|
|
|
/* idx in new team: */
|
|
int my_pe =
|
|
nvshmemi_pe_in_active_set(nvshmemi_state->mype, global_PE_start, global_PE_stride, PE_size);
|
|
|
|
long *psync_reduce = nvshmemi_team_get_psync(parent_team, REDUCE);
|
|
long *psync = &nvshmemi_team_get_psync(parent_team, SYNC)[NVSHMEMI_SYNC_SIZE];
|
|
long *sync_counter = &nvshmemi_team_get_sync_counter(parent_team)[1];
|
|
nvshmemi_team_t *myteam = NULL;
|
|
*team_ret_val = 0;
|
|
*team_ret_val_reduced = 0;
|
|
|
|
if (my_pe >= 0) {
|
|
char bit_str[NVSHMEMI_DIAG_STRLEN];
|
|
|
|
myteam = (nvshmemi_team_t *)calloc(1, sizeof(nvshmemi_team_t));
|
|
(*myteam) = NVSHMEMI_TEAM_INITIALIZER;
|
|
|
|
myteam->my_pe = my_pe;
|
|
myteam->start = global_PE_start;
|
|
myteam->stride = global_PE_stride;
|
|
myteam->size = PE_size;
|
|
myteam->rdxn_count = 0;
|
|
myteam->ll_flag = 1;
|
|
myteam->alltoall_count = 0;
|
|
myteam->bcast_count = 0;
|
|
myteam->bcast_sync_offset = 0;
|
|
myteam->fcollect_count = 0;
|
|
|
|
if (config) {
|
|
myteam->config = *config;
|
|
myteam->config_mask = config_mask;
|
|
}
|
|
myteam->team_idx = -1;
|
|
nvshmemi_bit_to_string(bit_str, NVSHMEMI_DIAG_STRLEN, psync_pool_avail, N_PSYNC_BYTES);
|
|
|
|
CUDA_RUNTIME_CHECK(cudaMemcpy(device_psync_pool_avail, psync_pool_avail, N_PSYNC_BYTES,
|
|
cudaMemcpyHostToDevice));
|
|
CUDA_RUNTIME_CHECK(cudaDeviceSynchronize());
|
|
nvshmemi_call_reduce_kernel<unsigned char, RDXN_OPS_AND>(
|
|
myteam->start, myteam->stride, myteam->size,
|
|
(unsigned char *)device_psync_pool_avail_reduced,
|
|
(const unsigned char *)device_psync_pool_avail, N_PSYNC_BYTES,
|
|
(unsigned char *)psync_reduce, (long *)(psync), sync_counter);
|
|
|
|
CUDA_RUNTIME_CHECK(cudaMemcpy(psync_pool_avail_reduced, device_psync_pool_avail_reduced,
|
|
N_PSYNC_BYTES, cudaMemcpyDeviceToHost));
|
|
|
|
/* We cannot release the psync here, because this reduction may not
|
|
* have been performed on the entire parent team. */
|
|
nvshmemi_bit_to_string(bit_str, NVSHMEMI_DIAG_STRLEN, psync_pool_avail_reduced,
|
|
N_PSYNC_BYTES);
|
|
|
|
/* Select the least signficant nonzero bit, which corresponds to an available pSync. */
|
|
myteam->team_idx = nvshmemi_bit_1st_nonzero(psync_pool_avail_reduced, N_PSYNC_BYTES);
|
|
NVSHMEMI_TEAM_DUP_INITIALIZER(*myteam, myteam->team_idx);
|
|
|
|
nvshmemi_bit_to_string(bit_str, NVSHMEMI_DIAG_STRLEN, psync_pool_avail_reduced,
|
|
N_PSYNC_BYTES);
|
|
if (myteam->team_idx == -1 || myteam->team_idx >= (int)nvshmemi_max_teams) {
|
|
NVSHMEMI_WARN_PRINT(
|
|
"No more teams available (max = %ld), try setting NVSHMEM_MAX_TEAMS environment "
|
|
"variable\n",
|
|
nvshmemi_max_teams);
|
|
/* No psync was available, but must call barrier across parent team before returning. */
|
|
myteam->team_idx = -1;
|
|
*team_ret_val = 1;
|
|
} else {
|
|
/* Set the selected psync bit to 0, reserving that slot */
|
|
nvshmemi_bit_clear(psync_pool_avail, N_PSYNC_BYTES, myteam->team_idx);
|
|
|
|
*new_team = myteam->team_idx;
|
|
|
|
nvshmemi_team_pool[myteam->team_idx] = myteam;
|
|
nvshmemi_team_t *device_team_addr;
|
|
CUDA_RUNTIME_CHECK(cudaMalloc((void **)&device_team_addr, sizeof(nvshmemi_team_t)));
|
|
nvshmemi_team_set_p2p_connectivity(myteam);
|
|
nvshmemi_recexchalgo_get_neighbors(myteam);
|
|
CUDA_RUNTIME_CHECK(cudaMemcpy(device_team_addr, myteam, sizeof(nvshmemi_team_t),
|
|
cudaMemcpyHostToDevice));
|
|
CUDA_RUNTIME_CHECK(cudaMemcpy(&nvshmemi_device_team_pool[myteam->team_idx],
|
|
&device_team_addr, sizeof(nvshmemi_team_t *),
|
|
cudaMemcpyHostToDevice));
|
|
CUDA_RUNTIME_CHECK(cudaDeviceSynchronize());
|
|
#ifdef NVSHMEM_USE_NCCL
|
|
if (nvshmemi_use_nccl) nvshmemi_team_init_nccl_comm(myteam);
|
|
#endif
|
|
/*
|
|
* Reuse NVLS resources if teams are identical,
|
|
* else creating a new NVLS resources for p2p connected teams
|
|
*/
|
|
if (nvshmemi_team_is_identical(myteam, parent_team) &&
|
|
nvshmemi_team_is_nvls_capable(myteam) && !nvshmemi_options.DISABLE_NVLS_SHARING) {
|
|
myteam->nvls_rsc = parent_team->nvls_rsc; /* Inherit the parent team's nvls_rsc */
|
|
if (myteam->nvls_rsc) {
|
|
nvshmemi_nvls_rsc *nvls =
|
|
reinterpret_cast<nvshmemi_nvls_rsc *>(myteam->nvls_rsc);
|
|
nvls->add_refcount();
|
|
INFO(NVSHMEM_TEAM,
|
|
"Successful NVLS resource sharing for new team ID: %d (parent ID: %d)\n",
|
|
myteam->team_idx, parent_team->team_idx);
|
|
}
|
|
|
|
} else if (nvshmemi_team_is_nvls_capable(myteam)) {
|
|
if (nvshmemi_team_nvls_setup(myteam) != 0) {
|
|
NVSHMEMI_WARN_PRINT("NVLS resource setup failed for team ID: %d\n",
|
|
myteam->team_idx);
|
|
return -1;
|
|
}
|
|
|
|
INFO(NVSHMEM_TEAM, "Successful NVLS resource setup for team ID: %d\n",
|
|
myteam->team_idx);
|
|
} else {
|
|
myteam->nvls_rsc = nullptr; /* NVLS not supported, so no resource created/bound */
|
|
}
|
|
|
|
/* Build team_node */
|
|
myteam->is_team_node = false;
|
|
int i;
|
|
for (i = 1; i < myteam->size; i++) {
|
|
if (nvshmemi_host_hashes[myteam->start] !=
|
|
nvshmemi_host_hashes[myteam->start + i * myteam->stride]) {
|
|
break;
|
|
}
|
|
}
|
|
if (i == myteam->size) myteam->is_team_node = true;
|
|
|
|
myteam->is_team_same_mype_node = true;
|
|
for (int i = 0; i < myteam->size; i++) {
|
|
for (int j = i + 1; j < myteam->size; j++) {
|
|
if (nvshmemi_host_hashes[myteam->start + i * myteam->stride] ==
|
|
nvshmemi_host_hashes[myteam->start + j * myteam->stride]) {
|
|
myteam->is_team_same_mype_node = false;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* count PEs on the same node */
|
|
int team_npes_node = 0;
|
|
for (int i = 0; i < myteam->size; i++) {
|
|
if (nvshmemi_team_translate_pe(myteam, i, &nvshmemi_team_node) != -1) {
|
|
team_npes_node++;
|
|
}
|
|
}
|
|
if (!myteam->is_team_node && !myteam->is_team_same_mype_node) {
|
|
/* Now I am just going to repurpose device_psync_pool_avail symm memory for the
|
|
purpose of finding max of team_npes_node */
|
|
assert(sizeof(int) <= N_PSYNC_BYTES);
|
|
CUDA_RUNTIME_CHECK(cudaMemcpy(device_psync_pool_avail, &team_npes_node, sizeof(int),
|
|
cudaMemcpyHostToDevice));
|
|
CUDA_RUNTIME_CHECK(cudaDeviceSynchronize());
|
|
nvshmemi_call_reduce_kernel<int, RDXN_OPS_MAX>(
|
|
myteam->start, myteam->stride, myteam->size,
|
|
(int *)device_psync_pool_avail_reduced, (const int *)device_psync_pool_avail, 1,
|
|
(int *)psync_reduce, (long *)(psync), sync_counter);
|
|
|
|
CUDA_RUNTIME_CHECK(cudaMemcpy(&team_npes_node, device_psync_pool_avail_reduced,
|
|
sizeof(int), cudaMemcpyDeviceToHost));
|
|
nvshmemi_team_split_2d(myteam, team_npes_node, NULL, 0, &myteam->team_node, NULL, 0,
|
|
&myteam->team_same_mype_node);
|
|
}
|
|
CUDA_RUNTIME_CHECK(cudaMemcpy(device_team_addr, myteam, sizeof(nvshmemi_team_t),
|
|
cudaMemcpyHostToDevice));
|
|
CUDA_RUNTIME_CHECK(cudaDeviceSynchronize());
|
|
}
|
|
nvshmemi_call_init_array_kernel<long>(sync_counter, 1, 1);
|
|
nvshmemi_call_init_array_kernel<long>(psync, NVSHMEMI_SYNC_SIZE, NVSHMEMI_SYNC_VALUE);
|
|
CUDA_RUNTIME_CHECK(cudaDeviceSynchronize());
|
|
}
|
|
|
|
/* This barrier on the parent team eliminates problematic race conditions
|
|
* during psync allocation between back-to-back team creations. */
|
|
nvshmem_quiet();
|
|
// nvshmem_barrier(parent_team->start, parent_team->stride, parent_team->size, psync);
|
|
nvshmem_team_sync(parent_team->team_idx);
|
|
/* This OR reduction assures all PEs return the same value. */
|
|
CUDA_RUNTIME_CHECK(
|
|
cudaMemcpy(device_team_ret_val, team_ret_val, sizeof(int), cudaMemcpyHostToDevice));
|
|
CUDA_RUNTIME_CHECK(cudaDeviceSynchronize());
|
|
nvshmemi_call_rdxn_on_stream_kernel<int, RDXN_OPS_MAX>(
|
|
parent_team->team_idx, device_team_ret_val_reduced, device_team_ret_val, 1,
|
|
nvshmemi_state->my_stream);
|
|
CUDA_RUNTIME_CHECK(cudaStreamSynchronize(nvshmemi_state->my_stream));
|
|
CUDA_RUNTIME_CHECK(cudaMemcpy(team_ret_val_reduced, device_team_ret_val_reduced, sizeof(int),
|
|
cudaMemcpyDeviceToHost));
|
|
|
|
/* If no team was available, print some team triplet info and return nonzero. */
|
|
if (myteam != NULL && myteam->team_idx == -1) {
|
|
NVSHMEMI_WARN_PRINT("Team split strided failed: child <%d, %d, %d>, parent <%d, %d, %d>\n",
|
|
global_PE_start, global_PE_stride, PE_size, parent_team->start,
|
|
parent_team->stride, parent_team->size);
|
|
/* TODO: In the event one of the PEs fails to create the team, do we need to revert the team
|
|
* on all of the other ones? */
|
|
free(myteam);
|
|
}
|
|
|
|
return *team_ret_val_reduced;
|
|
}
|
|
|
|
int nvshmemi_team_split_2d(nvshmemi_team_t *parent_team, int xrange,
|
|
const nvshmem_team_config_t *xaxis_config, long xaxis_mask,
|
|
nvshmem_team_t *xaxis_team, const nvshmem_team_config_t *yaxis_config,
|
|
long yaxis_mask, nvshmem_team_t *yaxis_team) {
|
|
*xaxis_team = NVSHMEM_TEAM_INVALID;
|
|
*yaxis_team = NVSHMEM_TEAM_INVALID;
|
|
|
|
if (xrange > parent_team->size) {
|
|
xrange = parent_team->size;
|
|
}
|
|
|
|
const int parent_size = parent_team->size;
|
|
const int num_xteams = ceil(parent_size / (float)xrange);
|
|
const int num_yteams = xrange;
|
|
|
|
int start = 0;
|
|
int ret = 0;
|
|
|
|
for (int i = 0; i < num_xteams; i++) {
|
|
nvshmem_team_t my_xteam;
|
|
int xsize = (i == num_xteams - 1 && parent_size % xrange) ? parent_size % xrange : xrange;
|
|
ret = nvshmemi_team_split_strided(parent_team, start, 1, xsize, xaxis_config, xaxis_mask,
|
|
&my_xteam);
|
|
if (ret) {
|
|
NVSHMEMI_ERROR_PRINT("Creation of x-axis team %d of %d failed\n", i + 1, num_xteams);
|
|
}
|
|
start += xrange;
|
|
|
|
if (my_xteam != NVSHMEM_TEAM_INVALID) {
|
|
assert(*xaxis_team == NVSHMEM_TEAM_INVALID);
|
|
*xaxis_team = my_xteam;
|
|
}
|
|
}
|
|
|
|
start = 0;
|
|
|
|
for (int i = 0; i < num_yteams; i++) {
|
|
nvshmem_team_t my_yteam;
|
|
int remainder = parent_size % xrange;
|
|
int yrange = parent_size / xrange;
|
|
int ysize = (remainder && i < remainder) ? yrange + 1 : yrange;
|
|
|
|
ret = nvshmemi_team_split_strided(parent_team, start, xrange, ysize, yaxis_config,
|
|
yaxis_mask, &my_yteam);
|
|
if (ret) {
|
|
NVSHMEMI_ERROR_PRINT("Creation of y-axis team %d of %d failed\n", i + 1, num_yteams);
|
|
}
|
|
start += 1;
|
|
|
|
if (my_yteam != NVSHMEM_TEAM_INVALID) {
|
|
assert(*yaxis_team == NVSHMEM_TEAM_INVALID);
|
|
*yaxis_team = my_yteam;
|
|
}
|
|
}
|
|
|
|
nvshmem_quiet();
|
|
nvshmem_team_sync(parent_team->team_idx);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static bool inline nvshmemi_is_rsvd_teams(nvshmem_team_t team_idx) {
|
|
/* This team resource shouldn't not be deleted as they are used for collectives APIs during
|
|
* init/finalize */
|
|
return ((team_idx == NVSHMEM_TEAM_INVALID) ||
|
|
(team_idx >= NVSHMEM_TEAM_WORLD_INDEX && team_idx < NVSHMEM_TEAMS_MIN));
|
|
}
|
|
|
|
void nvshmemi_team_destroy(nvshmemi_team_t *team) {
|
|
int idx = team->team_idx;
|
|
if (nvshmemi_bit_fetch(psync_pool_avail, idx)) {
|
|
NVSHMEMI_ERROR_PRINT("Destroying a team without an active pSync\n");
|
|
}
|
|
/* Since it is a collective routine, perform a barrier */
|
|
// nvshmem_barrier(idx);
|
|
|
|
if (!team->is_team_node) {
|
|
if (!nvshmemi_is_rsvd_teams(team->team_node) &&
|
|
nvshmemi_team_pool[team->team_node] != NULL) {
|
|
INFO(NVSHMEM_COLL, "Destroy sub-team 1 [%p] at index[%d] for parent-team [%p]",
|
|
nvshmemi_team_pool[team->team_node], team->team_node, team);
|
|
nvshmemi_team_destroy(nvshmemi_team_pool[team->team_node]);
|
|
}
|
|
}
|
|
|
|
if (!team->is_team_same_mype_node) {
|
|
if (!nvshmemi_is_rsvd_teams(team->team_same_mype_node) &&
|
|
nvshmemi_team_pool[team->team_same_mype_node] != NULL) {
|
|
INFO(NVSHMEM_COLL, "Destroy sub-team 2 [%p] at index[%d] for parent-team [%p]",
|
|
nvshmemi_team_pool[team->team_same_mype_node], team->team_same_mype_node, team);
|
|
nvshmemi_team_destroy(nvshmemi_team_pool[team->team_same_mype_node]);
|
|
}
|
|
}
|
|
|
|
nvshmemi_bit_set(psync_pool_avail, N_PSYNC_BYTES, idx);
|
|
|
|
nvshmemi_team_pool[idx] = NULL;
|
|
CUDA_RUNTIME_CHECK(cudaMemset(&nvshmemi_device_team_pool[idx], 0, sizeof(nvshmemi_team_t *)));
|
|
|
|
nvshmemi_call_init_array_kernel<long>(&nvshmemi_sync_counter[2 * idx], 2, 1);
|
|
nvshmemi_call_init_array_kernel<long>(&nvshmemi_psync_pool[idx * get_psync_len_per_team()],
|
|
get_psync_len_per_team(), NVSHMEMI_SYNC_VALUE);
|
|
CUDA_RUNTIME_CHECK(cudaDeviceSynchronize());
|
|
|
|
nvshmemi_team_destroy_nvls(team);
|
|
nvshmemi_recexchalgo_free_mem(team);
|
|
#ifdef NVSHMEM_USE_NCCL
|
|
if (nvshmemi_use_nccl) NCCL_CHECK(nccl_ftable.CommDestroy((ncclComm_t)team->nccl_comm));
|
|
#endif
|
|
if (team != &nvshmemi_team_world && team != &nvshmemi_team_shared &&
|
|
team != &nvshmemi_team_node && team != &nvshmemi_team_same_mype_node &&
|
|
team != &nvshmemi_team_same_gpu && team != &nvshmemi_team_gpu_leaders) {
|
|
free(team);
|
|
}
|
|
nvshmemi_team_t *device_team_addr;
|
|
CUDA_RUNTIME_CHECK(cudaMemcpy((void **)&device_team_addr, &nvshmemi_device_team_pool[idx],
|
|
sizeof(nvshmemi_team_t *), cudaMemcpyDeviceToHost));
|
|
CUDA_RUNTIME_CHECK(cudaFree(device_team_addr));
|
|
}
|
|
|
|
long *nvshmemi_team_get_psync(nvshmemi_team_t *team, nvshmemi_team_op_t op) {
|
|
long *team_psync;
|
|
size_t psync_fcollect_len;
|
|
psync_fcollect_len = get_fcollect_psync_len_per_team();
|
|
team_psync = &nvshmemi_psync_pool[team->team_idx * get_psync_len_per_team()];
|
|
switch (op) {
|
|
case SYNC:
|
|
return team_psync;
|
|
case REDUCE:
|
|
return &team_psync
|
|
[2 * NVSHMEMI_SYNC_SIZE +
|
|
(((nvshmemi_device_state.gpu_coll_env_params_var.reduce_scratch_size / 2) /
|
|
sizeof(long)) *
|
|
(team->rdxn_count % 2))];
|
|
case BCAST:
|
|
return &team_psync[2 * NVSHMEMI_SYNC_SIZE +
|
|
nvshmemi_device_state.gpu_coll_env_params_var.reduce_scratch_size /
|
|
sizeof(long)];
|
|
case FCOLLECT:
|
|
return &team_psync[2 * NVSHMEMI_SYNC_SIZE +
|
|
nvshmemi_device_state.gpu_coll_env_params_var.reduce_scratch_size /
|
|
sizeof(long) +
|
|
NVSHMEMI_BCAST_SYNC_SIZE];
|
|
case ALLTOALL:
|
|
return &team_psync[2 * NVSHMEMI_SYNC_SIZE +
|
|
nvshmemi_device_state.gpu_coll_env_params_var.reduce_scratch_size /
|
|
sizeof(long) +
|
|
NVSHMEMI_BCAST_SYNC_SIZE + psync_fcollect_len +
|
|
(NVSHMEMI_ALLTOALL_SYNC_SIZE * (team->alltoall_count % 2))];
|
|
case FCOLLECT_128:
|
|
return &team_psync[2 * NVSHMEMI_SYNC_SIZE +
|
|
nvshmemi_device_state.gpu_coll_env_params_var.reduce_scratch_size /
|
|
sizeof(long) +
|
|
NVSHMEMI_BCAST_SYNC_SIZE + psync_fcollect_len +
|
|
2 * NVSHMEMI_ALLTOALL_SYNC_SIZE];
|
|
default:
|
|
WARN("Incorrect argument to nvshmemi_team_get_psync\n");
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
long *nvshmemi_team_get_sync_counter(nvshmemi_team_t *team) {
|
|
return &nvshmemi_sync_counter[2 * team->team_idx];
|
|
}
|