/* * Copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved. * * See COPYRIGHT for license information */ #include "team_internal.h" #include // for assert #include // for cudaMemcpy #include // for cudaMemcpyHos... #include // for CHAR_BIT #include // for uint64_t #include // for snprintf, printf #include // for free, malloc #include // for memset, memcmp #include // for ceil #include "../coll/rdxn/rdxn.h" // for nvshmemi_call... #include "device_host/nvshmem_types.h" // for nvshmemi_team_t #include "cpu_coll.h" // for nccl_ftable #include "device_host/nvshmem_common.cuh" // for nvshmemi_pe_i... #include "bootstrap_host_transport/env_defs_internal.h" // for nvshmemi_opti... #include "host/nvshmem_api.h" // for nvshmem_quiet #include "host/nvshmem_coll_api.h" // for nvshmem_team_... #include "host/nvshmemx_api.h" // for nvshmemx_char... #include "non_abi/nvshmemx_error.h" // for NVSHMEMI_NULL... #include "internal/host/debug.h" // for INFO, NVSHMEM... #include "internal/host/nvshmem_internal.h" // for nvshmemi_free #include "internal/host/nvshmemi_coll.h" // for nvshmemi_barrier #include "internal/host/nvshmemi_symmetric_heap.hpp" // for nvshmemi_symm... #include "internal/host/nvshmemi_team.h" // for N_PSYNC_BYTES #include "internal/host/nvshmemi_types.h" // for nvshmemi_state #include "internal/host/util.h" // for CUDA_RUNTIME_... #include "internal/bootstrap_host_transport/nvshmemi_bootstrap_defines.h" // for nvshmemi_boot... #include "internal/host_transport/transport.h" // for nvshmem_trans... #include "non_abi/nvshmem_build_options.h" // for NVSHMEM_USE_NCCL #include "internal/host/nvshmemi_nvls_rsc.hpp" // for nvshmemi_nvls... #ifdef NVSHMEM_USE_NCCL #include "nccl.h" // for ncclUniqueId #endif #include "internal/host_transport/nvshmemi_transport_defines.h" // for NVSHMEM_MEM_H... using namespace nvls; #define NVSHMEMI_DIAG_STRLEN 1024 #define NVSHMEMI_SYNC_VALUE 0 #define NVSHMEMI_REDUCE_MAX_CTA_COUNT 64 /* 0th entry in team duplicate resources is same team as the encapsulating team. This allows * for reuse of the same business logic for nCTA == 1 and nCTA > 1 and minimizes if/else */ #define NVSHMEMI_TEAM_DUP_INITIALIZER(teami, team_idx) \ (teami).team_dups[0] = (team_idx); \ for (int i = 1; i < 128; i++) { \ (teami).team_dups[i] = NVSHMEM_TEAM_INVALID; \ } long nvshmemi_max_teams; nvshmemi_team_t nvshmemi_team_world; nvshmemi_team_t nvshmemi_team_shared; nvshmemi_team_t nvshmemi_team_node; nvshmemi_team_t nvshmemi_team_same_mype_node; nvshmemi_team_t nvshmemi_team_same_gpu; nvshmemi_team_t nvshmemi_team_gpu_leaders; nvshmemi_team_t *nvshmemi_device_team_world, *nvshmemi_device_team_shared, *nvshmemi_device_team_node, *nvshmemi_device_team_same_mype_node, *nvshmemi_device_team_same_gpu, *nvshmemi_device_team_gpu_leaders; nvshmemi_team_t **nvshmemi_team_pool; long *nvshmemi_psync_pool; long *nvshmemi_sync_counter; nvshmemi_team_t **nvshmemi_device_team_pool; static unsigned char *psync_pool_avail; static unsigned char *psync_pool_avail_reduced; static unsigned char *device_psync_pool_avail; static unsigned char *device_psync_pool_avail_reduced; static int *team_ret_val; static int *team_ret_val_reduced; static int *device_team_ret_val; static int *device_team_ret_val_reduced; bool nvshmemi_team_support_nvls(nvshmemi_team_t *team) { return ((team->are_gpus_p2p_connected) && (team->nvls_rsc != nullptr)); } bool nvshmemi_team_is_owner_nvls(nvshmemi_team_t *team) { nvshmemi_nvls_rsc *nvls = reinterpret_cast(team->nvls_rsc); INFO(NVSHMEM_TEAM, "Team ID: %d NVLS Resource Owner ID: %d\n", team->team_idx, nvls->get_owner()); return (nvls->is_owner(team)); } static bool nvshmemi_team_is_nvls_capable(nvshmemi_team_t *team) { return (team->are_gpus_p2p_connected && team->size >= 2 && nvshmemi_state->is_platform_nvls); } static bool nvshmemi_team_is_identical(nvshmemi_team_t *t1, nvshmemi_team_t *t2) { return (t1->start == t2->start && t1->stride == t2->stride && t1->size == t2->size); } static void nvshmemi_team_alloc_device(void) { CUDA_RUNTIME_CHECK(cudaMalloc((void **)&nvshmemi_device_team_world, sizeof(nvshmemi_team_t))); CUDA_RUNTIME_CHECK(cudaMalloc((void **)&nvshmemi_device_team_shared, sizeof(nvshmemi_team_t))); CUDA_RUNTIME_CHECK(cudaMalloc((void **)&nvshmemi_device_team_node, sizeof(nvshmemi_team_t))); CUDA_RUNTIME_CHECK( cudaMalloc((void **)&nvshmemi_device_team_same_mype_node, sizeof(nvshmemi_team_t))); CUDA_RUNTIME_CHECK( cudaMalloc((void **)&nvshmemi_device_team_same_gpu, sizeof(nvshmemi_team_t))); CUDA_RUNTIME_CHECK( cudaMalloc((void **)&nvshmemi_device_team_gpu_leaders, sizeof(nvshmemi_team_t))); } static void nvshmemi_team_update_device(void) { CUDA_RUNTIME_CHECK(cudaMemcpy(nvshmemi_device_team_world, &nvshmemi_team_world, sizeof(nvshmemi_team_t), cudaMemcpyHostToDevice)); CUDA_RUNTIME_CHECK(cudaMemcpy(&nvshmemi_device_team_pool[NVSHMEM_TEAM_WORLD_INDEX], &nvshmemi_device_team_world, sizeof(nvshmemi_team_t *), cudaMemcpyHostToDevice)); CUDA_RUNTIME_CHECK(cudaMemcpy(nvshmemi_device_team_shared, &nvshmemi_team_shared, sizeof(nvshmemi_team_t), cudaMemcpyHostToDevice)); CUDA_RUNTIME_CHECK(cudaMemcpy(&nvshmemi_device_team_pool[NVSHMEM_TEAM_SHARED_INDEX], &nvshmemi_device_team_shared, sizeof(nvshmemi_team_t *), cudaMemcpyHostToDevice)); CUDA_RUNTIME_CHECK(cudaMemcpy(nvshmemi_device_team_node, &nvshmemi_team_node, sizeof(nvshmemi_team_t), cudaMemcpyHostToDevice)); CUDA_RUNTIME_CHECK(cudaMemcpy(&nvshmemi_device_team_pool[NVSHMEM_TEAM_NODE_INDEX], &nvshmemi_device_team_node, sizeof(nvshmemi_team_t *), cudaMemcpyHostToDevice)); CUDA_RUNTIME_CHECK(cudaMemcpy(nvshmemi_device_team_same_mype_node, &nvshmemi_team_same_mype_node, sizeof(nvshmemi_team_t), cudaMemcpyHostToDevice)); CUDA_RUNTIME_CHECK(cudaMemcpy(&nvshmemi_device_team_pool[NVSHMEM_TEAM_SAME_MYPE_NODE_INDEX], &nvshmemi_device_team_same_mype_node, sizeof(nvshmemi_team_t *), cudaMemcpyHostToDevice)); CUDA_RUNTIME_CHECK(cudaMemcpy(nvshmemi_device_team_same_gpu, &nvshmemi_team_same_gpu, sizeof(nvshmemi_team_t), cudaMemcpyHostToDevice)); CUDA_RUNTIME_CHECK(cudaMemcpy(&nvshmemi_device_team_pool[NVSHMEM_TEAM_SAME_GPU_INDEX], &nvshmemi_device_team_same_gpu, sizeof(nvshmemi_team_t *), cudaMemcpyHostToDevice)); CUDA_RUNTIME_CHECK(cudaMemcpy(nvshmemi_device_team_gpu_leaders, &nvshmemi_team_gpu_leaders, sizeof(nvshmemi_team_t), cudaMemcpyHostToDevice)); CUDA_RUNTIME_CHECK(cudaMemcpy(&nvshmemi_device_team_pool[NVSHMEM_TEAM_GPU_LEADERS_INDEX], &nvshmemi_device_team_gpu_leaders, sizeof(nvshmemi_team_t *), cudaMemcpyHostToDevice)); CUDA_RUNTIME_CHECK(cudaDeviceSynchronize()); } static void nvshmemi_recexchalgo_get_neighbors(nvshmemi_team_t *teami) { int i, j, k; int p_of_k = 1, log_p_of_k = 0, rem, T, newpe; int step1_sendto, step1_nrecvs, step2_nphases; int *step1_recvfrom, **step2_nbrs; int *step1_recvfrom_device, **step2_nbrs_device; int my_pe = teami->my_pe; int num_pes = teami->size; INFO(NVSHMEM_COLL, "step 1 nbr calculation started, num_pes = %d", num_pes); k = nvshmemi_options.REDUCE_RECEXCH_KVAL; assert(k > 1); if (num_pes < k) /* If size of the active set is less than k, reduce the value of k */ k = (num_pes > 2) ? num_pes : 2; /* Calculate p_of_k, p_of_k is the largest power of k that is less than num_pes */ while (p_of_k <= num_pes) { p_of_k *= k; log_p_of_k++; } p_of_k /= k; /* protect against underflow warnings when asserts are disabled. */ if (log_p_of_k > 0) { log_p_of_k--; } step2_nphases = log_p_of_k; step1_recvfrom = (int *)malloc(sizeof(int) * (k - 1)); assert(step1_recvfrom); step2_nbrs = (int **)malloc(sizeof(int *) * step2_nphases); assert(step2_nbrs); for (int i = 0; i < step2_nphases; i++) { step2_nbrs[i] = (int *)malloc(sizeof(int) * (k - 1)); assert(step2_nbrs[i]); } rem = num_pes - p_of_k; /* rem is the number of PEs that do not particpate in Step 2 * We need to identify these non-participating PEs. This is done in the following way. * The first T PEs are divided into sets of k consecutive PEs each. * In each of these sets, the first k-1 PEs are the non-participating * PEs while the last PE is the participating PE. * The non-participating PEs send their data to the participating PE * in their corresponding set. */ T = (rem * k) / (k - 1); INFO(NVSHMEM_COLL, "step 1 nbr calculation started. T is %d", T); step1_nrecvs = 0; step1_sendto = -1; /* Step 1 */ if (my_pe < T) { if (my_pe % k != (k - 1)) { /* I am a non-participating PE */ step1_sendto = my_pe + (k - 1 - my_pe % k); /* partipating PE to send the data to */ /* if the corresponding participating PE is not in T, * then send to the Tth PE to preserve non-commutativity */ if (step1_sendto > T - 1) step1_sendto = T; newpe = -1; /* tag this PE as non-participating */ } else { /* participating PE */ for (i = 0; i < k - 1; i++) { step1_recvfrom[i] = my_pe - i - 1; } step1_nrecvs = k - 1; newpe = my_pe / k; /* this is the new PE amongst the set of participating PEs */ } } else { /* PE >= T */ newpe = my_pe - rem; if (my_pe == T && (T - 1) % k != k - 1 && T >= 1) { int nsenders = (T - 1) % k + 1; /* number of PEs sending their data to me in Step 1 */ for (j = nsenders - 1; j >= 0; j--) { step1_recvfrom[nsenders - 1 - j] = T - nsenders + j; } step1_nrecvs = nsenders; } } INFO(NVSHMEM_COLL, "step 1 nbr computation completed"); /* Step 2 */ if (step1_sendto == -1) { /* calulate step2_nbrs only for participating PEs */ int *digit = (int *)malloc(sizeof(int) * step2_nphases); assert(digit != NULL); int temppe = newpe; int mask = 0x1; int phase = 0, cbit, cnt, nbr, power; /* calculate the digits in base k representation of newpe */ for (i = 0; i < log_p_of_k; i++) digit[i] = 0; int remainder, i_digit = 0; while (temppe != 0) { remainder = temppe % k; temppe = temppe / k; digit[i_digit] = remainder; i_digit++; } while (mask < p_of_k) { cbit = digit[phase]; /* phase_th digit changes in this phase, obtain its original value */ cnt = 0; for (i = 0; i < k; i++) { /* there are k-1 neighbors */ if (i != cbit) { /* do not generate yourself as your nieighbor */ digit[phase] = i; /* this gets us the base k representation of the neighbor */ /* calculate the base 10 value of the neighbor PE */ nbr = 0; power = 1; for (j = 0; j < log_p_of_k; j++) { nbr += digit[j] * power; power *= k; } /* calculate its real PE and store it */ step2_nbrs[phase][cnt] = (nbr < rem / (k - 1)) ? (nbr * k) + (k - 1) : nbr + rem; cnt++; } } INFO(NVSHMEM_COLL, "step 2, phase %d nbr calculation completed", phase); digit[phase] = cbit; /* reset the digit to original value */ phase++; mask *= k; } free(digit); } // Update with global PE numbers if (step1_sendto != -1) step1_sendto = teami->start + step1_sendto * teami->stride; for (int i = 0; i < step1_nrecvs; i++) step1_recvfrom[i] = teami->start + step1_recvfrom[i] * teami->stride; for (int i = 0; i < step2_nphases; i++) for (int j = 0; j < k - 1; j++) step2_nbrs[i][j] = teami->start + step2_nbrs[i][j] * teami->stride; // Copy the data to device memory CUDA_RUNTIME_CHECK(cudaMalloc(&step1_recvfrom_device, sizeof(int) * (k - 1))); CUDA_RUNTIME_CHECK(cudaMalloc( &step2_nbrs_device, sizeof(int *) * (step2_nphases + 1))); /* + 1 to make it non-zero otherwise cuMemAlloc returns error when step2_nphases is 0 */ for (int i = 0; i < step2_nphases; i++) { void *dev_ptr; CUDA_RUNTIME_CHECK(cudaMalloc(&dev_ptr, sizeof(int) * (k - 1))); CUDA_RUNTIME_CHECK(cudaMemcpy((int **)step2_nbrs_device + i, &dev_ptr, sizeof(int *), cudaMemcpyHostToDevice)); } CUDA_RUNTIME_CHECK(cudaMemcpy(step1_recvfrom_device, step1_recvfrom, sizeof(int) * step1_nrecvs, cudaMemcpyHostToDevice)); void *dev_ptr, *dev_ptr_2; dev_ptr = step2_nbrs_device; for (int i = 0; i < step2_nphases; i++) { CUDA_RUNTIME_CHECK( cudaMemcpy(&dev_ptr_2, (int **)dev_ptr + i, sizeof(int *), cudaMemcpyDeviceToHost)); CUDA_RUNTIME_CHECK( cudaMemcpy(dev_ptr_2, step2_nbrs[i], sizeof(int) * (k - 1), cudaMemcpyHostToDevice)); } teami->reduce_recexch.step1_sendto = step1_sendto; teami->reduce_recexch.step1_nrecvs = step1_nrecvs; teami->reduce_recexch.step2_nphases = step2_nphases; teami->reduce_recexch.step1_recvfrom = step1_recvfrom_device; teami->reduce_recexch.step2_nbrs = step2_nbrs_device; free(step1_recvfrom); for (int i = 0; i < step2_nphases; i++) { if (step2_nbrs[i]) { free(step2_nbrs[i]); } } free(step2_nbrs); } static void nvshmemi_recexchalgo_free_mem(nvshmemi_team_t *teami) { CUDA_RUNTIME_CHECK(cudaFree(teami->reduce_recexch.step1_recvfrom)); for (int i = 0; i < teami->reduce_recexch.step2_nphases; i++) { void *dev_ptr; CUDA_RUNTIME_CHECK(cudaMemcpy(&dev_ptr, teami->reduce_recexch.step2_nbrs + i, sizeof(int *), cudaMemcpyDeviceToHost)); CUDA_RUNTIME_CHECK(cudaFree(dev_ptr)); } CUDA_RUNTIME_CHECK(cudaFree(teami->reduce_recexch.step2_nbrs)); } static inline void nvshmemi_bit_set(unsigned char *ptr, size_t size, size_t index) { assert(size > 0 && (index < size * CHAR_BIT)); size_t which_byte = index / CHAR_BIT; ptr[which_byte] |= (1 << (index % CHAR_BIT)); return; } static inline void nvshmemi_bit_clear(unsigned char *ptr, size_t size, size_t index) { assert(size > 0 && (index < size * CHAR_BIT)); size_t which_byte = index / CHAR_BIT; ptr[which_byte] &= ~(1 << (index % CHAR_BIT)); return; } static inline unsigned char nvshmemi_bit_fetch(unsigned char *ptr, size_t index) { return (ptr[index / CHAR_BIT] >> (index % CHAR_BIT)) & 1; } static inline size_t nvshmemi_bit_1st_nonzero(const unsigned char *ptr, const size_t size) { /* The following ignores endianess: */ for (size_t i = 0; i < size; i++) { unsigned char bit_val = ptr[i]; for (size_t j = 0; bit_val && j < CHAR_BIT; j++) { if (bit_val & 1) return i * CHAR_BIT + j; bit_val >>= 1; } } return (size_t)-1; } /* Create a bit string of the format AAAAAAAA.BBBBBBBB into str for the byte * array passed via ptr. */ static inline void nvshmemi_bit_to_string(char *str, size_t str_size, unsigned char *ptr, size_t ptr_size) { size_t off = 0; for (size_t i = 0; i < ptr_size; i++) { for (size_t j = 0; j < CHAR_BIT; j++) { off += snprintf(str + off, str_size - off, "%s", (ptr[i] & (1 << (CHAR_BIT - 1 - j))) ? "1" : "0"); if (off >= str_size) return; } if (i < ptr_size - 1) { off += snprintf(str + off, str_size - off, "."); if (off >= str_size) return; } } } /* Checks whether a PE has a consistent stride given (start, stride, size). * This function is useful within a loop across PE IDs, and sets 'start', * 'stride' and 'size' accordingly upon exiting the loop. It also assumes * 'start' and 'stride' are initialized to a negative number and 'size' to 0. * If an inconsistent stride is found, returns -1. */ static inline int check_for_linear_stride(int pe, int *start, int *stride, int *size) { if (*start < 0) { *start = pe; (*size)++; } else if (*stride < 0) { *stride = pe - *start; (*size)++; } else if ((pe - *start) % *stride != 0) { NVSHMEMI_WARN_PRINT("Detected non-uniform stride inserting PE %d into <%d, %d, %d>\n", pe, *start, *stride, *size); return -1; } else { (*size)++; } return 0; } static inline int nvshmemi_pe_in_active_set(int global_pe, int PE_start, int PE_stride, int PE_size) { int n = (global_pe - PE_start) / PE_stride; if (global_pe < PE_start || (global_pe - PE_start) % PE_stride || n >= PE_size) return -1; else { return n; } } int nvshmemi_team_translate_pe(nvshmemi_team_t *src_team, int src_pe, nvshmemi_team_t *dest_team) { int src_pe_world, dest_pe = -1; if (src_pe > src_team->size) return -1; src_pe_world = src_team->start + src_pe * src_team->stride; assert(src_pe_world >= src_team->start && src_pe_world < nvshmemi_state->npes); dest_pe = nvshmemi_pe_in_active_set(src_pe_world, dest_team->start, dest_team->stride, dest_team->size); return dest_pe; } static inline size_t get_fcollect_psync_len_per_team() { size_t fcollect_ll_threshold = nvshmemi_device_state.gpu_coll_env_params_var.fcollect_ll_threshold; size_t fcollect_sync_size = (2 * 2 * nvshmemi_state->npes * fcollect_ll_threshold) / sizeof(long); assert(fcollect_ll_threshold % sizeof(long) == 0); return fcollect_sync_size; } static inline size_t get_fcollect_ll128_psync_len_per_team() { size_t fcollect_ll128_threshold = nvshmemi_device_state.gpu_coll_env_params_var.fcollect_ll128_threshold; size_t fcollect_ll128_sync_size = NVSHMEMI_FCOLLECT_LL128_CALC_PSYNC_SIZE(fcollect_ll128_threshold, char); /* scale for npes and two separate psyncs */ fcollect_ll128_sync_size = fcollect_ll128_sync_size * 2 * nvshmemi_state->npes / sizeof(long); return fcollect_ll128_sync_size; } static inline size_t get_psync_len_per_team() { size_t fcollect_sync_size = get_fcollect_psync_len_per_team(); size_t fcollect_ll128_sync_size = get_fcollect_ll128_psync_len_per_team(); /* sync: Two buffers are used - one for sync/barrier collective ops, the second one during team split operation reduce: Two pWrk's are used alternatively across consecutive reduce calls, this is to avoid having to put a barrier in between bcast: The buffer is split to do multiple consecutive broadcast, when all buffers are used, a barrier is called and then again we begin from the start of the buffer fcollect: Two sets of buffer are used to alternate between - same way as in reduce. The other fator of 2 is because when using LL double the space is needed to fuse flag with data */ size_t ans = (2 * NVSHMEMI_SYNC_SIZE + nvshmemi_device_state.gpu_coll_env_params_var.reduce_scratch_size / sizeof(long) + NVSHMEMI_BCAST_SYNC_SIZE + fcollect_sync_size + 2 * NVSHMEMI_ALLTOALL_SYNC_SIZE + fcollect_ll128_sync_size); return ans; } size_t nvshmemi_get_teams_mem_requirement() { size_t psync_size = get_psync_len_per_team(); size_t teams_mem_req = sizeof(long) * nvshmemi_max_teams * psync_size + /* psync's */ 2 * N_PSYNC_BYTES + /* psync_pool_avail */ 2 * sizeof(int) + /* team_ret_val */ 2 * sizeof(long) * nvshmemi_max_teams /* storing counters */ #ifdef NVSHMEM_USE_NCCL + sizeof(ncclUniqueId) #endif ; INFO(NVSHMEM_INIT, "team psync mem req %ld bytes, team mem total req %d bytes, max teams %ld\n", psync_size, teams_mem_req, nvshmemi_max_teams); return teams_mem_req; } #ifdef NVSHMEM_USE_NCCL void nvshmemi_team_init_nccl_comm(nvshmemi_team_t *teami) { ncclUniqueId Id; int start = teami->start; int stride = teami->stride; int size = teami->size; /* This is technical debt where we are using the REDUCE op psync as scratchpad for src/dst of * broadcast broadcast's psync is used for LL8 and other algorithms, making it non-trivial to * share when issued from the host as a src or dest buffer. * * When reduce coll supports LL8 algorithm, we need to clean this up as a independent scratch * space */ long *pWrk = nvshmemi_team_get_psync(teami, REDUCE); if (teami->my_pe == 0) { NCCL_CHECK(nccl_ftable.GetUniqueId(&Id)); CUDA_RUNTIME_CHECK(cudaMemcpy(pWrk, &Id, sizeof(ncclUniqueId), cudaMemcpyHostToDevice)); CUDA_RUNTIME_CHECK(cudaDeviceSynchronize()); for (int i = 0; i < size; i++) { nvshmemx_char_put_nbi_on_stream((char *)pWrk, (const char *)pWrk, sizeof(ncclUniqueId), start + i * stride, (cudaStream_t)0); } CUDA_RUNTIME_CHECK(cudaDeviceSynchronize()); nvshmemi_barrier(teami->team_idx); } else { nvshmemi_barrier(teami->team_idx); CUDA_RUNTIME_CHECK(cudaMemcpy(&Id, pWrk, sizeof(ncclUniqueId), cudaMemcpyDeviceToHost)); } INFO(NVSHMEM_TEAM, "Calling ncclCommInitRank, teami->size = %d, teami->my_pe = %d", teami->size, teami->my_pe); NCCL_CHECK( nccl_ftable.CommInitRank((ncclComm_t *)&teami->nccl_comm, teami->size, Id, teami->my_pe)); } #endif /* NVSHMEM_USE_NCCL */ void nvshmemi_team_set_p2p_connectivity(nvshmemi_team_t *teami) { teami->are_gpus_p2p_connected = 1; for (int pe = teami->start; pe < teami->start + teami->stride * teami->size; pe += teami->stride) { if (nvshmemi_state->heap_obj->get_local_pe_base()[pe] == NULL) { teami->are_gpus_p2p_connected = 0; break; } } } /* NVLS Resource management for teams */ static void nvshmemi_team_destroy_nvls(nvshmemi_team_t *team) { if (team->nvls_rsc == nullptr) return; /* NOOP */ nvshmemi_nvls_rsc *nvls_obj = nullptr; nvls_obj = reinterpret_cast(team->nvls_rsc); if (nvls_obj->get_refcount() == 0) { /* Last reference */ nvshmemi_state->heap_obj->nvls_unmap_heap_memory_by_team(team); nvshmemi_state->heap_obj->nvls_unbind_heap_memory_by_team(team); nvls_obj->free_group_mem(); nvls_obj->release_owner(); delete nvls_obj; cudaFree(team->nvls_rsc_base_ptr); team->nvls_rsc = nullptr; INFO(NVSHMEM_TEAM, "NVLS Resource Destroyed for Team ID %d\n", team->team_idx); } else { nvls_obj->del_refcount(); /* Shared nvls resource */ /* Ownership of NVLS resource is necessary to allow for newly allocated UC memory to be * bound and mapped to MC heap */ if (nvls_obj->is_owner(team)) { // Transfer ownership to one of the dup teams NVSHMEMU_FOR_EACH_IF( i, nvshmemi_max_teams, nvshmemi_team_pool[i] != NULL && nvshmemi_team_support_nvls(nvshmemi_team_pool[i]), { // Find first duplicate team that shares the nvls rsc and make it the owner if (nvshmemi_team_pool[i]->nvls_rsc == team->nvls_rsc) { nvls_obj->release_owner(); nvls_obj->assign_owner(nvshmemi_team_pool[i]); break; } }) } } } static int nvshmemi_team_create_nvls(nvshmemi_team_t *team) { int status = -1; uint64_t mc_heap_base; nvshmemi_nvls_rsc *nvls_obj = nullptr; if (!nvshmemi_team_is_nvls_capable(team)) { team->nvls_rsc = nullptr; WARN("Skipping NVLINK SHARP resource initialized for team ID: %d\n", team->team_idx); return 0; } try { team->nvls_rsc = reinterpret_cast(new nvshmemi_nvls_rsc(team, nvshmemi_state)); } catch (nvshmemi_nvls_exception &exp) { WARN("NVLINK SHARP resource initialization failed for team ID: %d\n", team->team_idx); team->nvls_rsc = nullptr; return 0; } nvls_obj = reinterpret_cast(team->nvls_rsc); status = nvls_obj->reserve_group_mem(); NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, cleanup, "Reserve multicast group mapping failed for pe %d\n", team->my_pe); nvls_obj->assign_owner(team); CUDA_RUNTIME_CHECK(cudaMalloc((void **)&team->nvls_rsc_base_ptr, sizeof(void *))); mc_heap_base = (uint64_t)(nvls_obj->get_mc_base()); CUDA_RUNTIME_CHECK( cudaMemcpy(team->nvls_rsc_base_ptr, &mc_heap_base, sizeof(void *), cudaMemcpyHostToDevice)); INFO(NVSHMEM_TEAM, "NVLS Resource Created for Team ID %d MC VA Base: %llx\n", team->team_idx, mc_heap_base); return (status); cleanup: (void)nvls_obj->free_group_mem(); delete nvls_obj; team->nvls_rsc = nullptr; return (status); } static int nvshmemi_team_bind_nvls(nvshmemi_team_t *team) { int status = -1; /* Make a MC handle as large as physical heap size */ nvshmemi_nvls_rsc *nvls_obj = nullptr; nvls_obj = reinterpret_cast(team->nvls_rsc); if (!nvls_obj->is_owner(team)) return 0; status = nvshmemi_state->heap_obj->nvls_create_heap_memory_by_team(team); NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, cleanup, "Create multicast groups for UC heap failed for pe %d team ID %d\n", team->my_pe, team->team_idx); status = nvshmemi_state->heap_obj->nvls_bind_heap_memory_by_team(team); NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, cleanup, "Binding multicast groups to UC heap failed for pe %d team ID %d\n", team->my_pe, team->team_idx); status = nvshmemi_state->heap_obj->nvls_map_heap_memory_by_team(team); NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, cleanup, "Mapping multicast groups for UC heap failed for pe %d team ID %d\n", team->my_pe, team->team_idx); cleanup: return (status); } static int nvshmemi_team_nvls_setup(nvshmemi_team_t *team) { int status = 0; /* Initialize NVLS resources for team supporting P2P connected GPUs */ status = nvshmemi_team_create_nvls(team); NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, cleanup, "NVLS resource initialization failed for team ID: %d\n", team->team_idx); /* Any prior UC allocations need to bound to this team's MC groups */ if (team->nvls_rsc != nullptr) { status = nvshmemi_team_bind_nvls(team); NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, cleanup, "NVLS resource bind and mapping existing UC mappings to MC heap " "failed for team ID: %d\n", team->team_idx); } cleanup: return (status); } /* Team Management Routines */ int nvshmemi_set_max_teams(void) { nvshmemi_max_teams = nvshmemi_options.MAX_TEAMS; if (nvshmemi_max_teams < NVSHMEM_TEAMS_MIN) nvshmemi_max_teams = NVSHMEM_TEAMS_MIN; /* On NVLS sharp enabled platforms increase default max teams to 64 */ if (nvshmemi_state->is_platform_nvls) { nvshmemi_max_teams = (nvshmemi_max_teams > NVSHMEMI_REDUCE_MAX_CTA_COUNT) ? nvshmemi_max_teams : NVSHMEMI_REDUCE_MAX_CTA_COUNT; } if (nvshmemi_max_teams > N_PSYNC_BYTES * CHAR_BIT) { NVSHMEMI_ERROR_EXIT("Requested %ld teams, but only %d are supported\n", nvshmemi_max_teams, N_PSYNC_BYTES * CHAR_BIT); return 1; } return 0; } int nvshmemi_team_init(void) { long psync_len; int start, stride, size; int *scratch = NULL; int status = 0; uint64_t *hostHash = NULL; uint64_t myHostHash = 0; nvshmem_transport_pe_info_t *pe_info; int i; nvshmemi_team_world = NVSHMEMI_TEAM_INITIALIZER; nvshmemi_team_shared = NVSHMEMI_TEAM_INITIALIZER; nvshmemi_team_node = NVSHMEMI_TEAM_INITIALIZER; nvshmemi_team_same_mype_node = NVSHMEMI_TEAM_INITIALIZER; nvshmemi_team_same_gpu = NVSHMEMI_TEAM_INITIALIZER; nvshmemi_team_gpu_leaders = NVSHMEMI_TEAM_INITIALIZER; /* Initialize NVSHMEM_TEAM_WORLD */ nvshmemi_team_world.team_idx = NVSHMEM_TEAM_WORLD_INDEX; NVSHMEMI_TEAM_DUP_INITIALIZER(nvshmemi_team_world, NVSHMEM_TEAM_WORLD_INDEX); nvshmemi_team_world.start = 0; nvshmemi_team_world.stride = 1; nvshmemi_team_world.size = nvshmemi_state->npes; nvshmemi_team_world.my_pe = nvshmemi_state->mype; nvshmemi_team_world.rdxn_count = 0; nvshmemi_team_world.config_mask = 0; nvshmemi_team_world.ll_flag = 1; nvshmemi_team_world.alltoall_count = 0; nvshmemi_team_world.bcast_count = 0; nvshmemi_team_world.bcast_sync_offset = 0; nvshmemi_team_world.fcollect_count = 0; nvshmemi_team_set_p2p_connectivity(&nvshmemi_team_world); nvshmemi_recexchalgo_get_neighbors(&nvshmemi_team_world); nvshmemi_team_world.is_team_node = false; nvshmemi_team_world.is_team_same_mype_node = false; /* Initialize NVSHMEM_TEAM_SHARED */ nvshmemi_team_shared.team_idx = NVSHMEM_TEAM_SHARED_INDEX; NVSHMEMI_TEAM_DUP_INITIALIZER(nvshmemi_team_shared, NVSHMEM_TEAM_SHARED_INDEX); /* Collect list of p2p connected PEs */ int *p2p_pe_list = (int *)malloc(nvshmemi_team_world.size * sizeof(int)); int n_p2p_pes = 0; int my_idx_in_p2p_list = 0; for (int i = 0; i < nvshmemi_team_world.size; i++) { if (nvshmemi_state->heap_obj->get_local_pe_base()[i]) { if (i == nvshmemi_team_world.my_pe) my_idx_in_p2p_list = n_p2p_pes; p2p_pe_list[n_p2p_pes++] = i; } } std::ostringstream ss; for (int i = 0; i < n_p2p_pes; i++) { ss << p2p_pe_list[i] << " "; } INFO(NVSHMEM_INIT, "P2P list: %s", ss.str().c_str()); /* Make sure that n_p2p_pes is same for all PEs to form TEAM_SHARED */ int *n_p2p_pes_all = (int *)malloc(nvshmemi_team_world.size * sizeof(int)); int *p2p_pe_list_all = (int *)malloc(sizeof(int) * n_p2p_pes * nvshmemi_team_world.size); nvshmemi_boot_handle.allgather((void *)&n_p2p_pes, (void *)n_p2p_pes_all, sizeof(int), &nvshmemi_boot_handle); for (i = 0; i < nvshmemi_team_world.size; i++) { if (n_p2p_pes_all[i] != n_p2p_pes) { INFO(NVSHMEM_INIT, "n_p2p_pes is not equal across PEs, setting NVSHMEM_TEAM_SHARED to self"); goto team_shared_single_pe; } } /* Gather p2p lists of all PEs and ensure they are the same */ nvshmemi_boot_handle.allgather((void *)p2p_pe_list, (void *)p2p_pe_list_all, sizeof(int) * n_p2p_pes, &nvshmemi_boot_handle); for (i = 0; i < n_p2p_pes; i++) { if (memcmp((void *)p2p_pe_list, (void *)&p2p_pe_list_all[p2p_pe_list[i] * n_p2p_pes], sizeof(int) * n_p2p_pes) != 0) { INFO(NVSHMEM_INIT, "P2P lists are not symmetric, setting NVSHMEM_TEAM_SHARED to self"); goto team_shared_single_pe; } } for (int i = 2; i < n_p2p_pes; i++) { if (p2p_pe_list[i] - p2p_pe_list[i - 1] != p2p_pe_list[i - 1] - p2p_pe_list[i - 2]) { INFO(NVSHMEM_INIT, "P2P list is not of the form (start, stride, size). Cannot form " "NVSHMEM_TEAM_SHARED."); goto team_shared_single_pe; } } nvshmemi_team_shared.my_pe = my_idx_in_p2p_list; nvshmemi_team_shared.start = p2p_pe_list[0]; nvshmemi_team_shared.stride = n_p2p_pes > 1 ? (p2p_pe_list[1] - p2p_pe_list[0]) : 1; nvshmemi_team_shared.size = n_p2p_pes; goto team_shared_setup; team_shared_single_pe: nvshmemi_team_shared.my_pe = 0; nvshmemi_team_shared.start = nvshmemi_state->mype; nvshmemi_team_shared.stride = 1; nvshmemi_team_shared.size = 1; nvshmemi_team_shared.is_team_node = true; nvshmemi_team_shared.is_team_same_mype_node = true; team_shared_setup: free(n_p2p_pes_all); free(p2p_pe_list_all); free(p2p_pe_list); nvshmemi_team_shared.rdxn_count = 0; nvshmemi_team_shared.config_mask = 0; nvshmemi_team_shared.ll_flag = 1; nvshmemi_team_shared.alltoall_count = 0; nvshmemi_team_shared.bcast_count = 0; nvshmemi_team_shared.bcast_sync_offset = 0; nvshmemi_team_shared.fcollect_count = 0; nvshmemi_team_shared.are_gpus_p2p_connected = 0; nvshmemi_team_set_p2p_connectivity(&nvshmemi_team_shared); nvshmemi_recexchalgo_get_neighbors(&nvshmemi_team_shared); INFO(NVSHMEM_INIT, "NVSHMEM_TEAM_SHARED: start=%d, stride=%d, size=%d", nvshmemi_team_shared.start, nvshmemi_team_shared.stride, nvshmemi_team_shared.size); nvshmemi_team_shared.is_team_same_mype_node = false; /* Initialize NVSHMEMX_TEAM_NODE */ nvshmemi_team_node.team_idx = NVSHMEM_TEAM_NODE_INDEX; NVSHMEMI_TEAM_DUP_INITIALIZER(nvshmemi_team_node, NVSHMEM_TEAM_NODE_INDEX); nvshmemi_team_world.team_node = nvshmemi_team_node.team_idx; nvshmemi_team_node.my_pe = nvshmemi_state->mype_node; nvshmemi_team_node.rdxn_count = 0; nvshmemi_team_node.config_mask = 0; nvshmemi_team_node.ll_flag = 1; nvshmemi_team_node.alltoall_count = 0; nvshmemi_team_node.bcast_count = 0; nvshmemi_team_node.bcast_sync_offset = 0; nvshmemi_team_node.fcollect_count = 0; myHostHash = getHostHash(); hostHash = (uint64_t *)malloc(sizeof(uint64_t) * nvshmemi_state->npes); NVSHMEMI_NULL_ERROR_JMP(hostHash, status, NVSHMEMX_ERROR_OUT_OF_MEMORY, cleanup, "hostHash allocation failed \n"); status = nvshmemi_boot_handle.allgather((void *)&myHostHash, (void *)hostHash, sizeof(uint64_t), &nvshmemi_boot_handle); NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, cleanup, "allgather of host hashes failed\n"); /* Search for on-node peer PEs while checking for a consistent stride */ start = -1; stride = -1; size = 0; for (int pe = 0; pe < nvshmemi_state->npes; pe++) { if (hostHash[pe] != myHostHash) continue; int ret = check_for_linear_stride(pe, &start, &stride, &size); if (ret < 0) { start = nvshmemi_state->mype; stride = 1; size = 1; break; } } assert(start >= 0 && size > 0); nvshmemi_team_node.start = start; nvshmemi_team_node.stride = (stride == -1) ? 1 : stride; nvshmemi_team_node.size = size; if (nvshmemi_team_is_identical(&nvshmemi_team_world, &nvshmemi_team_node)) { nvshmemi_team_world.is_team_node = true; } if (nvshmemi_team_is_identical(&nvshmemi_team_shared, &nvshmemi_team_node)) { nvshmemi_team_shared.is_team_node = true; } nvshmemi_team_set_p2p_connectivity(&nvshmemi_team_node); nvshmemi_recexchalgo_get_neighbors(&nvshmemi_team_node); nvshmemi_team_node.is_team_node = true; nvshmemi_team_node.is_team_same_mype_node = false; INFO(NVSHMEM_INIT, "NVSHMEMX_TEAM_NODE: start=%d, stride=%d, size=%d", nvshmemi_team_node.start, nvshmemi_team_node.stride, nvshmemi_team_node.size); /* Initialize NVSHMEMX_TEAM_SAME_MYPE_NODE */ nvshmemi_team_same_mype_node.team_idx = NVSHMEM_TEAM_SAME_MYPE_NODE_INDEX; NVSHMEMI_TEAM_DUP_INITIALIZER(nvshmemi_team_same_mype_node, NVSHMEM_TEAM_SAME_MYPE_NODE_INDEX); nvshmemi_team_world.team_same_mype_node = nvshmemi_team_same_mype_node.team_idx; nvshmemi_team_same_mype_node.my_pe = nvshmemi_state->mype / nvshmemi_state->npes_node; nvshmemi_team_same_mype_node.rdxn_count = 0; nvshmemi_team_same_mype_node.config_mask = 0; nvshmemi_team_same_mype_node.start = nvshmemi_state->mype_node; nvshmemi_team_same_mype_node.stride = nvshmemi_state->npes_node; nvshmemi_team_same_mype_node.size = nvshmemi_state->npes / nvshmemi_state->npes_node; assert(nvshmemi_state->npes % nvshmemi_state->npes_node == 0); nvshmemi_team_same_mype_node.ll_flag = 1; nvshmemi_team_same_mype_node.alltoall_count = 0; nvshmemi_team_same_mype_node.bcast_count = 0; nvshmemi_team_same_mype_node.bcast_sync_offset = 0; nvshmemi_team_same_mype_node.fcollect_count = 0; nvshmemi_team_set_p2p_connectivity(&nvshmemi_team_same_mype_node); nvshmemi_recexchalgo_get_neighbors(&nvshmemi_team_same_mype_node); nvshmemi_team_same_mype_node.is_team_node = false; nvshmemi_team_same_mype_node.is_team_same_mype_node = true; INFO(NVSHMEM_INIT, "NVSHMEMX_TEAM_SAME_MYPE_NODE: start=%d, stride=%d, size=%d", nvshmemi_team_same_mype_node.start, nvshmemi_team_same_mype_node.stride, nvshmemi_team_same_mype_node.size); /* Initialize team NVSHMEMI_TEAM_SAME_GPU */ nvshmemi_team_same_gpu.team_idx = NVSHMEM_TEAM_SAME_GPU_INDEX; NVSHMEMI_TEAM_DUP_INITIALIZER(nvshmemi_team_same_gpu, NVSHMEM_TEAM_SAME_GPU_INDEX); nvshmemi_team_same_gpu.rdxn_count = 0; nvshmemi_team_same_gpu.ll_flag = 1; nvshmemi_team_same_gpu.alltoall_count = 0; nvshmemi_team_same_gpu.bcast_count = 0; nvshmemi_team_same_gpu.bcast_sync_offset = 0; nvshmemi_team_same_gpu.fcollect_count = 0; nvshmemi_team_same_gpu.config_mask = 0; pe_info = nvshmemi_state->pe_info; start = -1; stride = -1; size = 0; for (int pe = 0; pe < nvshmemi_state->npes; pe++) { if (pe_info[pe].hostHash != pe_info[nvshmemi_state->mype].hostHash || memcmp(&pe_info[pe].gpu_uuid, &pe_info[nvshmemi_state->mype].gpu_uuid, sizeof(cudaUUID_t)) != 0) continue; int ret = check_for_linear_stride(pe, &start, &stride, &size); if (ret < 0) { NVSHMEMI_ERROR_EXIT("Could not form NVSHMEMI_TEAM_SAME_GPU\n"); break; } } assert(start >= 0 && size > 0); nvshmemi_team_same_gpu.my_pe = (nvshmemi_state->mype - start) / stride; nvshmemi_team_same_gpu.start = start; nvshmemi_team_same_gpu.stride = (stride == -1) ? 1 : stride; nvshmemi_team_same_gpu.size = size; nvshmemi_team_set_p2p_connectivity(&nvshmemi_team_same_gpu); nvshmemi_recexchalgo_get_neighbors(&nvshmemi_team_same_gpu); nvshmemi_team_same_gpu.is_team_node = true; nvshmemi_team_same_gpu.is_team_same_mype_node = false; INFO(NVSHMEM_INIT, "NVSHMEMI_TEAM_SAME_GPU: start=%d, stride=%d, size=%d", nvshmemi_team_same_gpu.start, nvshmemi_team_same_gpu.stride, nvshmemi_team_same_gpu.size); /* All GPUs must have same number of processes (requires for us to form teams) */ /* Initialize team NVSHMEMI_TEAM_GPU_LEADERS */ scratch = (int *)malloc(sizeof(int) * nvshmemi_state->npes); NVSHMEMI_NULL_ERROR_JMP(scratch, status, NVSHMEMX_ERROR_OUT_OF_MEMORY, cleanup, "Unable to allocate host memory for team creation.\n"); if (nvshmemi_team_same_gpu.start == nvshmemi_state->mype) { /* Only GPU leaders are part of this team */ nvshmemi_team_gpu_leaders.team_idx = NVSHMEM_TEAM_GPU_LEADERS_INDEX; NVSHMEMI_TEAM_DUP_INITIALIZER(nvshmemi_team_gpu_leaders, NVSHMEM_TEAM_GPU_LEADERS_INDEX); nvshmemi_team_gpu_leaders.config_mask = 0; nvshmemi_team_gpu_leaders.start = 0; nvshmemi_team_gpu_leaders.stride = (nvshmemi_team_same_gpu.stride == 1) ? nvshmemi_team_same_gpu.size : 1; nvshmemi_team_gpu_leaders.size = nvshmemi_state->npes / nvshmemi_team_same_gpu.size; nvshmemi_team_gpu_leaders.my_pe = (nvshmemi_state->mype - nvshmemi_team_gpu_leaders.start) / nvshmemi_team_gpu_leaders.stride; nvshmemi_team_gpu_leaders.rdxn_count = 0; nvshmemi_team_gpu_leaders.ll_flag = 1; nvshmemi_team_gpu_leaders.alltoall_count = 0; nvshmemi_team_gpu_leaders.bcast_count = 0; nvshmemi_team_gpu_leaders.bcast_sync_offset = 0; nvshmemi_team_gpu_leaders.fcollect_count = 0; nvshmemi_team_set_p2p_connectivity(&nvshmemi_team_gpu_leaders); nvshmemi_recexchalgo_get_neighbors(&nvshmemi_team_gpu_leaders); status = nvshmemi_boot_handle.allgather((void *)&nvshmemi_team_gpu_leaders.my_pe, (void *)scratch, sizeof(int), &nvshmemi_boot_handle); NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, cleanup, "allgather of gpu leaders failed\n"); /* Check whether a valid TEAM_GPU_LEADERS was formed */ int last_mype = -1; for (int i = 0; i < nvshmemi_state->npes; i++) { if (scratch[i] != -1) { if (scratch[i] != last_mype + 1) { WARN( "NVSHMEMI_TEAM_GPU_LEADERS could not be formed, Limited MPG support will " "not be available\n"); break; } else { last_mype++; } } } /* XXX: Note that we are not setting team_node and team_same_mype_node for * nvshmemi_team_gpu_leaders */ nvshmemi_team_gpu_leaders.is_team_node = false; nvshmemi_team_gpu_leaders.is_team_same_mype_node = false; INFO(NVSHMEM_INIT, "NVSHMEMI_TEAM_GPU_LEADERS: start=%d, stride=%d, size=%d", nvshmemi_team_gpu_leaders.start, nvshmemi_team_gpu_leaders.stride, nvshmemi_team_gpu_leaders.size); } else { int my_pe = -1; status = nvshmemi_boot_handle.allgather((void *)&my_pe, (void *)scratch, sizeof(int), &nvshmemi_boot_handle); NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, cleanup, "allgather of gpu leaders failed\n"); } if (nvshmemi_max_teams < NVSHMEM_TEAMS_MIN) nvshmemi_max_teams = NVSHMEM_TEAMS_MIN; if (nvshmemi_max_teams > N_PSYNC_BYTES * CHAR_BIT) { NVSHMEMI_ERROR_EXIT("Requested %ld teams, but only %d are supported\n", nvshmemi_max_teams, N_PSYNC_BYTES * CHAR_BIT); goto cleanup; } nvshmemi_team_pool = (nvshmemi_team_t **)calloc(nvshmemi_max_teams, sizeof(nvshmemi_team_t *)); NVSHMEMI_NULL_ERROR_JMP(nvshmemi_team_pool, status, NVSHMEMX_ERROR_OUT_OF_MEMORY, cleanup, "nvshmemi_team_pool allocation failed \n"); CUDA_RUNTIME_CHECK(cudaMalloc((void **)&nvshmemi_device_team_pool, nvshmemi_max_teams * sizeof(nvshmemi_team_t *))); nvshmemi_device_state.team_pool = nvshmemi_device_team_pool; for (long i = 0; i < nvshmemi_max_teams; i++) { nvshmemi_team_pool[i] = NULL; } nvshmemi_call_init_array_kernel(nvshmemi_device_team_pool, nvshmemi_max_teams, NULL); nvshmemi_team_pool[NVSHMEM_TEAM_WORLD_INDEX] = &nvshmemi_team_world; nvshmemi_team_pool[NVSHMEM_TEAM_SHARED_INDEX] = &nvshmemi_team_shared; nvshmemi_team_pool[NVSHMEM_TEAM_NODE_INDEX] = &nvshmemi_team_node; nvshmemi_team_pool[NVSHMEM_TEAM_SAME_MYPE_NODE_INDEX] = &nvshmemi_team_same_mype_node; nvshmemi_team_pool[NVSHMEM_TEAM_SAME_GPU_INDEX] = &nvshmemi_team_same_gpu; if (nvshmemi_team_same_gpu.start == nvshmemi_state->mype) nvshmemi_team_pool[NVSHMEM_TEAM_GPU_LEADERS_INDEX] = &nvshmemi_team_gpu_leaders; /* Allocate pSync pool, each with the maximum possible size requirement */ /* Create two pSyncs per team for back-to-back collectives and one for barriers. * Array organization: * * [ (world) (shared) (team 1) (team 2) ... (world) (shared) (team 1) (team 2) ... ] * <----------- groups 1 & 2-------------->|<------------- group 3 ----------------> * <--- (bcast, collect, reduce, etc.) --->|<------ (barriers and syncs) ----------> * */ psync_len = nvshmemi_max_teams * get_psync_len_per_team(); nvshmemi_psync_pool = (long *)nvshmemi_malloc(sizeof(long) * psync_len); NVSHMEMI_NULL_ERROR_JMP(nvshmemi_psync_pool, status, NVSHMEMX_ERROR_OUT_OF_MEMORY, cleanup, "nvshmemi_psync_pool allocation failed \n"); nvshmemi_device_state.psync_pool = nvshmemi_psync_pool; nvshmemi_call_init_array_kernel(nvshmemi_psync_pool, psync_len, NVSHMEMI_SYNC_VALUE); nvshmemi_sync_counter = (long *)nvshmemi_malloc(2 * nvshmemi_max_teams * sizeof(long)); NVSHMEMI_NULL_ERROR_JMP(nvshmemi_sync_counter, status, NVSHMEMX_ERROR_OUT_OF_MEMORY, cleanup, "nvshmemi_sync_counter allocation failed \n"); nvshmemi_device_state.sync_counter = nvshmemi_sync_counter; nvshmemi_update_device_state(); nvshmemi_call_init_array_kernel(nvshmemi_sync_counter, 2 * nvshmemi_max_teams, 1); /* Convenience pointer to the group-3 pSync array (for barriers and syncs): */ psync_pool_avail = (unsigned char *)malloc(2 * N_PSYNC_BYTES); NVSHMEMI_NULL_ERROR_JMP(psync_pool_avail, status, NVSHMEMX_ERROR_OUT_OF_MEMORY, cleanup, "psync_pool_avail allocation failed \n"); psync_pool_avail_reduced = &psync_pool_avail[N_PSYNC_BYTES]; device_psync_pool_avail = (unsigned char *)nvshmemi_malloc(2 * N_PSYNC_BYTES); NVSHMEMI_NULL_ERROR_JMP(device_psync_pool_avail, status, NVSHMEMX_ERROR_OUT_OF_MEMORY, cleanup, "device_psync_pool_avail allocation failed \n"); device_psync_pool_avail_reduced = &device_psync_pool_avail[N_PSYNC_BYTES]; /* Initialize the psync bits to 1, making all slots available: */ memset(psync_pool_avail, 0, 2 * N_PSYNC_BYTES); for (size_t i = 0; i < (size_t)nvshmemi_max_teams; i++) { nvshmemi_bit_set(psync_pool_avail, N_PSYNC_BYTES, i); } /* Set the bits for NVSHMEM_TEAM_WORLD, NVSHMEM_TEAM_SHARED, NVSHMEMX_TEAM_NODE to 0: */ nvshmemi_bit_clear(psync_pool_avail, N_PSYNC_BYTES, NVSHMEM_TEAM_WORLD_INDEX); nvshmemi_bit_clear(psync_pool_avail, N_PSYNC_BYTES, NVSHMEM_TEAM_SHARED_INDEX); nvshmemi_bit_clear(psync_pool_avail, N_PSYNC_BYTES, NVSHMEM_TEAM_NODE_INDEX); nvshmemi_bit_clear(psync_pool_avail, N_PSYNC_BYTES, NVSHMEM_TEAM_SAME_MYPE_NODE_INDEX); nvshmemi_bit_clear(psync_pool_avail, N_PSYNC_BYTES, NVSHMEM_TEAM_SAME_GPU_INDEX); nvshmemi_bit_clear(psync_pool_avail, N_PSYNC_BYTES, NVSHMEM_TEAM_GPU_LEADERS_INDEX); /* Initialize an integer used to agree on an equal return value across PEs in team creation: */ team_ret_val = (int *)malloc(sizeof(int) * 2); NVSHMEMI_NULL_ERROR_JMP(team_ret_val, status, NVSHMEMX_ERROR_OUT_OF_MEMORY, cleanup, "team_ret_val allocation failed \n"); team_ret_val_reduced = &team_ret_val[1]; device_team_ret_val = (int *)nvshmemi_malloc(sizeof(int) * 2); NVSHMEMI_NULL_ERROR_JMP(team_ret_val, status, NVSHMEMX_ERROR_OUT_OF_MEMORY, cleanup, "device_team_ret_val allocation failed \n"); device_team_ret_val_reduced = &device_team_ret_val[1]; nvshmemi_boot_handle.barrier( &nvshmemi_boot_handle); /* To ensure neccessary setup has been done all PEs */ nvshmemi_team_alloc_device(); nvshmemi_team_update_device(); nvshmemi_boot_handle.barrier( &nvshmemi_boot_handle); /* To ensure neccessary setup has been done all PEs */ #ifdef NVSHMEM_USE_NCCL if (nvshmemi_use_nccl) { /* Setup NCCL usage */ nvshmemi_team_init_nccl_comm(&nvshmemi_team_world); nvshmemi_team_init_nccl_comm(&nvshmemi_team_shared); nvshmemi_team_init_nccl_comm(&nvshmemi_team_node); nvshmemi_team_init_nccl_comm(&nvshmemi_team_same_mype_node); nvshmemi_team_init_nccl_comm(&nvshmemi_team_same_gpu); if (nvshmemi_pe_in_active_set(nvshmemi_state->mype, nvshmemi_team_gpu_leaders.start, nvshmemi_team_gpu_leaders.stride, nvshmemi_team_gpu_leaders.size) >= 0) { nvshmemi_team_init_nccl_comm(&nvshmemi_team_gpu_leaders); } } #endif /* NVSHMEM_USE_NCCL */ /* Setup NVLS resources for all internal p2p connected teams */ NVSHMEMU_FOR_EACH_IF( i, nvshmemi_max_teams, nvshmemi_team_pool[i] != NULL && nvshmemi_team_pool[i]->are_gpus_p2p_connected, { status = nvshmemi_team_nvls_setup(nvshmemi_team_pool[i]); NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, cleanup, "NVLS resource setup failed for team ID: %d\n", nvshmemi_team_pool[i]->team_idx); if (nvshmemi_team_pool[i]->nvls_rsc) { INFO(NVSHMEM_TEAM, "Successful NVLS resource setup for team ID: %d\n", nvshmemi_team_pool[i]->team_idx); } }) nvshmemi_boot_handle.barrier( &nvshmemi_boot_handle); /* To ensure neccessary setup has been done all PEs */ nvshmemi_team_update_device(); nvshmemi_boot_handle.barrier( &nvshmemi_boot_handle); /* To ensure neccessary setup has been done all PEs */ #if defined(NVSHMEM_PPC64LE) if (nvshmemi_use_nccl) { /* Set GPU thread stack size to be max stack size of any kernel invoked by NCCL. The value 1256 has been obtained by profiling all NCCL kernels in NCCL 2.8.3-1. This value is being set to prevent any memory config during application run as that can lead to potential deadlock */ if (nvshmemi_options.CUDA_LIMIT_STACK_SIZE_provided) { CUDA_RUNTIME_CHECK( cudaDeviceSetLimit(cudaLimitStackSize, nvshmemi_options.CUDA_LIMIT_STACK_SIZE)); if (nvshmemi_options.CUDA_LIMIT_STACK_SIZE < 1256) NVSHMEMI_WARN_PRINT( "CUDA stack size limit has been set to less than 1256.\n" "This can lead to hangs because a NCCL kernel can need up\n" "to 1256 bytes"); } else CUDA_RUNTIME_CHECK(cudaDeviceSetLimit(cudaLimitStackSize, 1256)); } else if (nvshmemi_options.CUDA_LIMIT_STACK_SIZE_provided) { CUDA_RUNTIME_CHECK( cudaDeviceSetLimit(cudaLimitStackSize, nvshmemi_options.CUDA_LIMIT_STACK_SIZE)); } #endif cleanup: if (scratch) { free(scratch); } if (hostHash) { free(hostHash); } if (status != NVSHMEMX_SUCCESS) { if (nvshmemi_team_pool) { free(nvshmemi_team_pool); nvshmemi_team_pool = NULL; cudaFree(nvshmemi_device_team_pool); nvshmemi_device_team_pool = NULL; } if (nvshmemi_psync_pool) { nvshmemi_free(nvshmemi_psync_pool); nvshmemi_psync_pool = NULL; } if (psync_pool_avail) { free(psync_pool_avail); psync_pool_avail = NULL; } if (device_psync_pool_avail) { nvshmemi_free(device_psync_pool_avail); device_psync_pool_avail = NULL; } if (team_ret_val) { free(team_ret_val); team_ret_val = NULL; } if (device_team_ret_val) { nvshmemi_free(device_team_ret_val); device_team_ret_val = NULL; } } return status; } int nvshmemi_team_finalize(void) { /* Destroy all undestroyed teams */ for (long i = 0; i < nvshmemi_max_teams; i++) { if (nvshmemi_team_pool[i] != NULL) nvshmemi_team_destroy(nvshmemi_team_pool[i]); } free(nvshmemi_team_pool); nvshmemi_team_pool = NULL; CUDA_RUNTIME_CHECK(cudaFree(nvshmemi_device_team_pool)); nvshmemi_free(nvshmemi_psync_pool); nvshmemi_free(nvshmemi_sync_counter); free(psync_pool_avail); nvshmemi_free(device_psync_pool_avail); free(team_ret_val); nvshmemi_free(device_team_ret_val); cudaFree(nvshmemi_device_team_world); cudaFree(nvshmemi_device_team_shared); cudaFree(nvshmemi_device_team_node); cudaFree(nvshmemi_device_team_same_mype_node); cudaFree(nvshmemi_device_team_same_gpu); cudaFree(nvshmemi_device_team_gpu_leaders); return 0; } int nvshmemi_team_split_strided(nvshmemi_team_t *parent_team, int PE_start, int PE_stride, int PE_size, const nvshmem_team_config_t *config, long config_mask, nvshmem_team_t *new_team) { *new_team = NVSHMEM_TEAM_INVALID; nvshmem_barrier(parent_team->team_idx); int global_PE_start = nvshmemi_team_pe(parent_team, PE_start); int global_PE_stride = parent_team->stride * PE_stride; int global_PE_end = global_PE_start + global_PE_stride * (PE_size - 1); if (PE_start < 0 || PE_start >= parent_team->size || PE_size <= 0 || PE_size > parent_team->size || PE_stride < 1) { NVSHMEMI_WARN_PRINT( "Invalid : child <%d, %d, %d>, parent <%d, %d, %d>\n", PE_start, PE_stride, PE_size, parent_team->start, parent_team->stride, parent_team->size); return -1; } if (global_PE_start >= nvshmemi_state->npes || global_PE_end >= nvshmemi_state->npes) { NVSHMEMI_WARN_PRINT("Starting PE (%d) or ending PE (%d) is invalid\n", global_PE_start, global_PE_end); return -1; } /* idx in new team: */ int my_pe = nvshmemi_pe_in_active_set(nvshmemi_state->mype, global_PE_start, global_PE_stride, PE_size); long *psync_reduce = nvshmemi_team_get_psync(parent_team, REDUCE); long *psync = &nvshmemi_team_get_psync(parent_team, SYNC)[NVSHMEMI_SYNC_SIZE]; long *sync_counter = &nvshmemi_team_get_sync_counter(parent_team)[1]; nvshmemi_team_t *myteam = NULL; *team_ret_val = 0; *team_ret_val_reduced = 0; if (my_pe >= 0) { char bit_str[NVSHMEMI_DIAG_STRLEN]; myteam = (nvshmemi_team_t *)calloc(1, sizeof(nvshmemi_team_t)); (*myteam) = NVSHMEMI_TEAM_INITIALIZER; myteam->my_pe = my_pe; myteam->start = global_PE_start; myteam->stride = global_PE_stride; myteam->size = PE_size; myteam->rdxn_count = 0; myteam->ll_flag = 1; myteam->alltoall_count = 0; myteam->bcast_count = 0; myteam->bcast_sync_offset = 0; myteam->fcollect_count = 0; if (config) { myteam->config = *config; myteam->config_mask = config_mask; } myteam->team_idx = -1; nvshmemi_bit_to_string(bit_str, NVSHMEMI_DIAG_STRLEN, psync_pool_avail, N_PSYNC_BYTES); CUDA_RUNTIME_CHECK(cudaMemcpy(device_psync_pool_avail, psync_pool_avail, N_PSYNC_BYTES, cudaMemcpyHostToDevice)); CUDA_RUNTIME_CHECK(cudaDeviceSynchronize()); nvshmemi_call_reduce_kernel( myteam->start, myteam->stride, myteam->size, (unsigned char *)device_psync_pool_avail_reduced, (const unsigned char *)device_psync_pool_avail, N_PSYNC_BYTES, (unsigned char *)psync_reduce, (long *)(psync), sync_counter); CUDA_RUNTIME_CHECK(cudaMemcpy(psync_pool_avail_reduced, device_psync_pool_avail_reduced, N_PSYNC_BYTES, cudaMemcpyDeviceToHost)); /* We cannot release the psync here, because this reduction may not * have been performed on the entire parent team. */ nvshmemi_bit_to_string(bit_str, NVSHMEMI_DIAG_STRLEN, psync_pool_avail_reduced, N_PSYNC_BYTES); /* Select the least signficant nonzero bit, which corresponds to an available pSync. */ myteam->team_idx = nvshmemi_bit_1st_nonzero(psync_pool_avail_reduced, N_PSYNC_BYTES); NVSHMEMI_TEAM_DUP_INITIALIZER(*myteam, myteam->team_idx); nvshmemi_bit_to_string(bit_str, NVSHMEMI_DIAG_STRLEN, psync_pool_avail_reduced, N_PSYNC_BYTES); if (myteam->team_idx == -1 || myteam->team_idx >= (int)nvshmemi_max_teams) { NVSHMEMI_WARN_PRINT( "No more teams available (max = %ld), try setting NVSHMEM_MAX_TEAMS environment " "variable\n", nvshmemi_max_teams); /* No psync was available, but must call barrier across parent team before returning. */ myteam->team_idx = -1; *team_ret_val = 1; } else { /* Set the selected psync bit to 0, reserving that slot */ nvshmemi_bit_clear(psync_pool_avail, N_PSYNC_BYTES, myteam->team_idx); *new_team = myteam->team_idx; nvshmemi_team_pool[myteam->team_idx] = myteam; nvshmemi_team_t *device_team_addr; CUDA_RUNTIME_CHECK(cudaMalloc((void **)&device_team_addr, sizeof(nvshmemi_team_t))); nvshmemi_team_set_p2p_connectivity(myteam); nvshmemi_recexchalgo_get_neighbors(myteam); CUDA_RUNTIME_CHECK(cudaMemcpy(device_team_addr, myteam, sizeof(nvshmemi_team_t), cudaMemcpyHostToDevice)); CUDA_RUNTIME_CHECK(cudaMemcpy(&nvshmemi_device_team_pool[myteam->team_idx], &device_team_addr, sizeof(nvshmemi_team_t *), cudaMemcpyHostToDevice)); CUDA_RUNTIME_CHECK(cudaDeviceSynchronize()); #ifdef NVSHMEM_USE_NCCL if (nvshmemi_use_nccl) nvshmemi_team_init_nccl_comm(myteam); #endif /* * Reuse NVLS resources if teams are identical, * else creating a new NVLS resources for p2p connected teams */ if (nvshmemi_team_is_identical(myteam, parent_team) && nvshmemi_team_is_nvls_capable(myteam) && !nvshmemi_options.DISABLE_NVLS_SHARING) { myteam->nvls_rsc = parent_team->nvls_rsc; /* Inherit the parent team's nvls_rsc */ if (myteam->nvls_rsc) { nvshmemi_nvls_rsc *nvls = reinterpret_cast(myteam->nvls_rsc); nvls->add_refcount(); INFO(NVSHMEM_TEAM, "Successful NVLS resource sharing for new team ID: %d (parent ID: %d)\n", myteam->team_idx, parent_team->team_idx); } } else if (nvshmemi_team_is_nvls_capable(myteam)) { if (nvshmemi_team_nvls_setup(myteam) != 0) { NVSHMEMI_WARN_PRINT("NVLS resource setup failed for team ID: %d\n", myteam->team_idx); return -1; } INFO(NVSHMEM_TEAM, "Successful NVLS resource setup for team ID: %d\n", myteam->team_idx); } else { myteam->nvls_rsc = nullptr; /* NVLS not supported, so no resource created/bound */ } /* Build team_node */ myteam->is_team_node = false; int i; for (i = 1; i < myteam->size; i++) { if (nvshmemi_host_hashes[myteam->start] != nvshmemi_host_hashes[myteam->start + i * myteam->stride]) { break; } } if (i == myteam->size) myteam->is_team_node = true; myteam->is_team_same_mype_node = true; for (int i = 0; i < myteam->size; i++) { for (int j = i + 1; j < myteam->size; j++) { if (nvshmemi_host_hashes[myteam->start + i * myteam->stride] == nvshmemi_host_hashes[myteam->start + j * myteam->stride]) { myteam->is_team_same_mype_node = false; } } } /* count PEs on the same node */ int team_npes_node = 0; for (int i = 0; i < myteam->size; i++) { if (nvshmemi_team_translate_pe(myteam, i, &nvshmemi_team_node) != -1) { team_npes_node++; } } if (!myteam->is_team_node && !myteam->is_team_same_mype_node) { /* Now I am just going to repurpose device_psync_pool_avail symm memory for the purpose of finding max of team_npes_node */ assert(sizeof(int) <= N_PSYNC_BYTES); CUDA_RUNTIME_CHECK(cudaMemcpy(device_psync_pool_avail, &team_npes_node, sizeof(int), cudaMemcpyHostToDevice)); CUDA_RUNTIME_CHECK(cudaDeviceSynchronize()); nvshmemi_call_reduce_kernel( myteam->start, myteam->stride, myteam->size, (int *)device_psync_pool_avail_reduced, (const int *)device_psync_pool_avail, 1, (int *)psync_reduce, (long *)(psync), sync_counter); CUDA_RUNTIME_CHECK(cudaMemcpy(&team_npes_node, device_psync_pool_avail_reduced, sizeof(int), cudaMemcpyDeviceToHost)); nvshmemi_team_split_2d(myteam, team_npes_node, NULL, 0, &myteam->team_node, NULL, 0, &myteam->team_same_mype_node); } CUDA_RUNTIME_CHECK(cudaMemcpy(device_team_addr, myteam, sizeof(nvshmemi_team_t), cudaMemcpyHostToDevice)); CUDA_RUNTIME_CHECK(cudaDeviceSynchronize()); } nvshmemi_call_init_array_kernel(sync_counter, 1, 1); nvshmemi_call_init_array_kernel(psync, NVSHMEMI_SYNC_SIZE, NVSHMEMI_SYNC_VALUE); CUDA_RUNTIME_CHECK(cudaDeviceSynchronize()); } /* This barrier on the parent team eliminates problematic race conditions * during psync allocation between back-to-back team creations. */ nvshmem_quiet(); // nvshmem_barrier(parent_team->start, parent_team->stride, parent_team->size, psync); nvshmem_team_sync(parent_team->team_idx); /* This OR reduction assures all PEs return the same value. */ CUDA_RUNTIME_CHECK( cudaMemcpy(device_team_ret_val, team_ret_val, sizeof(int), cudaMemcpyHostToDevice)); CUDA_RUNTIME_CHECK(cudaDeviceSynchronize()); nvshmemi_call_rdxn_on_stream_kernel( parent_team->team_idx, device_team_ret_val_reduced, device_team_ret_val, 1, nvshmemi_state->my_stream); CUDA_RUNTIME_CHECK(cudaStreamSynchronize(nvshmemi_state->my_stream)); CUDA_RUNTIME_CHECK(cudaMemcpy(team_ret_val_reduced, device_team_ret_val_reduced, sizeof(int), cudaMemcpyDeviceToHost)); /* If no team was available, print some team triplet info and return nonzero. */ if (myteam != NULL && myteam->team_idx == -1) { NVSHMEMI_WARN_PRINT("Team split strided failed: child <%d, %d, %d>, parent <%d, %d, %d>\n", global_PE_start, global_PE_stride, PE_size, parent_team->start, parent_team->stride, parent_team->size); /* TODO: In the event one of the PEs fails to create the team, do we need to revert the team * on all of the other ones? */ free(myteam); } return *team_ret_val_reduced; } int nvshmemi_team_split_2d(nvshmemi_team_t *parent_team, int xrange, const nvshmem_team_config_t *xaxis_config, long xaxis_mask, nvshmem_team_t *xaxis_team, const nvshmem_team_config_t *yaxis_config, long yaxis_mask, nvshmem_team_t *yaxis_team) { *xaxis_team = NVSHMEM_TEAM_INVALID; *yaxis_team = NVSHMEM_TEAM_INVALID; if (xrange > parent_team->size) { xrange = parent_team->size; } const int parent_size = parent_team->size; const int num_xteams = ceil(parent_size / (float)xrange); const int num_yteams = xrange; int start = 0; int ret = 0; for (int i = 0; i < num_xteams; i++) { nvshmem_team_t my_xteam; int xsize = (i == num_xteams - 1 && parent_size % xrange) ? parent_size % xrange : xrange; ret = nvshmemi_team_split_strided(parent_team, start, 1, xsize, xaxis_config, xaxis_mask, &my_xteam); if (ret) { NVSHMEMI_ERROR_PRINT("Creation of x-axis team %d of %d failed\n", i + 1, num_xteams); } start += xrange; if (my_xteam != NVSHMEM_TEAM_INVALID) { assert(*xaxis_team == NVSHMEM_TEAM_INVALID); *xaxis_team = my_xteam; } } start = 0; for (int i = 0; i < num_yteams; i++) { nvshmem_team_t my_yteam; int remainder = parent_size % xrange; int yrange = parent_size / xrange; int ysize = (remainder && i < remainder) ? yrange + 1 : yrange; ret = nvshmemi_team_split_strided(parent_team, start, xrange, ysize, yaxis_config, yaxis_mask, &my_yteam); if (ret) { NVSHMEMI_ERROR_PRINT("Creation of y-axis team %d of %d failed\n", i + 1, num_yteams); } start += 1; if (my_yteam != NVSHMEM_TEAM_INVALID) { assert(*yaxis_team == NVSHMEM_TEAM_INVALID); *yaxis_team = my_yteam; } } nvshmem_quiet(); nvshmem_team_sync(parent_team->team_idx); return 0; } static bool inline nvshmemi_is_rsvd_teams(nvshmem_team_t team_idx) { /* This team resource shouldn't not be deleted as they are used for collectives APIs during * init/finalize */ return ((team_idx == NVSHMEM_TEAM_INVALID) || (team_idx >= NVSHMEM_TEAM_WORLD_INDEX && team_idx < NVSHMEM_TEAMS_MIN)); } void nvshmemi_team_destroy(nvshmemi_team_t *team) { int idx = team->team_idx; if (nvshmemi_bit_fetch(psync_pool_avail, idx)) { NVSHMEMI_ERROR_PRINT("Destroying a team without an active pSync\n"); } /* Since it is a collective routine, perform a barrier */ // nvshmem_barrier(idx); if (!team->is_team_node) { if (!nvshmemi_is_rsvd_teams(team->team_node) && nvshmemi_team_pool[team->team_node] != NULL) { INFO(NVSHMEM_COLL, "Destroy sub-team 1 [%p] at index[%d] for parent-team [%p]", nvshmemi_team_pool[team->team_node], team->team_node, team); nvshmemi_team_destroy(nvshmemi_team_pool[team->team_node]); } } if (!team->is_team_same_mype_node) { if (!nvshmemi_is_rsvd_teams(team->team_same_mype_node) && nvshmemi_team_pool[team->team_same_mype_node] != NULL) { INFO(NVSHMEM_COLL, "Destroy sub-team 2 [%p] at index[%d] for parent-team [%p]", nvshmemi_team_pool[team->team_same_mype_node], team->team_same_mype_node, team); nvshmemi_team_destroy(nvshmemi_team_pool[team->team_same_mype_node]); } } nvshmemi_bit_set(psync_pool_avail, N_PSYNC_BYTES, idx); nvshmemi_team_pool[idx] = NULL; CUDA_RUNTIME_CHECK(cudaMemset(&nvshmemi_device_team_pool[idx], 0, sizeof(nvshmemi_team_t *))); nvshmemi_call_init_array_kernel(&nvshmemi_sync_counter[2 * idx], 2, 1); nvshmemi_call_init_array_kernel(&nvshmemi_psync_pool[idx * get_psync_len_per_team()], get_psync_len_per_team(), NVSHMEMI_SYNC_VALUE); CUDA_RUNTIME_CHECK(cudaDeviceSynchronize()); nvshmemi_team_destroy_nvls(team); nvshmemi_recexchalgo_free_mem(team); #ifdef NVSHMEM_USE_NCCL if (nvshmemi_use_nccl) NCCL_CHECK(nccl_ftable.CommDestroy((ncclComm_t)team->nccl_comm)); #endif if (team != &nvshmemi_team_world && team != &nvshmemi_team_shared && team != &nvshmemi_team_node && team != &nvshmemi_team_same_mype_node && team != &nvshmemi_team_same_gpu && team != &nvshmemi_team_gpu_leaders) { free(team); } nvshmemi_team_t *device_team_addr; CUDA_RUNTIME_CHECK(cudaMemcpy((void **)&device_team_addr, &nvshmemi_device_team_pool[idx], sizeof(nvshmemi_team_t *), cudaMemcpyDeviceToHost)); CUDA_RUNTIME_CHECK(cudaFree(device_team_addr)); } long *nvshmemi_team_get_psync(nvshmemi_team_t *team, nvshmemi_team_op_t op) { long *team_psync; size_t psync_fcollect_len; psync_fcollect_len = get_fcollect_psync_len_per_team(); team_psync = &nvshmemi_psync_pool[team->team_idx * get_psync_len_per_team()]; switch (op) { case SYNC: return team_psync; case REDUCE: return &team_psync [2 * NVSHMEMI_SYNC_SIZE + (((nvshmemi_device_state.gpu_coll_env_params_var.reduce_scratch_size / 2) / sizeof(long)) * (team->rdxn_count % 2))]; case BCAST: return &team_psync[2 * NVSHMEMI_SYNC_SIZE + nvshmemi_device_state.gpu_coll_env_params_var.reduce_scratch_size / sizeof(long)]; case FCOLLECT: return &team_psync[2 * NVSHMEMI_SYNC_SIZE + nvshmemi_device_state.gpu_coll_env_params_var.reduce_scratch_size / sizeof(long) + NVSHMEMI_BCAST_SYNC_SIZE]; case ALLTOALL: return &team_psync[2 * NVSHMEMI_SYNC_SIZE + nvshmemi_device_state.gpu_coll_env_params_var.reduce_scratch_size / sizeof(long) + NVSHMEMI_BCAST_SYNC_SIZE + psync_fcollect_len + (NVSHMEMI_ALLTOALL_SYNC_SIZE * (team->alltoall_count % 2))]; case FCOLLECT_128: return &team_psync[2 * NVSHMEMI_SYNC_SIZE + nvshmemi_device_state.gpu_coll_env_params_var.reduce_scratch_size / sizeof(long) + NVSHMEMI_BCAST_SYNC_SIZE + psync_fcollect_len + 2 * NVSHMEMI_ALLTOALL_SYNC_SIZE]; default: WARN("Incorrect argument to nvshmemi_team_get_psync\n"); return NULL; } } long *nvshmemi_team_get_sync_counter(nvshmemi_team_t *team) { return &nvshmemi_sync_counter[2 * team->team_idx]; }