/* * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. * * NVIDIA CORPORATION and its licensors retain all intellectual property * and proprietary rights in and to this software, related documentation * and any modifications thereto. Any use, reproduction, disclosure or * distribution of this software and related documentation without an express * license agreement from NVIDIA CORPORATION is strictly prohibited. * * See COPYRIGHT.txt for license information */ #include "coll_test.h" #define DATATYPE int64_t int main(int argc, char **argv) { int status = 0; int mype, npes; size_t alloc_size; size_t min_elems, max_elems; DATATYPE *buffer = NULL; DATATYPE *h_buffer = NULL; DATATYPE *d_source, *d_dest; DATATYPE *h_source, *h_dest; int PE_root = 0; char size_string[100]; read_args(argc, argv); uint64_t *size_array = (uint64_t *)calloc(max_size_log, sizeof(uint64_t)); double **latency_array = (double **)malloc(max_size_log * sizeof(double *)); cudaStream_t stream; for (int i = 0; i < max_size_log; i++) { latency_array[i] = (double *)calloc(iters, sizeof(double)); } DEBUG_PRINT("symmetric size requested %lu\n", max_size * 2); sprintf(size_string, "%lu", max_size * 2); status = setenv("NVSHMEM_SYMMETRIC_SIZE", size_string, 1); if (status) { fprintf(stderr, "setenv failed \n"); status = -1; goto out; } init_wrapper(&argc, &argv); mype = nvshmem_my_pe(); npes = nvshmem_n_pes(); (void)npes; // Silence unused variable warning CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); CUDA_CHECK(cudaHostAlloc(&h_buffer, max_size * 2, cudaHostAllocDefault)); h_source = (DATATYPE *)h_buffer; h_dest = (DATATYPE *)&h_source[max_size / sizeof(DATATYPE)]; buffer = (DATATYPE *)nvshmem_malloc(max_size * 2); if (!buffer) { fprintf(stderr, "nvshmem_malloc failed \n"); status = -1; goto out; } d_source = (DATATYPE *)buffer; d_dest = (DATATYPE *)&d_source[max_size / sizeof(DATATYPE)]; #define CALL_RUN_COLL_ON_STREAM(TYPENAME, TYPE) \ RUN_COLL_ON_STREAM(broadcast, BCAST, TYPENAME, TYPE, (TYPE *)d_source, (TYPE *)h_source, \ (TYPE *)d_dest, (TYPE *)h_dest, npes, PE_root, stream, size_array, \ latency_array); switch (datatype.type) { case NVSHMEM_INT: CALL_RUN_COLL_ON_STREAM(int, int); break; case NVSHMEM_LONG: CALL_RUN_COLL_ON_STREAM(long, long); break; case NVSHMEM_LONGLONG: CALL_RUN_COLL_ON_STREAM(longlong, long long); break; case NVSHMEM_ULONGLONG: CALL_RUN_COLL_ON_STREAM(ulonglong, unsigned long long); break; case NVSHMEM_SIZE: CALL_RUN_COLL_ON_STREAM(size, size_t); break; case NVSHMEM_PTRDIFF: CALL_RUN_COLL_ON_STREAM(ptrdiff, ptrdiff_t); break; case NVSHMEM_FLOAT: CALL_RUN_COLL_ON_STREAM(float, float); break; case NVSHMEM_DOUBLE: CALL_RUN_COLL_ON_STREAM(double, double); break; case NVSHMEM_UINT: CALL_RUN_COLL_ON_STREAM(uint, unsigned int); break; case NVSHMEM_INT32: CALL_RUN_COLL_ON_STREAM(int32, int32_t); break; case NVSHMEM_INT64: CALL_RUN_COLL_ON_STREAM(int64, int64_t); break; case NVSHMEM_UINT32: CALL_RUN_COLL_ON_STREAM(uint32, uint32_t); break; case NVSHMEM_UINT64: CALL_RUN_COLL_ON_STREAM(uint64, uint64_t); break; case NVSHMEM_FP16: CALL_RUN_COLL_ON_STREAM(half, half); break; case NVSHMEM_BF16: CALL_RUN_COLL_ON_STREAM(bfloat16, __nv_bfloat16); break; default: printf("Incorrect datatype specified\n"); exit(1); break; } if (!mype) { print_table_v2("broadcast_on_stream", datatype.name.c_str(), "size (bytes)", "latency", "us", '-', size_array, latency_array, max_size_log, iters); } CUDA_CHECK(cudaFreeHost(h_buffer)); nvshmem_free(buffer); nvshmem_barrier_all(); CUDA_CHECK(cudaStreamDestroy(stream)); finalize_wrapper(); out: return status; }