223 lines
7.2 KiB
Plaintext
223 lines
7.2 KiB
Plaintext
/*
|
|
* Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* NVIDIA CORPORATION and its licensors retain all intellectual property
|
|
* and proprietary rights in and to this software, related documentation
|
|
* and any modifications thereto. Any use, reproduction, disclosure or
|
|
* distribution of this software and related documentation without an express
|
|
* license agreement from NVIDIA CORPORATION is strictly prohibited.
|
|
*
|
|
* See COPYRIGHT.txt for license information
|
|
*/
|
|
|
|
#include <stdio.h>
|
|
#include <assert.h>
|
|
#include <string.h>
|
|
#include <getopt.h>
|
|
#include <cuda.h>
|
|
#include <cuda_runtime.h>
|
|
#include "utils.h"
|
|
|
|
#define DEFAULT_ITERS 10
|
|
#define DEFAULT_MIN_MSG_SIZE 1
|
|
#define DEFAULT_MAX_MSG_SIZE 128 * 1024 * 1024
|
|
|
|
typedef enum { PUSH = 0, PULL = 1 } putget_dir_t;
|
|
|
|
__global__ void test_kernel(void *data_d_local, long long int ncycles) {
|
|
long long int sclk = clock64();
|
|
long long int cyc = 0;
|
|
while (cyc < ncycles) {
|
|
cyc = clock64() - sclk;
|
|
}
|
|
*(long long int *)data_d_local = cyc;
|
|
}
|
|
|
|
int lat(void *data_d, void *data_d_local, int sizeBytes, int pe, int iter, putget_dir_t dir,
|
|
cudaStream_t strm, cudaEvent_t sev, cudaEvent_t eev, float *ms1, float *ms2, int ng, int nb,
|
|
long long int ncycles) {
|
|
int status = 0;
|
|
int peer = !pe;
|
|
|
|
if (dir == PUSH) {
|
|
CUDA_CHECK(cudaEventRecord(sev, strm));
|
|
for (int i = 0; i < iter; i++) {
|
|
test_kernel<<<ng, nb, 0, strm>>>(data_d_local, ncycles);
|
|
nvshmemx_putmem_on_stream((void *)data_d, (void *)data_d_local, sizeBytes, peer, strm);
|
|
}
|
|
CUDA_CHECK(cudaEventRecord(eev, strm));
|
|
CUDA_CHECK(cudaEventSynchronize(eev));
|
|
CUDA_CHECK(cudaEventElapsedTime(ms1, sev, eev));
|
|
|
|
CUDA_CHECK(cudaEventRecord(sev, strm));
|
|
for (int i = 0; i < iter; i++) {
|
|
test_kernel<<<ng, nb, 0, strm>>>(data_d_local, ncycles);
|
|
CUDA_CHECK(cudaStreamSynchronize(strm));
|
|
nvshmem_putmem((void *)data_d, (void *)data_d_local, sizeBytes, peer);
|
|
}
|
|
CUDA_CHECK(cudaEventRecord(eev, strm));
|
|
CUDA_CHECK(cudaEventSynchronize(eev));
|
|
CUDA_CHECK(cudaEventElapsedTime(ms2, sev, eev));
|
|
} else {
|
|
CUDA_CHECK(cudaEventRecord(sev, strm));
|
|
for (int i = 0; i < iter; i++) {
|
|
nvshmemx_getmem_on_stream((void *)data_d_local, (void *)data_d, sizeBytes, peer, strm);
|
|
test_kernel<<<ng, nb, 0, strm>>>(data_d_local, ncycles);
|
|
}
|
|
CUDA_CHECK(cudaEventRecord(eev, strm));
|
|
CUDA_CHECK(cudaEventSynchronize(eev));
|
|
CUDA_CHECK(cudaEventElapsedTime(ms1, sev, eev));
|
|
|
|
CUDA_CHECK(cudaEventRecord(sev, strm));
|
|
for (int i = 0; i < iter; i++) {
|
|
nvshmem_getmem((void *)data_d_local, (void *)data_d, sizeBytes,
|
|
peer); // shmem_getmem is blocking, so nvshmem_quiet is not needed
|
|
test_kernel<<<ng, nb, 0, strm>>>(data_d_local, ncycles);
|
|
}
|
|
CUDA_CHECK(cudaEventRecord(eev, strm));
|
|
CUDA_CHECK(cudaEventSynchronize(eev));
|
|
CUDA_CHECK(cudaEventElapsedTime(ms2, sev, eev));
|
|
}
|
|
|
|
return status;
|
|
}
|
|
|
|
int main(int argc, char *argv[]) {
|
|
int status = 0;
|
|
int mype, npes;
|
|
char *data_d = NULL, *data_d_local = NULL;
|
|
uint64_t *size_array = NULL;
|
|
double *offs_latency_array = NULL;
|
|
double *ons_latency_array = NULL;
|
|
int num_entries;
|
|
int i;
|
|
|
|
putget_dir_t dir = PUSH;
|
|
int iter = DEFAULT_ITERS;
|
|
int min_msg_size = DEFAULT_MIN_MSG_SIZE;
|
|
int max_msg_size = DEFAULT_MAX_MSG_SIZE;
|
|
|
|
int nb = 1, nt = 32;
|
|
long long int ncycles = 1;
|
|
|
|
init_wrapper(&argc, &argv);
|
|
|
|
mype = nvshmem_my_pe();
|
|
npes = nvshmem_n_pes();
|
|
|
|
if (npes != 2) {
|
|
fprintf(stderr, "This test requires exactly two processes \n");
|
|
status = -1;
|
|
goto finalize;
|
|
}
|
|
|
|
while (1) {
|
|
int c;
|
|
c = getopt(argc, argv, "s:S:n:i:d:b:t:c:h");
|
|
if (c == -1) break;
|
|
|
|
switch (c) {
|
|
case 's':
|
|
min_msg_size = strtol(optarg, NULL, 0);
|
|
break;
|
|
case 'S':
|
|
max_msg_size = strtol(optarg, NULL, 0);
|
|
break;
|
|
case 'n':
|
|
iter = strtol(optarg, NULL, 0);
|
|
break;
|
|
case 'd':
|
|
dir = (putget_dir_t)strtol(optarg, NULL, 0);
|
|
break;
|
|
case 'b':
|
|
nb = strtol(optarg, NULL, 0);
|
|
break;
|
|
case 't':
|
|
nt = strtol(optarg, NULL, 0);
|
|
break;
|
|
case 'c':
|
|
ncycles = strtol(optarg, NULL, 0);
|
|
break;
|
|
default:
|
|
case 'h':
|
|
printf(
|
|
"-n [Iterations] -S [Max message size] -s [Min message size] -i [Put/Get issue type : ON_STREAM(0) otherwise 1] -d [Direction of copy : PUSH(0) or PULL(1)] -b [# blocks] \
|
|
-t [# threads] -c [# cycles to wait in the the kernel]\n");
|
|
goto finalize;
|
|
}
|
|
}
|
|
|
|
num_entries = floor(std::log2((float)max_msg_size)) - floor(std::log2((float)min_msg_size)) + 1;
|
|
size_array = (uint64_t *)calloc(sizeof(uint64_t), num_entries);
|
|
if (!size_array) {
|
|
status = -1;
|
|
goto finalize;
|
|
}
|
|
|
|
offs_latency_array = (double *)calloc(sizeof(double), num_entries);
|
|
if (!offs_latency_array) {
|
|
status = -1;
|
|
goto finalize;
|
|
}
|
|
|
|
ons_latency_array = (double *)calloc(sizeof(double), num_entries);
|
|
if (!ons_latency_array) {
|
|
status = -1;
|
|
goto finalize;
|
|
}
|
|
|
|
data_d = (char *)nvshmem_malloc(max_msg_size);
|
|
CUDA_CHECK(cudaMemset(data_d, 0, max_msg_size));
|
|
|
|
data_d_local = (char *)nvshmem_malloc(max_msg_size);
|
|
CUDA_CHECK(cudaMemset(data_d, 0, max_msg_size));
|
|
|
|
cudaStream_t strm;
|
|
CUDA_CHECK(cudaStreamCreateWithFlags(&strm, cudaStreamNonBlocking));
|
|
|
|
CUDA_CHECK(cudaDeviceSynchronize());
|
|
|
|
if (mype == 0) {
|
|
float ms1, ms2;
|
|
cudaEvent_t sev, eev;
|
|
CUDA_CHECK(cudaEventCreate(&sev));
|
|
CUDA_CHECK(cudaEventCreate(&eev));
|
|
i = 0;
|
|
for (int size = min_msg_size; size <= max_msg_size; size *= 2) {
|
|
size_array[i] = size;
|
|
lat(data_d, data_d_local, size, mype, iter, dir, strm, sev, eev, &ms1, &ms2, nb, nt,
|
|
ncycles);
|
|
ons_latency_array[i] = ms1 / iter * 1000;
|
|
offs_latency_array[i] = ms2 / iter * 1000;
|
|
i++;
|
|
}
|
|
|
|
print_table_basic("Stream_Latency", "with _on_stream", "size (Bytes)", "latency", "us", '-',
|
|
size_array, ons_latency_array, i);
|
|
print_table_basic("Stream_Latency", "without _on_stream", "size (Bytes)", "latency", "us",
|
|
'-', size_array, offs_latency_array, i);
|
|
|
|
CUDA_CHECK(cudaEventDestroy(sev));
|
|
CUDA_CHECK(cudaEventDestroy(eev));
|
|
|
|
nvshmem_barrier_all();
|
|
|
|
} else {
|
|
nvshmem_barrier_all();
|
|
}
|
|
|
|
finalize:
|
|
CUDA_CHECK(cudaStreamDestroy(strm));
|
|
|
|
if (data_d) nvshmem_free(data_d);
|
|
if (size_array) free(size_array);
|
|
if (ons_latency_array) free(ons_latency_array);
|
|
if (offs_latency_array) free(offs_latency_array);
|
|
|
|
if (data_d_local) nvshmem_free(data_d_local);
|
|
|
|
finalize_wrapper();
|
|
|
|
return status;
|
|
}
|