sglang.0.4.8.post1/nvshmem_src/perftest/host/pt-to-pt/stream_latency.cu

223 lines
7.2 KiB
Plaintext

/*
* Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
*
* NVIDIA CORPORATION and its licensors retain all intellectual property
* and proprietary rights in and to this software, related documentation
* and any modifications thereto. Any use, reproduction, disclosure or
* distribution of this software and related documentation without an express
* license agreement from NVIDIA CORPORATION is strictly prohibited.
*
* See COPYRIGHT.txt for license information
*/
#include <stdio.h>
#include <assert.h>
#include <string.h>
#include <getopt.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include "utils.h"
#define DEFAULT_ITERS 10
#define DEFAULT_MIN_MSG_SIZE 1
#define DEFAULT_MAX_MSG_SIZE 128 * 1024 * 1024
typedef enum { PUSH = 0, PULL = 1 } putget_dir_t;
__global__ void test_kernel(void *data_d_local, long long int ncycles) {
long long int sclk = clock64();
long long int cyc = 0;
while (cyc < ncycles) {
cyc = clock64() - sclk;
}
*(long long int *)data_d_local = cyc;
}
int lat(void *data_d, void *data_d_local, int sizeBytes, int pe, int iter, putget_dir_t dir,
cudaStream_t strm, cudaEvent_t sev, cudaEvent_t eev, float *ms1, float *ms2, int ng, int nb,
long long int ncycles) {
int status = 0;
int peer = !pe;
if (dir == PUSH) {
CUDA_CHECK(cudaEventRecord(sev, strm));
for (int i = 0; i < iter; i++) {
test_kernel<<<ng, nb, 0, strm>>>(data_d_local, ncycles);
nvshmemx_putmem_on_stream((void *)data_d, (void *)data_d_local, sizeBytes, peer, strm);
}
CUDA_CHECK(cudaEventRecord(eev, strm));
CUDA_CHECK(cudaEventSynchronize(eev));
CUDA_CHECK(cudaEventElapsedTime(ms1, sev, eev));
CUDA_CHECK(cudaEventRecord(sev, strm));
for (int i = 0; i < iter; i++) {
test_kernel<<<ng, nb, 0, strm>>>(data_d_local, ncycles);
CUDA_CHECK(cudaStreamSynchronize(strm));
nvshmem_putmem((void *)data_d, (void *)data_d_local, sizeBytes, peer);
}
CUDA_CHECK(cudaEventRecord(eev, strm));
CUDA_CHECK(cudaEventSynchronize(eev));
CUDA_CHECK(cudaEventElapsedTime(ms2, sev, eev));
} else {
CUDA_CHECK(cudaEventRecord(sev, strm));
for (int i = 0; i < iter; i++) {
nvshmemx_getmem_on_stream((void *)data_d_local, (void *)data_d, sizeBytes, peer, strm);
test_kernel<<<ng, nb, 0, strm>>>(data_d_local, ncycles);
}
CUDA_CHECK(cudaEventRecord(eev, strm));
CUDA_CHECK(cudaEventSynchronize(eev));
CUDA_CHECK(cudaEventElapsedTime(ms1, sev, eev));
CUDA_CHECK(cudaEventRecord(sev, strm));
for (int i = 0; i < iter; i++) {
nvshmem_getmem((void *)data_d_local, (void *)data_d, sizeBytes,
peer); // shmem_getmem is blocking, so nvshmem_quiet is not needed
test_kernel<<<ng, nb, 0, strm>>>(data_d_local, ncycles);
}
CUDA_CHECK(cudaEventRecord(eev, strm));
CUDA_CHECK(cudaEventSynchronize(eev));
CUDA_CHECK(cudaEventElapsedTime(ms2, sev, eev));
}
return status;
}
int main(int argc, char *argv[]) {
int status = 0;
int mype, npes;
char *data_d = NULL, *data_d_local = NULL;
uint64_t *size_array = NULL;
double *offs_latency_array = NULL;
double *ons_latency_array = NULL;
int num_entries;
int i;
putget_dir_t dir = PUSH;
int iter = DEFAULT_ITERS;
int min_msg_size = DEFAULT_MIN_MSG_SIZE;
int max_msg_size = DEFAULT_MAX_MSG_SIZE;
int nb = 1, nt = 32;
long long int ncycles = 1;
init_wrapper(&argc, &argv);
mype = nvshmem_my_pe();
npes = nvshmem_n_pes();
if (npes != 2) {
fprintf(stderr, "This test requires exactly two processes \n");
status = -1;
goto finalize;
}
while (1) {
int c;
c = getopt(argc, argv, "s:S:n:i:d:b:t:c:h");
if (c == -1) break;
switch (c) {
case 's':
min_msg_size = strtol(optarg, NULL, 0);
break;
case 'S':
max_msg_size = strtol(optarg, NULL, 0);
break;
case 'n':
iter = strtol(optarg, NULL, 0);
break;
case 'd':
dir = (putget_dir_t)strtol(optarg, NULL, 0);
break;
case 'b':
nb = strtol(optarg, NULL, 0);
break;
case 't':
nt = strtol(optarg, NULL, 0);
break;
case 'c':
ncycles = strtol(optarg, NULL, 0);
break;
default:
case 'h':
printf(
"-n [Iterations] -S [Max message size] -s [Min message size] -i [Put/Get issue type : ON_STREAM(0) otherwise 1] -d [Direction of copy : PUSH(0) or PULL(1)] -b [# blocks] \
-t [# threads] -c [# cycles to wait in the the kernel]\n");
goto finalize;
}
}
num_entries = floor(std::log2((float)max_msg_size)) - floor(std::log2((float)min_msg_size)) + 1;
size_array = (uint64_t *)calloc(sizeof(uint64_t), num_entries);
if (!size_array) {
status = -1;
goto finalize;
}
offs_latency_array = (double *)calloc(sizeof(double), num_entries);
if (!offs_latency_array) {
status = -1;
goto finalize;
}
ons_latency_array = (double *)calloc(sizeof(double), num_entries);
if (!ons_latency_array) {
status = -1;
goto finalize;
}
data_d = (char *)nvshmem_malloc(max_msg_size);
CUDA_CHECK(cudaMemset(data_d, 0, max_msg_size));
data_d_local = (char *)nvshmem_malloc(max_msg_size);
CUDA_CHECK(cudaMemset(data_d, 0, max_msg_size));
cudaStream_t strm;
CUDA_CHECK(cudaStreamCreateWithFlags(&strm, cudaStreamNonBlocking));
CUDA_CHECK(cudaDeviceSynchronize());
if (mype == 0) {
float ms1, ms2;
cudaEvent_t sev, eev;
CUDA_CHECK(cudaEventCreate(&sev));
CUDA_CHECK(cudaEventCreate(&eev));
i = 0;
for (int size = min_msg_size; size <= max_msg_size; size *= 2) {
size_array[i] = size;
lat(data_d, data_d_local, size, mype, iter, dir, strm, sev, eev, &ms1, &ms2, nb, nt,
ncycles);
ons_latency_array[i] = ms1 / iter * 1000;
offs_latency_array[i] = ms2 / iter * 1000;
i++;
}
print_table_basic("Stream_Latency", "with _on_stream", "size (Bytes)", "latency", "us", '-',
size_array, ons_latency_array, i);
print_table_basic("Stream_Latency", "without _on_stream", "size (Bytes)", "latency", "us",
'-', size_array, offs_latency_array, i);
CUDA_CHECK(cudaEventDestroy(sev));
CUDA_CHECK(cudaEventDestroy(eev));
nvshmem_barrier_all();
} else {
nvshmem_barrier_all();
}
finalize:
CUDA_CHECK(cudaStreamDestroy(strm));
if (data_d) nvshmem_free(data_d);
if (size_array) free(size_array);
if (ons_latency_array) free(ons_latency_array);
if (offs_latency_array) free(offs_latency_array);
if (data_d_local) nvshmem_free(data_d_local);
finalize_wrapper();
return status;
}