sglang.0.4.8.post1/nvshmem_src/perftest/device/pt-to-pt/shmem_p_latency.cu

142 lines
3.6 KiB
Plaintext

/*
* Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
*
* NVIDIA CORPORATION and its licensors retain all intellectual property
* and proprietary rights in and to this software, related documentation
* and any modifications thereto. Any use, reproduction, disclosure or
* distribution of this software and related documentation without an express
* license agreement from NVIDIA CORPORATION is strictly prohibited.
*
* See COPYRIGHT.txt for license information
*/
#define CUMODULE_NAME "shmem_p_latency.cubin"
#include <stdio.h>
#include <assert.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <unistd.h>
#include "utils.h"
#if defined __cplusplus || defined NVSHMEM_BITCODE_APPLICATION
extern "C" {
#endif
__global__ void p_latency(int *data_d, int len, int pe, int iter) {
int i, j, tid, peer;
peer = !pe;
tid = threadIdx.x;
for (i = 0; i < iter; i++) {
for (j = tid; j < len; j += blockDim.x) {
nvshmem_int_p(data_d + j, *(data_d + j), peer);
}
__syncthreads();
if (!tid) {
nvshmem_quiet();
}
__syncthreads();
}
}
#if defined __cplusplus || defined NVSHMEM_BITCODE_APPLICATION
}
#endif
void test_p(int *data_d, int len, int pe, int iter, CUfunction kernel) {
if (use_cubin) {
void *arglist[] = {(void *)&data_d, (void *)&len, (void *)&pe, (void *)&iter};
CU_CHECK(cuLaunchKernel(kernel, 1, 1, 1, threads_per_block, 1, 1, 0, NULL, arglist, NULL));
} else {
p_latency<<<1, threads_per_block>>>(data_d, len, pe, iter);
}
}
int main(int argc, char *argv[]) {
int mype, npes;
int *data_d = NULL;
read_args(argc, argv);
int iter = iters;
int skip = warmup_iters;
int array_size, i;
void **h_tables;
uint64_t *h_size_arr;
double *h_lat;
float milliseconds;
cudaEvent_t start, stop;
CUfunction test_cubin = NULL;
init_wrapper(&argc, &argv);
if (use_cubin) {
init_cumodule(CUMODULE_NAME);
init_test_case_kernel(&test_cubin, "p_latency");
}
cudaEventCreate(&start);
cudaEventCreate(&stop);
mype = nvshmem_my_pe();
npes = nvshmem_n_pes();
if (npes != 2) {
fprintf(stderr, "This test requires exactly two processes \n");
goto finalize;
}
array_size = max_size_log;
alloc_tables(&h_tables, 2, array_size);
h_size_arr = (uint64_t *)h_tables[0];
h_lat = (double *)h_tables[1];
data_d = (int *)nvshmem_malloc(max_size);
CUDA_CHECK(cudaMemset(data_d, 0, max_size));
nvshmem_barrier_all();
CUDA_CHECK(cudaDeviceSynchronize());
i = 0;
for (size_t size = min_size; size <= max_size; size *= step_factor) {
if (!mype) {
int nelems;
h_size_arr[i] = size;
nelems = size / sizeof(int);
test_p(data_d, nelems, mype, skip, test_cubin);
cudaEventRecord(start);
test_p(data_d, nelems, mype, iter, test_cubin);
cudaEventRecord(stop);
CUDA_CHECK(cudaGetLastError());
CUDA_CHECK(cudaEventSynchronize(stop));
cudaEventElapsedTime(&milliseconds, start, stop);
/* give latency in us */
h_lat[i] = (milliseconds * 1000) / iter;
i++;
}
nvshmem_barrier_all();
}
if (mype == 0) {
print_table_basic("shmem_p_latency", "None", "size (Bytes)", "latency", "us", '-',
h_size_arr, h_lat, i);
}
finalize:
if (data_d) nvshmem_free(data_d);
free_tables(h_tables, 2);
finalize_wrapper();
return 0;
}