sglang.0.4.8.post1/nvshmem_src/perftest/common/atomic_one_sided_common.h

289 lines
23 KiB
C

/*
* Copyright (c) 2021, NVIDIA CORPORATION All rights reserved.
*
* NVIDIA CORPORATION and its licensors retain all intellectual property
* and proprietary rights in and to this software, related documentation
* and any modifications thereto Any use, reproduction, disclosure or
* distribution of this software and related documentation without an express
* license agreement from NVIDIA CORPORATION is strictly prohibited.
*
* See COPYRIGHT.txt for license information
*/
#ifndef _ATOMIC_LAT_COMMON_H_
#define _ATOMIC_LAT_COMMON_H_
#include <stdio.h>
#include <assert.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <unistd.h>
#include "utils.h"
#define DEFINE_ATOMIC_LATENCY_CALL_KERNEL(AMO, TYPE_NAME) \
void test_lat_##TYPE_NAME##_##AMO##_cubin(cudaStream_t stream, void **arglist) { \
CUfunction test_cubin; \
init_test_case_kernel(&test_cubin, NVSHMEMI_TEST_STRINGIFY(lat_##TYPE_NAME##_##AMO)); \
CU_CHECK(cuLaunchCooperativeKernel(test_cubin, 1, 1, 1, 1, 1, 1, 0, stream, arglist)); \
}
#define DEFINE_LAT_NON_FETCH_TEST_FOR_AMO_NO_ARG(TYPE, TYPE_NAME, AMO) \
DEFINE_ATOMIC_LATENCY_CALL_KERNEL(AMO, TYPE_NAME) \
__global__ void lat_##TYPE_NAME##_##AMO(TYPE *flag_d, int pe, int iter) { \
int i, tid, peer; \
\
assert(1 == blockDim.x * blockDim.y * blockDim.z * gridDim.x * gridDim.y * gridDim.z); \
peer = !pe; \
tid = threadIdx.x; \
\
if ((pe == 0) && !tid) { \
for (i = 0; i < iter; i++) { \
nvshmem_##TYPE_NAME##_atomic_##AMO(flag_d, peer); \
nvshmem_quiet(); \
} \
} \
}
#define DEFINE_LAT_FETCH_TEST_FOR_AMO_NO_ARG(TYPE, TYPE_NAME, AMO) \
DEFINE_ATOMIC_LATENCY_CALL_KERNEL(AMO, TYPE_NAME) \
__global__ void lat_##TYPE_NAME##_##AMO(TYPE *flag_d, int pe, int iter) { \
int i, tid, peer; \
\
assert(1 == blockDim.x * blockDim.y * blockDim.z * gridDim.x * gridDim.y * gridDim.z); \
peer = !pe; \
tid = threadIdx.x; \
\
if ((pe == 0) && !tid) { \
for (i = 0; i < iter; i++) { \
(void)nvshmem_##TYPE_NAME##_atomic_##AMO(flag_d, peer); \
} \
nvshmem_quiet(); \
} \
}
#define DEFINE_LAT_NON_FETCH_TEST_FOR_AMO_ONE_ARG(TYPE, TYPE_NAME, AMO, COMPARE_EXPR, SET_EXPR) \
DEFINE_ATOMIC_LATENCY_CALL_KERNEL(AMO, TYPE_NAME) \
__global__ void lat_##TYPE_NAME##_##AMO(TYPE *flag_d, int pe, int iter, TYPE value, \
TYPE cmp) { \
int i, tid, peer; \
\
assert(1 == blockDim.x * blockDim.y * blockDim.z * gridDim.x * gridDim.y * gridDim.z); \
peer = !pe; \
tid = threadIdx.x; \
\
if ((pe == 0) && !tid) { \
for (i = 0; i < iter; i++) { \
nvshmem_##TYPE_NAME##_atomic_##AMO(flag_d, SET_EXPR, peer); \
nvshmem_quiet(); \
} \
} \
}
#define DEFINE_LAT_FETCH_TEST_FOR_AMO_ONE_ARG(TYPE, TYPE_NAME, AMO, COMPARE_EXPR, SET_EXPR) \
DEFINE_ATOMIC_LATENCY_CALL_KERNEL(AMO, TYPE_NAME) \
__global__ void lat_##TYPE_NAME##_##AMO(TYPE *flag_d, int pe, int iter, TYPE value, \
TYPE cmp) { \
int i, tid, peer; \
\
assert(1 == blockDim.x * blockDim.y * blockDim.z * gridDim.x * gridDim.y * gridDim.z); \
peer = !pe; \
tid = threadIdx.x; \
\
if ((pe == 0) && !tid) { \
for (i = 0; i < iter; i++) { \
(void)nvshmem_##TYPE_NAME##_atomic_##AMO(flag_d, SET_EXPR, peer); \
} \
nvshmem_quiet(); \
} \
}
#define DEFINE_LAT_NON_FETCH_TEST_FOR_AMO_TWO_ARG(TYPE, TYPE_NAME, AMO, COMPARE_EXPR, SET_EXPR) \
DEFINE_ATOMIC_LATENCY_CALL_KERNEL(AMO, TYPE_NAME) \
__global__ void lat_##TYPE_NAME##_##AMO(TYPE *flag_d, int pe, int iter, TYPE value, \
TYPE cmp) { \
int i, tid, peer; \
\
assert(1 == blockDim.x * blockDim.y * blockDim.z * gridDim.x * gridDim.y * gridDim.z); \
peer = !pe; \
tid = threadIdx.x; \
\
if ((pe == 0) && !tid) { \
for (i = 0; i < iter; i++) { \
nvshmem_##TYPE_NAME##_atomic_##AMO(flag_d, COMPARE_EXPR, SET_EXPR, peer); \
} \
} \
}
#define DEFINE_LAT_FETCH_TEST_FOR_AMO_TWO_ARG(TYPE, TYPE_NAME, AMO, COMPARE_EXPR, SET_EXPR) \
DEFINE_ATOMIC_LATENCY_CALL_KERNEL(AMO, TYPE_NAME) \
__global__ void lat_##TYPE_NAME##_##AMO(TYPE *flag_d, int pe, int iter, TYPE value, \
TYPE cmp) { \
int i, tid, peer; \
\
assert(1 == blockDim.x * blockDim.y * blockDim.z * gridDim.x * gridDim.y * gridDim.z); \
peer = !pe; \
tid = threadIdx.x; \
\
if ((pe == 0) && !tid) { \
for (i = 0; i < iter; i++) { \
(void)nvshmem_##TYPE_NAME##_atomic_##AMO(flag_d, COMPARE_EXPR, SET_EXPR, peer); \
} \
nvshmem_quiet(); \
} \
}
#define MAIN_SETUP(c, v, mype, npes, flag_d, stream, h_size_arr, h_tables, h_lat) \
do { \
init_wrapper(&c, &v); \
\
if (use_cubin) { \
init_cumodule(CUMODULE_NAME); \
} \
\
mype = nvshmem_my_pe(); \
npes = nvshmem_n_pes(); \
\
if (npes != 2) { \
fprintf(stderr, "This test requires exactly two processes \n"); \
finalize_wrapper(); \
exit(-1); \
} \
\
alloc_tables(&h_tables, 2, 1); \
h_size_arr = (uint64_t *)h_tables[0]; \
h_lat = (double *)h_tables[1]; \
\
flag_d = nvshmem_malloc(sizeof(uint64_t)); \
CUDA_CHECK(cudaMemset(flag_d, 0, sizeof(uint64_t))); \
\
CUDA_CHECK(cudaStreamCreate(&stream)); \
\
nvshmem_barrier_all(); \
\
CUDA_CHECK(cudaDeviceSynchronize()); \
\
if (mype == 0) { \
printf("Note: This test measures full round-trip latency\n"); \
} \
} while (0)
#define LAUNCH_KERNEL(TYPE_NAME, AMO, ARGLIST, STREAM) \
if (use_cubin) { \
test_lat_##TYPE_NAME##_##AMO##_cubin(STREAM, ARGLIST); \
} else { \
status = nvshmemx_collective_launch((const void *)lat_##TYPE_NAME##_##AMO, 1, 1, ARGLIST, \
0, STREAM); \
if (status != NVSHMEMX_SUCCESS) { \
fprintf(stderr, "shmemx_collective_launch failed %d \n", status); \
exit(-1); \
} \
}
#define RUN_TEST_WITHOUT_ARG(TYPE, TYPE_NAME, AMO, flag_d, mype, iter, skip, h_lat, h_size_arr, \
flag_init) \
do { \
int size = sizeof(TYPE); \
\
int status = 0; \
h_size_arr[0] = size; \
void *args_1[] = {&flag_d, &mype, &skip}; \
void *args_2[] = {&flag_d, &mype, &iter}; \
\
float milliseconds; \
cudaEvent_t start, stop; \
cudaEventCreate(&start); \
cudaEventCreate(&stop); \
\
TYPE flag_init_var = flag_init; \
CUDA_CHECK(cudaMemcpy(flag_d, &flag_init_var, sizeof(TYPE), cudaMemcpyHostToDevice)); \
CUDA_CHECK(cudaDeviceSynchronize()); \
nvshmem_barrier_all(); \
\
LAUNCH_KERNEL(TYPE_NAME, AMO, args_1, stream); \
if (status != NVSHMEMX_SUCCESS) { \
fprintf(stderr, "shmemx_collective_launch failed %d \n", status); \
exit(-1); \
} \
\
cudaStreamSynchronize(stream); \
\
nvshmem_barrier_all(); \
CUDA_CHECK(cudaMemcpy(flag_d, &flag_init_var, sizeof(TYPE), cudaMemcpyHostToDevice)); \
cudaEventRecord(start, stream); \
LAUNCH_KERNEL(TYPE_NAME, AMO, args_2, stream) \
cudaEventRecord(stop, stream); \
cudaStreamSynchronize(stream); \
/* give latency in us */ \
cudaEventElapsedTime(&milliseconds, start, stop); \
h_lat[0] = (milliseconds * 1000) / iter; \
\
nvshmem_barrier_all(); \
\
if (mype == 0) { \
print_table_basic("shmem_at_" #TYPE "_" #AMO "_ping_lat", "None", "size (Bytes)", \
"latency", "us", '-', h_size_arr, h_lat, 1); \
} \
\
CUDA_CHECK(cudaDeviceSynchronize()); \
\
} while (0)
#define RUN_TEST_WITH_ARG(TYPE, TYPE_NAME, AMO, flag_d, mype, iter, skip, h_lat, h_size_arr, val, \
cmp, flag_init) \
do { \
int size = sizeof(TYPE); \
TYPE compare, value, flag_init_var; \
\
int status = 0; \
h_size_arr[0] = size; \
void *args_1[] = {&flag_d, &mype, &skip, &value, &compare}; \
void *args_2[] = {&flag_d, &mype, &iter, &value, &compare}; \
\
float milliseconds; \
cudaEvent_t start, stop; \
cudaEventCreate(&start); \
cudaEventCreate(&stop); \
\
compare = cmp; \
value = val; \
flag_init_var = flag_init; \
\
CUDA_CHECK(cudaDeviceSynchronize()); \
nvshmem_barrier_all(); \
CUDA_CHECK(cudaMemcpy(flag_d, &flag_init_var, sizeof(TYPE), cudaMemcpyHostToDevice)); \
\
LAUNCH_KERNEL(TYPE_NAME, AMO, args_1, stream) \
\
cudaStreamSynchronize(stream); \
\
nvshmem_barrier_all(); \
CUDA_CHECK(cudaMemcpy(flag_d, &flag_init_var, sizeof(TYPE), cudaMemcpyHostToDevice)); \
cudaEventRecord(start, stream); \
LAUNCH_KERNEL(TYPE_NAME, AMO, args_2, stream) \
cudaEventRecord(stop, stream); \
cudaStreamSynchronize(stream); \
/* give latency in us */ \
cudaEventElapsedTime(&milliseconds, start, stop); \
h_lat[0] = (milliseconds * 1000) / iter; \
\
nvshmem_barrier_all(); \
\
if (mype == 0) { \
print_table_basic("shmem_at_" #TYPE "_" #AMO "_ping_lat", "None", "size (Bytes)", \
"latency", "us", '-', h_size_arr, h_lat, 1); \
} \
\
CUDA_CHECK(cudaDeviceSynchronize()); \
\
} while (0)
#define MAIN_CLEANUP(flag_d, stream, h_tables) \
do { \
if (flag_d) nvshmem_free(flag_d); \
cudaStreamDestroy(stream); \
free_tables(h_tables, 2); \
finalize_wrapper(); \
} while (0);
#endif /* _ATOMIC_LAT_COMMON_H_ */