sglang_v0.5.2/nvshmem_src/perftest/common/utils.cu

980 lines
35 KiB
Plaintext

/*
* Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
*
* NVIDIA CORPORATION and its licensors retain all intellectual property
* and proprietary rights in and to this software, related documentation
* and any modifications thereto. Any use, reproduction, disclosure or
* distribution of this software and related documentation without an express
* license agreement from NVIDIA CORPORATION is strictly prohibited.
*
* See COPYRIGHT.txt for license information
*/
#include "utils.h"
#include <dlfcn.h>
#include <stdlib.h>
#include <sstream>
#include <vector>
#include <tuple>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <getopt.h>
#include <stdio.h>
#include <errno.h>
#include <string.h>
double *d_latency = NULL;
double *d_avg_time = NULL;
double *latency = NULL;
double *avg_time = NULL;
int mype = 0;
int npes = 0;
int use_mpi = 0;
int use_shmem = 0;
int use_uid = 0;
bool use_cubin = false;
__device__ int clockrate;
CUmodule mymodule = NULL;
void init_cumodule(const char *str) {
int init_error = 0;
char exe_path[1000];
size_t count = readlink("/proc/self/exe", exe_path, 1000);
exe_path[count] = '\0';
char *exe_dir = dirname(exe_path);
char cubin_path[1000];
strcpy(cubin_path, exe_dir);
strcat(cubin_path, "/");
strcat(cubin_path, str);
printf("CUBIN Selected: %s\n", cubin_path);
CU_CHECK(cuModuleLoad(&mymodule, cubin_path));
init_error = nvshmemx_cumodule_init(mymodule);
if (init_error) {
ERROR_PRINT("cumodule_init failed \n");
assert(false);
}
}
void init_test_case_kernel(CUfunction *kernel, const char *kernel_name) {
CU_CHECK(cuModuleGetFunction(kernel, mymodule, kernel_name));
}
#ifdef NVSHMEMTEST_SHMEM_SUPPORT
#include "unistd.h"
void *nvshmemi_shmem_handle = NULL;
struct nvshmemi_shmem_fn_table shmem_fn_table = {0};
static uint64_t getHostHash() {
char hostname[1024];
uint64_t result = 5381;
int status = 0;
status = gethostname(hostname, 1024);
if (status) ERROR_EXIT("gethostname failed \n");
for (int c = 0; c < 1024 && hostname[c] != '\0'; c++) {
result = ((result << 5) + result) + hostname[c];
}
return result;
}
/* This is a special function that is a WAR for a bug in OSHMEM
implementation. OSHMEM erroneosly sets the context on device 0 during
shmem_init. Hence before nvshmem_init() is called, device must be
set correctly */
void select_device_shmem() {
uint64_t host;
uint64_t *hosts;
long *pSync;
cudaDeviceProp prop;
int dev_count;
int mype_node;
int mype, n_pes;
mype = shmem_fn_table.fn_shmem_my_pe();
n_pes = shmem_fn_table.fn_shmem_n_pes();
mype_node = 0;
host = getHostHash();
hosts = (uint64_t *)shmem_fn_table.fn_shmem_malloc(sizeof(uint64_t) * (n_pes + 1));
hosts[0] = host;
pSync = (long *)shmem_fn_table.fn_shmem_malloc(SHMEM_COLLECT_SYNC_SIZE * sizeof(long));
shmem_fn_table.fn_shmem_fcollect64(hosts + 1, hosts, 1, 0, 0, n_pes, pSync);
for (int i = 0; i < n_pes; i++) {
if (i == mype) break;
if (hosts[i + 1] == host) mype_node++;
}
CUDA_CHECK(cudaGetDeviceCount(&dev_count));
CUDA_CHECK(cudaSetDevice(mype_node % dev_count));
CUDA_CHECK(cudaGetDeviceProperties(&prop, mype_node % dev_count));
fprintf(stderr, "mype: %d mype_node: %d device name: %s bus id: %d \n", mype, mype_node,
prop.name, prop.pciBusID);
CUDA_CHECK(cudaMemcpyToSymbol(clockrate, (void *)&prop.clockRate, sizeof(int), 0,
cudaMemcpyHostToDevice));
}
#endif
#ifdef NVSHMEMTEST_MPI_SUPPORT
void *nvshmemi_mpi_handle = NULL;
struct nvshmemi_mpi_fn_table mpi_fn_table = {0};
MPI_Comm MPI_COMM_WORLD_PLACEHOLDER;
MPI_Datatype MPI_UINT8_T_PLACEHOLDER;
MPI_Datatype *mpi_uint8_ptr;
int nvshmemi_load_mpi() {
nvshmemi_mpi_handle = dlopen("libmpi.so.40", RTLD_NOW | RTLD_GLOBAL | RTLD_DEEPBIND);
if (nvshmemi_mpi_handle == NULL) {
// Print the error number and description from errno.
fprintf(stderr, "dlopen failed: errno = %d, description = %s\n", errno, strerror(errno));
// Additionally, print the error message from dlerror for more specific information.
const char *dlerror_msg = dlerror();
if (dlerror_msg) {
fprintf(stderr, "dlerror: %s\n", dlerror_msg);
}
fprintf(stderr,
"Unable to dlopen libmpi.so.40."
"Please add it to your LD_LIBRARY_PATH or run without"
" NVSHMEMTEST_USE_MPI_LAUNCHER.\n");
return -1;
}
MPI_LOAD_SYM(MPI_Init);
MPI_LOAD_SYM(MPI_Bcast);
MPI_LOAD_SYM(MPI_Comm_rank);
MPI_LOAD_SYM(MPI_Comm_size);
MPI_LOAD_SYM(MPI_Finalize);
return 0;
}
int nvshmemi_dlclose_mpi() {
int status;
status = dlclose(nvshmemi_mpi_handle);
if (status) {
fprintf(stderr, "unable to dlclose MPI.\n");
return -1;
}
return 0;
}
#endif
#ifdef NVSHMEMTEST_SHMEM_SUPPORT
int nvshmemi_load_shmem() {
nvshmemi_shmem_handle = dlopen("liboshmem.so.40", RTLD_NOW | RTLD_GLOBAL | RTLD_DEEPBIND);
if (nvshmemi_shmem_handle == NULL) {
// Print the error number and description from errno.
fprintf(stderr, "dlopen failed: errno = %d, description = %s\n", errno, strerror(errno));
// Additionally, print the error message from dlerror for more specific information.
const char *dlerror_msg = dlerror();
if (dlerror_msg) {
fprintf(stderr, "dlerror: %s\n", dlerror_msg);
}
fprintf(stderr,
"Unable to dlopen liboshmem.so.40."
"Please add it to your LD_LIBRARY_PATH or run without"
" NVSHMEMTEST_USE_SHMEM_LAUNCHER.\n");
return -1;
}
SHMEM_LOAD_SYM(shmem_init);
SHMEM_LOAD_SYM(shmem_my_pe);
SHMEM_LOAD_SYM(shmem_n_pes);
SHMEM_LOAD_SYM(shmem_fcollect64);
SHMEM_LOAD_SYM(shmem_malloc);
SHMEM_LOAD_SYM(shmem_free);
SHMEM_LOAD_SYM(shmem_finalize);
return 0;
}
int nvshmemi_dlclose_shmem() {
int status;
status = dlclose(nvshmemi_shmem_handle);
if (status) {
fprintf(stderr, "unable to dlclose shmem.\n");
return -1;
}
return 0;
}
#endif
void select_device() {
cudaDeviceProp prop;
int dev_count;
int mype_node;
int mype;
mype = nvshmem_my_pe();
mype_node = nvshmem_team_my_pe(NVSHMEMX_TEAM_NODE);
CUDA_CHECK(cudaGetDeviceCount(&dev_count));
CUDA_CHECK(cudaSetDevice(mype_node % dev_count));
CUDA_CHECK(cudaGetDeviceProperties(&prop, mype_node % dev_count));
fprintf(stderr, "mype: %d mype_node: %d device name: %s bus id: %d \n", mype, mype_node,
prop.name, prop.pciBusID);
CUDA_CHECK(cudaMemcpyToSymbol(clockrate, (void *)&prop.clockRate, sizeof(int), 0,
cudaMemcpyHostToDevice));
}
static void check_for_cumodule_tests() {
char *test_mode = getenv("NVSHMEM_TEST_CUBIN_LIBRARY");
if (test_mode) {
use_cubin = atoi(test_mode);
}
if (use_cubin) {
printf("Bitcode Library Testing Method Chosen.\n");
}
}
void init_wrapper(int *c, char ***v) {
check_for_cumodule_tests();
#ifdef NVSHMEMTEST_MPI_SUPPORT
{
char *value = getenv("NVSHMEMTEST_USE_MPI_LAUNCHER");
if (value) use_mpi = atoi(value);
char *uid_value = getenv("NVSHMEMTEST_USE_UID_BOOTSTRAP");
if (uid_value) use_uid = atoi(uid_value);
}
#endif
#ifdef NVSHMEMTEST_SHMEM_SUPPORT
{
char *value = getenv("NVSHMEMTEST_USE_SHMEM_LAUNCHER");
if (value) use_shmem = atoi(value);
}
#endif
#ifdef NVSHMEMTEST_MPI_SUPPORT
int status;
int rank, nranks;
if (use_mpi || use_uid) {
status = nvshmemi_load_mpi();
if (status) exit(-1);
mpi_fn_table.fn_MPI_Init(c, v);
MPI_COMM_WORLD_PLACEHOLDER = (MPI_Comm)dlsym(nvshmemi_mpi_handle, "ompi_mpi_comm_world");
MPI_UINT8_T_PLACEHOLDER = (MPI_Datatype)dlsym(nvshmemi_mpi_handle, "ompi_mpi_uint8_t");
mpi_fn_table.fn_MPI_Comm_rank(MPI_COMM_WORLD_PLACEHOLDER, &rank);
mpi_fn_table.fn_MPI_Comm_size(MPI_COMM_WORLD_PLACEHOLDER, &nranks);
DEBUG_PRINT("MPI: [%d of %d] hello MPI world! \n", rank, nranks);
}
if (use_mpi) {
MPI_Comm mpi_comm = MPI_COMM_WORLD_PLACEHOLDER;
nvshmemx_init_attr_t attr = NVSHMEMX_INIT_ATTR_INITIALIZER;
attr.mpi_comm = &mpi_comm;
nvshmemx_init_attr(NVSHMEMX_INIT_WITH_MPI_COMM, &attr);
select_device();
nvshmem_barrier_all();
return;
} else if (use_uid) {
nvshmemx_init_attr_t attr = NVSHMEMX_INIT_ATTR_INITIALIZER;
nvshmemx_uniqueid_t id = NVSHMEMX_UNIQUEID_INITIALIZER;
if (rank == 0) {
nvshmemx_get_uniqueid(&id);
}
mpi_fn_table.fn_MPI_Bcast(&id, sizeof(nvshmemx_uniqueid_t), MPI_UINT8_T_PLACEHOLDER, 0,
MPI_COMM_WORLD_PLACEHOLDER);
nvshmemx_set_attr_uniqueid_args(rank, nranks, &id, &attr);
nvshmemx_init_attr(NVSHMEMX_INIT_WITH_UNIQUEID, &attr);
select_device();
nvshmem_barrier_all();
return;
}
#endif
#ifdef NVSHMEMTEST_SHMEM_SUPPORT
if (use_shmem) {
status = nvshmemi_load_shmem();
if (status) exit(-1);
shmem_fn_table.fn_shmem_init();
mype = shmem_fn_table.fn_shmem_my_pe();
npes = shmem_fn_table.fn_shmem_n_pes();
DEBUG_PRINT("SHMEM: [%d of %d] hello SHMEM world! \n", mype, npes);
latency = (double *)shmem_fn_table.fn_shmem_malloc(sizeof(double));
if (!latency) ERROR_EXIT("(shmem_malloc) failed \n");
avg_time = (double *)shmem_fn_table.fn_shmem_malloc(sizeof(double));
if (!avg_time) ERROR_EXIT("(shmem_malloc) failed \n");
select_device_shmem();
nvshmemx_init_attr_t attr = NVSHMEMX_INIT_ATTR_INITIALIZER;
nvshmemx_init_attr(NVSHMEMX_INIT_WITH_SHMEM, &attr);
nvshmem_barrier_all();
return;
}
#endif
nvshmem_init();
mype = nvshmem_my_pe();
npes = nvshmem_n_pes();
select_device();
nvshmem_barrier_all();
d_latency = (double *)nvshmem_malloc(sizeof(double));
if (!d_latency) ERROR_EXIT("nvshmem_malloc failed \n");
d_avg_time = (double *)nvshmem_malloc(sizeof(double));
if (!d_avg_time) ERROR_EXIT("nvshmem_malloc failed \n");
DEBUG_PRINT("end of init \n");
return;
}
void finalize_wrapper() {
#ifdef NVSHMEMTEST_SHMEM_SUPPORT
if (use_shmem) {
shmem_fn_table.fn_shmem_free(latency);
shmem_fn_table.fn_shmem_free(avg_time);
}
#endif
#if !defined(NVSHMEMTEST_SHMEM_SUPPORT) && !defined(NVSHMEMTEST_MPI_SUPPORT)
if (!use_mpi && !use_shmem) {
nvshmem_free(d_latency);
nvshmem_free(d_avg_time);
}
#endif
nvshmem_finalize();
#ifdef NVSHMEMTEST_MPI_SUPPORT
if (use_mpi || use_uid) {
mpi_fn_table.fn_MPI_Finalize();
nvshmemi_dlclose_mpi();
}
#endif
#ifdef NVSHMEMTEST_SHMEM_SUPPORT
if (use_shmem) {
shmem_fn_table.fn_shmem_finalize();
// Calling dlclose will cause oshrun to segfault at termination.
// nvshmemi_dlclose_shmem();
}
#endif
}
void alloc_tables(void ***table_mem, int num_tables, int num_entries_per_table) {
void **tables;
int i, dev_property;
int dev_count;
CUDA_CHECK(cudaGetDeviceCount(&dev_count));
int mype_node = nvshmem_team_my_pe(NVSHMEMX_TEAM_NODE);
CUDA_CHECK(
cudaDeviceGetAttribute(&dev_property, cudaDevAttrUnifiedAddressing, mype_node % dev_count));
assert(dev_property == 1);
assert(num_tables >= 1);
assert(num_entries_per_table >= 1);
CUDA_CHECK(cudaHostAlloc(table_mem, num_tables * sizeof(void *), cudaHostAllocMapped));
tables = *table_mem;
/* Just allocate an array of 8 byte values. The user can decide if they want to use double or
* uint64_t */
for (i = 0; i < num_tables; i++) {
CUDA_CHECK(
cudaHostAlloc(&tables[i], num_entries_per_table * sizeof(double), cudaHostAllocMapped));
memset(tables[i], 0, num_entries_per_table * sizeof(double));
}
}
void free_tables(void **tables, int num_tables) {
int i;
for (i = 0; i < num_tables; i++) {
CUDA_CHECK(cudaFreeHost(tables[i]));
}
CUDA_CHECK(cudaFreeHost(tables));
}
uint64_t get_coll_info(double *algBw, double *busBw, const char *job_name, uint64_t size,
double usec, int npes) {
double baseBw, factor;
// size == count * typesize
// convert to seconds
double sec = usec / 1.0E6;
uint64_t total_bytes = 0;
if (strcmp(job_name, "reduction") == 0 || strcmp(job_name, "reduction_on_stream") == 0 ||
strcmp(job_name, "device_reduction") == 0) {
baseBw = (double)(size) / 1.0E9 / sec;
factor = ((double)2 * (npes - 1)) / ((double)(npes));
total_bytes = size;
} else if (strcmp(job_name, "broadcast") == 0 || strcmp(job_name, "broadcast_on_stream") == 0 ||
strcmp(job_name, "bcast_device") == 0) {
baseBw = (double)(size) / 1.0E9 / sec;
factor = 1;
total_bytes = size;
} else if (strcmp(job_name, "alltoall") == 0 || strcmp(job_name, "alltoall_on_stream") == 0 ||
strcmp(job_name, "alltoall_device") == 0 || strcmp(job_name, "fcollect") == 0 ||
strcmp(job_name, "fcollect_on_stream") == 0 ||
strcmp(job_name, "fcollect_device") == 0 || strcmp(job_name, "reducescatter") == 0 ||
strcmp(job_name, "reducescatter_on_stream") == 0 ||
strcmp(job_name, "device_reducescatter") == 0) {
baseBw = (double)(size) / 1.0E9 / sec;
factor = ((double)(npes - 1)) / ((double)(npes));
total_bytes = size;
} else {
printf("Job Name %s bandwidth factor not set. Using 1 values for bw.\n", job_name);
*algBw = 1;
*busBw = 1;
return size;
}
*algBw = baseBw;
*busBw = baseBw * factor;
return total_bytes;
}
tuple<double, double, double> get_latency_metrics(double *values, int num_values) {
double min, max, sum;
int i = 0;
min = max = values[0];
sum = 0.0;
while (i < num_values) {
auto v = values[i];
if (v < min) {
min = v;
}
if (v > max) {
max = v;
}
sum += v;
i++;
}
double avg = (double)sum / num_values;
return make_tuple(avg, min, max);
}
void print_table_basic(const char *job_name, const char *subjob_name, const char *var_name,
const char *output_var, const char *units, const char plus_minus,
uint64_t *size, double *value, int num_entries) {
bool machine_readable = false;
char buffer[256] = {0};
char *env_value = getenv("NVSHMEM_MACHINE_READABLE_OUTPUT");
if (env_value) machine_readable = atoi(env_value);
int i;
if (machine_readable) {
printf("%s\n", job_name);
for (i = 0; i < num_entries; i++) {
if (size[i] != 0 && value[i] != 0.00) {
printf("&&&& PERF %s___%s___size__%lu___%s %lf %c%s\n", job_name, subjob_name,
size[i], output_var, value[i], plus_minus, units);
}
}
} else {
printf("#%10s\n", job_name);
snprintf(buffer, 256, "%s (%s)", output_var, units);
printf("%-10s %-8s %-16s\n", "size(B)", "scope", buffer);
for (i = 0; i < num_entries; i++) {
if (size[i] != 0 && value[i] != 0.00) {
printf("%-10lu %-8s %-16.6lf", size[i], subjob_name, value[i]);
printf("\n");
}
}
}
}
void print_table_v1(const char *job_name, const char *subjob_name, const char *var_name,
const char *output_var, const char *units, const char plus_minus,
uint64_t *size, double *value, int num_entries) {
bool machine_readable = false;
char *env_value = getenv("NVSHMEM_MACHINE_READABLE_OUTPUT");
if (env_value) machine_readable = atoi(env_value);
int i;
int npes = nvshmem_n_pes();
double avg, algbw, busbw;
char **tokens = (char **)malloc(3 * sizeof(char *));
const char *delim = "-";
char copy[strlen(subjob_name) + 1];
strcpy(copy, subjob_name);
char *token = strtok(copy, delim);
i = 0;
while (token != NULL) {
tokens[i] = strdup(token);
token = strtok(NULL, delim);
i++;
}
/* Used for automated test output. It outputs the data in a non human-friendly format. */
if (machine_readable) {
printf("%s\n", job_name);
for (i = 0; i < num_entries; i++) {
if (size[i] != 0 && value[i] != 0.00) {
printf("&&&& PERF %s___%s___size__%lu___%s %lf %c%s\n", job_name, subjob_name,
size[i], output_var, value[i], plus_minus, units);
}
}
} else if (strcmp(job_name, "device_reduction") == 0 ||
strcmp(job_name, "device_reducescatter") == 0) {
printf("#%10s\n", job_name);
printf("%-10s %-8s %-8s %-8s %-16s %-12s %-12s\n", "size(B)", "type", "redop",
"scope", "latency(us)", "algbw(GB/s)", "busbw(GB/s)");
for (i = 0; i < num_entries; i++) {
if (size[i] != 0 && value[i] != 0.00) {
avg = value[i];
uint64_t total_bytes = get_coll_info(&algbw, &busbw, job_name, size[i], avg, npes);
printf("%-10lu %-8s %-8s %-8s %-16.6lf %-12.3lf %-12.3lf", total_bytes,
tokens[0], tokens[1], tokens[2], avg, algbw, busbw);
printf("\n");
}
}
} else {
// recombine first two tokens of subjob_name
char type[strlen(subjob_name)];
strcpy(type, subjob_name);
char *last_delim = strrchr(type, '-');
if (last_delim != NULL) *last_delim = '\0';
printf("#%10s\n", job_name);
printf("%-10s %-8s %-8s %-16s %-12s %-12s\n", "size(B)", "type", "scope",
"latency(us)", "algbw(GB/s)", "busbw(GB/s)");
for (i = 0; i < num_entries; i++) {
if (size[i] != 0 && value[i] != 0.00) {
avg = value[i];
uint64_t total_bytes = get_coll_info(&algbw, &busbw, job_name, size[i], avg, npes);
printf("%-10lu %-8s %-8s %-16.6lf %-12.3lf %-12.3lf", total_bytes, type,
tokens[2], avg, algbw, busbw);
printf("\n");
}
}
}
}
void print_table_v2(const char *job_name, const char *subjob_name, const char *var_name,
const char *output_var, const char *units, const char plus_minus,
uint64_t *size, double **values, int num_entries, size_t num_iters) {
bool machine_readable = false;
char *env_value = getenv("NVSHMEM_MACHINE_READABLE_OUTPUT");
if (env_value) machine_readable = atoi(env_value);
int i;
int npes = nvshmem_n_pes();
double avg, min, max, algbw, busbw = 0;
/* Used for automated test output. It outputs the data in a non human-friendly format. */
if (machine_readable) {
printf("%s\n", job_name);
for (i = 0; i < num_entries; i++) {
auto value = values[i];
tie(avg, min, max) = get_latency_metrics(value, num_iters);
if (size[i] != 0 && value[0] != 0.00) {
printf("&&&& PERF %s___%s___size__%lu___%s %lf %c%s\n", job_name, subjob_name,
size[i], output_var, avg, plus_minus, units);
}
}
} else if (strcmp(job_name, "reduction") == 0) {
/* Splits subjob_name into data type and operation name */
char **tokens = (char **)malloc(2 * sizeof(char *));
const char *delim = "-";
char copy[strlen(subjob_name) + 1];
strcpy(copy, subjob_name);
char *token = strtok(copy, delim);
if (token != NULL) {
tokens[0] = strdup(token);
token = strtok(NULL, delim);
if (token != NULL) {
tokens[1] = strdup(token);
} else {
tokens[1] = strdup("None");
}
}
printf("#%10s\n", job_name);
printf("%-10s %-8s %-8s %-16s %-16s %-16s %-12s %-12s\n", "size(B)", "type", "redop",
"latency(us)", "min_lat(us)", "max_lat(us)", "algbw(GB/s)", "busbw(GB/s)");
for (i = 0; i < num_entries; i++) {
auto value = values[i];
if (size[i] != 0 && value[0] != 0.00) {
tie(avg, min, max) = get_latency_metrics(value, num_iters);
uint64_t total_bytes = get_coll_info(&algbw, &busbw, job_name, size[i], avg, npes);
printf("%-10.1lu %-8s %-8s %-16.6lf %-16.3lf %-16.3lf %-12.3lf %-12.3lf",
total_bytes, tokens[0], tokens[1], avg, min, max, algbw, busbw);
printf("\n");
}
}
} else {
printf("#%10s\n", job_name);
printf("%-10s %-8s %-16s %-16s %-16s %-12s %-12s\n", "size(B)", "type", "latency(us)",
"min_lat(us)", "max_lat(us)", "algbw(GB/s)", "busbw(GB/s)");
for (i = 0; i < num_entries; i++) {
auto value = values[i];
if (size[i] != 0 && value[0] != 0.00) {
tie(avg, min, max) = get_latency_metrics(value, num_iters);
uint64_t total_bytes = get_coll_info(&algbw, &busbw, job_name, size[i], avg, npes);
printf("%-10.1lu %-8s %-16.6lf %-16.3lf %-16.3lf %-12.3lf %-12.3lf",
total_bytes, subjob_name, avg, min, max, algbw, busbw);
printf("\n");
}
}
}
}
void datatype_parse(char *optarg, datatype_t *datatype) {
if (!strcmp(optarg, "int")) {
datatype->type = NVSHMEM_INT;
datatype->size = sizeof(int);
datatype->name = "int";
} else if (!strcmp(optarg, "long")) {
datatype->type = NVSHMEM_LONG;
datatype->size = sizeof(long);
datatype->name = "long";
} else if (!strcmp(optarg, "longlong")) {
datatype->type = NVSHMEM_LONGLONG;
datatype->size = sizeof(long long);
datatype->name = "longlong";
} else if (!strcmp(optarg, "ulonglong")) {
datatype->type = NVSHMEM_ULONGLONG;
datatype->size = sizeof(unsigned long long);
datatype->name = "ulonglong";
} else if (!strcmp(optarg, "size")) {
datatype->type = NVSHMEM_SIZE;
datatype->size = sizeof(size_t);
datatype->name = "size";
} else if (!strcmp(optarg, "ptrdiff")) {
datatype->type = NVSHMEM_PTRDIFF;
datatype->size = sizeof(ptrdiff_t);
datatype->name = "ptrdiff";
} else if (!strcmp(optarg, "float")) {
datatype->type = NVSHMEM_FLOAT;
datatype->size = sizeof(float);
datatype->name = "float";
} else if (!strcmp(optarg, "double")) {
datatype->type = NVSHMEM_DOUBLE;
datatype->size = sizeof(double);
datatype->name = "double";
} else if (!strcmp(optarg, "uint")) {
datatype->type = NVSHMEM_UINT;
datatype->size = sizeof(unsigned int);
datatype->name = "uint";
} else if (!strcmp(optarg, "int32")) {
datatype->type = NVSHMEM_INT32;
datatype->size = sizeof(int32_t);
datatype->name = "int32";
} else if (!strcmp(optarg, "int64")) {
datatype->type = NVSHMEM_INT64;
datatype->size = sizeof(int64_t);
datatype->name = "int64";
} else if (!strcmp(optarg, "uint32")) {
datatype->type = NVSHMEM_UINT32;
datatype->size = sizeof(int32_t);
datatype->name = "uint32";
} else if (!strcmp(optarg, "uint64")) {
datatype->type = NVSHMEM_UINT64;
datatype->size = sizeof(uint64_t);
datatype->name = "uint64";
} else if (!strcmp(optarg, "fp16")) {
datatype->type = NVSHMEM_FP16;
datatype->size = sizeof(half);
datatype->name = "fp16";
} else if (!strcmp(optarg, "bf16")) {
datatype->type = NVSHMEM_BF16;
datatype->size = sizeof(__nv_bfloat16);
datatype->name = "bf16";
}
}
static void reduce_op_parse(char *str, reduce_op_t *reduce_op) {
if (!strcmp(optarg, "min")) {
reduce_op->type = NVSHMEM_MIN;
reduce_op->name = "min";
} else if (!strcmp(optarg, "max")) {
reduce_op->type = NVSHMEM_MAX;
reduce_op->name = "max";
} else if (!strcmp(optarg, "sum")) {
reduce_op->type = NVSHMEM_SUM;
reduce_op->name = "sum";
} else if (!strcmp(optarg, "prod")) {
reduce_op->type = NVSHMEM_PROD;
reduce_op->name = "prod";
} else if (!strcmp(optarg, "and")) {
reduce_op->type = NVSHMEM_AND;
reduce_op->name = "and";
} else if (!strcmp(optarg, "or")) {
reduce_op->type = NVSHMEM_OR;
reduce_op->name = "or";
} else if (!strcmp(optarg, "xor")) {
reduce_op->type = NVSHMEM_XOR;
reduce_op->name = "xor";
}
}
void atomic_op_parse(char *str, amo_t *amo) {
size_t string_length = strnlen(str, 20);
if (strncmp(str, "inc", string_length) == 0) {
amo->type = AMO_INC;
amo->name = "inc";
} else if (strncmp(str, "fetch_inc", string_length) == 0) {
amo->type = AMO_FETCH_INC;
amo->name = "fetch_inc";
} else if (strncmp(str, "set", string_length) == 0) {
amo->type = AMO_SET;
amo->name = "set";
} else if (strncmp(str, "add", string_length) == 0) {
amo->type = AMO_ADD;
amo->name = "add";
} else if (strncmp(str, "fetch_add", string_length) == 0) {
amo->type = AMO_FETCH_ADD;
amo->name = "fetch_add";
} else if (strncmp(str, "and", string_length) == 0) {
amo->type = AMO_AND;
amo->name = "and";
} else if (strncmp(str, "fetch_and", string_length) == 0) {
amo->type = AMO_FETCH_AND;
amo->name = "fetch_and";
} else if (strncmp(str, "or", string_length) == 0) {
amo->type = AMO_OR;
amo->name = "or";
} else if (strncmp(str, "fetch_or", string_length) == 0) {
amo->type = AMO_FETCH_OR;
amo->name = "fetch_or";
} else if (strncmp(str, "xor", string_length) == 0) {
amo->type = AMO_XOR;
amo->name = "xor";
} else if (strncmp(str, "fetch_xor", string_length) == 0) {
amo->type = AMO_FETCH_XOR;
amo->name = "fetch_xor";
} else if (strncmp(str, "swap", string_length) == 0) {
amo->type = AMO_SWAP;
amo->name = "swap";
} else if (strncmp(str, "compare_swap", string_length) == 0) {
amo->type = AMO_COMPARE_SWAP;
amo->name = "compare_swap";
} else {
amo->type = AMO_ACK;
amo->name = "ack";
}
}
/* atol() + optional scaled suffix recognition: 1K, 2M, 3G, 1T */
static inline int atol_scaled(const char *str, size_t *out) {
int scale, n;
double p = -1.0;
char f;
n = sscanf(str, "%lf%c", &p, &f);
if (n == 2) {
switch (f) {
case 'k':
case 'K':
scale = 10;
break;
case 'm':
case 'M':
scale = 20;
break;
case 'g':
case 'G':
scale = 30;
break;
case 't':
case 'T':
scale = 40;
break;
default:
return 1;
}
} else if (p < 0) {
return 1;
} else
scale = 0;
*out = (size_t)ceil(p * (1lu << scale));
return 0;
}
size_t min_size = 4;
size_t max_size = min_size * 1024 * 1024;
size_t num_blocks = 32;
size_t threads_per_block = 256;
size_t iters = 10;
size_t warmup_iters = 5;
size_t step_factor = 2;
size_t max_size_log = 1;
size_t stride = 1;
bool bidirectional = false;
bool report_msgrate = false;
datatype_t datatype = {NVSHMEM_INT, 4, "int"};
reduce_op_t reduce_op = {NVSHMEM_SUM, "sum"};
threadgroup_scope_t threadgroup_scope = {NVSHMEM_ALL_SCOPES, "all_scopes"};
amo_t test_amo = {AMO_INC, "inc"};
putget_issue_t putget_issue = {ON_STREAM, "on_stream"};
dir_t dir = {WRITE, "write"};
void read_args(int argc, char **argv) {
int c;
static struct option long_options[] = {{"bidir", no_argument, 0, 0},
{"report_msgrate", no_argument, 0, 0},
{"dir", required_argument, 0, 0},
{"issue", required_argument, 0, 0},
{"help", no_argument, 0, 'h'},
{"min_size", required_argument, 0, 'b'},
{"max_size", required_argument, 0, 'e'},
{"step", required_argument, 0, 'f'},
{"iters", required_argument, 0, 'n'},
{"warmup_iters", required_argument, 0, 'w'},
{"ctas", required_argument, 0, 'c'},
{"threads_per_cta", required_argument, 0, 't'},
{"datatype", required_argument, 0, 'd'},
{"reduce_op", required_argument, 0, 'o'},
{"scope", required_argument, 0, 's'},
{"atomic_op", required_argument, 0, 'a'},
{"stride", required_argument, 0, 'i'},
{0, 0, 0, 0}};
/* getopt_long stores the option index here. */
int option_index = 0;
while ((c = getopt_long(argc, argv, "hb:e:f:n:w:c:t:d:o:s:a:i:", long_options,
&option_index)) != -1) {
switch (c) {
case 'h':
printf(
"Accepted arguments: \n"
"-b, --min_size <minbytes> \n"
"-e, --max_size <maxbytes> \n"
"-f, --step <step factor for message sizes> \n"
"-n, --iters <number of iterations> \n"
"-w, --warmup_iters <number of warmup iterations> \n"
"-c, --ctas <number of CTAs to launch> (used in some device pt-to-pt tests) \n"
"-t, --threads_per_cta <number of threads per block> (used in some device "
"pt-to-pt tests) \n"
"-d, --datatype: "
"<int, int32_t, uint32_t, int64_t, uint64_t, long, longlong, ulonglong, size, "
"ptrdiff, "
"float, double, fp16, bf16> \n"
"-o, --reduce_op <min, max, sum, prod, and, or, xor> \n"
"-s, --scope <thread, warp, block, all> \n"
"-i, --stride stride between elements \n"
"-a, --atomic_op <inc, add, and, or, xor, set, swap, fetch_<inc, add, and, or, "
"xor>, compare_swap> \n"
"--bidir: run bidirectional test \n"
"--msgrate: report message rate (MMPs)\n"
"--dir: <read, write> (whether to run put or get operations) \n"
"--issue: <on_stream, host> (applicable in some host pt-to-pt tests) \n");
exit(0);
case 0:
if (strcmp(long_options[option_index].name, "bidir") == 0) {
bidirectional = true;
} else if (strcmp(long_options[option_index].name, "msgrate") == 0) {
report_msgrate = true;
} else if (strcmp(long_options[option_index].name, "dir") == 0) {
if (strcmp(optarg, "read") == 0) {
dir.type = READ;
dir.name = "read";
} else {
dir.type = WRITE;
dir.name = "write";
}
} else if (strcmp(long_options[option_index].name, "issue") == 0) {
if (strcmp(optarg, "on_stream") == 0) {
putget_issue.type = ON_STREAM;
putget_issue.name = "on_stream";
} else {
putget_issue.type = HOST;
putget_issue.name = "host";
}
}
break;
case 'b':
atol_scaled(optarg, &min_size);
break;
case 'e':
atol_scaled(optarg, &max_size);
break;
case 'f':
atol_scaled(optarg, &step_factor);
break;
case 'n':
atol_scaled(optarg, &iters);
break;
case 'w':
atol_scaled(optarg, &warmup_iters);
break;
case 'c':
atol_scaled(optarg, &num_blocks);
break;
case 't':
atol_scaled(optarg, &threads_per_block);
break;
case 'i':
atol_scaled(optarg, &stride);
break;
case 'd':
datatype_parse(optarg, &datatype);
break;
case 'o':
reduce_op_parse(optarg, &reduce_op);
break;
case 's':
if (!strcmp(optarg, "thread")) {
threadgroup_scope.type = NVSHMEM_THREAD;
threadgroup_scope.name = "thread";
} else if (!strcmp(optarg, "warp")) {
threadgroup_scope.type = NVSHMEM_WARP;
threadgroup_scope.name = "warp";
} else if (!strcmp(optarg, "block")) {
threadgroup_scope.type = NVSHMEM_BLOCK;
threadgroup_scope.name = "block";
}
break;
case 'a':
atomic_op_parse(optarg, &test_amo);
break;
case '?':
if (optopt == 'c')
fprintf(stderr, "Option -%c requires an argument.\n", optopt);
else if (isprint(optopt))
fprintf(stderr, "Unknown option `-%c'.\n", optopt);
else
fprintf(stderr, "Unknown option character `\\x%x'.\n", optopt);
return;
default:
abort();
}
}
max_size_log = 1;
size_t tmp = max_size;
while (tmp) {
max_size_log += 1;
tmp >>= 1;
}
assert(min_size <= max_size);
printf("Runtime options after parsing command line arguments \n");
printf(
"min_size: %zu, max_size: %zu, step_factor: %zu, iterations: %zu, warmup iterations: %zu, "
"number of ctas: %zu, threads per cta: %zu "
"stride: %zu, datatype: %s, reduce_op: %s, threadgroup_scope: %s, atomic_op: %s, dir: %s, "
"report_msgrate: %d, bidirectional: %d, putget_issue :%s\n",
min_size, max_size, step_factor, iters, warmup_iters, num_blocks, threads_per_block, stride,
datatype.name.c_str(), reduce_op.name.c_str(), threadgroup_scope.name.c_str(),
test_amo.name.c_str(), dir.name.c_str(), report_msgrate, bidirectional,
putget_issue.name.c_str());
printf(
"Note: Above is full list of options, any given test will use only a subset of these "
"variables.\n");
}