sglang.0.4.8.post1/nvshmem_src/perftest/device/pt-to-pt/shmem_g_latency.cu

/*
 * Copyright (c) 2018-2020, NVIDIA CORPORATION.  All rights reserved.
 *
 * NVIDIA CORPORATION and its licensors retain all intellectual property
 * and proprietary rights in and to this software, related documentation
 * and any modifications thereto.  Any use, reproduction, disclosure or
 * distribution of this software and related documentation without an express
 * license agreement from NVIDIA CORPORATION is strictly prohibited.
 *
 * See COPYRIGHT.txt for license information
 */

#define CUMODULE_NAME "shmem_g_latency.cubin"

#include <stdio.h>
#include <assert.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <unistd.h>
#include "utils.h"

CUfunction test_cubin;

#if defined __cplusplus || defined NVSHMEM_BITCODE_APPLICATION
extern "C" {
#endif

__global__ void pull(int *data_d, int len, int pe, int iter) {
    int i, j, tid, peer;

    peer = !pe;
    tid = threadIdx.x;

    for (i = 0; i < iter; i++) {
        if (!pe) {
            for (j = tid; j < len; j += blockDim.x) {
                *(data_d + j) = nvshmem_int_g(data_d + j, peer);
            }

            __syncthreads();
        }
    }
}

#if defined __cplusplus || defined NVSHMEM_BITCODE_APPLICATION
}
#endif

void test_pull(int *data_d, int len, int pe, int iter, CUfunction kernel) {
    if (use_cubin) {
        void *arglist[] = {(void *)&data_d, (void *)&len, (void *)&pe, (void *)&iter};
        CU_CHECK(cuLaunchKernel(kernel, 1, 1, 1, threads_per_block, 1, 1, 0, NULL, arglist, NULL));
    } else {
        pull<<<1, threads_per_block>>>(data_d, len, pe, iter);
    }
}

int main(int argc, char *argv[]) {
    int mype, npes, size;
    int *data_d = NULL;

    read_args(argc, argv);

    int iter = iters;
    int skip = warmup_iters;

    int array_size, i;
    void **h_tables;
    uint64_t *h_size_arr;
    double *h_lat;

    float milliseconds;
    cudaEvent_t start, stop;
    CUfunction test_cubin = NULL;

    init_wrapper(&argc, &argv);

    if (use_cubin) {
        init_cumodule(CUMODULE_NAME);
        init_test_case_kernel(&test_cubin, "pull");
    }

    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    mype = nvshmem_my_pe();
    npes = nvshmem_n_pes();

    if (npes != 2) {
        fprintf(stderr, "This test requires exactly two processes \n");
        goto finalize;
    }

    array_size = max_size_log;
    alloc_tables(&h_tables, 2, array_size);
    h_size_arr = (uint64_t *)h_tables[0];
    h_lat = (double *)h_tables[1];

    data_d = (int *)nvshmem_malloc(max_size);
    CUDA_CHECK(cudaMemset(data_d, 0, max_size));

    nvshmem_barrier_all();

    CUDA_CHECK(cudaDeviceSynchronize());

    i = 0;
    for (size = min_size; size <= max_size; size *= step_factor) {
        int nelems;
        h_size_arr[i] = size;
        nelems = size / sizeof(int);

        test_pull(data_d, nelems, mype, skip, test_cubin);
        cudaEventRecord(start);
        test_pull(data_d, nelems, mype, iter, test_cubin);
        cudaEventRecord(stop);

        CUDA_CHECK(cudaEventSynchronize(stop));

        cudaEventElapsedTime(&milliseconds, start, stop);
        /* give latency in us */
        h_lat[i] = (milliseconds * 1000) / iter;
        nvshmem_barrier_all();
        i++;
    }

    if (mype == 0) {
        print_table_basic("shmem_g_latency", "None", "size (Bytes)", "latency", "us", '-',
                          h_size_arr, h_lat, i);
    }

finalize:

    if (data_d) nvshmem_free(data_d);
    free_tables(h_tables, 2);
    finalize_wrapper();

    return 0;
}