sglang.0.4.8.post1/gdrcopy/tests/common.cpp

/*
 * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */

#include <stdio.h>
#include <stdarg.h>
#include <sys/types.h>
#include <unistd.h>
#include <map>
#include <cuda.h>
#include "common.hpp"

namespace gdrcopy {
    namespace test {
        bool print_dbg_msg = false;

        void print_dbg(const char* fmt, ...)
        {
            if (print_dbg_msg) {
                va_list ap;
                va_start(ap, fmt);
                vfprintf(stderr, fmt, ap);
                va_end(ap);
            }
        }

        CUresult gpu_mem_alloc(gpu_mem_handle_t *handle, const size_t size, bool aligned_mapping, bool set_sync_memops)
        {
            CUresult ret = CUDA_SUCCESS;
            CUdeviceptr ptr, out_ptr;
            size_t allocated_size;

            if (aligned_mapping)
                allocated_size = size + GPU_PAGE_SIZE - 1;
            else
                allocated_size = size;

            ret = cuMemAlloc(&ptr, allocated_size);
            if (ret != CUDA_SUCCESS)
                return ret;

            if (set_sync_memops) {
                unsigned int flag = 1;
                ret = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, ptr);
                if (ret != CUDA_SUCCESS) {
                    cuMemFree(ptr);
                    return ret;
                }
            }

            if (aligned_mapping)
                out_ptr = PAGE_ROUND_UP(ptr, GPU_PAGE_SIZE);
            else
                out_ptr = ptr;

            handle->ptr = out_ptr;
            handle->unaligned_ptr = ptr;
            handle->size = size;
            handle->allocated_size = allocated_size;

            return CUDA_SUCCESS;
        }

        CUresult gpu_mem_free(gpu_mem_handle_t *handle)
        {
            CUresult ret = CUDA_SUCCESS;
            CUdeviceptr ptr;

            ret = cuMemFree(handle->unaligned_ptr);
            if (ret == CUDA_SUCCESS)
                memset(handle, 0, sizeof(gpu_mem_handle_t));

            return ret;
        }

#if CUDA_VERSION >= 11000
        /**
         * Allocating GPU memory using VMM API.
         * VMM API is available since CUDA 10.2. However, the RDMA support is added in CUDA 11.0.
         * Our tests are not useful without RDMA support. So, we enable this VMM allocation from CUDA 11.0.
         */
        CUresult gpu_vmm_alloc(gpu_mem_handle_t *handle, const size_t size, bool aligned_mapping, bool set_sync_memops)
        {
            CUresult ret = CUDA_SUCCESS;

            size_t granularity, gran;
            CUmemAllocationProp mprop;
            CUdevice gpu_dev;
            size_t rounded_size;
            CUdeviceptr ptr = 0;
            CUmemGenericAllocationHandle mem_handle = 0;
            bool is_mapped = false;

            int RDMASupported = 0;

            int version;

            ret = cuDriverGetVersion(&version);
            if (ret != CUDA_SUCCESS) {
                print_dbg("error in cuDriverGetVersion\n");
                goto out;
            }

            if (version < 11000) {
                print_dbg("VMM with RDMA is not supported in this CUDA version.\n");
                ret = CUDA_ERROR_NOT_SUPPORTED;
                goto out;
            }

            ret = cuCtxGetDevice(&gpu_dev);
            if (ret != CUDA_SUCCESS) {
                print_dbg("error in cuCtxGetDevice\n");
                goto out;
            }

            ret = cuDeviceGetAttribute(&RDMASupported, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, gpu_dev);
            if (ret != CUDA_SUCCESS) {
                print_dbg("error in cuDeviceGetAttribute\n");
                goto out;
            }

            if (!RDMASupported) {
                print_dbg("GPUDirect RDMA is not supported on this GPU.\n");
                ret = CUDA_ERROR_NOT_SUPPORTED;
                goto out;
            }

            memset(&mprop, 0, sizeof(CUmemAllocationProp));
            mprop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
            mprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
            mprop.location.id = gpu_dev;
            mprop.allocFlags.gpuDirectRDMACapable = 1;

            ret = cuMemGetAllocationGranularity(&gran, &mprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED);
            if (ret != CUDA_SUCCESS) {
                print_dbg("error in cuMemGetAllocationGranularity\n");
                goto out;
            }

            // In case gran is smaller than GPU_PAGE_SIZE
            granularity = PAGE_ROUND_UP(gran, GPU_PAGE_SIZE);

            rounded_size = PAGE_ROUND_UP(size, granularity);
            ret = cuMemAddressReserve(&ptr, rounded_size, granularity, 0, 0);
            if (ret != CUDA_SUCCESS) {
                print_dbg("error in cuMemAddressReserve\n");
                goto out;
            }

            ret = cuMemCreate(&mem_handle, rounded_size, &mprop, 0);
            if (ret != CUDA_SUCCESS) {
                print_dbg("error in cuMemCreate\n");
                goto out;
            }

            ret = cuMemMap(ptr, rounded_size, 0, mem_handle, 0);
            if (ret != CUDA_SUCCESS) {
                print_dbg("error in cuMemMap\n");
                goto out;
            }
            is_mapped = true;

            CUmemAccessDesc access;
            access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
            access.location.id = gpu_dev;
            access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;

            ret = cuMemSetAccess(ptr, rounded_size, &access, 1);
            if (ret != CUDA_SUCCESS) {
                print_dbg("error in cuMemSetAccess\n");
                goto out;
            }

            // cuMemAddressReserve always returns aligned ptr
            handle->ptr = ptr;
            handle->handle = mem_handle;
            handle->size = size;
            handle->allocated_size = rounded_size;

out:
            if (ret != CUDA_SUCCESS) {
                if (is_mapped)
                    cuMemUnmap(ptr, rounded_size);

                if (mem_handle)
                    cuMemRelease(mem_handle);

                if (ptr)
                    cuMemAddressFree(ptr, rounded_size);
            }
            return ret;
        }

        CUresult gpu_vmm_free(gpu_mem_handle_t *handle)
        {
            CUresult ret;

            if (!handle || !handle->ptr)
                return CUDA_ERROR_INVALID_VALUE;

            ret = cuMemUnmap(handle->ptr, handle->allocated_size);
            if (ret != CUDA_SUCCESS) {
                print_dbg("error in cuMemUnmap\n");
                return ret;
            }

            ret = cuMemRelease(handle->handle);
            if (ret != CUDA_SUCCESS) {
                print_dbg("error in cuMemRelease\n");
                return ret;
            }

            ret = cuMemAddressFree(handle->ptr, handle->allocated_size);
            if (ret != CUDA_SUCCESS) {
                print_dbg("error in cuMemAddressFree\n");
                return ret;
            }

            memset(handle, 0, sizeof(gpu_mem_handle_t));

            return CUDA_SUCCESS;
        }
#else
        /* VMM with RDMA is not available before CUDA 11.0 */
        CUresult gpu_vmm_alloc(gpu_mem_handle_t *handle, const size_t size, bool aligned_mapping, bool set_sync_memops)
        {
            return CUDA_ERROR_NOT_SUPPORTED;
        }

        CUresult gpu_vmm_free(gpu_mem_handle_t *handle)
        {
            return CUDA_ERROR_NOT_SUPPORTED;
        }
#endif

        int compare_buf(uint32_t *ref_buf, uint32_t *buf, size_t size)
        {
            int diff = 0;
            if (size % 4 != 0U) {
                print_dbg("warning: buffer size %zu is not dword aligned, ignoring trailing bytes\n", size);
                size -= (size % 4);
            }
            unsigned ndwords = size/sizeof(uint32_t);
            for(unsigned  w = 0; w < ndwords; ++w) {
                if (ref_buf[w] != buf[w]) {
                    if (!diff) {
                        printf("%10.10s %8.8s %8.8s\n", "word", "content", "expected");
                    }
                    if (diff < 10) {
                        printf("%10d %08x %08x\n", w, buf[w], ref_buf[w]);
                    }
                    ++diff;
                }
            }
            if (diff) {
                print_dbg("check error: %d different dwords out of %d\n", diff, ndwords);
            }
            return diff;
        }

        void init_hbuf_walking_bit(uint32_t *h_buf, size_t size)
        {
            uint32_t base_value = 0x3F4C5E6A; // 0xa55ad33d;
            unsigned w;
            ASSERT_NEQ(h_buf, (void*)0);
            ASSERT_EQ(size % 4, 0U);
            //OUT << "filling mem with walking bit " << endl;
            for(w = 0; w<size/sizeof(uint32_t); ++w)
                h_buf[w] = base_value ^ (1<< (w%32));
        }

        void init_hbuf_linear_ramp(uint32_t *h_buf, size_t size)
        {
            uint32_t base_value = 0x3F4C5E6A; // 0xa55ad33d;
            unsigned w;
            ASSERT_NEQ(h_buf, (void*)0);
            ASSERT_EQ(size % 4, 0U);
            //OUT << "filling mem with walking bit " << endl;
            for(w = 0; w<size/sizeof(uint32_t); ++w)
                h_buf[w] = w;
        }

        bool check_gdr_support(CUdevice dev)
        {
            #if CUDA_VERSION >= 11030
            int drv_version;
            ASSERTDRV(cuDriverGetVersion(&drv_version));

            // Starting from CUDA 11.3, CUDA provides an ability to check GPUDirect RDMA support.
            if (drv_version >= 11030) {
                int gdr_support = 0;
                ASSERTDRV(cuDeviceGetAttribute(&gdr_support, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, dev));

                if (!gdr_support)
                    print_dbg("This GPU does not support GPUDirect RDMA.\n");

                return !!gdr_support;
            }
            #endif

            // For older versions, we fall back to detect this support with gdr_pin_buffer.
            const size_t size = GPU_PAGE_SIZE;
            CUdeviceptr d_A;
            gpu_mem_handle_t mhandle;
            ASSERTDRV(gpu_mem_alloc(&mhandle, size, true, true));
            d_A = mhandle.ptr;

            gdr_t g = gdr_open_safe();

            gdr_mh_t mh;
            int status = gdr_pin_buffer(g, d_A, size, 0, 0, &mh);
            if (status != 0) {
                print_dbg("error in gdr_pin_buffer with code=%d\n", status);
                print_dbg("Your GPU might not support GPUDirect RDMA\n");
            }
            else
                ASSERT_EQ(gdr_unpin_buffer(g, mh), 0);

            ASSERT_EQ(gdr_close(g), 0);

            ASSERTDRV(gpu_mem_free(&mhandle));

            return status == 0;
        }

        void print_histogram(double *lat_arr, int count, int *bin_arr, int num_bins, double min, double max)
        {
            int den = (max - min) / num_bins;
            den = den > 0 ? den : 1;
            for (int j = 0; j < num_bins; j++)
                bin_arr[j] = 0;
            for (int i = 0; i < count; i++) {
                bin_arr[(int) ((lat_arr[i] - min) / den)]++;
            }
            for (int j = 0; j < num_bins; j++) {
                printf("[%lf\t-\t%lf]\t%d\n", (min * (j + 1)), (min * (j + 2)), bin_arr[j]);
            }
        }
    }
}