sglang_v0.5.2/gdrcopy/src/gdrapi.c

/*
 * Copyright (c) 2014-2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <stdarg.h>
#include <string.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/time.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <errno.h>
#include <netdb.h>
#include <malloc.h>
#include <getopt.h>
#include <arpa/inet.h>
#include <sys/ioctl.h>
#include <time.h>
#include <asm/types.h>
#include <assert.h>
#include <sys/queue.h>

#include "gdrconfig.h"
#include "gdrapi.h"
#include "gdrdrv.h"
#include "gdrapi_internal.h"

// logging/tracing

enum gdrcopy_msg_level {
    GDRCOPY_MSG_DEBUG = 1,
    GDRCOPY_MSG_INFO,
    GDRCOPY_MSG_WARN,
    GDRCOPY_MSG_ERROR
};

static int gdr_msg_level = GDRCOPY_MSG_ERROR;
static int gdr_enable_logging = -1;

static void gdr_msg(enum gdrcopy_msg_level lvl, const char* fmt, ...)
{
    if (-1 == gdr_enable_logging) {
        const char *env = getenv("GDRCOPY_ENABLE_LOGGING");
        if (env)
            gdr_enable_logging = 1;
        else
            gdr_enable_logging = 0;

        env = getenv("GDRCOPY_LOG_LEVEL");
        if (env)
            gdr_msg_level = atoi(env);
    }
    if (gdr_enable_logging) {
        if (lvl >= gdr_msg_level) {
            va_list ap;
            va_start(ap, fmt);
            vfprintf(stderr, fmt, ap);
            va_end(ap);
        }
    }
}

#define gdr_dbg(FMT, ARGS...)  gdr_msg(GDRCOPY_MSG_DEBUG, "DBG:  " FMT, ## ARGS)
#define gdr_dbgc(C, FMT, ARGS...)  do { static int gdr_dbg_cnt=(C); if (gdr_dbg_cnt) { gdr_dbg(FMT, ## ARGS); --gdr_dbg_cnt; }} while (0)
#define gdr_info(FMT, ARGS...) gdr_msg(GDRCOPY_MSG_INFO,  "INFO: " FMT, ## ARGS)
#define gdr_warn(FMT, ARGS...) gdr_msg(GDRCOPY_MSG_WARN,  "WARN: " FMT, ## ARGS)
#define gdr_err(FMT, ARGS...)  gdr_msg(GDRCOPY_MSG_ERROR, "ERR:  " FMT, ## ARGS)

static gdr_memh_t *to_memh(gdr_mh_t mh) {
    return (gdr_memh_t *)mh.h;
}

static gdr_mh_t from_memh(gdr_memh_t *memh) {
    gdr_mh_t mh;
    mh.h = (unsigned long)memh;
    return mh;
}

static void gdr_init_cpu_flags(void);

static inline int gdr_is_mapped(const gdr_mapping_type_t mapping_type)
{
    return mapping_type != GDR_MAPPING_TYPE_NONE;
}

gdr_t gdr_open(void)
{
    gdr_t g = NULL;
    const char *gdrinode = "/dev/gdrdrv";
    int ret;

    g = calloc(1, sizeof(*g));
    if (!g) {
        gdr_err("error while allocating memory\n");
        return NULL;
    }

    int fd = open(gdrinode, O_RDWR | O_CLOEXEC);
    if (-1 == fd ) {
        ret = errno;
        gdr_err("error opening driver (errno=%d/%s)\n", ret, strerror(ret));
        goto err_mem;
    }

    struct GDRDRV_IOC_GET_VERSION_PARAMS params;
    int retcode = ioctl(fd, GDRDRV_IOC_GET_VERSION, &params);
    if (0 != retcode) {
        ret = errno;
        gdr_err("Error getting the gdrdrv driver version with ioctl error (errno=%d). gdrdrv might be too old.\n", ret);
        goto err_fd;
    }
    if (params.gdrdrv_version < MINIMUM_GDRDRV_VERSION) {
        gdr_err(
            "The minimum required gdrdrv driver version is %d.%d but the current gdrdrv version is %d.%d\n",
            MINIMUM_GDRDRV_MAJOR_VERSION,
            MINIMUM_GDRDRV_MINOR_VERSION,
            params.gdrdrv_version >> MAJOR_VERSION_SHIFT,
            params.gdrdrv_version & MINOR_VERSION_MASK
        );
        goto err_fd;
    }
    if (params.minimum_gdr_api_version > GDR_API_VERSION) {
        gdr_err(
            "gdrdrv driver requires libgdrapi version %d.%d or above but the current libgdrapi version is %d.%d\n",
            params.minimum_gdr_api_version >> MAJOR_VERSION_SHIFT,
            params.minimum_gdr_api_version & MINOR_VERSION_MASK,
            GDR_API_MAJOR_VERSION,
            GDR_API_MINOR_VERSION
        );
        goto err_fd;
    }

    g->fd = fd;
    LIST_INIT(&g->memhs);

    gdr_init_cpu_flags();

    // Initialize page_shift, page_size, and page_mask.
    g->page_size = sysconf(_SC_PAGESIZE);
    g->page_mask = ~(g->page_size - 1);

    size_t ps_tmp = g->page_size;
    g->page_shift = -1;
    while (ps_tmp > 0) {
        ++g->page_shift;
        if ((ps_tmp & 0x1) == 1)
            break;
        ps_tmp >>= 1;
    }

    g->gdrdrv_version = params.gdrdrv_version;

    return g;

err_fd:
    close(fd);

err_mem:
    free(g);

    return NULL;
}

int gdr_close(gdr_t g)
{
    int ret = 0;
    int retcode;
    gdr_memh_t *mh, *next_mh;

    mh = g->memhs.lh_first;
    while (mh != NULL) {
        // gdr_unpin_buffer frees mh, so we need to get the next one
        // beforehand.
        next_mh = mh->entries.le_next;
        ret = gdr_unpin_buffer(g, from_memh(mh));
        if (ret) {
            gdr_err("error unpinning buffer inside gdr_close (errno=%d/%s)\n", ret, strerror(ret));
            return ret;
        }
        mh = next_mh;
    }

    retcode = close(g->fd);
    if (-1 == retcode) {
        ret = errno;
        gdr_err("error closing driver (errno=%d/%s)\n", ret, strerror(ret));
    }
    g->fd = 0;
    free(g);
    return ret;
}

int gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle)
{
    int ret = 0;
    int retcode;

    if (!handle) {
        return EINVAL;
    }

    gdr_memh_t *mh = calloc(1, sizeof(gdr_memh_t));
    if (!mh) {
        return ENOMEM;
    }

    struct GDRDRV_IOC_PIN_BUFFER_PARAMS params;
    params.addr = addr;
    params.size = size;
    params.p2p_token = p2p_token;
    params.va_space = va_space;
    params.handle = 0;

    retcode = ioctl(g->fd, GDRDRV_IOC_PIN_BUFFER, &params);
    if (0 != retcode) {
        ret = errno;
        gdr_err("ioctl error (errno=%d)\n", ret);
        free(mh);
        goto err;
    }
    mh->handle = params.handle;
    LIST_INSERT_HEAD(&g->memhs, mh, entries);
    *handle = from_memh(mh);
 err:
    return ret;
}

int gdr_unpin_buffer(gdr_t g, gdr_mh_t handle)
{
    int ret = 0;
    int retcode;
    gdr_memh_t *mh = to_memh(handle);

    struct GDRDRV_IOC_UNPIN_BUFFER_PARAMS params;
    params.handle = mh->handle;
    retcode = ioctl(g->fd, GDRDRV_IOC_UNPIN_BUFFER, &params);
    if (0 != retcode) {
        ret = errno;
        gdr_err("ioctl error (errno=%d)\n", ret);
    }
    LIST_REMOVE(mh, entries);
    free(mh);

    return ret;
}

int gdr_get_callback_flag(gdr_t g, gdr_mh_t handle, int *flag)
{
    int ret = 0;
    int retcode;
    gdr_memh_t *mh = to_memh(handle);

    struct GDRDRV_IOC_GET_CB_FLAG_PARAMS params;
    params.handle = mh->handle;
    retcode = ioctl(g->fd, GDRDRV_IOC_GET_CB_FLAG, &params);
    if (0 != retcode) {
        ret = errno;
        gdr_err("ioctl error (errno=%d)\n", ret);
    } else {
        *flag = params.flag;
    }
    return ret;
}

int gdr_get_info_v2(gdr_t g, gdr_mh_t handle, gdr_info_v2_t *info)
{
    int ret = 0;
    int retcode;
    gdr_memh_t *mh = to_memh(handle);

    if (g->gdrdrv_version >= GDRDRV_MINIMUM_VERSION_WITH_GET_INFO_V2) {
        struct GDRDRV_IOC_GET_INFO_V2_PARAMS params;
        params.handle = mh->handle;

        retcode = ioctl(g->fd, GDRDRV_IOC_GET_INFO_V2, &params);
        if (0 != retcode) {
            ret = errno;
            gdr_err("ioctl error (errno=%d)\n", ret);
            goto out;
        } else {
            info->va            = params.va;
            info->mapped_size   = params.mapped_size;
            info->page_size     = params.page_size;
            info->tm_cycles     = params.tm_cycles;
            info->cycles_per_ms = params.tsc_khz;
            info->mapped        = gdr_is_mapped(params.mapping_type);
            info->wc_mapping    = (params.mapping_type == GDR_MAPPING_TYPE_WC);
            info->mapping_type  = params.mapping_type;
        }
    }
    else
    {
        struct GDRDRV_IOC_GET_INFO_PARAMS params;
        params.handle = mh->handle;

        retcode = ioctl(g->fd, GDRDRV_IOC_GET_INFO, &params);
        if (0 != retcode) {
            ret = errno;
            gdr_err("ioctl error (errno=%d)\n", ret);
            goto out;
        } else {
            info->va            = params.va;
            info->mapped_size   = params.mapped_size;
            info->page_size     = params.page_size;
            info->tm_cycles     = params.tm_cycles;
            info->cycles_per_ms = params.tsc_khz;
            info->mapped        = params.mapped;
            info->wc_mapping    = params.wc_mapping;
            info->mapping_type  = params.mapped ? (params.wc_mapping ? GDR_MAPPING_TYPE_WC : GDR_MAPPING_TYPE_CACHING) : GDR_MAPPING_TYPE_NONE;
        }
    }

out:
    return ret;
}

int gdr_map(gdr_t g, gdr_mh_t handle, void **ptr_va, size_t size)
{
    int ret = 0;
    gdr_info_v2_t info = {0,};
    gdr_memh_t *mh = to_memh(handle);

    if (gdr_is_mapped(mh->mapping_type)) {
        gdr_err("mh is mapped already\n");
        return EAGAIN;
    }
    size_t rounded_size = (size + g->page_size - 1) & g->page_mask;
    off_t magic_off = (off_t)mh->handle << g->page_shift;
    void *mmio = mmap(NULL, rounded_size, PROT_READ|PROT_WRITE, MAP_SHARED, g->fd, magic_off);
    if (mmio == MAP_FAILED) {
        int __errno = errno;
        mmio = NULL;
        gdr_err("error %s(%d) while mapping handle %x, rounded_size=%zu offset=%llx\n",
                strerror(__errno), __errno, handle, rounded_size, (long long unsigned)magic_off);
        ret = __errno;
        goto err;
    }
    *ptr_va = mmio;
    ret = gdr_get_info_v2(g, handle, &info);
    if (ret) {
        gdr_err("error %d from get_info, munmapping before exiting\n", ret);
        munmap(mmio, rounded_size);
        goto err;
    }
    if (!gdr_is_mapped(info.mapping_type)) {
        // Race could cause this issue.
        // E.g., gdr_map and cuMemFree are triggered concurrently.
        // The above mmap is successful but cuMemFree causes unmapping immediately.
        gdr_err("mh is not mapped\n");
        ret = EAGAIN;
    }
    mh->mapping_type = info.mapping_type;
    gdr_dbg("mapping_type=%d\n", mh->mapping_type);
 err:
    return ret;
}

int gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size)
{
    int ret = 0;
    int retcode = 0;
    size_t rounded_size;
    gdr_memh_t *mh = to_memh(handle);

    rounded_size = (size + g->page_size - 1) & g->page_mask;

    if (!gdr_is_mapped(mh->mapping_type)) {
        gdr_err("mh is not mapped yet\n");
        return EINVAL;
    }
    retcode = munmap(va, rounded_size);
    if (-1 == retcode) {
        int __errno = errno;
        gdr_err("error %s(%d) while unmapping handle %x, rounded_size=%zu\n",
                strerror(__errno), __errno, handle, rounded_size);
        ret = __errno;
        goto err;
    }
    mh->mapping_type = GDR_MAPPING_TYPE_NONE;
 err:
    return ret;
}

#ifdef GDRAPI_X86
#include <cpuid.h>

// prepare for AVX2 implementation
#ifndef bit_AVX2
/* Extended Features (%eax == 7) */
/* %ebx */
#define bit_AVX2 (1 << 5)
#endif

#include <immintrin.h>

extern int memcpy_uncached_store_avx(void *dest, const void *src, size_t n_bytes);
extern int memcpy_cached_store_avx(void *dest, const void *src, size_t n_bytes);
extern int memcpy_uncached_store_sse(void *dest, const void *src, size_t n_bytes);
extern int memcpy_cached_store_sse(void *dest, const void *src, size_t n_bytes);
extern int memcpy_uncached_load_sse41(void *dest, const void *src, size_t n_bytes);
static inline void wc_store_fence(void) { _mm_sfence(); }
#define PREFERS_STORE_UNROLL4 0
#define PREFERS_STORE_UNROLL8 0
#define PREFERS_LOAD_UNROLL4  0
#define PREFERS_LOAD_UNROLL8  0
// GDRAPI_X86

#elif defined(GDRAPI_POWER)
static int memcpy_uncached_store_avx(void *dest, const void *src, size_t n_bytes)  { return 1; }
static int memcpy_cached_store_avx(void *dest, const void *src, size_t n_bytes)  { return 1; }
static int memcpy_uncached_store_sse(void *dest, const void *src, size_t n_bytes)    { return 1; }
static int memcpy_cached_store_sse(void *dest, const void *src, size_t n_bytes)    { return 1; }
static int memcpy_uncached_load_sse41(void *dest, const void *src, size_t n_bytes) { return 1; }
static inline void wc_store_fence(void) { asm volatile("sync") ; }
#define PREFERS_STORE_UNROLL4 1
#define PREFERS_STORE_UNROLL8 0
#define PREFERS_LOAD_UNROLL4  0
#define PREFERS_LOAD_UNROLL8  1
// GDRAPI_POWER

#elif defined(GDRAPI_ARM64)
static int memcpy_uncached_store_avx(void *dest, const void *src, size_t n_bytes)  { return 1; }
static int memcpy_cached_store_avx(void *dest, const void *src, size_t n_bytes)  { return 1; }
static int memcpy_uncached_store_sse(void *dest, const void *src, size_t n_bytes)    { return 1; }
static int memcpy_cached_store_sse(void *dest, const void *src, size_t n_bytes)    { return 1; }
static int memcpy_uncached_load_sse41(void *dest, const void *src, size_t n_bytes) { return 1; }
static inline void wc_store_fence(void) { asm volatile("DMB ishld") ; }
#define PREFERS_STORE_UNROLL4 0
#define PREFERS_STORE_UNROLL8 0
#define PREFERS_LOAD_UNROLL4  0
#define PREFERS_LOAD_UNROLL8  0
// GDRAPI_ARM64
#endif

static int has_sse = 0;
static int has_sse2 = 0;
static int has_sse4_1 = 0;
static int has_avx = 0;
static int has_avx2 = 0;

static void gdr_init_cpu_flags(void)
{
#ifdef GDRAPI_X86
    unsigned int info_type = 0x00000001;
    unsigned int ax, bx, cx, dx;
    if (__get_cpuid(info_type, &ax, &bx, &cx, &dx) == 1) {
       has_sse4_1 = ((cx & bit_SSE4_1) != 0);
       has_avx    = ((cx & bit_AVX)    != 0);
       has_sse    = ((dx & bit_SSE)    != 0);
       has_sse2   = ((dx & bit_SSE2)   != 0);
       gdr_dbg("sse4_1=%d avx=%d sse=%d sse2=%d\n", has_sse4_1, has_avx, has_sse, has_sse2);
    }
#ifdef bit_AVX2
    info_type = 0x7;
    if (__get_cpuid(info_type, &ax, &bx, &cx, &dx) == 1) {
        has_avx2 = bx & bit_AVX2;
    }
#endif // bit_AVX2
#endif // GDRAPI_X86

#ifdef GDRAPI_POWER
    // detect and enable Altivec/SMX support
#endif
}

// note: more than one implementation may be compiled in

static void unroll8_memcpy(void *dst, const void *src, size_t size)
{
    const uint64_t *r = (const uint64_t *)src;
    uint64_t *w = (uint64_t *)dst;
    size_t nw = size / sizeof(*r);
    assert(size % sizeof(*r) == 0);

    while (nw) {
        if (0 == (nw & 3)) {
            uint64_t r0 = r[0];
            uint64_t r1 = r[1];
            uint64_t r2 = r[2];
            uint64_t r3 = r[3];
            w[0] = r0;
            w[1] = r1;
            w[2] = r2;
            w[3] = r3;
            r += 4;
            w += 4;
            nw -= 4;
        } else if (0 == (nw & 1)) {
            uint64_t r0 = r[0];
            uint64_t r1 = r[1];
            w[0] = r0;
            w[1] = r1;
            r += 2;
            w += 2;
            nw -= 2;
        } else {
            w[0] = r[0];
            ++w;
            ++r;
            --nw;
        }
    }
}

static void unroll4_memcpy(void *dst, const void *src, size_t size)
{
    const uint32_t *r = (const uint32_t *)src;
    uint32_t *w = (uint32_t *)dst;
    size_t nw = size / sizeof(*r);
    assert(size % sizeof(*r) == 0);

    while (nw) {
        if (0 == (nw & 3)) {
            uint32_t r0 = r[0];
            uint32_t r1 = r[1];
            uint32_t r2 = r[2];
            uint32_t r3 = r[3];
            w[0] = r0;
            w[1] = r1;
            w[2] = r2;
            w[3] = r3;
            r += 4;
            w += 4;
            nw -= 4;
        } else if (0 == (nw & 1)) {
            uint32_t r0 = r[0];
            uint32_t r1 = r[1];
            w[0] = r0;
            w[1] = r1;
            r += 2;
            w += 2;
            nw -= 2;
        } else {
            w[0] = r[0];
            ++w;
            ++r;
            --nw;
        }
    }
}

static inline int is_aligned(unsigned long value, unsigned powof2)
{
    return ((value & (powof2-1)) == 0);
}

static inline int ptr_is_aligned(const void *ptr, unsigned powof2)
{
    unsigned long addr = (unsigned long)ptr;
    return is_aligned(addr, powof2);
}

static inline void memcpy_to_device_mapping(void *dst, const void *src, size_t size)
{
    size_t remaining_size = size;
    void *curr_map_d_ptr = dst;
    const void *curr_h_ptr = src;
    size_t copy_size = 0;
    while (remaining_size > 0) {
        if (is_aligned(remaining_size, sizeof(uint64_t)) && ptr_is_aligned(curr_map_d_ptr, sizeof(uint64_t)) && ptr_is_aligned(curr_h_ptr, sizeof(uint64_t))) {
            // We have proper alignment. memcpy can be used here. Although
            // unlikely, this might break in the future if the implementation
            // of memcpy changes to generate unaligned access. Still, we choose
            // memcpy because it provides better performance than our simple
            // aligned-access workaround.
            memcpy(curr_map_d_ptr, curr_h_ptr, remaining_size);
            copy_size = remaining_size;
        }
        else if (remaining_size >= sizeof(uint64_t) && ptr_is_aligned(curr_map_d_ptr, sizeof(uint64_t))) {
            // memcpy cannot be used here because its internal
            // implementation may end up in an unaligned access.
            WRITE_ONCE(*(uint64_t *)curr_map_d_ptr, *(uint64_t *)curr_h_ptr);
            copy_size = sizeof(uint64_t);
        }
        else if (remaining_size >= sizeof(uint32_t) && ptr_is_aligned(curr_map_d_ptr, sizeof(uint32_t))) {
            WRITE_ONCE(*(uint32_t *)curr_map_d_ptr, *(uint32_t *)curr_h_ptr);
            copy_size = sizeof(uint32_t);
        }
        else if (remaining_size >= sizeof(uint16_t) && ptr_is_aligned(curr_map_d_ptr, sizeof(uint16_t))) {
            WRITE_ONCE(*(uint16_t *)curr_map_d_ptr, *(uint16_t *)curr_h_ptr);
            copy_size = sizeof(uint16_t);
        }
        else {
            WRITE_ONCE(*(uint8_t *)curr_map_d_ptr, *(uint8_t *)curr_h_ptr);
            copy_size = sizeof(uint8_t);
        }
        remaining_size -= copy_size;
        curr_map_d_ptr = (void *)((uintptr_t)curr_map_d_ptr + copy_size);
        curr_h_ptr = (const void *)((uintptr_t)curr_h_ptr + copy_size);
    }
}

static inline void memcpy_from_device_mapping(void *dst, const void *src, size_t size)
{
    size_t remaining_size = size;
    const void *curr_map_d_ptr = src;
    void *curr_h_ptr = dst;
    size_t copy_size = 0;
    while (remaining_size > 0) {
        if (is_aligned(remaining_size, sizeof(uint64_t)) && ptr_is_aligned(curr_map_d_ptr, sizeof(uint64_t)) && ptr_is_aligned(curr_h_ptr, sizeof(uint64_t))) {
            // We have proper alignment. memcpy can be used here. Although
            // unlikely, this might break in the future if the implementation
            // of memcpy changes to generate unaligned access. Still, we choose
            // memcpy because it provides better performance than our simple
            // aligned-access workaround.
            memcpy(curr_h_ptr, curr_map_d_ptr, remaining_size);
            copy_size = remaining_size;
        }
        else if (remaining_size >= sizeof(uint64_t) && ptr_is_aligned(curr_map_d_ptr, sizeof(uint64_t))) {
            // memcpy cannot be used here because its internal
            // implementation may end up in an unaligned access.
            *(uint64_t *)curr_h_ptr = READ_ONCE(*(uint64_t *)curr_map_d_ptr);
            copy_size = sizeof(uint64_t);
        }
        else if (remaining_size >= sizeof(uint32_t) && ptr_is_aligned(curr_map_d_ptr, sizeof(uint32_t))) {
            *(uint32_t *)curr_h_ptr = READ_ONCE(*(uint32_t *)curr_map_d_ptr);
            copy_size = sizeof(uint32_t);
        }
        else if (remaining_size >= sizeof(uint16_t) && ptr_is_aligned(curr_map_d_ptr, sizeof(uint16_t))) {
            *(uint16_t *)curr_h_ptr = READ_ONCE(*(uint16_t *)curr_map_d_ptr);
            copy_size = sizeof(uint16_t);
        }
        else {
            *(uint8_t *)curr_h_ptr = READ_ONCE(*(uint8_t *)curr_map_d_ptr);
            copy_size = sizeof(uint8_t);
        }
        remaining_size -= copy_size;
        curr_map_d_ptr = (const void *)((uintptr_t)curr_map_d_ptr + copy_size);
        curr_h_ptr = (void *)((uintptr_t)curr_h_ptr + copy_size);
    }
}

static int gdr_copy_to_mapping_internal(void *map_d_ptr, const void *h_ptr, size_t size, gdr_mapping_type_t mapping_type)
{
    const int wc_mapping = (mapping_type == GDR_MAPPING_TYPE_WC);
    const int device_mapping = (mapping_type == GDR_MAPPING_TYPE_DEVICE);
    do {
        // For very small sizes and aligned pointers, we use simple store.
        if (size == sizeof(uint8_t)) {
            WRITE_ONCE(*(uint8_t *)map_d_ptr, *(uint8_t *)h_ptr);
            goto do_fence;
        } else if (size == sizeof(uint16_t) && ptr_is_aligned(map_d_ptr, sizeof(uint16_t))) {
            WRITE_ONCE(*(uint16_t *)map_d_ptr, *(uint16_t *)h_ptr);
            goto do_fence;
        } else if (size == sizeof(uint32_t) && ptr_is_aligned(map_d_ptr, sizeof(uint32_t))) {
            WRITE_ONCE(*(uint32_t *)map_d_ptr, *(uint32_t *)h_ptr);
            goto do_fence;
        } else if (size == sizeof(uint64_t) && ptr_is_aligned(map_d_ptr, sizeof(uint64_t))) {
            WRITE_ONCE(*(uint64_t *)map_d_ptr, *(uint64_t *)h_ptr);
            goto do_fence;
        }

        // pick the most performing implementation compatible with the platform we are running on
        // NOTE: write fences are included in functions below
        if (has_avx) {
            assert(wc_mapping);
            gdr_dbgc(1, "using AVX implementation of gdr_copy_to_mapping\n");
            memcpy_uncached_store_avx(map_d_ptr, h_ptr, size);
            goto out;
        }
        if (has_sse) {
            assert(wc_mapping);
            gdr_dbgc(1, "using SSE implementation of gdr_copy_to_mapping\n");
            memcpy_uncached_store_sse(map_d_ptr, h_ptr, size);
            goto out;
        }

        // on POWER, compiler/libc memcpy is not optimal for MMIO
        // 64bit stores are not better than 32bit ones, so we prefer the latter.
        // NOTE: if preferred but not aligned, a better implementation would still try to
        // use byte sized stores to align map_d_ptr and h_ptr to next word.
        // NOTE2: unroll*_memcpy and memcpy do not include fencing.
        if (wc_mapping && PREFERS_STORE_UNROLL8 && is_aligned(size, 8) && ptr_is_aligned(map_d_ptr, 8) && ptr_is_aligned(h_ptr, 8)) {
            gdr_dbgc(1, "using unroll8_memcpy for gdr_copy_to_mapping\n");
            unroll8_memcpy(map_d_ptr, h_ptr, size);
        } else if (wc_mapping && PREFERS_STORE_UNROLL4 && is_aligned(size, 4) && ptr_is_aligned(map_d_ptr, 4) && ptr_is_aligned(h_ptr, 4)) {
            gdr_dbgc(1, "using unroll4_memcpy for gdr_copy_to_mapping\n");
            unroll4_memcpy(map_d_ptr, h_ptr, size);
        } else if (device_mapping) {
            gdr_dbgc(1, "using device-mapping copy for gdr_copy_to_mapping with device mapping\n");
            memcpy_to_device_mapping(map_d_ptr, h_ptr, size);
        } else {
            gdr_dbgc(1, "fallback to compiler/libc memcpy implementation of gdr_copy_to_mapping\n");
            memcpy(map_d_ptr, h_ptr, size);
        }
    } while (0);

do_fence:
    if (wc_mapping) {
        // fencing is needed even for plain memcpy(), due to performance
        // being hit by delayed flushing of WC buffers
        wc_store_fence();
    }

out:
    return 0;
}

static int gdr_copy_from_mapping_internal(void *h_ptr, const void *map_d_ptr, size_t size, gdr_mapping_type_t mapping_type)
{
    const int wc_mapping = (mapping_type == GDR_MAPPING_TYPE_WC);
    const int device_mapping = (mapping_type == GDR_MAPPING_TYPE_DEVICE);

    do {
        // pick the most performing implementation compatible with the platform we are running on
        if (has_sse4_1) {
            assert(wc_mapping);
            gdr_dbgc(1, "using SSE4_1 implementation of gdr_copy_from_mapping\n");
            memcpy_uncached_load_sse41(h_ptr, map_d_ptr, size);
            break;
        }
        if (has_avx) {
            assert(wc_mapping);
            gdr_dbgc(1, "using AVX implementation of gdr_copy_from_mapping\n");
            memcpy_cached_store_avx(h_ptr, map_d_ptr, size);
            break;
        }
        if (has_sse) {
            assert(wc_mapping);
            gdr_dbgc(1, "using SSE implementation of gdr_copy_from_mapping\n");
            memcpy_cached_store_sse(h_ptr, map_d_ptr, size);
            break;
        }

        // on POWER, compiler memcpy is not optimal for MMIO
        // 64bit loads have 2x the BW of 32bit ones
        if (wc_mapping && PREFERS_LOAD_UNROLL8 && is_aligned(size, 8) && ptr_is_aligned(map_d_ptr, 8) && ptr_is_aligned(h_ptr, 8)) {
            gdr_dbgc(1, "using unroll8_memcpy for gdr_copy_from_mapping\n");
            unroll8_memcpy(h_ptr, map_d_ptr, size);
        } else if (wc_mapping && PREFERS_LOAD_UNROLL4 && is_aligned(size, 4) && ptr_is_aligned(map_d_ptr, 4) && ptr_is_aligned(h_ptr, 4)) {
            gdr_dbgc(1, "using unroll4_memcpy for gdr_copy_from_mapping\n");
            unroll4_memcpy(h_ptr, map_d_ptr, size);
        } else if (device_mapping) {
            gdr_dbgc(1, "using device-mapping copy for gdr_copy_from_mapping\n");
            memcpy_from_device_mapping(h_ptr, map_d_ptr, size);
        } else {
            gdr_dbgc(1, "fallback to compiler/libc memcpy implementation of gdr_copy_from_mapping\n");
            memcpy(h_ptr, map_d_ptr, size);
        }

        // note: fencing is not needed because plain stores are used
        // if non-temporal/uncached stores were used on x86, a proper fence would be needed instead
        // if (wc_mapping)
        //    wc_store_fence();
    } while (0);

    return 0;
}

int gdr_copy_to_mapping(gdr_mh_t handle, void *map_d_ptr, const void *h_ptr, size_t size)
{
    gdr_memh_t *mh = to_memh(handle);
    if (unlikely(!gdr_is_mapped(mh->mapping_type))) {
        gdr_err("mh is not mapped yet\n");
        return EINVAL;
    }
    if (unlikely(size == 0))
        return 0;
    return gdr_copy_to_mapping_internal(map_d_ptr, h_ptr, size, mh->mapping_type);
}

int gdr_copy_from_mapping(gdr_mh_t handle, void *h_ptr, const void *map_d_ptr, size_t size)
{
    gdr_memh_t *mh = to_memh(handle);
    if (unlikely(!gdr_is_mapped(mh->mapping_type))) {
        gdr_err("mh is not mapped yet\n");
        return EINVAL;
    }
    if (unlikely(size == 0))
        return 0;
    return gdr_copy_from_mapping_internal(h_ptr, map_d_ptr, size, mh->mapping_type);
}


void gdr_runtime_get_version(int *major, int *minor)
{
    *major = GDR_API_MAJOR_VERSION;
    *minor = GDR_API_MINOR_VERSION;
}

int gdr_driver_get_version(gdr_t g, int *major, int *minor)
{
    assert(g != NULL);
    assert(g->fd > 0);

    struct GDRDRV_IOC_GET_VERSION_PARAMS params;
    int retcode = ioctl(g->fd, GDRDRV_IOC_GET_VERSION, &params);
    if (0 != retcode) {
        int ret = errno;
        gdr_err("Error getting the gdrdrv driver version with ioctl error (errno=%d). gdrdrv might be too old.\n", ret);
        return ret;
    }

    *major = params.gdrdrv_version >> MAJOR_VERSION_SHIFT;
    *minor = params.gdrdrv_version & MINOR_VERSION_MASK;

    return 0;
}

// ==============================================================================
// Obsoleted API. Provided for compatibility only.
// ==============================================================================

#ifdef gdr_get_info
#undef gdr_get_info
#endif

typedef struct gdr_info_v1 {
    uint64_t va;
    uint64_t mapped_size;
    uint32_t page_size;
    // tm_cycles and cycles_per_ms are deprecated and will be removed in future.
    uint64_t tm_cycles;
    uint32_t cycles_per_ms;
    unsigned mapped:1;
    unsigned wc_mapping:1;
} gdr_info_v1_t;

int gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_v1_t *info)
{
    int ret = 0;
    int retcode;
    gdr_memh_t *mh = to_memh(handle);

    struct GDRDRV_IOC_GET_INFO_PARAMS params;
    params.handle = mh->handle;

    retcode = ioctl(g->fd, GDRDRV_IOC_GET_INFO, &params);
    if (0 != retcode) {
        ret = errno;
        gdr_err("ioctl error (errno=%d)\n", ret);
        goto out;
    } else {
        info->va            = params.va;
        info->mapped_size   = params.mapped_size;
        info->page_size     = params.page_size;
        info->tm_cycles     = params.tm_cycles;
        info->cycles_per_ms = params.tsc_khz;
        info->mapped        = params.mapped;
        info->wc_mapping    = params.wc_mapping;
    }

out:
    return ret;
}


/*
 * Local variables:
 *  c-indent-level: 4
 *  c-basic-offset: 4
 *  tab-width: 4
 *  indent-tabs-mode: nil
 * End:
 */