/* * Copyright (c) 2014-2021, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "gdrconfig.h" #include "gdrapi.h" #include "gdrdrv.h" #include "gdrapi_internal.h" // logging/tracing enum gdrcopy_msg_level { GDRCOPY_MSG_DEBUG = 1, GDRCOPY_MSG_INFO, GDRCOPY_MSG_WARN, GDRCOPY_MSG_ERROR }; static int gdr_msg_level = GDRCOPY_MSG_ERROR; static int gdr_enable_logging = -1; static void gdr_msg(enum gdrcopy_msg_level lvl, const char* fmt, ...) { if (-1 == gdr_enable_logging) { const char *env = getenv("GDRCOPY_ENABLE_LOGGING"); if (env) gdr_enable_logging = 1; else gdr_enable_logging = 0; env = getenv("GDRCOPY_LOG_LEVEL"); if (env) gdr_msg_level = atoi(env); } if (gdr_enable_logging) { if (lvl >= gdr_msg_level) { va_list ap; va_start(ap, fmt); vfprintf(stderr, fmt, ap); va_end(ap); } } } #define gdr_dbg(FMT, ARGS...) gdr_msg(GDRCOPY_MSG_DEBUG, "DBG: " FMT, ## ARGS) #define gdr_dbgc(C, FMT, ARGS...) do { static int gdr_dbg_cnt=(C); if (gdr_dbg_cnt) { gdr_dbg(FMT, ## ARGS); --gdr_dbg_cnt; }} while (0) #define gdr_info(FMT, ARGS...) gdr_msg(GDRCOPY_MSG_INFO, "INFO: " FMT, ## ARGS) #define gdr_warn(FMT, ARGS...) gdr_msg(GDRCOPY_MSG_WARN, "WARN: " FMT, ## ARGS) #define gdr_err(FMT, ARGS...) gdr_msg(GDRCOPY_MSG_ERROR, "ERR: " FMT, ## ARGS) static gdr_memh_t *to_memh(gdr_mh_t mh) { return (gdr_memh_t *)mh.h; } static gdr_mh_t from_memh(gdr_memh_t *memh) { gdr_mh_t mh; mh.h = (unsigned long)memh; return mh; } static void gdr_init_cpu_flags(void); static inline int gdr_is_mapped(const gdr_mapping_type_t mapping_type) { return mapping_type != GDR_MAPPING_TYPE_NONE; } gdr_t gdr_open(void) { gdr_t g = NULL; const char *gdrinode = "/dev/gdrdrv"; int ret; g = calloc(1, sizeof(*g)); if (!g) { gdr_err("error while allocating memory\n"); return NULL; } int fd = open(gdrinode, O_RDWR | O_CLOEXEC); if (-1 == fd ) { ret = errno; gdr_err("error opening driver (errno=%d/%s)\n", ret, strerror(ret)); goto err_mem; } struct GDRDRV_IOC_GET_VERSION_PARAMS params; int retcode = ioctl(fd, GDRDRV_IOC_GET_VERSION, ¶ms); if (0 != retcode) { ret = errno; gdr_err("Error getting the gdrdrv driver version with ioctl error (errno=%d). gdrdrv might be too old.\n", ret); goto err_fd; } if (params.gdrdrv_version < MINIMUM_GDRDRV_VERSION) { gdr_err( "The minimum required gdrdrv driver version is %d.%d but the current gdrdrv version is %d.%d\n", MINIMUM_GDRDRV_MAJOR_VERSION, MINIMUM_GDRDRV_MINOR_VERSION, params.gdrdrv_version >> MAJOR_VERSION_SHIFT, params.gdrdrv_version & MINOR_VERSION_MASK ); goto err_fd; } if (params.minimum_gdr_api_version > GDR_API_VERSION) { gdr_err( "gdrdrv driver requires libgdrapi version %d.%d or above but the current libgdrapi version is %d.%d\n", params.minimum_gdr_api_version >> MAJOR_VERSION_SHIFT, params.minimum_gdr_api_version & MINOR_VERSION_MASK, GDR_API_MAJOR_VERSION, GDR_API_MINOR_VERSION ); goto err_fd; } g->fd = fd; LIST_INIT(&g->memhs); gdr_init_cpu_flags(); // Initialize page_shift, page_size, and page_mask. g->page_size = sysconf(_SC_PAGESIZE); g->page_mask = ~(g->page_size - 1); size_t ps_tmp = g->page_size; g->page_shift = -1; while (ps_tmp > 0) { ++g->page_shift; if ((ps_tmp & 0x1) == 1) break; ps_tmp >>= 1; } g->gdrdrv_version = params.gdrdrv_version; return g; err_fd: close(fd); err_mem: free(g); return NULL; } int gdr_close(gdr_t g) { int ret = 0; int retcode; gdr_memh_t *mh, *next_mh; mh = g->memhs.lh_first; while (mh != NULL) { // gdr_unpin_buffer frees mh, so we need to get the next one // beforehand. next_mh = mh->entries.le_next; ret = gdr_unpin_buffer(g, from_memh(mh)); if (ret) { gdr_err("error unpinning buffer inside gdr_close (errno=%d/%s)\n", ret, strerror(ret)); return ret; } mh = next_mh; } retcode = close(g->fd); if (-1 == retcode) { ret = errno; gdr_err("error closing driver (errno=%d/%s)\n", ret, strerror(ret)); } g->fd = 0; free(g); return ret; } int gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle) { int ret = 0; int retcode; if (!handle) { return EINVAL; } gdr_memh_t *mh = calloc(1, sizeof(gdr_memh_t)); if (!mh) { return ENOMEM; } struct GDRDRV_IOC_PIN_BUFFER_PARAMS params; params.addr = addr; params.size = size; params.p2p_token = p2p_token; params.va_space = va_space; params.handle = 0; retcode = ioctl(g->fd, GDRDRV_IOC_PIN_BUFFER, ¶ms); if (0 != retcode) { ret = errno; gdr_err("ioctl error (errno=%d)\n", ret); free(mh); goto err; } mh->handle = params.handle; LIST_INSERT_HEAD(&g->memhs, mh, entries); *handle = from_memh(mh); err: return ret; } int gdr_unpin_buffer(gdr_t g, gdr_mh_t handle) { int ret = 0; int retcode; gdr_memh_t *mh = to_memh(handle); struct GDRDRV_IOC_UNPIN_BUFFER_PARAMS params; params.handle = mh->handle; retcode = ioctl(g->fd, GDRDRV_IOC_UNPIN_BUFFER, ¶ms); if (0 != retcode) { ret = errno; gdr_err("ioctl error (errno=%d)\n", ret); } LIST_REMOVE(mh, entries); free(mh); return ret; } int gdr_get_callback_flag(gdr_t g, gdr_mh_t handle, int *flag) { int ret = 0; int retcode; gdr_memh_t *mh = to_memh(handle); struct GDRDRV_IOC_GET_CB_FLAG_PARAMS params; params.handle = mh->handle; retcode = ioctl(g->fd, GDRDRV_IOC_GET_CB_FLAG, ¶ms); if (0 != retcode) { ret = errno; gdr_err("ioctl error (errno=%d)\n", ret); } else { *flag = params.flag; } return ret; } int gdr_get_info_v2(gdr_t g, gdr_mh_t handle, gdr_info_v2_t *info) { int ret = 0; int retcode; gdr_memh_t *mh = to_memh(handle); if (g->gdrdrv_version >= GDRDRV_MINIMUM_VERSION_WITH_GET_INFO_V2) { struct GDRDRV_IOC_GET_INFO_V2_PARAMS params; params.handle = mh->handle; retcode = ioctl(g->fd, GDRDRV_IOC_GET_INFO_V2, ¶ms); if (0 != retcode) { ret = errno; gdr_err("ioctl error (errno=%d)\n", ret); goto out; } else { info->va = params.va; info->mapped_size = params.mapped_size; info->page_size = params.page_size; info->tm_cycles = params.tm_cycles; info->cycles_per_ms = params.tsc_khz; info->mapped = gdr_is_mapped(params.mapping_type); info->wc_mapping = (params.mapping_type == GDR_MAPPING_TYPE_WC); info->mapping_type = params.mapping_type; } } else { struct GDRDRV_IOC_GET_INFO_PARAMS params; params.handle = mh->handle; retcode = ioctl(g->fd, GDRDRV_IOC_GET_INFO, ¶ms); if (0 != retcode) { ret = errno; gdr_err("ioctl error (errno=%d)\n", ret); goto out; } else { info->va = params.va; info->mapped_size = params.mapped_size; info->page_size = params.page_size; info->tm_cycles = params.tm_cycles; info->cycles_per_ms = params.tsc_khz; info->mapped = params.mapped; info->wc_mapping = params.wc_mapping; info->mapping_type = params.mapped ? (params.wc_mapping ? GDR_MAPPING_TYPE_WC : GDR_MAPPING_TYPE_CACHING) : GDR_MAPPING_TYPE_NONE; } } out: return ret; } int gdr_map(gdr_t g, gdr_mh_t handle, void **ptr_va, size_t size) { int ret = 0; gdr_info_v2_t info = {0,}; gdr_memh_t *mh = to_memh(handle); if (gdr_is_mapped(mh->mapping_type)) { gdr_err("mh is mapped already\n"); return EAGAIN; } size_t rounded_size = (size + g->page_size - 1) & g->page_mask; off_t magic_off = (off_t)mh->handle << g->page_shift; void *mmio = mmap(NULL, rounded_size, PROT_READ|PROT_WRITE, MAP_SHARED, g->fd, magic_off); if (mmio == MAP_FAILED) { int __errno = errno; mmio = NULL; gdr_err("error %s(%d) while mapping handle %x, rounded_size=%zu offset=%llx\n", strerror(__errno), __errno, handle, rounded_size, (long long unsigned)magic_off); ret = __errno; goto err; } *ptr_va = mmio; ret = gdr_get_info_v2(g, handle, &info); if (ret) { gdr_err("error %d from get_info, munmapping before exiting\n", ret); munmap(mmio, rounded_size); goto err; } if (!gdr_is_mapped(info.mapping_type)) { // Race could cause this issue. // E.g., gdr_map and cuMemFree are triggered concurrently. // The above mmap is successful but cuMemFree causes unmapping immediately. gdr_err("mh is not mapped\n"); ret = EAGAIN; } mh->mapping_type = info.mapping_type; gdr_dbg("mapping_type=%d\n", mh->mapping_type); err: return ret; } int gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size) { int ret = 0; int retcode = 0; size_t rounded_size; gdr_memh_t *mh = to_memh(handle); rounded_size = (size + g->page_size - 1) & g->page_mask; if (!gdr_is_mapped(mh->mapping_type)) { gdr_err("mh is not mapped yet\n"); return EINVAL; } retcode = munmap(va, rounded_size); if (-1 == retcode) { int __errno = errno; gdr_err("error %s(%d) while unmapping handle %x, rounded_size=%zu\n", strerror(__errno), __errno, handle, rounded_size); ret = __errno; goto err; } mh->mapping_type = GDR_MAPPING_TYPE_NONE; err: return ret; } #ifdef GDRAPI_X86 #include // prepare for AVX2 implementation #ifndef bit_AVX2 /* Extended Features (%eax == 7) */ /* %ebx */ #define bit_AVX2 (1 << 5) #endif #include extern int memcpy_uncached_store_avx(void *dest, const void *src, size_t n_bytes); extern int memcpy_cached_store_avx(void *dest, const void *src, size_t n_bytes); extern int memcpy_uncached_store_sse(void *dest, const void *src, size_t n_bytes); extern int memcpy_cached_store_sse(void *dest, const void *src, size_t n_bytes); extern int memcpy_uncached_load_sse41(void *dest, const void *src, size_t n_bytes); static inline void wc_store_fence(void) { _mm_sfence(); } #define PREFERS_STORE_UNROLL4 0 #define PREFERS_STORE_UNROLL8 0 #define PREFERS_LOAD_UNROLL4 0 #define PREFERS_LOAD_UNROLL8 0 // GDRAPI_X86 #elif defined(GDRAPI_POWER) static int memcpy_uncached_store_avx(void *dest, const void *src, size_t n_bytes) { return 1; } static int memcpy_cached_store_avx(void *dest, const void *src, size_t n_bytes) { return 1; } static int memcpy_uncached_store_sse(void *dest, const void *src, size_t n_bytes) { return 1; } static int memcpy_cached_store_sse(void *dest, const void *src, size_t n_bytes) { return 1; } static int memcpy_uncached_load_sse41(void *dest, const void *src, size_t n_bytes) { return 1; } static inline void wc_store_fence(void) { asm volatile("sync") ; } #define PREFERS_STORE_UNROLL4 1 #define PREFERS_STORE_UNROLL8 0 #define PREFERS_LOAD_UNROLL4 0 #define PREFERS_LOAD_UNROLL8 1 // GDRAPI_POWER #elif defined(GDRAPI_ARM64) static int memcpy_uncached_store_avx(void *dest, const void *src, size_t n_bytes) { return 1; } static int memcpy_cached_store_avx(void *dest, const void *src, size_t n_bytes) { return 1; } static int memcpy_uncached_store_sse(void *dest, const void *src, size_t n_bytes) { return 1; } static int memcpy_cached_store_sse(void *dest, const void *src, size_t n_bytes) { return 1; } static int memcpy_uncached_load_sse41(void *dest, const void *src, size_t n_bytes) { return 1; } static inline void wc_store_fence(void) { asm volatile("DMB ishld") ; } #define PREFERS_STORE_UNROLL4 0 #define PREFERS_STORE_UNROLL8 0 #define PREFERS_LOAD_UNROLL4 0 #define PREFERS_LOAD_UNROLL8 0 // GDRAPI_ARM64 #endif static int has_sse = 0; static int has_sse2 = 0; static int has_sse4_1 = 0; static int has_avx = 0; static int has_avx2 = 0; static void gdr_init_cpu_flags(void) { #ifdef GDRAPI_X86 unsigned int info_type = 0x00000001; unsigned int ax, bx, cx, dx; if (__get_cpuid(info_type, &ax, &bx, &cx, &dx) == 1) { has_sse4_1 = ((cx & bit_SSE4_1) != 0); has_avx = ((cx & bit_AVX) != 0); has_sse = ((dx & bit_SSE) != 0); has_sse2 = ((dx & bit_SSE2) != 0); gdr_dbg("sse4_1=%d avx=%d sse=%d sse2=%d\n", has_sse4_1, has_avx, has_sse, has_sse2); } #ifdef bit_AVX2 info_type = 0x7; if (__get_cpuid(info_type, &ax, &bx, &cx, &dx) == 1) { has_avx2 = bx & bit_AVX2; } #endif // bit_AVX2 #endif // GDRAPI_X86 #ifdef GDRAPI_POWER // detect and enable Altivec/SMX support #endif } // note: more than one implementation may be compiled in static void unroll8_memcpy(void *dst, const void *src, size_t size) { const uint64_t *r = (const uint64_t *)src; uint64_t *w = (uint64_t *)dst; size_t nw = size / sizeof(*r); assert(size % sizeof(*r) == 0); while (nw) { if (0 == (nw & 3)) { uint64_t r0 = r[0]; uint64_t r1 = r[1]; uint64_t r2 = r[2]; uint64_t r3 = r[3]; w[0] = r0; w[1] = r1; w[2] = r2; w[3] = r3; r += 4; w += 4; nw -= 4; } else if (0 == (nw & 1)) { uint64_t r0 = r[0]; uint64_t r1 = r[1]; w[0] = r0; w[1] = r1; r += 2; w += 2; nw -= 2; } else { w[0] = r[0]; ++w; ++r; --nw; } } } static void unroll4_memcpy(void *dst, const void *src, size_t size) { const uint32_t *r = (const uint32_t *)src; uint32_t *w = (uint32_t *)dst; size_t nw = size / sizeof(*r); assert(size % sizeof(*r) == 0); while (nw) { if (0 == (nw & 3)) { uint32_t r0 = r[0]; uint32_t r1 = r[1]; uint32_t r2 = r[2]; uint32_t r3 = r[3]; w[0] = r0; w[1] = r1; w[2] = r2; w[3] = r3; r += 4; w += 4; nw -= 4; } else if (0 == (nw & 1)) { uint32_t r0 = r[0]; uint32_t r1 = r[1]; w[0] = r0; w[1] = r1; r += 2; w += 2; nw -= 2; } else { w[0] = r[0]; ++w; ++r; --nw; } } } static inline int is_aligned(unsigned long value, unsigned powof2) { return ((value & (powof2-1)) == 0); } static inline int ptr_is_aligned(const void *ptr, unsigned powof2) { unsigned long addr = (unsigned long)ptr; return is_aligned(addr, powof2); } static inline void memcpy_to_device_mapping(void *dst, const void *src, size_t size) { size_t remaining_size = size; void *curr_map_d_ptr = dst; const void *curr_h_ptr = src; size_t copy_size = 0; while (remaining_size > 0) { if (is_aligned(remaining_size, sizeof(uint64_t)) && ptr_is_aligned(curr_map_d_ptr, sizeof(uint64_t)) && ptr_is_aligned(curr_h_ptr, sizeof(uint64_t))) { // We have proper alignment. memcpy can be used here. Although // unlikely, this might break in the future if the implementation // of memcpy changes to generate unaligned access. Still, we choose // memcpy because it provides better performance than our simple // aligned-access workaround. memcpy(curr_map_d_ptr, curr_h_ptr, remaining_size); copy_size = remaining_size; } else if (remaining_size >= sizeof(uint64_t) && ptr_is_aligned(curr_map_d_ptr, sizeof(uint64_t))) { // memcpy cannot be used here because its internal // implementation may end up in an unaligned access. WRITE_ONCE(*(uint64_t *)curr_map_d_ptr, *(uint64_t *)curr_h_ptr); copy_size = sizeof(uint64_t); } else if (remaining_size >= sizeof(uint32_t) && ptr_is_aligned(curr_map_d_ptr, sizeof(uint32_t))) { WRITE_ONCE(*(uint32_t *)curr_map_d_ptr, *(uint32_t *)curr_h_ptr); copy_size = sizeof(uint32_t); } else if (remaining_size >= sizeof(uint16_t) && ptr_is_aligned(curr_map_d_ptr, sizeof(uint16_t))) { WRITE_ONCE(*(uint16_t *)curr_map_d_ptr, *(uint16_t *)curr_h_ptr); copy_size = sizeof(uint16_t); } else { WRITE_ONCE(*(uint8_t *)curr_map_d_ptr, *(uint8_t *)curr_h_ptr); copy_size = sizeof(uint8_t); } remaining_size -= copy_size; curr_map_d_ptr = (void *)((uintptr_t)curr_map_d_ptr + copy_size); curr_h_ptr = (const void *)((uintptr_t)curr_h_ptr + copy_size); } } static inline void memcpy_from_device_mapping(void *dst, const void *src, size_t size) { size_t remaining_size = size; const void *curr_map_d_ptr = src; void *curr_h_ptr = dst; size_t copy_size = 0; while (remaining_size > 0) { if (is_aligned(remaining_size, sizeof(uint64_t)) && ptr_is_aligned(curr_map_d_ptr, sizeof(uint64_t)) && ptr_is_aligned(curr_h_ptr, sizeof(uint64_t))) { // We have proper alignment. memcpy can be used here. Although // unlikely, this might break in the future if the implementation // of memcpy changes to generate unaligned access. Still, we choose // memcpy because it provides better performance than our simple // aligned-access workaround. memcpy(curr_h_ptr, curr_map_d_ptr, remaining_size); copy_size = remaining_size; } else if (remaining_size >= sizeof(uint64_t) && ptr_is_aligned(curr_map_d_ptr, sizeof(uint64_t))) { // memcpy cannot be used here because its internal // implementation may end up in an unaligned access. *(uint64_t *)curr_h_ptr = READ_ONCE(*(uint64_t *)curr_map_d_ptr); copy_size = sizeof(uint64_t); } else if (remaining_size >= sizeof(uint32_t) && ptr_is_aligned(curr_map_d_ptr, sizeof(uint32_t))) { *(uint32_t *)curr_h_ptr = READ_ONCE(*(uint32_t *)curr_map_d_ptr); copy_size = sizeof(uint32_t); } else if (remaining_size >= sizeof(uint16_t) && ptr_is_aligned(curr_map_d_ptr, sizeof(uint16_t))) { *(uint16_t *)curr_h_ptr = READ_ONCE(*(uint16_t *)curr_map_d_ptr); copy_size = sizeof(uint16_t); } else { *(uint8_t *)curr_h_ptr = READ_ONCE(*(uint8_t *)curr_map_d_ptr); copy_size = sizeof(uint8_t); } remaining_size -= copy_size; curr_map_d_ptr = (const void *)((uintptr_t)curr_map_d_ptr + copy_size); curr_h_ptr = (void *)((uintptr_t)curr_h_ptr + copy_size); } } static int gdr_copy_to_mapping_internal(void *map_d_ptr, const void *h_ptr, size_t size, gdr_mapping_type_t mapping_type) { const int wc_mapping = (mapping_type == GDR_MAPPING_TYPE_WC); const int device_mapping = (mapping_type == GDR_MAPPING_TYPE_DEVICE); do { // For very small sizes and aligned pointers, we use simple store. if (size == sizeof(uint8_t)) { WRITE_ONCE(*(uint8_t *)map_d_ptr, *(uint8_t *)h_ptr); goto do_fence; } else if (size == sizeof(uint16_t) && ptr_is_aligned(map_d_ptr, sizeof(uint16_t))) { WRITE_ONCE(*(uint16_t *)map_d_ptr, *(uint16_t *)h_ptr); goto do_fence; } else if (size == sizeof(uint32_t) && ptr_is_aligned(map_d_ptr, sizeof(uint32_t))) { WRITE_ONCE(*(uint32_t *)map_d_ptr, *(uint32_t *)h_ptr); goto do_fence; } else if (size == sizeof(uint64_t) && ptr_is_aligned(map_d_ptr, sizeof(uint64_t))) { WRITE_ONCE(*(uint64_t *)map_d_ptr, *(uint64_t *)h_ptr); goto do_fence; } // pick the most performing implementation compatible with the platform we are running on // NOTE: write fences are included in functions below if (has_avx) { assert(wc_mapping); gdr_dbgc(1, "using AVX implementation of gdr_copy_to_mapping\n"); memcpy_uncached_store_avx(map_d_ptr, h_ptr, size); goto out; } if (has_sse) { assert(wc_mapping); gdr_dbgc(1, "using SSE implementation of gdr_copy_to_mapping\n"); memcpy_uncached_store_sse(map_d_ptr, h_ptr, size); goto out; } // on POWER, compiler/libc memcpy is not optimal for MMIO // 64bit stores are not better than 32bit ones, so we prefer the latter. // NOTE: if preferred but not aligned, a better implementation would still try to // use byte sized stores to align map_d_ptr and h_ptr to next word. // NOTE2: unroll*_memcpy and memcpy do not include fencing. if (wc_mapping && PREFERS_STORE_UNROLL8 && is_aligned(size, 8) && ptr_is_aligned(map_d_ptr, 8) && ptr_is_aligned(h_ptr, 8)) { gdr_dbgc(1, "using unroll8_memcpy for gdr_copy_to_mapping\n"); unroll8_memcpy(map_d_ptr, h_ptr, size); } else if (wc_mapping && PREFERS_STORE_UNROLL4 && is_aligned(size, 4) && ptr_is_aligned(map_d_ptr, 4) && ptr_is_aligned(h_ptr, 4)) { gdr_dbgc(1, "using unroll4_memcpy for gdr_copy_to_mapping\n"); unroll4_memcpy(map_d_ptr, h_ptr, size); } else if (device_mapping) { gdr_dbgc(1, "using device-mapping copy for gdr_copy_to_mapping with device mapping\n"); memcpy_to_device_mapping(map_d_ptr, h_ptr, size); } else { gdr_dbgc(1, "fallback to compiler/libc memcpy implementation of gdr_copy_to_mapping\n"); memcpy(map_d_ptr, h_ptr, size); } } while (0); do_fence: if (wc_mapping) { // fencing is needed even for plain memcpy(), due to performance // being hit by delayed flushing of WC buffers wc_store_fence(); } out: return 0; } static int gdr_copy_from_mapping_internal(void *h_ptr, const void *map_d_ptr, size_t size, gdr_mapping_type_t mapping_type) { const int wc_mapping = (mapping_type == GDR_MAPPING_TYPE_WC); const int device_mapping = (mapping_type == GDR_MAPPING_TYPE_DEVICE); do { // pick the most performing implementation compatible with the platform we are running on if (has_sse4_1) { assert(wc_mapping); gdr_dbgc(1, "using SSE4_1 implementation of gdr_copy_from_mapping\n"); memcpy_uncached_load_sse41(h_ptr, map_d_ptr, size); break; } if (has_avx) { assert(wc_mapping); gdr_dbgc(1, "using AVX implementation of gdr_copy_from_mapping\n"); memcpy_cached_store_avx(h_ptr, map_d_ptr, size); break; } if (has_sse) { assert(wc_mapping); gdr_dbgc(1, "using SSE implementation of gdr_copy_from_mapping\n"); memcpy_cached_store_sse(h_ptr, map_d_ptr, size); break; } // on POWER, compiler memcpy is not optimal for MMIO // 64bit loads have 2x the BW of 32bit ones if (wc_mapping && PREFERS_LOAD_UNROLL8 && is_aligned(size, 8) && ptr_is_aligned(map_d_ptr, 8) && ptr_is_aligned(h_ptr, 8)) { gdr_dbgc(1, "using unroll8_memcpy for gdr_copy_from_mapping\n"); unroll8_memcpy(h_ptr, map_d_ptr, size); } else if (wc_mapping && PREFERS_LOAD_UNROLL4 && is_aligned(size, 4) && ptr_is_aligned(map_d_ptr, 4) && ptr_is_aligned(h_ptr, 4)) { gdr_dbgc(1, "using unroll4_memcpy for gdr_copy_from_mapping\n"); unroll4_memcpy(h_ptr, map_d_ptr, size); } else if (device_mapping) { gdr_dbgc(1, "using device-mapping copy for gdr_copy_from_mapping\n"); memcpy_from_device_mapping(h_ptr, map_d_ptr, size); } else { gdr_dbgc(1, "fallback to compiler/libc memcpy implementation of gdr_copy_from_mapping\n"); memcpy(h_ptr, map_d_ptr, size); } // note: fencing is not needed because plain stores are used // if non-temporal/uncached stores were used on x86, a proper fence would be needed instead // if (wc_mapping) // wc_store_fence(); } while (0); return 0; } int gdr_copy_to_mapping(gdr_mh_t handle, void *map_d_ptr, const void *h_ptr, size_t size) { gdr_memh_t *mh = to_memh(handle); if (unlikely(!gdr_is_mapped(mh->mapping_type))) { gdr_err("mh is not mapped yet\n"); return EINVAL; } if (unlikely(size == 0)) return 0; return gdr_copy_to_mapping_internal(map_d_ptr, h_ptr, size, mh->mapping_type); } int gdr_copy_from_mapping(gdr_mh_t handle, void *h_ptr, const void *map_d_ptr, size_t size) { gdr_memh_t *mh = to_memh(handle); if (unlikely(!gdr_is_mapped(mh->mapping_type))) { gdr_err("mh is not mapped yet\n"); return EINVAL; } if (unlikely(size == 0)) return 0; return gdr_copy_from_mapping_internal(h_ptr, map_d_ptr, size, mh->mapping_type); } void gdr_runtime_get_version(int *major, int *minor) { *major = GDR_API_MAJOR_VERSION; *minor = GDR_API_MINOR_VERSION; } int gdr_driver_get_version(gdr_t g, int *major, int *minor) { assert(g != NULL); assert(g->fd > 0); struct GDRDRV_IOC_GET_VERSION_PARAMS params; int retcode = ioctl(g->fd, GDRDRV_IOC_GET_VERSION, ¶ms); if (0 != retcode) { int ret = errno; gdr_err("Error getting the gdrdrv driver version with ioctl error (errno=%d). gdrdrv might be too old.\n", ret); return ret; } *major = params.gdrdrv_version >> MAJOR_VERSION_SHIFT; *minor = params.gdrdrv_version & MINOR_VERSION_MASK; return 0; } // ============================================================================== // Obsoleted API. Provided for compatibility only. // ============================================================================== #ifdef gdr_get_info #undef gdr_get_info #endif typedef struct gdr_info_v1 { uint64_t va; uint64_t mapped_size; uint32_t page_size; // tm_cycles and cycles_per_ms are deprecated and will be removed in future. uint64_t tm_cycles; uint32_t cycles_per_ms; unsigned mapped:1; unsigned wc_mapping:1; } gdr_info_v1_t; int gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_v1_t *info) { int ret = 0; int retcode; gdr_memh_t *mh = to_memh(handle); struct GDRDRV_IOC_GET_INFO_PARAMS params; params.handle = mh->handle; retcode = ioctl(g->fd, GDRDRV_IOC_GET_INFO, ¶ms); if (0 != retcode) { ret = errno; gdr_err("ioctl error (errno=%d)\n", ret); goto out; } else { info->va = params.va; info->mapped_size = params.mapped_size; info->page_size = params.page_size; info->tm_cycles = params.tm_cycles; info->cycles_per_ms = params.tsc_khz; info->mapped = params.mapped; info->wc_mapping = params.wc_mapping; } out: return ret; } /* * Local variables: * c-indent-level: 4 * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */