X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fmesa%2Fdrivers%2Fdri%2Fi965%2Fbrw_bufmgr.c;h=17036b53bcdad8aeb97b148a013fb02e1726b469;hb=1617fca6d12e418e02d18733dd0d1964c7ecbda9;hp=2e55363ece0450a5105c54710158dba33197b5c4;hpb=f053ee78ed2415a91d2960da50ea7c2ff9eddaa5;p=mesa.git diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.c b/src/mesa/drivers/dri/i965/brw_bufmgr.c index 2e55363ece0..17036b53bcd 100644 --- a/src/mesa/drivers/dri/i965/brw_bufmgr.c +++ b/src/mesa/drivers/dri/i965/brw_bufmgr.c @@ -1,35 +1,32 @@ -/************************************************************************** - * +/* * Copyright © 2007 Red Hat Inc. - * Copyright © 2007-2012 Intel Corporation - * Copyright 2006 Tungsten Graphics, Inc., Bismarck, ND., USA + * Copyright © 2007-2017 Intel Corporation + * Copyright © 2006 VMware, Inc. * All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR - * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE - * USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * - **************************************************************************/ + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + /* - * Authors: Thomas Hellström - * Keith Whitwell + * Authors: Thomas Hellström + * Keith Whitwell * Eric Anholt * Dave Airlie */ @@ -46,7 +43,6 @@ #include #include #include -#include #include #include #include @@ -56,6 +52,7 @@ #ifndef ETIME #define ETIME ETIMEDOUT #endif +#include "common/gen_clflush.h" #include "common/gen_debug.h" #include "common/gen_device_info.h" #include "libdrm_macros.h" @@ -64,6 +61,7 @@ #include "util/hash_table.h" #include "util/list.h" #include "brw_bufmgr.h" +#include "brw_context.h" #include "string.h" #include "i915_drm.h" @@ -76,6 +74,16 @@ #define VG(x) #endif +/* VALGRIND_FREELIKE_BLOCK unfortunately does not actually undo the earlier + * VALGRIND_MALLOCLIKE_BLOCK but instead leaves vg convinced the memory is + * leaked. All because it does not call VG(cli_free) from its + * VG_USERREQ__FREELIKE_BLOCK handler. Instead of treating the memory like + * and allocation, we mark it available for use upon mmapping and remove + * it upon unmapping. + */ +#define VG_DEFINED(ptr, size) VG(VALGRIND_MAKE_MEM_DEFINED(ptr, size)) +#define VG_NOACCESS(ptr, size) VG(VALGRIND_MAKE_MEM_NOACCESS(ptr, size)) + #define memclear(s) memset(&s, 0, sizeof(s)) #define FILE_DEBUG_FLAG DEBUG_BUFMGR @@ -92,13 +100,13 @@ atomic_add_unless(int *v, int add, int unless) struct bo_cache_bucket { struct list_head head; - unsigned long size; + uint64_t size; }; struct brw_bufmgr { int fd; - pthread_mutex_t lock; + mtx_t lock; /** Array of lists of cached gem objects of power-of-two sizes */ struct bo_cache_bucket cache_bucket[14 * 4]; @@ -108,8 +116,9 @@ struct brw_bufmgr { struct hash_table *name_table; struct hash_table *handle_table; - unsigned int has_llc:1; - unsigned int bo_reuse:1; + bool has_llc:1; + bool has_mmap_wc:1; + bool bo_reuse:1; }; static int bo_set_tiling_internal(struct brw_bo *bo, uint32_t tiling_mode, @@ -136,11 +145,10 @@ hash_find_bo(struct hash_table *ht, unsigned int key) return entry ? (struct brw_bo *) entry->data : NULL; } -static unsigned long -bo_tile_size(struct brw_bufmgr *bufmgr, unsigned long size, - uint32_t *tiling_mode) +static uint64_t +bo_tile_size(struct brw_bufmgr *bufmgr, uint64_t size, uint32_t tiling) { - if (*tiling_mode == I915_TILING_NONE) + if (tiling == I915_TILING_NONE) return size; /* 965+ just need multiples of page size for tiling */ @@ -152,19 +160,18 @@ bo_tile_size(struct brw_bufmgr *bufmgr, unsigned long size, * given chip. We use 512 as the minimum to allow for a later tiling * change. */ -static unsigned long -bo_tile_pitch(struct brw_bufmgr *bufmgr, - unsigned long pitch, uint32_t *tiling_mode) +static uint32_t +bo_tile_pitch(struct brw_bufmgr *bufmgr, uint32_t pitch, uint32_t tiling) { unsigned long tile_width; /* If untiled, then just align it so that we can do rendering * to it with the 3D engine. */ - if (*tiling_mode == I915_TILING_NONE) + if (tiling == I915_TILING_NONE) return ALIGN(pitch, 64); - if (*tiling_mode == I915_TILING_X) + if (tiling == I915_TILING_X) tile_width = 512; else tile_width = 128; @@ -174,7 +181,7 @@ bo_tile_pitch(struct brw_bufmgr *bufmgr, } static struct bo_cache_bucket * -bucket_for_size(struct brw_bufmgr *bufmgr, unsigned long size) +bucket_for_size(struct brw_bufmgr *bufmgr, uint64_t size) { int i; @@ -188,12 +195,6 @@ bucket_for_size(struct brw_bufmgr *bufmgr, unsigned long size) return NULL; } -inline void -brw_bo_reference(struct brw_bo *bo) -{ - p_atomic_inc(&bo->refcount); -} - int brw_bo_busy(struct brw_bo *bo) { @@ -208,10 +209,8 @@ brw_bo_busy(struct brw_bo *bo) if (ret == 0) { bo->idle = !busy.busy; return busy.busy; - } else { - return false; } - return (ret == 0 && busy.busy); + return false; } int @@ -245,21 +244,30 @@ brw_bo_cache_purge_bucket(struct brw_bufmgr *bufmgr, static struct brw_bo * bo_alloc_internal(struct brw_bufmgr *bufmgr, const char *name, - unsigned long size, - unsigned long flags, + uint64_t size, + unsigned flags, uint32_t tiling_mode, - unsigned long stride, unsigned int alignment) + uint32_t stride, uint64_t alignment) { struct brw_bo *bo; unsigned int page_size = getpagesize(); int ret; struct bo_cache_bucket *bucket; bool alloc_from_cache; - unsigned long bo_size; - bool for_render = false; + uint64_t bo_size; + bool busy = false; + bool zeroed = false; + + if (flags & BO_ALLOC_BUSY) + busy = true; - if (flags & BO_ALLOC_FOR_RENDER) - for_render = true; + if (flags & BO_ALLOC_ZEROED) + zeroed = true; + + /* BUSY does doesn't really jive with ZEROED as we have to wait for it to + * be idle before we can memset. Just disallow that combination. + */ + assert(!(busy && zeroed)); /* Round the allocated size up to a power of two number of pages. */ bucket = bucket_for_size(bufmgr, size); @@ -275,15 +283,17 @@ bo_alloc_internal(struct brw_bufmgr *bufmgr, bo_size = bucket->size; } - pthread_mutex_lock(&bufmgr->lock); + mtx_lock(&bufmgr->lock); /* Get a buffer out of the cache if available */ retry: alloc_from_cache = false; if (bucket != NULL && !list_empty(&bucket->head)) { - if (for_render) { + if (busy && !zeroed) { /* Allocate new render-target BOs from the tail (MRU) * of the list, as it will likely be hot in the GPU - * cache and in the aperture for us. + * cache and in the aperture for us. If the caller + * asked us to zero the buffer, we don't want this + * because we are going to mmap it. */ bo = LIST_ENTRY(struct brw_bo, bucket->head.prev, head); list_del(&bo->head); @@ -316,6 +326,15 @@ retry: bo_free(bo); goto retry; } + + if (zeroed) { + void *map = brw_bo_map(NULL, bo, MAP_WRITE | MAP_RAW); + if (!map) { + bo_free(bo); + goto retry; + } + memset(map, 0, bo_size); + } } } @@ -327,10 +346,14 @@ retry: goto err; bo->size = bo_size; + bo->idle = true; memclear(create); create.size = bo_size; + /* All new BOs we get from the kernel are zeroed, so we don't need to + * worry about that here. + */ ret = drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_CREATE, &create); if (ret != 0) { free(bo); @@ -338,7 +361,6 @@ retry: } bo->gem_handle = create.handle; - _mesa_hash_table_insert(bufmgr->handle_table, &bo->gem_handle, bo); bo->bufmgr = bufmgr; bo->align = alignment; @@ -349,70 +371,89 @@ retry: if (bo_set_tiling_internal(bo, tiling_mode, stride)) goto err_free; + + /* Calling set_domain() will allocate pages for the BO outside of the + * struct mutex lock in the kernel, which is more efficient than waiting + * to create them during the first execbuf that uses the BO. + */ + struct drm_i915_gem_set_domain sd = { + .handle = bo->gem_handle, + .read_domains = I915_GEM_DOMAIN_CPU, + .write_domain = 0, + }; + + if (drmIoctl(bo->bufmgr->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &sd) != 0) + goto err_free; } bo->name = name; p_atomic_set(&bo->refcount, 1); bo->reusable = true; + bo->cache_coherent = bufmgr->has_llc; + bo->index = -1; - pthread_mutex_unlock(&bufmgr->lock); + mtx_unlock(&bufmgr->lock); - DBG("bo_create: buf %d (%s) %ldb\n", bo->gem_handle, bo->name, size); + DBG("bo_create: buf %d (%s) %llub\n", bo->gem_handle, bo->name, + (unsigned long long) size); return bo; err_free: bo_free(bo); err: - pthread_mutex_unlock(&bufmgr->lock); + mtx_unlock(&bufmgr->lock); return NULL; } struct brw_bo * brw_bo_alloc(struct brw_bufmgr *bufmgr, - const char *name, unsigned long size, unsigned int alignment) + const char *name, uint64_t size, uint64_t alignment) { return bo_alloc_internal(bufmgr, name, size, 0, I915_TILING_NONE, 0, 0); } struct brw_bo * brw_bo_alloc_tiled(struct brw_bufmgr *bufmgr, const char *name, - int x, int y, int cpp, uint32_t *tiling_mode, - unsigned long *pitch, unsigned long flags) + uint64_t size, uint32_t tiling_mode, uint32_t pitch, + unsigned flags) { - unsigned long size, stride; - uint32_t tiling; - - do { - unsigned long aligned_y, height_alignment; + return bo_alloc_internal(bufmgr, name, size, flags, tiling_mode, pitch, 0); +} - tiling = *tiling_mode; +struct brw_bo * +brw_bo_alloc_tiled_2d(struct brw_bufmgr *bufmgr, const char *name, + int x, int y, int cpp, uint32_t tiling, + uint32_t *pitch, unsigned flags) +{ + uint64_t size; + uint32_t stride; + unsigned long aligned_y, height_alignment; - /* If we're tiled, our allocations are in 8 or 32-row blocks, - * so failure to align our height means that we won't allocate - * enough pages. - * - * If we're untiled, we still have to align to 2 rows high - * because the data port accesses 2x2 blocks even if the - * bottom row isn't to be rendered, so failure to align means - * we could walk off the end of the GTT and fault. This is - * documented on 965, and may be the case on older chipsets - * too so we try to be careful. - */ - aligned_y = y; - height_alignment = 2; - - if (tiling == I915_TILING_X) - height_alignment = 8; - else if (tiling == I915_TILING_Y) - height_alignment = 32; - aligned_y = ALIGN(y, height_alignment); - - stride = x * cpp; - stride = bo_tile_pitch(bufmgr, stride, tiling_mode); - size = stride * aligned_y; - size = bo_tile_size(bufmgr, size, tiling_mode); - } while (*tiling_mode != tiling); + /* If we're tiled, our allocations are in 8 or 32-row blocks, + * so failure to align our height means that we won't allocate + * enough pages. + * + * If we're untiled, we still have to align to 2 rows high + * because the data port accesses 2x2 blocks even if the + * bottom row isn't to be rendered, so failure to align means + * we could walk off the end of the GTT and fault. This is + * documented on 965, and may be the case on older chipsets + * too so we try to be careful. + */ + aligned_y = y; + height_alignment = 2; + + if (tiling == I915_TILING_X) + height_alignment = 8; + else if (tiling == I915_TILING_Y) + height_alignment = 32; + aligned_y = ALIGN(y, height_alignment); + + stride = x * cpp; + stride = bo_tile_pitch(bufmgr, stride, tiling); + size = stride * aligned_y; + size = bo_tile_size(bufmgr, size, tiling); *pitch = stride; if (tiling == I915_TILING_NONE) @@ -442,7 +483,7 @@ brw_bo_gem_create_from_name(struct brw_bufmgr *bufmgr, * alternating names for the front/back buffer a linear search * provides a sufficiently fast match. */ - pthread_mutex_lock(&bufmgr->lock); + mtx_lock(&bufmgr->lock); bo = hash_find_bo(bufmgr->name_table, handle); if (bo) { brw_bo_reference(bo); @@ -475,13 +516,13 @@ brw_bo_gem_create_from_name(struct brw_bufmgr *bufmgr, p_atomic_set(&bo->refcount, 1); bo->size = open_arg.size; - bo->offset64 = 0; - bo->virtual = NULL; + bo->gtt_offset = 0; bo->bufmgr = bufmgr; bo->gem_handle = open_arg.handle; bo->name = name; bo->global_name = handle; bo->reusable = false; + bo->external = true; _mesa_hash_table_insert(bufmgr->handle_table, &bo->gem_handle, bo); _mesa_hash_table_insert(bufmgr->name_table, &bo->global_name, bo); @@ -498,12 +539,12 @@ brw_bo_gem_create_from_name(struct brw_bufmgr *bufmgr, DBG("bo_create_from_handle: %d (%s)\n", handle, bo->name); out: - pthread_mutex_unlock(&bufmgr->lock); + mtx_unlock(&bufmgr->lock); return bo; err_unref: bo_free(bo); - pthread_mutex_unlock(&bufmgr->lock); + mtx_unlock(&bufmgr->lock); return NULL; } @@ -512,27 +553,32 @@ bo_free(struct brw_bo *bo) { struct brw_bufmgr *bufmgr = bo->bufmgr; struct drm_gem_close close; - struct hash_entry *entry; int ret; - if (bo->mem_virtual) { - VG(VALGRIND_FREELIKE_BLOCK(bo->mem_virtual, 0)); - drm_munmap(bo->mem_virtual, bo->size); + if (bo->map_cpu) { + VG_NOACCESS(bo->map_cpu, bo->size); + drm_munmap(bo->map_cpu, bo->size); } - if (bo->wc_virtual) { - VG(VALGRIND_FREELIKE_BLOCK(bo->wc_virtual, 0)); - drm_munmap(bo->wc_virtual, bo->size); + if (bo->map_wc) { + VG_NOACCESS(bo->map_wc, bo->size); + drm_munmap(bo->map_wc, bo->size); } - if (bo->gtt_virtual) { - drm_munmap(bo->gtt_virtual, bo->size); + if (bo->map_gtt) { + VG_NOACCESS(bo->map_gtt, bo->size); + drm_munmap(bo->map_gtt, bo->size); } - if (bo->global_name) { - entry = _mesa_hash_table_search(bufmgr->name_table, &bo->global_name); - _mesa_hash_table_remove(bufmgr->name_table, entry); + if (bo->external) { + struct hash_entry *entry; + + if (bo->global_name) { + entry = _mesa_hash_table_search(bufmgr->name_table, &bo->global_name); + _mesa_hash_table_remove(bufmgr->name_table, entry); + } + + entry = _mesa_hash_table_search(bufmgr->handle_table, &bo->gem_handle); + _mesa_hash_table_remove(bufmgr->handle_table, entry); } - entry = _mesa_hash_table_search(bufmgr->handle_table, &bo->gem_handle); - _mesa_hash_table_remove(bufmgr->handle_table, entry); /* Close this object */ memclear(close); @@ -545,21 +591,6 @@ bo_free(struct brw_bo *bo) free(bo); } -static void -bo_mark_mmaps_incoherent(struct brw_bo *bo) -{ -#if HAVE_VALGRIND - if (bo->mem_virtual) - VALGRIND_MAKE_MEM_NOACCESS(bo->mem_virtual, bo->size); - - if (bo->wc_virtual) - VALGRIND_MAKE_MEM_NOACCESS(bo->wc_virtual, bo->size); - - if (bo->gtt_virtual) - VALGRIND_MAKE_MEM_NOACCESS(bo->gtt_virtual, bo->size); -#endif -} - /** Frees all cached buffers significantly older than @time. */ static void cleanup_bo_cache(struct brw_bufmgr *bufmgr, time_t time) @@ -593,13 +624,6 @@ bo_unreference_final(struct brw_bo *bo, time_t time) DBG("bo_unreference final: %d (%s)\n", bo->gem_handle, bo->name); - /* Clear any left-over mappings */ - if (bo->map_count) { - DBG("bo freed with non-zero map-count %d\n", bo->map_count); - bo->map_count = 0; - bo_mark_mmaps_incoherent(bo); - } - bucket = bucket_for_size(bufmgr, bo->size); /* Put the buffer into our internal cache for reuse if we can. */ if (bufmgr->bo_reuse && bo->reusable && bucket != NULL && @@ -607,6 +631,7 @@ bo_unreference_final(struct brw_bo *bo, time_t time) bo->free_time = time; bo->name = NULL; + bo->kflags = 0; list_addtail(&bo->head, &bucket->head); } else { @@ -628,228 +653,314 @@ brw_bo_unreference(struct brw_bo *bo) clock_gettime(CLOCK_MONOTONIC, &time); - pthread_mutex_lock(&bufmgr->lock); + mtx_lock(&bufmgr->lock); if (p_atomic_dec_zero(&bo->refcount)) { bo_unreference_final(bo, time.tv_sec); cleanup_bo_cache(bufmgr, time.tv_sec); } - pthread_mutex_unlock(&bufmgr->lock); + mtx_unlock(&bufmgr->lock); } } static void -set_domain(struct brw_bo *bo, uint32_t read_domains, uint32_t write_domain) +bo_wait_with_stall_warning(struct brw_context *brw, + struct brw_bo *bo, + const char *action) { - struct drm_i915_gem_set_domain sd = { - .handle = bo->gem_handle, - .read_domains = read_domains, - .write_domain = write_domain, - }; + bool busy = brw && brw->perf_debug && !bo->idle; + double elapsed = unlikely(busy) ? -get_time() : 0.0; - if (drmIoctl(bo->bufmgr->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &sd) != 0) { - DBG("%s:%d: Error setting memory domains %d (%08x %08x): %s.\n", - __FILE__, __LINE__, bo->gem_handle, read_domains, write_domain, - strerror(errno)); + brw_bo_wait_rendering(bo); + + if (unlikely(busy)) { + elapsed += get_time(); + if (elapsed > 1e-5) /* 0.01ms */ + perf_debug("%s a busy \"%s\" BO stalled and took %.03f ms.\n", + action, bo->name, elapsed * 1000); } } -int -brw_bo_map(struct brw_bo *bo, int write_enable) +static void +print_flags(unsigned flags) +{ + if (flags & MAP_READ) + DBG("READ "); + if (flags & MAP_WRITE) + DBG("WRITE "); + if (flags & MAP_ASYNC) + DBG("ASYNC "); + if (flags & MAP_PERSISTENT) + DBG("PERSISTENT "); + if (flags & MAP_COHERENT) + DBG("COHERENT "); + if (flags & MAP_RAW) + DBG("RAW "); + DBG("\n"); +} + +static void * +brw_bo_map_cpu(struct brw_context *brw, struct brw_bo *bo, unsigned flags) { struct brw_bufmgr *bufmgr = bo->bufmgr; - int ret; - pthread_mutex_lock(&bufmgr->lock); + /* We disallow CPU maps for writing to non-coherent buffers, as the + * CPU map can become invalidated when a batch is flushed out, which + * can happen at unpredictable times. You should use WC maps instead. + */ + assert(bo->cache_coherent || !(flags & MAP_WRITE)); - if (!bo->mem_virtual) { + if (!bo->map_cpu) { struct drm_i915_gem_mmap mmap_arg; + void *map; - DBG("bo_map: %d (%s), map_count=%d\n", - bo->gem_handle, bo->name, bo->map_count); + DBG("brw_bo_map_cpu: %d (%s)\n", bo->gem_handle, bo->name); memclear(mmap_arg); mmap_arg.handle = bo->gem_handle; mmap_arg.size = bo->size; - ret = drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_MMAP, &mmap_arg); + int ret = drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_MMAP, &mmap_arg); if (ret != 0) { ret = -errno; DBG("%s:%d: Error mapping buffer %d (%s): %s .\n", __FILE__, __LINE__, bo->gem_handle, bo->name, strerror(errno)); - pthread_mutex_unlock(&bufmgr->lock); - return ret; + return NULL; + } + map = (void *) (uintptr_t) mmap_arg.addr_ptr; + VG_DEFINED(map, bo->size); + + if (p_atomic_cmpxchg(&bo->map_cpu, NULL, map)) { + VG_NOACCESS(map, bo->size); + drm_munmap(map, bo->size); } - bo->map_count++; - VG(VALGRIND_MALLOCLIKE_BLOCK(mmap_arg.addr_ptr, mmap_arg.size, 0, 1)); - bo->mem_virtual = (void *) (uintptr_t) mmap_arg.addr_ptr; } - DBG("bo_map: %d (%s) -> %p\n", bo->gem_handle, bo->name, bo->mem_virtual); - bo->virtual = bo->mem_virtual; + assert(bo->map_cpu); - set_domain(bo, I915_GEM_DOMAIN_CPU, - write_enable ? I915_GEM_DOMAIN_CPU : 0); + DBG("brw_bo_map_cpu: %d (%s) -> %p, ", bo->gem_handle, bo->name, + bo->map_cpu); + print_flags(flags); - bo_mark_mmaps_incoherent(bo); - VG(VALGRIND_MAKE_MEM_DEFINED(bo->mem_virtual, bo->size)); - pthread_mutex_unlock(&bufmgr->lock); + if (!(flags & MAP_ASYNC)) { + bo_wait_with_stall_warning(brw, bo, "CPU mapping"); + } - return 0; + if (!bo->cache_coherent && !bo->bufmgr->has_llc) { + /* If we're reusing an existing CPU mapping, the CPU caches may + * contain stale data from the last time we read from that mapping. + * (With the BO cache, it might even be data from a previous buffer!) + * Even if it's a brand new mapping, the kernel may have zeroed the + * buffer via CPU writes. + * + * We need to invalidate those cachelines so that we see the latest + * contents, and so long as we only read from the CPU mmap we do not + * need to write those cachelines back afterwards. + * + * On LLC, the emprical evidence suggests that writes from the GPU + * that bypass the LLC (i.e. for scanout) do *invalidate* the CPU + * cachelines. (Other reads, such as the display engine, bypass the + * LLC entirely requiring us to keep dirty pixels for the scanout + * out of any cache.) + */ + gen_invalidate_range(bo->map_cpu, bo->size); + } + + return bo->map_cpu; } -static int -map_gtt(struct brw_bo *bo) +static void * +brw_bo_map_wc(struct brw_context *brw, struct brw_bo *bo, unsigned flags) { struct brw_bufmgr *bufmgr = bo->bufmgr; - int ret; - /* Get a mapping of the buffer if we haven't before. */ - if (bo->gtt_virtual == NULL) { - struct drm_i915_gem_mmap_gtt mmap_arg; + if (!bufmgr->has_mmap_wc) + return NULL; - DBG("bo_map_gtt: mmap %d (%s), map_count=%d\n", - bo->gem_handle, bo->name, bo->map_count); + if (!bo->map_wc) { + struct drm_i915_gem_mmap mmap_arg; + void *map; + + DBG("brw_bo_map_wc: %d (%s)\n", bo->gem_handle, bo->name); memclear(mmap_arg); mmap_arg.handle = bo->gem_handle; - - /* Get the fake offset back... */ - ret = drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_MMAP_GTT, &mmap_arg); + mmap_arg.size = bo->size; + mmap_arg.flags = I915_MMAP_WC; + int ret = drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_MMAP, &mmap_arg); if (ret != 0) { ret = -errno; - DBG("%s:%d: Error preparing buffer map %d (%s): %s .\n", + DBG("%s:%d: Error mapping buffer %d (%s): %s .\n", __FILE__, __LINE__, bo->gem_handle, bo->name, strerror(errno)); - return ret; + return NULL; } - /* and mmap it */ - bo->gtt_virtual = drm_mmap(0, bo->size, PROT_READ | PROT_WRITE, - MAP_SHARED, bufmgr->fd, mmap_arg.offset); - if (bo->gtt_virtual == MAP_FAILED) { - bo->gtt_virtual = NULL; - ret = -errno; - DBG("%s:%d: Error mapping buffer %d (%s): %s .\n", - __FILE__, __LINE__, bo->gem_handle, bo->name, strerror(errno)); - return ret; + map = (void *) (uintptr_t) mmap_arg.addr_ptr; + VG_DEFINED(map, bo->size); + + if (p_atomic_cmpxchg(&bo->map_wc, NULL, map)) { + VG_NOACCESS(map, bo->size); + drm_munmap(map, bo->size); } } + assert(bo->map_wc); - bo->map_count++; - bo->virtual = bo->gtt_virtual; + DBG("brw_bo_map_wc: %d (%s) -> %p\n", bo->gem_handle, bo->name, bo->map_wc); + print_flags(flags); - DBG("bo_map_gtt: %d (%s) -> %p\n", bo->gem_handle, bo->name, - bo->gtt_virtual); + if (!(flags & MAP_ASYNC)) { + bo_wait_with_stall_warning(brw, bo, "WC mapping"); + } - return 0; + return bo->map_wc; } -int -brw_bo_map_gtt(struct brw_bo *bo) +/** + * Perform an uncached mapping via the GTT. + * + * Write access through the GTT is not quite fully coherent. On low power + * systems especially, like modern Atoms, we can observe reads from RAM before + * the write via GTT has landed. A write memory barrier that flushes the Write + * Combining Buffer (i.e. sfence/mfence) is not sufficient to order the later + * read after the write as the GTT write suffers a small delay through the GTT + * indirection. The kernel uses an uncached mmio read to ensure the GTT write + * is ordered with reads (either by the GPU, WB or WC) and unconditionally + * flushes prior to execbuf submission. However, if we are not informing the + * kernel about our GTT writes, it will not flush before earlier access, such + * as when using the cmdparser. Similarly, we need to be careful if we should + * ever issue a CPU read immediately following a GTT write. + * + * Telling the kernel about write access also has one more important + * side-effect. Upon receiving notification about the write, it cancels any + * scanout buffering for FBC/PSR and friends. Later FBC/PSR is then flushed by + * either SW_FINISH or DIRTYFB. The presumption is that we never write to the + * actual scanout via a mmaping, only to a backbuffer and so all the FBC/PSR + * tracking is handled on the buffer exchange instead. + */ +static void * +brw_bo_map_gtt(struct brw_context *brw, struct brw_bo *bo, unsigned flags) { struct brw_bufmgr *bufmgr = bo->bufmgr; - int ret; - pthread_mutex_lock(&bufmgr->lock); + /* Get a mapping of the buffer if we haven't before. */ + if (bo->map_gtt == NULL) { + struct drm_i915_gem_mmap_gtt mmap_arg; + void *map; - ret = map_gtt(bo); - if (ret) { - pthread_mutex_unlock(&bufmgr->lock); - return ret; + DBG("bo_map_gtt: mmap %d (%s)\n", bo->gem_handle, bo->name); + + memclear(mmap_arg); + mmap_arg.handle = bo->gem_handle; + + /* Get the fake offset back... */ + int ret = drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_MMAP_GTT, &mmap_arg); + if (ret != 0) { + DBG("%s:%d: Error preparing buffer map %d (%s): %s .\n", + __FILE__, __LINE__, bo->gem_handle, bo->name, strerror(errno)); + return NULL; + } + + /* and mmap it. */ + map = drm_mmap(0, bo->size, PROT_READ | PROT_WRITE, + MAP_SHARED, bufmgr->fd, mmap_arg.offset); + if (map == MAP_FAILED) { + DBG("%s:%d: Error mapping buffer %d (%s): %s .\n", + __FILE__, __LINE__, bo->gem_handle, bo->name, strerror(errno)); + return NULL; + } + + /* We don't need to use VALGRIND_MALLOCLIKE_BLOCK because Valgrind will + * already intercept this mmap call. However, for consistency between + * all the mmap paths, we mark the pointer as defined now and mark it + * as inaccessible afterwards. + */ + VG_DEFINED(map, bo->size); + + if (p_atomic_cmpxchg(&bo->map_gtt, NULL, map)) { + VG_NOACCESS(map, bo->size); + drm_munmap(map, bo->size); + } } + assert(bo->map_gtt); - /* Now move it to the GTT domain so that the GPU and CPU - * caches are flushed and the GPU isn't actively using the - * buffer. - * - * The pagefault handler does this domain change for us when - * it has unbound the BO from the GTT, but it's up to us to - * tell it when we're about to use things if we had done - * rendering and it still happens to be bound to the GTT. - */ - set_domain(bo, I915_GEM_DOMAIN_GTT, I915_GEM_DOMAIN_GTT); + DBG("bo_map_gtt: %d (%s) -> %p, ", bo->gem_handle, bo->name, bo->map_gtt); + print_flags(flags); - bo_mark_mmaps_incoherent(bo); - VG(VALGRIND_MAKE_MEM_DEFINED(bo->gtt_virtual, bo->size)); - pthread_mutex_unlock(&bufmgr->lock); + if (!(flags & MAP_ASYNC)) { + bo_wait_with_stall_warning(brw, bo, "GTT mapping"); + } - return 0; + return bo->map_gtt; } -/** - * Performs a mapping of the buffer object like the normal GTT - * mapping, but avoids waiting for the GPU to be done reading from or - * rendering to the buffer. - * - * This is used in the implementation of GL_ARB_map_buffer_range: The - * user asks to create a buffer, then does a mapping, fills some - * space, runs a drawing command, then asks to map it again without - * synchronizing because it guarantees that it won't write over the - * data that the GPU is busy using (or, more specifically, that if it - * does write over the data, it acknowledges that rendering is - * undefined). - */ - -int -brw_bo_map_unsynchronized(struct brw_bo *bo) +static bool +can_map_cpu(struct brw_bo *bo, unsigned flags) { - struct brw_bufmgr *bufmgr = bo->bufmgr; - int ret; + if (bo->cache_coherent) + return true; - /* If the CPU cache isn't coherent with the GTT, then use a - * regular synchronized mapping. The problem is that we don't - * track where the buffer was last used on the CPU side in - * terms of brw_bo_map vs brw_bo_map_gtt, so - * we would potentially corrupt the buffer even when the user - * does reasonable things. + /* Even if the buffer itself is not cache-coherent (such as a scanout), on + * an LLC platform reads always are coherent (as they are performed via the + * central system agent). It is just the writes that we need to take special + * care to ensure that land in main memory and not stick in the CPU cache. */ - if (!bufmgr->has_llc) - return brw_bo_map_gtt(bo); - - pthread_mutex_lock(&bufmgr->lock); + if (!(flags & MAP_WRITE) && bo->bufmgr->has_llc) + return true; - ret = map_gtt(bo); - if (ret == 0) { - bo_mark_mmaps_incoherent(bo); - VG(VALGRIND_MAKE_MEM_DEFINED(bo->gtt_virtual, bo->size)); - } - - pthread_mutex_unlock(&bufmgr->lock); + /* If PERSISTENT or COHERENT are set, the mmapping needs to remain valid + * across batch flushes where the kernel will change cache domains of the + * bo, invalidating continued access to the CPU mmap on non-LLC device. + * + * Similarly, ASYNC typically means that the buffer will be accessed via + * both the CPU and the GPU simultaneously. Batches may be executed that + * use the BO even while it is mapped. While OpenGL technically disallows + * most drawing while non-persistent mappings are active, we may still use + * the GPU for blits or other operations, causing batches to happen at + * inconvenient times. + */ + if (flags & (MAP_PERSISTENT | MAP_COHERENT | MAP_ASYNC)) + return false; - return ret; + return !(flags & MAP_WRITE); } -int -brw_bo_unmap(struct brw_bo *bo) +void * +brw_bo_map(struct brw_context *brw, struct brw_bo *bo, unsigned flags) { - struct brw_bufmgr *bufmgr = bo->bufmgr; - int ret = 0; - - if (bo == NULL) - return 0; + if (bo->tiling_mode != I915_TILING_NONE && !(flags & MAP_RAW)) + return brw_bo_map_gtt(brw, bo, flags); - pthread_mutex_lock(&bufmgr->lock); + void *map; - if (bo->map_count <= 0) { - DBG("attempted to unmap an unmapped bo\n"); - pthread_mutex_unlock(&bufmgr->lock); - /* Preserve the old behaviour of just treating this as a - * no-op rather than reporting the error. - */ - return 0; - } + if (can_map_cpu(bo, flags)) + map = brw_bo_map_cpu(brw, bo, flags); + else + map = brw_bo_map_wc(brw, bo, flags); - if (--bo->map_count == 0) { - bo_mark_mmaps_incoherent(bo); - bo->virtual = NULL; + /* Allow the attempt to fail by falling back to the GTT where necessary. + * + * Not every buffer can be mmaped directly using the CPU (or WC), for + * example buffers that wrap stolen memory or are imported from other + * devices. For those, we have little choice but to use a GTT mmapping. + * However, if we use a slow GTT mmapping for reads where we expected fast + * access, that order of magnitude difference in throughput will be clearly + * expressed by angry users. + * + * We skip MAP_RAW because we want to avoid map_gtt's fence detiling. + */ + if (!map && !(flags & MAP_RAW)) { + if (brw) { + perf_debug("Fallback GTT mapping for %s with access flags %x\n", + bo->name, flags); + } + map = brw_bo_map_gtt(brw, bo, flags); } - pthread_mutex_unlock(&bufmgr->lock); - return ret; + return map; } int -brw_bo_subdata(struct brw_bo *bo, unsigned long offset, - unsigned long size, const void *data) +brw_bo_subdata(struct brw_bo *bo, uint64_t offset, + uint64_t size, const void *data) { struct brw_bufmgr *bufmgr = bo->bufmgr; struct drm_i915_gem_pwrite pwrite; @@ -863,33 +974,9 @@ brw_bo_subdata(struct brw_bo *bo, unsigned long offset, ret = drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_PWRITE, &pwrite); if (ret != 0) { ret = -errno; - DBG("%s:%d: Error writing data to buffer %d: (%d %d) %s .\n", - __FILE__, __LINE__, bo->gem_handle, (int) offset, - (int) size, strerror(errno)); - } - - return ret; -} - -int -brw_bo_get_subdata(struct brw_bo *bo, unsigned long offset, - unsigned long size, void *data) -{ - struct brw_bufmgr *bufmgr = bo->bufmgr; - struct drm_i915_gem_pread pread; - int ret; - - memclear(pread); - pread.handle = bo->gem_handle; - pread.offset = offset; - pread.size = size; - pread.data_ptr = (uint64_t) (uintptr_t) data; - ret = drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_PREAD, &pread); - if (ret != 0) { - ret = -errno; - DBG("%s:%d: Error reading data from buffer %d: (%d %d) %s .\n", - __FILE__, __LINE__, bo->gem_handle, (int) offset, - (int) size, strerror(errno)); + DBG("%s:%d: Error writing data to buffer %d: " + "(%"PRIu64" %"PRIu64") %s .\n", + __FILE__, __LINE__, bo->gem_handle, offset, size, strerror(errno)); } return ret; @@ -899,7 +986,10 @@ brw_bo_get_subdata(struct brw_bo *bo, unsigned long offset, void brw_bo_wait_rendering(struct brw_bo *bo) { - set_domain(bo, I915_GEM_DOMAIN_GTT, I915_GEM_DOMAIN_GTT); + /* We require a kernel recent enough for WAIT_IOCTL support. + * See intel_init_bufmgr() + */ + brw_bo_wait(bo, -1); } /** @@ -936,6 +1026,10 @@ brw_bo_wait(struct brw_bo *bo, int64_t timeout_ns) struct drm_i915_gem_wait wait; int ret; + /* If we know it's idle, don't bother with the kernel round trip */ + if (bo->idle && !bo->external) + return 0; + memclear(wait); wait.bo_handle = bo->gem_handle; wait.timeout_ns = timeout_ns; @@ -943,13 +1037,15 @@ brw_bo_wait(struct brw_bo *bo, int64_t timeout_ns) if (ret == -1) return -errno; + bo->idle = true; + return ret; } void brw_bufmgr_destroy(struct brw_bufmgr *bufmgr) { - pthread_mutex_destroy(&bufmgr->lock); + mtx_destroy(&bufmgr->lock); /* Free any cached buffer objects we were going to reuse */ for (int i = 0; i < bufmgr->num_buckets; i++) { @@ -1011,20 +1107,19 @@ brw_bo_get_tiling(struct brw_bo *bo, uint32_t *tiling_mode, } struct brw_bo * -brw_bo_gem_create_from_prime(struct brw_bufmgr *bufmgr, int prime_fd, - int size) +brw_bo_gem_create_from_prime(struct brw_bufmgr *bufmgr, int prime_fd) { int ret; uint32_t handle; struct brw_bo *bo; struct drm_i915_gem_get_tiling get_tiling; - pthread_mutex_lock(&bufmgr->lock); + mtx_lock(&bufmgr->lock); ret = drmPrimeFDToHandle(bufmgr->fd, prime_fd, &handle); if (ret) { DBG("create_from_prime: failed to obtain handle from fd: %s\n", strerror(errno)); - pthread_mutex_unlock(&bufmgr->lock); + mtx_unlock(&bufmgr->lock); return NULL; } @@ -1053,8 +1148,6 @@ brw_bo_gem_create_from_prime(struct brw_bufmgr *bufmgr, int prime_fd, ret = lseek(prime_fd, 0, SEEK_END); if (ret != -1) bo->size = ret; - else - bo->size = size; bo->bufmgr = bufmgr; @@ -1063,6 +1156,7 @@ brw_bo_gem_create_from_prime(struct brw_bufmgr *bufmgr, int prime_fd, bo->name = "prime"; bo->reusable = false; + bo->external = true; memclear(get_tiling); get_tiling.handle = bo->gem_handle; @@ -1074,12 +1168,12 @@ brw_bo_gem_create_from_prime(struct brw_bufmgr *bufmgr, int prime_fd, /* XXX stride is unknown */ out: - pthread_mutex_unlock(&bufmgr->lock); + mtx_unlock(&bufmgr->lock); return bo; err: bo_free(bo); - pthread_mutex_unlock(&bufmgr->lock); + mtx_unlock(&bufmgr->lock); return NULL; } @@ -1088,6 +1182,15 @@ brw_bo_gem_export_to_prime(struct brw_bo *bo, int *prime_fd) { struct brw_bufmgr *bufmgr = bo->bufmgr; + if (!bo->external) { + mtx_lock(&bufmgr->lock); + if (!bo->external) { + _mesa_hash_table_insert(bufmgr->handle_table, &bo->gem_handle, bo); + bo->external = true; + } + mtx_unlock(&bufmgr->lock); + } + if (drmPrimeHandleToFD(bufmgr->fd, bo->gem_handle, DRM_CLOEXEC, prime_fd) != 0) return -errno; @@ -1110,14 +1213,18 @@ brw_bo_flink(struct brw_bo *bo, uint32_t *name) if (drmIoctl(bufmgr->fd, DRM_IOCTL_GEM_FLINK, &flink)) return -errno; - pthread_mutex_lock(&bufmgr->lock); + mtx_lock(&bufmgr->lock); + if (!bo->external) { + _mesa_hash_table_insert(bufmgr->handle_table, &bo->gem_handle, bo); + bo->external = true; + } if (!bo->global_name) { bo->global_name = flink.name; - bo->reusable = false; - _mesa_hash_table_insert(bufmgr->name_table, &bo->global_name, bo); } - pthread_mutex_unlock(&bufmgr->lock); + mtx_unlock(&bufmgr->lock); + + bo->reusable = false; } *name = bo->global_name; @@ -1152,7 +1259,7 @@ add_bucket(struct brw_bufmgr *bufmgr, int size) static void init_cache_buckets(struct brw_bufmgr *bufmgr) { - unsigned long size, cache_max_size = 64 * 1024 * 1024; + uint64_t size, cache_max_size = 64 * 1024 * 1024; /* OK, so power of two buckets was too wasteful of memory. * Give 3 other sizes between each power of two, to hopefully @@ -1192,6 +1299,25 @@ brw_create_hw_context(struct brw_bufmgr *bufmgr) return create.ctx_id; } +int +brw_hw_context_set_priority(struct brw_bufmgr *bufmgr, + uint32_t ctx_id, + int priority) +{ + struct drm_i915_gem_context_param p = { + .ctx_id = ctx_id, + .param = I915_CONTEXT_PARAM_PRIORITY, + .value = priority, + }; + int err; + + err = 0; + if (drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_CONTEXT_SETPARAM, &p)) + err = -errno; + + return err; +} + void brw_destroy_hw_context(struct brw_bufmgr *bufmgr, uint32_t ctx_id) { @@ -1219,109 +1345,19 @@ brw_reg_read(struct brw_bufmgr *bufmgr, uint32_t offset, uint64_t *result) return ret; } -void * -brw_bo_map__gtt(struct brw_bo *bo) -{ - struct brw_bufmgr *bufmgr = bo->bufmgr; - - if (bo->gtt_virtual) - return bo->gtt_virtual; - - pthread_mutex_lock(&bufmgr->lock); - if (bo->gtt_virtual == NULL) { - struct drm_i915_gem_mmap_gtt mmap_arg; - void *ptr; - - DBG("bo_map_gtt: mmap %d (%s), map_count=%d\n", - bo->gem_handle, bo->name, bo->map_count); - - memclear(mmap_arg); - mmap_arg.handle = bo->gem_handle; - - /* Get the fake offset back... */ - ptr = MAP_FAILED; - if (drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_MMAP_GTT, &mmap_arg) == 0) { - /* and mmap it */ - ptr = drm_mmap(0, bo->size, PROT_READ | PROT_WRITE, - MAP_SHARED, bufmgr->fd, mmap_arg.offset); - } - if (ptr == MAP_FAILED) { - --bo->map_count; - ptr = NULL; - } - - bo->gtt_virtual = ptr; - } - pthread_mutex_unlock(&bufmgr->lock); - - return bo->gtt_virtual; -} - -void * -brw_bo_map__cpu(struct brw_bo *bo) -{ - struct brw_bufmgr *bufmgr = bo->bufmgr; - - if (bo->mem_virtual) - return bo->mem_virtual; - - pthread_mutex_lock(&bufmgr->lock); - if (!bo->mem_virtual) { - struct drm_i915_gem_mmap mmap_arg; - - DBG("bo_map: %d (%s), map_count=%d\n", - bo->gem_handle, bo->name, bo->map_count); - - memclear(mmap_arg); - mmap_arg.handle = bo->gem_handle; - mmap_arg.size = bo->size; - if (drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_MMAP, &mmap_arg)) { - DBG("%s:%d: Error mapping buffer %d (%s): %s .\n", - __FILE__, __LINE__, bo->gem_handle, bo->name, strerror(errno)); - } else { - bo->map_count++; - VG(VALGRIND_MALLOCLIKE_BLOCK - (mmap_arg.addr_ptr, mmap_arg.size, 0, 1)); - bo->mem_virtual = (void *) (uintptr_t) mmap_arg.addr_ptr; - } - } - pthread_mutex_unlock(&bufmgr->lock); - - return bo->mem_virtual; -} - -void * -brw_bo_map__wc(struct brw_bo *bo) +static int +gem_param(int fd, int name) { - struct brw_bufmgr *bufmgr = bo->bufmgr; - - if (bo->wc_virtual) - return bo->wc_virtual; - - pthread_mutex_lock(&bufmgr->lock); - if (!bo->wc_virtual) { - struct drm_i915_gem_mmap mmap_arg; + drm_i915_getparam_t gp; + int v = -1; /* No param uses (yet) the sign bit, reserve it for errors */ - DBG("bo_map: %d (%s), map_count=%d\n", - bo->gem_handle, bo->name, bo->map_count); - - memclear(mmap_arg); - mmap_arg.handle = bo->gem_handle; - mmap_arg.size = bo->size; - mmap_arg.flags = I915_MMAP_WC; - if (drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_MMAP, &mmap_arg)) { - DBG("%s:%d: Error mapping buffer %d (%s): %s .\n", - __FILE__, __LINE__, bo->gem_handle, bo->name, strerror(errno)); - } else { - bo->map_count++; - VG(VALGRIND_MALLOCLIKE_BLOCK - (mmap_arg.addr_ptr, mmap_arg.size, 0, 1)); - bo->wc_virtual = (void *) (uintptr_t) mmap_arg.addr_ptr; - } - } - pthread_mutex_unlock(&bufmgr->lock); + memset(&gp, 0, sizeof(gp)); + gp.param = name; + gp.value = &v; + if (drmIoctl(fd, DRM_IOCTL_I915_GETPARAM, &gp)) + return -1; - return bo->wc_virtual; + return v; } /** @@ -1331,7 +1367,7 @@ brw_bo_map__wc(struct brw_bo *bo) * \param fd File descriptor of the opened DRM device. */ struct brw_bufmgr * -brw_bufmgr_init(struct gen_device_info *devinfo, int fd, int batch_size) +brw_bufmgr_init(struct gen_device_info *devinfo, int fd) { struct brw_bufmgr *bufmgr; @@ -1350,12 +1386,13 @@ brw_bufmgr_init(struct gen_device_info *devinfo, int fd, int batch_size) */ bufmgr->fd = fd; - if (pthread_mutex_init(&bufmgr->lock, NULL) != 0) { + if (mtx_init(&bufmgr->lock, mtx_plain) != 0) { free(bufmgr); return NULL; } bufmgr->has_llc = devinfo->has_llc; + bufmgr->has_mmap_wc = gem_param(fd, I915_PARAM_MMAP_VERSION) > 0; init_cache_buckets(bufmgr);