From 46af0ecc1d1f060786a1c2dfede1f936b407fbf6 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Mon, 25 Nov 2019 21:55:51 -0600 Subject: [PATCH] anv: Use PIPE_CONTROL flushes to implement the gen8 VF cache WA Reviewed-by: Lionel Landwerlin --- src/intel/vulkan/anv_device.c | 14 ++- src/intel/vulkan/anv_genX.h | 8 ++ src/intel/vulkan/anv_private.h | 26 +++++ src/intel/vulkan/genX_blorp_exec.c | 33 +++--- src/intel/vulkan/genX_cmd_buffer.c | 180 +++++++++++++++++++++++++++++ src/intel/vulkan/genX_gpu_memcpy.c | 4 + 6 files changed, 245 insertions(+), 20 deletions(-) diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c index 2c48003b276..be4d23356e3 100644 --- a/src/intel/vulkan/anv_device.c +++ b/src/intel/vulkan/anv_device.c @@ -141,8 +141,12 @@ anv_physical_device_init_heaps(struct anv_physical_device *device, int fd) } } + /* We only allow 48-bit addresses with softpin because knowing the actual + * address is required for the vertex cache flush workaround. + */ device->supports_48bit_addresses = (device->info.gen >= 8) && - gtt_size > (4ULL << 30 /* GiB */); + device->has_softpin && + gtt_size > (4ULL << 30 /* GiB */); uint64_t heap_size = anv_compute_heap_size(fd, gtt_size); @@ -471,10 +475,6 @@ anv_physical_device_init(struct anv_physical_device *device, goto fail; } - result = anv_physical_device_init_heaps(device, fd); - if (result != VK_SUCCESS) - goto fail; - device->has_softpin = anv_gem_get_param(fd, I915_PARAM_HAS_EXEC_SOFTPIN); device->has_exec_async = anv_gem_get_param(fd, I915_PARAM_HAS_EXEC_ASYNC); device->has_exec_capture = anv_gem_get_param(fd, I915_PARAM_HAS_EXEC_CAPTURE); @@ -484,6 +484,10 @@ anv_physical_device_init(struct anv_physical_device *device, anv_gem_supports_syncobj_wait(fd); device->has_context_priority = anv_gem_has_context_priority(fd); + result = anv_physical_device_init_heaps(device, fd); + if (result != VK_SUCCESS) + goto fail; + device->use_softpin = device->has_softpin && device->supports_48bit_addresses; diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h index 0274fe8b3a8..8c2a0e40099 100644 --- a/src/intel/vulkan/anv_genX.h +++ b/src/intel/vulkan/anv_genX.h @@ -44,6 +44,14 @@ void genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer); void genX(cmd_buffer_emit_gen7_depth_flush)(struct anv_cmd_buffer *cmd_buffer); +void genX(cmd_buffer_set_binding_for_gen8_vb_flush)(struct anv_cmd_buffer *cmd_buffer, + int vb_index, + struct anv_address vb_address, + uint32_t vb_size); +void genX(cmd_buffer_update_dirty_vbs_for_gen8_vb_flush)(struct anv_cmd_buffer *cmd_buffer, + uint32_t access_type, + uint64_t vb_used); + void genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer, unsigned width, unsigned height, unsigned scale); diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index fd6f0fdb104..2abbb866b2f 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -2503,6 +2503,27 @@ struct anv_attachment_state { struct anv_image_view * image_view; }; +/** State tracking for vertex buffer flushes + * + * On Gen8-9, the VF cache only considers the bottom 32 bits of memory + * addresses. If you happen to have two vertex buffers which get placed + * exactly 4 GiB apart and use them in back-to-back draw calls, you can get + * collisions. In order to solve this problem, we track vertex address ranges + * which are live in the cache and invalidate the cache if one ever exceeds 32 + * bits. + */ +struct anv_vb_cache_range { + /* Virtual address at which the live vertex buffer cache range starts for + * this vertex buffer index. + */ + uint64_t start; + + /* Virtual address of the byte after where vertex buffer cache range ends. + * This is exclusive such that end - start is the size of the range. + */ + uint64_t end; +}; + /** State tracking for particular pipeline bind point * * This struct is the base struct for anv_cmd_graphics_state and @@ -2531,6 +2552,11 @@ struct anv_cmd_graphics_state { anv_cmd_dirty_mask_t dirty; uint32_t vb_dirty; + struct anv_vb_cache_range ib_bound_range; + struct anv_vb_cache_range ib_dirty_range; + struct anv_vb_cache_range vb_bound_ranges[33]; + struct anv_vb_cache_range vb_dirty_ranges[33]; + struct anv_dynamic_state dynamic; struct { diff --git a/src/intel/vulkan/genX_blorp_exec.c b/src/intel/vulkan/genX_blorp_exec.c index 79e18d95282..302acb54461 100644 --- a/src/intel/vulkan/genX_blorp_exec.c +++ b/src/intel/vulkan/genX_blorp_exec.c @@ -139,19 +139,6 @@ blorp_alloc_vertex_buffer(struct blorp_batch *batch, uint32_t size, struct blorp_address *addr) { struct anv_cmd_buffer *cmd_buffer = batch->driver_batch; - - /* From the Skylake PRM, 3DSTATE_VERTEX_BUFFERS: - * - * "The VF cache needs to be invalidated before binding and then using - * Vertex Buffers that overlap with any previously bound Vertex Buffer - * (at a 64B granularity) since the last invalidation. A VF cache - * invalidate is performed by setting the "VF Cache Invalidation Enable" - * bit in PIPE_CONTROL." - * - * This restriction first appears in the Skylake PRM but the internal docs - * also list it as being an issue on Broadwell. In order to avoid this - * problem, we align all vertex buffer allocations to 64 bytes. - */ struct anv_state vb_state = anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, size, 64); @@ -170,9 +157,25 @@ blorp_vf_invalidate_for_vb_48b_transitions(struct blorp_batch *batch, uint32_t *sizes, unsigned num_vbs) { - /* anv forces all vertex buffers into the low 4GB so there are never any - * transitions that require a VF invalidation. + struct anv_cmd_buffer *cmd_buffer = batch->driver_batch; + + for (unsigned i = 0; i < num_vbs; i++) { + struct anv_address anv_addr = { + .bo = addrs[i].buffer, + .offset = addrs[i].offset, + }; + genX(cmd_buffer_set_binding_for_gen8_vb_flush)(cmd_buffer, + i, anv_addr, sizes[i]); + } + + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + /* Technically, we should call this *after* 3DPRIMITIVE but it doesn't + * really matter for blorp because we never call apply_pipe_flushes after + * this point. */ + genX(cmd_buffer_update_dirty_vbs_for_gen8_vb_flush)(cmd_buffer, SEQUENTIAL, + (1 << num_vbs) - 1); } #if GEN_GEN >= 8 diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index 677377ea302..c764011eb5f 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -1392,6 +1392,10 @@ genX(BeginCommandBuffer)( * executing anything. The chances are fairly high that they will use * blorp at least once per primary command buffer so it shouldn't be * wasted. + * + * There is also a workaround on gen8 which requires us to invalidate the + * VF cache occasionally. It's easier if we can assume we start with a + * fresh cache (See also genX(cmd_buffer_set_binding_for_gen8_vb_flush).) */ cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT; @@ -1598,6 +1602,14 @@ genX(CmdExecuteCommands)( anv_cmd_buffer_add_secondary(primary, secondary); } + /* The secondary isn't counted in our VF cache tracking so we need to + * invalidate the whole thing. + */ + if (GEN_GEN >= 8 && GEN_GEN <= 9) { + primary->state.pending_pipe_bits |= + ANV_PIPE_CS_STALL_BIT | ANV_PIPE_VF_CACHE_INVALIDATE_BIT; + } + /* The secondary may have selected a different pipeline (3D or compute) and * may have changed the current L3$ configuration. Reset our tracking * variables to invalid values to ensure that we re-emit these in the case @@ -1836,6 +1848,18 @@ genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer) bits |= ANV_PIPE_TILE_CACHE_FLUSH_BIT; } + if ((GEN_GEN >= 8 && GEN_GEN <= 9) && + (bits & ANV_PIPE_CS_STALL_BIT) && + (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT)) { + /* If we are doing a VF cache invalidate AND a CS stall (it must be + * both) then we can reset our vertex cache tracking. + */ + memset(cmd_buffer->state.gfx.vb_dirty_ranges, 0, + sizeof(cmd_buffer->state.gfx.vb_dirty_ranges)); + memset(&cmd_buffer->state.gfx.ib_dirty_range, 0, + sizeof(cmd_buffer->state.gfx.ib_dirty_range)); + } + if (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT)) { anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) { #if GEN_GEN >= 12 @@ -2830,6 +2854,12 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer) #endif }; +#if GEN_GEN >= 8 && GEN_GEN <= 9 + genX(cmd_buffer_set_binding_for_gen8_vb_flush)(cmd_buffer, vb, + state.BufferStartingAddress, + state.BufferSize); +#endif + GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, &p[1 + i * 4], &state); i++; } @@ -2967,6 +2997,9 @@ emit_vertex_bo(struct anv_cmd_buffer *cmd_buffer, .EndAddress = anv_address_add(addr, size), #endif }); + + genX(cmd_buffer_set_binding_for_gen8_vb_flush)(cmd_buffer, + index, addr, size); } static void @@ -3014,6 +3047,25 @@ emit_draw_index(struct anv_cmd_buffer *cmd_buffer, uint32_t draw_index) emit_vertex_bo(cmd_buffer, addr, 4, ANV_DRAWID_VB_INDEX); } +static void +update_dirty_vbs_for_gen8_vb_flush(struct anv_cmd_buffer *cmd_buffer, + uint32_t access_type) +{ + struct anv_pipeline *pipeline = cmd_buffer->state.gfx.base.pipeline; + const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); + + uint64_t vb_used = pipeline->vb_used; + if (vs_prog_data->uses_firstvertex || + vs_prog_data->uses_baseinstance) + vb_used |= 1ull << ANV_SVGS_VB_INDEX; + if (vs_prog_data->uses_drawid) + vb_used |= 1ull << ANV_DRAWID_VB_INDEX; + + genX(cmd_buffer_update_dirty_vbs_for_gen8_vb_flush)(cmd_buffer, + access_type == RANDOM, + vb_used); +} + void genX(CmdDraw)( VkCommandBuffer commandBuffer, uint32_t vertexCount, @@ -3059,6 +3111,8 @@ void genX(CmdDraw)( prim.StartInstanceLocation = firstInstance; prim.BaseVertexLocation = 0; } + + update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, SEQUENTIAL); } void genX(CmdDrawIndexed)( @@ -3107,6 +3161,8 @@ void genX(CmdDrawIndexed)( prim.StartInstanceLocation = firstInstance; prim.BaseVertexLocation = vertexOffset; } + + update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, RANDOM); } /* Auto-Draw / Indirect Registers */ @@ -3179,6 +3235,8 @@ void genX(CmdDrawIndirectByteCountEXT)( prim.VertexAccessType = SEQUENTIAL; prim.PrimitiveTopologyType = pipeline->topology; } + + update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, SEQUENTIAL); #endif /* GEN_IS_HASWELL || GEN_GEN >= 8 */ } @@ -3263,6 +3321,8 @@ void genX(CmdDrawIndirect)( prim.PrimitiveTopologyType = pipeline->topology; } + update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, SEQUENTIAL); + offset += stride; } } @@ -3311,6 +3371,8 @@ void genX(CmdDrawIndexedIndirect)( prim.PrimitiveTopologyType = pipeline->topology; } + update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, RANDOM); + offset += stride; } } @@ -3465,6 +3527,8 @@ void genX(CmdDrawIndirectCountKHR)( prim.PrimitiveTopologyType = pipeline->topology; } + update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, SEQUENTIAL); + offset += stride; } } @@ -3530,6 +3594,8 @@ void genX(CmdDrawIndexedIndirectCountKHR)( prim.PrimitiveTopologyType = pipeline->topology; } + update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, RANDOM); + offset += stride; } } @@ -4115,6 +4181,120 @@ genX(cmd_buffer_emit_gen7_depth_flush)(struct anv_cmd_buffer *cmd_buffer) } } +/* From the Skylake PRM, 3DSTATE_VERTEX_BUFFERS: + * + * "The VF cache needs to be invalidated before binding and then using + * Vertex Buffers that overlap with any previously bound Vertex Buffer + * (at a 64B granularity) since the last invalidation. A VF cache + * invalidate is performed by setting the "VF Cache Invalidation Enable" + * bit in PIPE_CONTROL." + * + * This is implemented by carefully tracking all vertex and index buffer + * bindings and flushing if the cache ever ends up with a range in the cache + * that would exceed 4 GiB. This is implemented in three parts: + * + * 1. genX(cmd_buffer_set_binding_for_gen8_vb_flush)() which must be called + * every time a 3DSTATE_VERTEX_BUFFER packet is emitted and informs the + * tracking code of the new binding. If this new binding would cause + * the cache to have a too-large range on the next draw call, a pipeline + * stall and VF cache invalidate are added to pending_pipeline_bits. + * + * 2. genX(cmd_buffer_apply_pipe_flushes)() resets the cache tracking to + * empty whenever we emit a VF invalidate. + * + * 3. genX(cmd_buffer_update_dirty_vbs_for_gen8_vb_flush)() must be called + * after every 3DPRIMITIVE and copies the bound range into the dirty + * range for each used buffer. This has to be a separate step because + * we don't always re-bind all buffers and so 1. can't know which + * buffers are actually bound. + */ +void +genX(cmd_buffer_set_binding_for_gen8_vb_flush)(struct anv_cmd_buffer *cmd_buffer, + int vb_index, + struct anv_address vb_address, + uint32_t vb_size) +{ + if (GEN_GEN < 8 || GEN_GEN > 9 || + !cmd_buffer->device->instance->physicalDevice.use_softpin) + return; + + struct anv_vb_cache_range *bound, *dirty; + if (vb_index == -1) { + bound = &cmd_buffer->state.gfx.ib_bound_range; + dirty = &cmd_buffer->state.gfx.ib_dirty_range; + } else { + assert(vb_index >= 0); + assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges)); + assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges)); + bound = &cmd_buffer->state.gfx.vb_bound_ranges[vb_index]; + dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[vb_index]; + } + + if (vb_size == 0) { + bound->start = 0; + bound->end = 0; + return; + } + + assert(vb_address.bo && (vb_address.bo->flags & EXEC_OBJECT_PINNED)); + bound->start = gen_48b_address(anv_address_physical(vb_address)); + bound->end = bound->start + vb_size; + assert(bound->end > bound->start); /* No overflow */ + + /* Align everything to a cache line */ + bound->start &= ~(64ull - 1ull); + bound->end = align_u64(bound->end, 64); + + /* Compute the dirty range */ + dirty->start = MIN2(dirty->start, bound->start); + dirty->end = MAX2(dirty->end, bound->end); + + /* If our range is larger than 32 bits, we have to flush */ + assert(bound->end - bound->start <= (1ull << 32)); + if (dirty->end - dirty->start > (1ull << 32)) { + cmd_buffer->state.pending_pipe_bits |= + ANV_PIPE_CS_STALL_BIT | ANV_PIPE_VF_CACHE_INVALIDATE_BIT; + } +} + +void +genX(cmd_buffer_update_dirty_vbs_for_gen8_vb_flush)(struct anv_cmd_buffer *cmd_buffer, + uint32_t access_type, + uint64_t vb_used) +{ + if (GEN_GEN < 8 || GEN_GEN > 9 || + !cmd_buffer->device->instance->physicalDevice.use_softpin) + return; + + if (access_type == RANDOM) { + /* We have an index buffer */ + struct anv_vb_cache_range *bound = &cmd_buffer->state.gfx.ib_bound_range; + struct anv_vb_cache_range *dirty = &cmd_buffer->state.gfx.ib_dirty_range; + + if (bound->end > bound->start) { + dirty->start = MIN2(dirty->start, bound->start); + dirty->end = MAX2(dirty->end, bound->end); + } + } + + uint64_t mask = vb_used; + while (mask) { + int i = u_bit_scan64(&mask); + assert(i >= 0); + assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges)); + assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges)); + + struct anv_vb_cache_range *bound, *dirty; + bound = &cmd_buffer->state.gfx.vb_bound_ranges[i]; + dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[i]; + + if (bound->end > bound->start) { + dirty->start = MIN2(dirty->start, bound->start); + dirty->end = MAX2(dirty->end, bound->end); + } + } +} + /** * Update the pixel hashing modes that determine the balancing of PS threads * across subslices and slices. diff --git a/src/intel/vulkan/genX_gpu_memcpy.c b/src/intel/vulkan/genX_gpu_memcpy.c index 5af7085393e..28de5def12d 100644 --- a/src/intel/vulkan/genX_gpu_memcpy.c +++ b/src/intel/vulkan/genX_gpu_memcpy.c @@ -78,6 +78,7 @@ genX(cmd_buffer_so_memcpy)(struct anv_cmd_buffer *cmd_buffer, genX(cmd_buffer_config_l3)(cmd_buffer, cfg); } + genX(cmd_buffer_set_binding_for_gen8_vb_flush)(cmd_buffer, 32, src, size); genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); genX(flush_pipeline_select_3d)(cmd_buffer); @@ -229,5 +230,8 @@ genX(cmd_buffer_so_memcpy)(struct anv_cmd_buffer *cmd_buffer, prim.BaseVertexLocation = 0; } + genX(cmd_buffer_update_dirty_vbs_for_gen8_vb_flush)(cmd_buffer, SEQUENTIAL, + 1ull << 32); + cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_PIPELINE; } -- 2.30.2