From 3a83c176eab8c513eb723554a240af1a7308d701 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Fri, 20 May 2016 11:49:12 -0700 Subject: [PATCH] anv/cmd_buffer: Only emit PIPE_CONTROL on-demand This is in contrast to emitting it directly in vkCmdPipelineBarrier. This has a couple of advantages. First, it means that no matter how many vkCmdPipelineBarrier calls the application strings together it gets one or two PIPE_CONTROLs. Second, it allow us to better track when we need to do stalls because we can flag when a flush has happened and we need a stall. Signed-off-by: Jason Ekstrand Reviewed-by: Jordan Justen --- src/intel/vulkan/anv_cmd_buffer.c | 1 + src/intel/vulkan/anv_genX.h | 2 + src/intel/vulkan/anv_private.h | 40 +++++++ src/intel/vulkan/gen7_cmd_buffer.c | 2 + src/intel/vulkan/gen8_cmd_buffer.c | 2 + src/intel/vulkan/genX_cmd_buffer.c | 163 ++++++++++++++++------------- 6 files changed, 140 insertions(+), 70 deletions(-) diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c index db7793ee462..25ab9532ae1 100644 --- a/src/intel/vulkan/anv_cmd_buffer.c +++ b/src/intel/vulkan/anv_cmd_buffer.c @@ -127,6 +127,7 @@ anv_cmd_state_reset(struct anv_cmd_buffer *cmd_buffer) state->dirty = 0; state->vb_dirty = 0; + state->pending_pipe_bits = 0; state->descriptors_dirty = 0; state->push_constants_dirty = 0; state->pipeline = NULL; diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h index a5ec27da2bd..cf5a232c218 100644 --- a/src/intel/vulkan/anv_genX.h +++ b/src/intel/vulkan/anv_genX.h @@ -39,6 +39,8 @@ genX(cmd_buffer_alloc_null_surface_state)(struct anv_cmd_buffer *cmd_buffer, void genX(cmd_buffer_set_subpass)(struct anv_cmd_buffer *cmd_buffer, struct anv_subpass *subpass); +void genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer); + void genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer); void genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer); diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index f6ce31da1c4..b3774a88cb2 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -1066,6 +1066,45 @@ enum anv_cmd_dirty_bits { }; typedef uint32_t anv_cmd_dirty_mask_t; +enum anv_pipe_bits { + ANV_PIPE_DEPTH_CACHE_FLUSH_BIT = (1 << 0), + ANV_PIPE_STALL_AT_SCOREBOARD_BIT = (1 << 1), + ANV_PIPE_STATE_CACHE_INVALIDATE_BIT = (1 << 2), + ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT = (1 << 3), + ANV_PIPE_VF_CACHE_INVALIDATE_BIT = (1 << 4), + ANV_PIPE_DATA_CACHE_FLUSH_BIT = (1 << 5), + ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT = (1 << 10), + ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT = (1 << 11), + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT = (1 << 12), + ANV_PIPE_DEPTH_STALL_BIT = (1 << 13), + ANV_PIPE_CS_STALL_BIT = (1 << 20), + + /* This bit does not exist directly in PIPE_CONTROL. Instead it means that + * a flush has happened but not a CS stall. The next time we do any sort + * of invalidation we need to insert a CS stall at that time. Otherwise, + * we would have to CS stall on every flush which could be bad. + */ + ANV_PIPE_NEEDS_CS_STALL_BIT = (1 << 21), +}; + +#define ANV_PIPE_FLUSH_BITS ( \ + ANV_PIPE_DEPTH_CACHE_FLUSH_BIT | \ + ANV_PIPE_DATA_CACHE_FLUSH_BIT | \ + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT) + +#define ANV_PIPE_STALL_BITS ( \ + ANV_PIPE_STALL_AT_SCOREBOARD_BIT | \ + ANV_PIPE_DEPTH_STALL_BIT | \ + ANV_PIPE_CS_STALL_BIT) + +#define ANV_PIPE_INVALIDATE_BITS ( \ + ANV_PIPE_STATE_CACHE_INVALIDATE_BIT | \ + ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT | \ + ANV_PIPE_VF_CACHE_INVALIDATE_BIT | \ + ANV_PIPE_DATA_CACHE_FLUSH_BIT | \ + ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT | \ + ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT) + struct anv_vertex_binding { struct anv_buffer * buffer; VkDeviceSize offset; @@ -1164,6 +1203,7 @@ struct anv_cmd_state { uint32_t vb_dirty; anv_cmd_dirty_mask_t dirty; anv_cmd_dirty_mask_t compute_dirty; + enum anv_pipe_bits pending_pipe_bits; uint32_t num_workgroups_offset; struct anv_bo *num_workgroups_bo; VkShaderStageFlags descriptors_dirty; diff --git a/src/intel/vulkan/gen7_cmd_buffer.c b/src/intel/vulkan/gen7_cmd_buffer.c index 7b55133fede..331275e5a88 100644 --- a/src/intel/vulkan/gen7_cmd_buffer.c +++ b/src/intel/vulkan/gen7_cmd_buffer.c @@ -307,6 +307,8 @@ genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer) } cmd_buffer->state.compute_dirty = 0; + + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); } void diff --git a/src/intel/vulkan/gen8_cmd_buffer.c b/src/intel/vulkan/gen8_cmd_buffer.c index 065cf9e7638..547fedd1eab 100644 --- a/src/intel/vulkan/gen8_cmd_buffer.c +++ b/src/intel/vulkan/gen8_cmd_buffer.c @@ -391,6 +391,8 @@ genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer) } cmd_buffer->state.compute_dirty = 0; + + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); } void genX(CmdSetEvent)( diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index 9f61a10c3ef..64172ca377b 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -136,6 +136,82 @@ genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer) } } +void +genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer) +{ + enum anv_pipe_bits bits = cmd_buffer->state.pending_pipe_bits; + + /* Flushes are pipelined while invalidations are handled immediately. + * Therefore, if we're flushing anything then we need to schedule a stall + * before any invalidations can happen. + */ + if (bits & ANV_PIPE_FLUSH_BITS) + bits |= ANV_PIPE_NEEDS_CS_STALL_BIT; + + /* If we're going to do an invalidate and we have a pending CS stall that + * has yet to be resolved, we do the CS stall now. + */ + if ((bits & ANV_PIPE_INVALIDATE_BITS) && + (bits & ANV_PIPE_NEEDS_CS_STALL_BIT)) { + bits |= ANV_PIPE_CS_STALL_BIT; + bits &= ~ANV_PIPE_NEEDS_CS_STALL_BIT; + } + + if (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT)) { + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) { + pipe.DepthCacheFlushEnable = bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT; + pipe.DCFlushEnable = bits & ANV_PIPE_DATA_CACHE_FLUSH_BIT; + pipe.RenderTargetCacheFlushEnable = + bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT; + + pipe.DepthStallEnable = bits & ANV_PIPE_DEPTH_STALL_BIT; + pipe.CommandStreamerStallEnable = bits & ANV_PIPE_CS_STALL_BIT; + pipe.StallAtPixelScoreboard = bits & ANV_PIPE_STALL_AT_SCOREBOARD_BIT; + + /* + * According to the Broadwell documentation, any PIPE_CONTROL with the + * "Command Streamer Stall" bit set must also have another bit set, + * with five different options: + * + * - Render Target Cache Flush + * - Depth Cache Flush + * - Stall at Pixel Scoreboard + * - Post-Sync Operation + * - Depth Stall + * - DC Flush Enable + * + * I chose "Stall at Pixel Scoreboard" since that's what we use in + * mesa and it seems to work fine. The choice is fairly arbitrary. + */ + if ((bits & ANV_PIPE_CS_STALL_BIT) && + !(bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_DEPTH_STALL_BIT | + ANV_PIPE_STALL_AT_SCOREBOARD_BIT))) + pipe.StallAtPixelScoreboard = true; + } + + bits &= ~(ANV_PIPE_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT); + } + + if (bits & ANV_PIPE_INVALIDATE_BITS) { + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) { + pipe.StateCacheInvalidationEnable = + bits & ANV_PIPE_STATE_CACHE_INVALIDATE_BIT; + pipe.ConstantCacheInvalidationEnable = + bits & ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT; + pipe.VFCacheInvalidationEnable = + bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT; + pipe.TextureCacheInvalidationEnable = + bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT; + pipe.InstructionCacheInvalidateEnable = + bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT; + } + + bits &= ~ANV_PIPE_INVALIDATE_BITS; + } + + cmd_buffer->state.pending_pipe_bits = bits; +} + void genX(CmdPipelineBarrier)( VkCommandBuffer commandBuffer, VkPipelineStageFlags srcStageMask, @@ -149,7 +225,7 @@ void genX(CmdPipelineBarrier)( const VkImageMemoryBarrier* pImageMemoryBarriers) { ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); - uint32_t b, *dw; + uint32_t b; /* XXX: Right now, we're really dumb and just flush whatever categories * the app asks for. One of these days we may make this a bit better @@ -173,105 +249,50 @@ void genX(CmdPipelineBarrier)( dst_flags |= pImageMemoryBarriers[i].dstAccessMask; } - /* Mask out the Source access flags we care about */ - const uint32_t src_mask = - VK_ACCESS_SHADER_WRITE_BIT | - VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | - VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | - VK_ACCESS_TRANSFER_WRITE_BIT; - - src_flags = src_flags & src_mask; - - /* Mask out the destination access flags we care about */ - const uint32_t dst_mask = - VK_ACCESS_INDIRECT_COMMAND_READ_BIT | - VK_ACCESS_INDEX_READ_BIT | - VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT | - VK_ACCESS_UNIFORM_READ_BIT | - VK_ACCESS_SHADER_READ_BIT | - VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | - VK_ACCESS_TRANSFER_READ_BIT; - - dst_flags = dst_flags & dst_mask; - - /* The src flags represent how things were used previously. This is - * what we use for doing flushes. - */ - struct GENX(PIPE_CONTROL) flush_cmd = { - GENX(PIPE_CONTROL_header), - .PostSyncOperation = NoWrite, - }; + enum anv_pipe_bits pipe_bits = 0; for_each_bit(b, src_flags) { switch ((VkAccessFlagBits)(1 << b)) { case VK_ACCESS_SHADER_WRITE_BIT: - flush_cmd.DCFlushEnable = true; + pipe_bits |= ANV_PIPE_DATA_CACHE_FLUSH_BIT; break; case VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT: - flush_cmd.RenderTargetCacheFlushEnable = true; + pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT; break; case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT: - flush_cmd.DepthCacheFlushEnable = true; + pipe_bits |= ANV_PIPE_DEPTH_CACHE_FLUSH_BIT; break; case VK_ACCESS_TRANSFER_WRITE_BIT: - flush_cmd.RenderTargetCacheFlushEnable = true; - flush_cmd.DepthCacheFlushEnable = true; + pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT; + pipe_bits |= ANV_PIPE_DEPTH_CACHE_FLUSH_BIT; break; default: - unreachable("should've masked this out by now"); + break; /* Nothing to do */ } } - /* If we end up doing two PIPE_CONTROLs, the first, flusing one also has to - * stall and wait for the flushing to finish, so we don't re-dirty the - * caches with in-flight rendering after the second PIPE_CONTROL - * invalidates. - */ - - if (dst_flags) - flush_cmd.CommandStreamerStallEnable = true; - - if (src_flags && dst_flags) { - dw = anv_batch_emit_dwords(&cmd_buffer->batch, GENX(PIPE_CONTROL_length)); - GENX(PIPE_CONTROL_pack)(&cmd_buffer->batch, dw, &flush_cmd); - } - - /* The dst flags represent how things will be used in the future. This - * is what we use for doing cache invalidations. - */ - struct GENX(PIPE_CONTROL) invalidate_cmd = { - GENX(PIPE_CONTROL_header), - .PostSyncOperation = NoWrite, - }; - for_each_bit(b, dst_flags) { switch ((VkAccessFlagBits)(1 << b)) { case VK_ACCESS_INDIRECT_COMMAND_READ_BIT: case VK_ACCESS_INDEX_READ_BIT: case VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT: - invalidate_cmd.VFCacheInvalidationEnable = true; + pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT; break; case VK_ACCESS_UNIFORM_READ_BIT: - invalidate_cmd.ConstantCacheInvalidationEnable = true; - /* fallthrough */ - case VK_ACCESS_SHADER_READ_BIT: - invalidate_cmd.TextureCacheInvalidationEnable = true; + pipe_bits |= ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT; + pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT; break; + case VK_ACCESS_SHADER_READ_BIT: case VK_ACCESS_COLOR_ATTACHMENT_READ_BIT: - invalidate_cmd.TextureCacheInvalidationEnable = true; - break; case VK_ACCESS_TRANSFER_READ_BIT: - invalidate_cmd.TextureCacheInvalidationEnable = true; + pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT; break; default: - unreachable("should've masked this out by now"); + break; /* Nothing to do */ } } - if (dst_flags) { - dw = anv_batch_emit_dwords(&cmd_buffer->batch, GENX(PIPE_CONTROL_length)); - GENX(PIPE_CONTROL_pack)(&cmd_buffer->batch, dw, &invalidate_cmd); - } + cmd_buffer->state.pending_pipe_bits |= pipe_bits; } static void @@ -511,6 +532,8 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer) gen7_cmd_buffer_emit_scissor(cmd_buffer); genX(cmd_buffer_flush_dynamic_state)(cmd_buffer); + + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); } static void -- 2.30.2