anv/cmd_buffer: Only emit PIPE_CONTROL on-demand
authorJason Ekstrand <jason.ekstrand@intel.com>
Fri, 20 May 2016 18:49:12 +0000 (11:49 -0700)
committerJason Ekstrand <jason.ekstrand@intel.com>
Fri, 27 May 2016 22:18:09 +0000 (15:18 -0700)
This is in contrast to emitting it directly in vkCmdPipelineBarrier.  This
has a couple of advantages.  First, it means that no matter how many
vkCmdPipelineBarrier calls the application strings together it gets one or
two PIPE_CONTROLs.  Second, it allow us to better track when we need to do
stalls because we can flag when a flush has happened and we need a stall.

Signed-off-by: Jason Ekstrand <jason@jlekstrand.net>
Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
src/intel/vulkan/anv_cmd_buffer.c
src/intel/vulkan/anv_genX.h
src/intel/vulkan/anv_private.h
src/intel/vulkan/gen7_cmd_buffer.c
src/intel/vulkan/gen8_cmd_buffer.c
src/intel/vulkan/genX_cmd_buffer.c

index db7793ee462d6e24bcdc61edcd68c0dd86b3fbf4..25ab9532ae17adf97e79b10f3bb232637b9ea985 100644 (file)
@@ -127,6 +127,7 @@ anv_cmd_state_reset(struct anv_cmd_buffer *cmd_buffer)
 
    state->dirty = 0;
    state->vb_dirty = 0;
+   state->pending_pipe_bits = 0;
    state->descriptors_dirty = 0;
    state->push_constants_dirty = 0;
    state->pipeline = NULL;
index a5ec27da2bdbdb0ad903fe2c7fdfdf798bc6324a..cf5a232c218d02a687b47feb5bbeec54c71cd080 100644 (file)
@@ -39,6 +39,8 @@ genX(cmd_buffer_alloc_null_surface_state)(struct anv_cmd_buffer *cmd_buffer,
 void genX(cmd_buffer_set_subpass)(struct anv_cmd_buffer *cmd_buffer,
                                   struct anv_subpass *subpass);
 
+void genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer);
+
 void genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer);
 void genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer);
 
index f6ce31da1c4643858c70ac2838ed18b43e0766b5..b3774a88cb2aa9b70477a8836bd6a3e95d89a7d4 100644 (file)
@@ -1066,6 +1066,45 @@ enum anv_cmd_dirty_bits {
 };
 typedef uint32_t anv_cmd_dirty_mask_t;
 
+enum anv_pipe_bits {
+   ANV_PIPE_DEPTH_CACHE_FLUSH_BIT            = (1 << 0),
+   ANV_PIPE_STALL_AT_SCOREBOARD_BIT          = (1 << 1),
+   ANV_PIPE_STATE_CACHE_INVALIDATE_BIT       = (1 << 2),
+   ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT    = (1 << 3),
+   ANV_PIPE_VF_CACHE_INVALIDATE_BIT          = (1 << 4),
+   ANV_PIPE_DATA_CACHE_FLUSH_BIT             = (1 << 5),
+   ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT     = (1 << 10),
+   ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT = (1 << 11),
+   ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT    = (1 << 12),
+   ANV_PIPE_DEPTH_STALL_BIT                  = (1 << 13),
+   ANV_PIPE_CS_STALL_BIT                     = (1 << 20),
+
+   /* This bit does not exist directly in PIPE_CONTROL.  Instead it means that
+    * a flush has happened but not a CS stall.  The next time we do any sort
+    * of invalidation we need to insert a CS stall at that time.  Otherwise,
+    * we would have to CS stall on every flush which could be bad.
+    */
+   ANV_PIPE_NEEDS_CS_STALL_BIT               = (1 << 21),
+};
+
+#define ANV_PIPE_FLUSH_BITS ( \
+   ANV_PIPE_DEPTH_CACHE_FLUSH_BIT | \
+   ANV_PIPE_DATA_CACHE_FLUSH_BIT | \
+   ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT)
+
+#define ANV_PIPE_STALL_BITS ( \
+   ANV_PIPE_STALL_AT_SCOREBOARD_BIT | \
+   ANV_PIPE_DEPTH_STALL_BIT | \
+   ANV_PIPE_CS_STALL_BIT)
+
+#define ANV_PIPE_INVALIDATE_BITS ( \
+   ANV_PIPE_STATE_CACHE_INVALIDATE_BIT | \
+   ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT | \
+   ANV_PIPE_VF_CACHE_INVALIDATE_BIT | \
+   ANV_PIPE_DATA_CACHE_FLUSH_BIT | \
+   ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT | \
+   ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT)
+
 struct anv_vertex_binding {
    struct anv_buffer *                          buffer;
    VkDeviceSize                                 offset;
@@ -1164,6 +1203,7 @@ struct anv_cmd_state {
    uint32_t                                     vb_dirty;
    anv_cmd_dirty_mask_t                         dirty;
    anv_cmd_dirty_mask_t                         compute_dirty;
+   enum anv_pipe_bits                           pending_pipe_bits;
    uint32_t                                     num_workgroups_offset;
    struct anv_bo                                *num_workgroups_bo;
    VkShaderStageFlags                           descriptors_dirty;
index 7b55133fedea12b961d498d207091972f75f6b3d..331275e5a8850b11ffdd778aa29683fa1fb04668 100644 (file)
@@ -307,6 +307,8 @@ genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer)
    }
 
    cmd_buffer->state.compute_dirty = 0;
+
+   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
 }
 
 void
index 065cf9e76384cc49b5cebd67e1b4ebdb32faaef1..547fedd1eab59912af16a6de99c8f0f9f56dab73 100644 (file)
@@ -391,6 +391,8 @@ genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer)
    }
 
    cmd_buffer->state.compute_dirty = 0;
+
+   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
 }
 
 void genX(CmdSetEvent)(
index 9f61a10c3efa4019b357671da94cc0a833df3d7e..64172ca377bdfb6c31ab184d515c544e21916f1d 100644 (file)
@@ -136,6 +136,82 @@ genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
    }
 }
 
+void
+genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
+{
+   enum anv_pipe_bits bits = cmd_buffer->state.pending_pipe_bits;
+
+   /* Flushes are pipelined while invalidations are handled immediately.
+    * Therefore, if we're flushing anything then we need to schedule a stall
+    * before any invalidations can happen.
+    */
+   if (bits & ANV_PIPE_FLUSH_BITS)
+      bits |= ANV_PIPE_NEEDS_CS_STALL_BIT;
+
+   /* If we're going to do an invalidate and we have a pending CS stall that
+    * has yet to be resolved, we do the CS stall now.
+    */
+   if ((bits & ANV_PIPE_INVALIDATE_BITS) &&
+       (bits & ANV_PIPE_NEEDS_CS_STALL_BIT)) {
+      bits |= ANV_PIPE_CS_STALL_BIT;
+      bits &= ~ANV_PIPE_NEEDS_CS_STALL_BIT;
+   }
+
+   if (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT)) {
+      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
+         pipe.DepthCacheFlushEnable = bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
+         pipe.DCFlushEnable = bits & ANV_PIPE_DATA_CACHE_FLUSH_BIT;
+         pipe.RenderTargetCacheFlushEnable =
+            bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
+
+         pipe.DepthStallEnable = bits & ANV_PIPE_DEPTH_STALL_BIT;
+         pipe.CommandStreamerStallEnable = bits & ANV_PIPE_CS_STALL_BIT;
+         pipe.StallAtPixelScoreboard = bits & ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
+
+         /*
+          * According to the Broadwell documentation, any PIPE_CONTROL with the
+          * "Command Streamer Stall" bit set must also have another bit set,
+          * with five different options:
+          *
+          *  - Render Target Cache Flush
+          *  - Depth Cache Flush
+          *  - Stall at Pixel Scoreboard
+          *  - Post-Sync Operation
+          *  - Depth Stall
+          *  - DC Flush Enable
+          *
+          * I chose "Stall at Pixel Scoreboard" since that's what we use in
+          * mesa and it seems to work fine. The choice is fairly arbitrary.
+          */
+         if ((bits & ANV_PIPE_CS_STALL_BIT) &&
+             !(bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_DEPTH_STALL_BIT |
+                       ANV_PIPE_STALL_AT_SCOREBOARD_BIT)))
+            pipe.StallAtPixelScoreboard = true;
+      }
+
+      bits &= ~(ANV_PIPE_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT);
+   }
+
+   if (bits & ANV_PIPE_INVALIDATE_BITS) {
+      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
+         pipe.StateCacheInvalidationEnable =
+            bits & ANV_PIPE_STATE_CACHE_INVALIDATE_BIT;
+         pipe.ConstantCacheInvalidationEnable =
+            bits & ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
+         pipe.VFCacheInvalidationEnable =
+            bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
+         pipe.TextureCacheInvalidationEnable =
+            bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
+         pipe.InstructionCacheInvalidateEnable =
+            bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT;
+      }
+
+      bits &= ~ANV_PIPE_INVALIDATE_BITS;
+   }
+
+   cmd_buffer->state.pending_pipe_bits = bits;
+}
+
 void genX(CmdPipelineBarrier)(
     VkCommandBuffer                             commandBuffer,
     VkPipelineStageFlags                        srcStageMask,
@@ -149,7 +225,7 @@ void genX(CmdPipelineBarrier)(
     const VkImageMemoryBarrier*                 pImageMemoryBarriers)
 {
    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-   uint32_t b, *dw;
+   uint32_t b;
 
    /* XXX: Right now, we're really dumb and just flush whatever categories
     * the app asks for.  One of these days we may make this a bit better
@@ -173,105 +249,50 @@ void genX(CmdPipelineBarrier)(
       dst_flags |= pImageMemoryBarriers[i].dstAccessMask;
    }
 
-   /* Mask out the Source access flags we care about */
-   const uint32_t src_mask =
-      VK_ACCESS_SHADER_WRITE_BIT |
-      VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
-      VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT |
-      VK_ACCESS_TRANSFER_WRITE_BIT;
-
-   src_flags = src_flags & src_mask;
-
-   /* Mask out the destination access flags we care about */
-   const uint32_t dst_mask =
-      VK_ACCESS_INDIRECT_COMMAND_READ_BIT |
-      VK_ACCESS_INDEX_READ_BIT |
-      VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT |
-      VK_ACCESS_UNIFORM_READ_BIT |
-      VK_ACCESS_SHADER_READ_BIT |
-      VK_ACCESS_COLOR_ATTACHMENT_READ_BIT |
-      VK_ACCESS_TRANSFER_READ_BIT;
-
-   dst_flags = dst_flags & dst_mask;
-
-   /* The src flags represent how things were used previously.  This is
-    * what we use for doing flushes.
-    */
-   struct GENX(PIPE_CONTROL) flush_cmd = {
-      GENX(PIPE_CONTROL_header),
-      .PostSyncOperation = NoWrite,
-   };
+   enum anv_pipe_bits pipe_bits = 0;
 
    for_each_bit(b, src_flags) {
       switch ((VkAccessFlagBits)(1 << b)) {
       case VK_ACCESS_SHADER_WRITE_BIT:
-         flush_cmd.DCFlushEnable = true;
+         pipe_bits |= ANV_PIPE_DATA_CACHE_FLUSH_BIT;
          break;
       case VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT:
-         flush_cmd.RenderTargetCacheFlushEnable = true;
+         pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
          break;
       case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
-         flush_cmd.DepthCacheFlushEnable = true;
+         pipe_bits |= ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
          break;
       case VK_ACCESS_TRANSFER_WRITE_BIT:
-         flush_cmd.RenderTargetCacheFlushEnable = true;
-         flush_cmd.DepthCacheFlushEnable = true;
+         pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
+         pipe_bits |= ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
          break;
       default:
-         unreachable("should've masked this out by now");
+         break; /* Nothing to do */
       }
    }
 
-   /* If we end up doing two PIPE_CONTROLs, the first, flusing one also has to
-    * stall and wait for the flushing to finish, so we don't re-dirty the
-    * caches with in-flight rendering after the second PIPE_CONTROL
-    * invalidates.
-    */
-
-   if (dst_flags)
-      flush_cmd.CommandStreamerStallEnable = true;
-
-   if (src_flags && dst_flags) {
-      dw = anv_batch_emit_dwords(&cmd_buffer->batch, GENX(PIPE_CONTROL_length));
-      GENX(PIPE_CONTROL_pack)(&cmd_buffer->batch, dw, &flush_cmd);
-   }
-
-   /* The dst flags represent how things will be used in the future.  This
-    * is what we use for doing cache invalidations.
-    */
-   struct GENX(PIPE_CONTROL) invalidate_cmd = {
-      GENX(PIPE_CONTROL_header),
-      .PostSyncOperation = NoWrite,
-   };
-
    for_each_bit(b, dst_flags) {
       switch ((VkAccessFlagBits)(1 << b)) {
       case VK_ACCESS_INDIRECT_COMMAND_READ_BIT:
       case VK_ACCESS_INDEX_READ_BIT:
       case VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT:
-         invalidate_cmd.VFCacheInvalidationEnable = true;
+         pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
          break;
       case VK_ACCESS_UNIFORM_READ_BIT:
-         invalidate_cmd.ConstantCacheInvalidationEnable = true;
-         /* fallthrough */
-      case VK_ACCESS_SHADER_READ_BIT:
-         invalidate_cmd.TextureCacheInvalidationEnable = true;
+         pipe_bits |= ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
+         pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
          break;
+      case VK_ACCESS_SHADER_READ_BIT:
       case VK_ACCESS_COLOR_ATTACHMENT_READ_BIT:
-         invalidate_cmd.TextureCacheInvalidationEnable = true;
-         break;
       case VK_ACCESS_TRANSFER_READ_BIT:
-         invalidate_cmd.TextureCacheInvalidationEnable = true;
+         pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
          break;
       default:
-         unreachable("should've masked this out by now");
+         break; /* Nothing to do */
       }
    }
 
-   if (dst_flags) {
-      dw = anv_batch_emit_dwords(&cmd_buffer->batch, GENX(PIPE_CONTROL_length));
-      GENX(PIPE_CONTROL_pack)(&cmd_buffer->batch, dw, &invalidate_cmd);
-   }
+   cmd_buffer->state.pending_pipe_bits |= pipe_bits;
 }
 
 static void
@@ -511,6 +532,8 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer)
       gen7_cmd_buffer_emit_scissor(cmd_buffer);
 
    genX(cmd_buffer_flush_dynamic_state)(cmd_buffer);
+
+   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
 }
 
 static void