anv: Add support for the PMA fix on Broadwell
authorJason Ekstrand <jason.ekstrand@intel.com>
Wed, 7 Dec 2016 01:52:14 +0000 (17:52 -0800)
committerJason Ekstrand <jason.ekstrand@intel.com>
Tue, 14 Feb 2017 22:18:55 +0000 (14:18 -0800)
This helps Dota 2 on Broadwell by 8-9%.  I also hacked up the driver and
used the Sascha "shadowmapping" demo to get some results.  Setting
uses_kill to true dropped the framerate on the demo by 25-30%.  Enabling
the PMA fix brought it back up to around 90% of the original framerate.

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Nanley Chery <nanley.g.chery@intel.com>
src/intel/vulkan/TODO
src/intel/vulkan/anv_cmd_buffer.c
src/intel/vulkan/anv_genX.h
src/intel/vulkan/anv_private.h
src/intel/vulkan/gen7_cmd_buffer.c
src/intel/vulkan/gen8_cmd_buffer.c
src/intel/vulkan/genX_blorp_exec.c
src/intel/vulkan/genX_cmd_buffer.c
src/intel/vulkan/genX_pipeline.c

index 38acc0dd5b6cac76f012906cd4971b66ff04aea3..f8b73a113154974b99e7255e7d1f89afcdccbe13 100644 (file)
@@ -12,5 +12,4 @@ Performance:
  - Compressed multisample support
  - Pushing pieces of UBOs?
  - Enable guardband clipping
- - pma stall workaround
  - Use soft-pin to avoid relocations
index 5886fa63e69248b960047c765152660b676e4926..8c08f8d48e6ec79dc86c233676ccb07e9e647d2e 100644 (file)
@@ -135,6 +135,8 @@ anv_cmd_state_reset(struct anv_cmd_buffer *cmd_buffer)
    state->restart_index = UINT32_MAX;
    state->dynamic = default_dynamic_state;
    state->need_query_wa = true;
+   state->pma_fix_enabled = false;
+   state->hiz_enabled = false;
 
    if (state->attachments != NULL) {
       vk_free(&cmd_buffer->pool->alloc, state->attachments);
index d04fe38a51724c20bf3887a9131aacfe56fbbdcb..67147b0e92b74818bec03b4b768a9dd6132d17b0 100644 (file)
@@ -55,6 +55,9 @@ void genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer);
 
 void genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer);
 
+void genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer,
+                                     bool enable);
+
 void
 genX(emit_urb_setup)(struct anv_device *device, struct anv_batch *batch,
                      const struct gen_l3_config *l3_config,
index 89ae8183799a733f5d4d6f43c77f8c7fddaa546a..449aa0f4ab652b0806c34015976bd2abebce0e08 100644 (file)
@@ -1168,6 +1168,20 @@ struct anv_cmd_state {
    struct anv_dynamic_state                     dynamic;
    bool                                         need_query_wa;
 
+   /**
+    * Whether or not the gen8 PMA fix is enabled.  We ensure that, at the top
+    * of any command buffer it is disabled by disabling it in EndCommandBuffer
+    * and before invoking the secondary in ExecuteCommands.
+    */
+   bool                                         pma_fix_enabled;
+
+   /**
+    * Whether or not we know for certain that HiZ is enabled for the current
+    * subpass.  If, for whatever reason, we are unsure as to whether HiZ is
+    * enabled or not, this will be false.
+    */
+   bool                                         hiz_enabled;
+
    /**
     * Array length is anv_cmd_state::pass::attachment_count. Array content is
     * valid only when recording a render pass instance.
@@ -1471,8 +1485,11 @@ struct anv_pipeline {
 
    uint32_t                                     cs_right_mask;
 
+   bool                                         writes_depth;
+   bool                                         depth_test_enable;
    bool                                         writes_stencil;
    bool                                         depth_clamp_enable;
+   bool                                         kill_pixel;
 
    struct {
       uint32_t                                  sf[7];
index 013ed8718a4c3f3f75948230cfd18b7706a6c62e..c1a25e8ce2608ecba3a32a8a0b159d14ad2f1ccf 100644 (file)
@@ -260,6 +260,13 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
    cmd_buffer->state.dirty = 0;
 }
 
+void
+genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer,
+                                bool enable)
+{
+   /* The NP PMA fix doesn't exist on gen7 */
+}
+
 void genX(CmdSetEvent)(
     VkCommandBuffer                             commandBuffer,
     VkEvent                                     event,
index 8c8de622eb8634e98a983a0bf636085c697f744d..0628f3a0dd1539164db5cd67382c9efe68c33145 100644 (file)
@@ -154,6 +154,133 @@ __emit_sf_state(struct anv_cmd_buffer *cmd_buffer)
 
 #endif
 
+void
+genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer, bool enable)
+{
+#if GEN_GEN == 8
+   if (cmd_buffer->state.pma_fix_enabled == enable)
+      return;
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+      pc.DepthCacheFlushEnable = true;
+      pc.CommandStreamerStallEnable = true;
+      pc.RenderTargetCacheFlushEnable = true;
+   }
+
+   uint32_t cache_mode;
+   anv_pack_struct(&cache_mode, GENX(CACHE_MODE_1),
+                   .NPPMAFixEnable = enable,
+                   .NPEarlyZFailsDisable = enable,
+                   .NPPMAFixEnableMask = true,
+                   .NPEarlyZFailsDisableMask = true);
+   anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
+      lri.RegisterOffset   = GENX(CACHE_MODE_1_num);
+      lri.DataDWord        = cache_mode;
+   }
+
+   /* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache
+    * Flush bits is often necessary.  We do it regardless because it's easier.
+    * The render cache flush is also necessary if stencil writes are enabled.
+    */
+   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+      pc.DepthStallEnable = true;
+      pc.DepthCacheFlushEnable = true;
+      pc.RenderTargetCacheFlushEnable = true;
+   }
+
+   cmd_buffer->state.pma_fix_enabled = enable;
+#endif /* GEN_GEN == 8 */
+}
+
+static inline bool
+want_depth_pma_fix(struct anv_cmd_buffer *cmd_buffer)
+{
+   assert(GEN_GEN == 8);
+
+   /* From the Broadwell PRM Vol. 2c CACHE_MODE_1::NP_PMA_FIX_ENABLE:
+    *
+    *    SW must set this bit in order to enable this fix when following
+    *    expression is TRUE.
+    *
+    *    3DSTATE_WM::ForceThreadDispatch != 1 &&
+    *    !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0) &&
+    *    (3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL) &&
+    *    (3DSTATE_DEPTH_BUFFER::HIZ Enable) &&
+    *    !(3DSTATE_WM::EDSC_Mode == EDSC_PREPS) &&
+    *    (3DSTATE_PS_EXTRA::PixelShaderValid) &&
+    *    !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
+    *      3DSTATE_WM_HZ_OP::DepthBufferResolve ||
+    *      3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
+    *      3DSTATE_WM_HZ_OP::StencilBufferClear) &&
+    *    (3DSTATE_WM_DEPTH_STENCIL::DepthTestEnable) &&
+    *    (((3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
+    *       3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
+    *       3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
+    *       3DSTATE_PS_BLEND::AlphaTestEnable ||
+    *       3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) &&
+    *      3DSTATE_WM::ForceKillPix != ForceOff &&
+    *      ((3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable &&
+    *        3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE) ||
+    *       (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
+    *        3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE &&
+    *        3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE))) ||
+    *     (3DSTATE_PS_EXTRA:: Pixel Shader Computed Depth mode != PSCDEPTH_OFF))
+    */
+
+   /* These are always true:
+    *    3DSTATE_WM::ForceThreadDispatch != 1 &&
+    *    !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0)
+    */
+
+   /* We only enable the PMA fix if we know for certain that HiZ is enabled.
+    * If we don't know whether HiZ is enabled or not, we disable the PMA fix
+    * and there is no harm.
+    *
+    * (3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL) &&
+    * 3DSTATE_DEPTH_BUFFER::HIZ Enable
+    */
+   if (!cmd_buffer->state.hiz_enabled)
+      return false;
+
+   /* 3DSTATE_PS_EXTRA::PixelShaderValid */
+   struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
+   if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT))
+      return false;
+
+   /* !(3DSTATE_WM::EDSC_Mode == EDSC_PREPS) */
+   const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
+   if (wm_prog_data->early_fragment_tests)
+      return false;
+
+   /* We never use anv_pipeline for HiZ ops so this is trivially true:
+    *    !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
+    *      3DSTATE_WM_HZ_OP::DepthBufferResolve ||
+    *      3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
+    *      3DSTATE_WM_HZ_OP::StencilBufferClear)
+    */
+
+   /* 3DSTATE_WM_DEPTH_STENCIL::DepthTestEnable */
+   if (!pipeline->depth_test_enable)
+      return false;
+
+   /* (((3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
+    *    3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
+    *    3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
+    *    3DSTATE_PS_BLEND::AlphaTestEnable ||
+    *    3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) &&
+    *   3DSTATE_WM::ForceKillPix != ForceOff &&
+    *   ((3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable &&
+    *     3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE) ||
+    *    (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
+    *     3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE &&
+    *     3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE))) ||
+    *  (3DSTATE_PS_EXTRA:: Pixel Shader Computed Depth mode != PSCDEPTH_OFF))
+    */
+   return (pipeline->kill_pixel && (pipeline->writes_depth ||
+                                    pipeline->writes_stencil)) ||
+          wm_prog_data->computed_depth_mode != PSCDEPTH_OFF;
+}
+
 void
 genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
 {
@@ -211,6 +338,7 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
    }
 
    if (cmd_buffer->state.dirty & (ANV_CMD_DIRTY_PIPELINE |
+                                  ANV_CMD_DIRTY_RENDER_TARGETS |
                                   ANV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK |
                                   ANV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK)) {
       uint32_t wm_depth_stencil_dw[GENX(3DSTATE_WM_DEPTH_STENCIL_length)];
@@ -234,6 +362,9 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
 
       anv_batch_emit_merge(&cmd_buffer->batch, wm_depth_stencil_dw,
                            pipeline->gen8.wm_depth_stencil);
+
+      genX(cmd_buffer_enable_pma_fix)(cmd_buffer,
+                                      want_depth_pma_fix(cmd_buffer));
    }
 #else
    if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS) {
index 663e6c92819f6da01ccd3c2822cfec42236d0c95..6f0b063897320b687a560688026814dd84eec26a 100644 (file)
@@ -154,6 +154,11 @@ genX(blorp_exec)(struct blorp_batch *batch,
 
    genX(cmd_buffer_emit_gen7_depth_flush)(cmd_buffer);
 
+   /* BLORP doesn't do anything fancy with depth such as discards, so we want
+    * the PMA fix off.  Also, off is always the safe option.
+    */
+   genX(cmd_buffer_enable_pma_fix)(cmd_buffer, false);
+
    blorp_exec(batch, params);
 
    cmd_buffer->state.vb_dirty = ~0;
index 14338b22ecea3364aa21dfe10ec2b1196d6bfe43..40a72f4d1416226e3ae994e110ced1dc46791ba4 100644 (file)
@@ -637,6 +637,11 @@ genX(EndCommandBuffer)(
 {
    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 
+   /* We want every command buffer to start with the PMA fix in a known state,
+    * so we disable it at the end of the command buffer.
+    */
+   genX(cmd_buffer_enable_pma_fix)(cmd_buffer, false);
+
    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
 
    anv_cmd_buffer_end_batch_buffer(cmd_buffer);
@@ -654,6 +659,11 @@ genX(CmdExecuteCommands)(
 
    assert(primary->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
 
+   /* The secondary command buffers will assume that the PMA fix is disabled
+    * when they begin executing.  Make sure this is true.
+    */
+   genX(cmd_buffer_enable_pma_fix)(primary, false);
+
    for (uint32_t i = 0; i < commandBufferCount; i++) {
       ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
 
@@ -2227,7 +2237,8 @@ cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
    const bool has_stencil =
       image && (image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT);
 
-   /* FIXME: Implement the PMA stall W/A */
+   cmd_buffer->state.hiz_enabled = has_hiz;
+
    /* FIXME: Width and Height are wrong */
 
    genX(cmd_buffer_emit_gen7_depth_flush)(cmd_buffer);
@@ -2465,6 +2476,8 @@ void genX(CmdEndRenderPass)(
 
    anv_cmd_buffer_resolve_subpass(cmd_buffer);
 
+   cmd_buffer->state.hiz_enabled = false;
+
 #ifndef NDEBUG
    anv_dump_add_framebuffer(cmd_buffer, cmd_buffer->state.framebuffer);
 #endif
index f641661db1b56935071061dc34784af1c193740d..d0dbe13f393386728cd66cec2388e66c10d862f6 100644 (file)
@@ -455,6 +455,10 @@ emit_rs_state(struct anv_pipeline *pipeline,
     */
 #if GEN_GEN >= 8
    raster.DXMultisampleRasterizationEnable = true;
+   /* NOTE: 3DSTATE_RASTER::ForcedSampleCount affects the BDW and SKL PMA fix
+    * computations.  If we ever set this bit to a different value, they will
+    * need to be updated accordingly.
+    */
    raster.ForcedSampleCount = FSC_NUMRASTSAMPLES_0;
    raster.ForceMultisampling = false;
 #else
@@ -664,6 +668,8 @@ emit_ds_state(struct anv_pipeline *pipeline,
        * to make sure it's initialized to something useful.
        */
       pipeline->writes_stencil = false;
+      pipeline->writes_depth = false;
+      pipeline->depth_test_enable = false;
       memset(depth_stencil_dw, 0, sizeof(depth_stencil_dw));
       return;
    }
@@ -722,6 +728,9 @@ emit_ds_state(struct anv_pipeline *pipeline,
    if (info->depthTestEnable && info->depthCompareOp == VK_COMPARE_OP_EQUAL)
       depth_stencil.DepthBufferWriteEnable = false;
 
+   pipeline->writes_depth = depth_stencil.DepthBufferWriteEnable;
+   pipeline->depth_test_enable = depth_stencil.DepthTestEnable;
+
 #if GEN_GEN <= 7
    GENX(DEPTH_STENCIL_STATE_pack)(NULL, depth_stencil_dw, &depth_stencil);
 #else
@@ -1443,6 +1452,38 @@ emit_3dstate_vf_topology(struct anv_pipeline *pipeline)
 }
 #endif
 
+static void
+compute_kill_pixel(struct anv_pipeline *pipeline,
+                   const VkPipelineMultisampleStateCreateInfo *ms_info,
+                   const struct anv_subpass *subpass)
+{
+   if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
+      pipeline->kill_pixel = false;
+      return;
+   }
+
+   const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
+
+   /* This computes the KillPixel portion of the computation for whether or
+    * not we want to enable the PMA fix on gen8.  It's given by this chunk of
+    * the giant formula:
+    *
+    *    (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
+    *     3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
+    *     3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
+    *     3DSTATE_PS_BLEND::AlphaTestEnable ||
+    *     3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)
+    *
+    * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable is always false and so is
+    * 3DSTATE_PS_BLEND::AlphaTestEnable since Vulkan doesn't have a concept
+    * of an alpha test.
+    */
+   pipeline->kill_pixel =
+      subpass->has_ds_self_dep || wm_prog_data->uses_kill ||
+      wm_prog_data->uses_omask ||
+      (ms_info && ms_info->alphaToCoverageEnable);
+}
+
 static VkResult
 genX(graphics_pipeline_create)(
     VkDevice                                    _device,
@@ -1480,6 +1521,7 @@ genX(graphics_pipeline_create)(
    emit_ds_state(pipeline, pCreateInfo->pDepthStencilState, pass, subpass);
    emit_cb_state(pipeline, pCreateInfo->pColorBlendState,
                            pCreateInfo->pMultisampleState);
+   compute_kill_pixel(pipeline, pCreateInfo->pMultisampleState, subpass);
 
    emit_urb_setup(pipeline);