i965: Implement the PMA stall fix.
authorKenneth Graunke <kenneth@whitecape.org>
Wed, 22 Oct 2014 15:58:58 +0000 (08:58 -0700)
committerKenneth Graunke <kenneth@whitecape.org>
Tue, 4 Nov 2014 19:38:01 +0000 (11:38 -0800)
Certain non-promoted depth cases typically incur stalls.  In very
specific cases, we can enable a workaround which improves performance.

Improves performance in GLBenchmark 2.7 TRex by 1.17762% +/- 0.448765%
(n=75) at 1280x720 on Broadwell GT3.

Haswell has this feature as well, but we can't currently write registers
from userspace batches (and we'd incur additional software batch
scanning overhead as well), so we haven't enabled it.  Broadwell allows
us to write CACHE_MODE_1.  Backporters beware: the formula and flushing
incantation differs between Haswell and Broadwell.

v2: Move pma_stall_bits from brw->state to brw itself (requested by
    Kristian Høgsberg).

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
src/mesa/drivers/dri/i965/brw_context.h
src/mesa/drivers/dri/i965/brw_state.h
src/mesa/drivers/dri/i965/brw_state_upload.c
src/mesa/drivers/dri/i965/gen8_depth_state.c

index eb37e750f6db9762d65ec7ba8a36882c92c59a25..656cbe8ce84d20350c73739ffa83775cf3aec859 100644 (file)
@@ -1092,6 +1092,9 @@ struct brw_context
    /* Whether the last depth/stencil packets were both NULL. */
    bool no_depth_or_stencil;
 
+   /* The last PMA stall bits programmed. */
+   uint32_t pma_stall_bits;
+
    struct {
       /** Does the current draw use the index buffer? */
       bool indexed;
index 2efe56e4c73fd3dd61e0969bb3723b5a7f2ea074..209fab10c85a992ef6a7a4fd558613b8da25dedf 100644 (file)
@@ -137,6 +137,7 @@ extern const struct brw_tracked_state gen8_disable_stages;
 extern const struct brw_tracked_state gen8_gs_state;
 extern const struct brw_tracked_state gen8_index_buffer;
 extern const struct brw_tracked_state gen8_multisample_state;
+extern const struct brw_tracked_state gen8_pma_fix;
 extern const struct brw_tracked_state gen8_ps_blend;
 extern const struct brw_tracked_state gen8_ps_extra;
 extern const struct brw_tracked_state gen8_ps_state;
index a69131958489ebaaa2c1cf166372aa3b41b49220..1c53e5b7d7ddbf8babbb0f83437589f7195cf4b0 100644 (file)
@@ -333,6 +333,7 @@ static const struct brw_tracked_state *gen8_atoms[] =
    &gen8_vertices,
 
    &haswell_cut_index,
+   &gen8_pma_fix,
 };
 
 static void
@@ -390,6 +391,11 @@ void brw_init_state( struct brw_context *brw )
    brw->state.dirty.mesa = ~0;
    brw->state.dirty.brw = ~0ull;
 
+   /* ~0 is a nonsensical value which won't match anything we program, so
+    * the programming will take effect on the first time around.
+    */
+   brw->pma_stall_bits = ~0;
+
    /* Make sure that brw->state.dirty.brw has enough bits to hold all possible
     * dirty flags.
     */
index a24032c343cf09eeddaa6d33d7860e409a70b4b6..e716141e62b892f9e8a3684e1add25d6c738dfed 100644 (file)
@@ -28,6 +28,7 @@
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
+#include "brw_wm.h"
 
 /**
  * Helper function to emit depth related command packets.
@@ -211,6 +212,172 @@ gen8_emit_depth_stencil_hiz(struct brw_context *brw,
                       hiz, width, height, depth, lod, min_array_element);
 }
 
+/**
+ * Should we set the PMA FIX ENABLE bit?
+ *
+ * To avoid unnecessary depth related stalls, we need to set this bit.
+ * However, there is a very complicated formula which governs when it
+ * is legal to do so.  This function computes that.
+ *
+ * See the documenation for the CACHE_MODE_1 register, bit 11.
+ */
+static bool
+pma_fix_enable(const struct brw_context *brw)
+{
+   const struct gl_context *ctx = &brw->ctx;
+   /* BRW_NEW_FRAGMENT_PROGRAM */
+   const struct gl_fragment_program *fp = brw->fragment_program;
+   /* _NEW_BUFFERS */
+   struct intel_renderbuffer *depth_irb =
+      intel_get_renderbuffer(ctx->DrawBuffer, BUFFER_DEPTH);
+
+   /* 3DSTATE_WM::ForceThreadDispatch is never used. */
+   const bool wm_force_thread_dispatch = false;
+
+   /* 3DSTATE_RASTER::ForceSampleCount is never used. */
+   const bool raster_force_sample_count_nonzero = false;
+
+   /* _NEW_BUFFERS:
+    * 3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
+    * 3DSTATE_DEPTH_BUFFER::HIZ Enable
+    */
+   const bool hiz_enabled = depth_irb && intel_renderbuffer_has_hiz(depth_irb);
+
+   /* 3DSTATE_WM::Early Depth/Stencil Control != EDSC_PREPS (2).
+    * We always leave this set to EDSC_NORMAL (0).
+    */
+   const bool edsc_not_preps = true;
+
+   /* 3DSTATE_PS_EXTRA::PixelShaderValid is always true. */
+   const bool pixel_shader_valid = true;
+
+   /* !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
+    *   3DSTATE_WM_HZ_OP::DepthBufferResolve ||
+    *   3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
+    *   3DSTATE_WM_HZ_OP::StencilBufferClear)
+    *
+    * HiZ operations are done outside of the normal state upload, so they're
+    * definitely not happening now.
+    */
+   const bool in_hiz_op = false;
+
+   /* _NEW_DEPTH:
+    * DEPTH_STENCIL_STATE::DepthTestEnable
+    */
+   const bool depth_test_enabled = depth_irb && ctx->Depth.Test;
+
+   /* _NEW_DEPTH:
+    * 3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable &&
+    * 3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE.
+    */
+   const bool depth_writes_enabled = ctx->Depth.Mask;
+
+   /* _NEW_STENCIL:
+    * !DEPTH_STENCIL_STATE::Stencil Buffer Write Enable ||
+    * !3DSTATE_DEPTH_BUFFER::Stencil Buffer Enable ||
+    * !3DSTATE_STENCIL_BUFFER::Stencil Buffer Enable
+    */
+   const bool stencil_writes_enabled = ctx->Stencil._WriteEnabled;
+
+   /* BRW_NEW_FRAGMENT_PROGRAM:
+    * 3DSTATE_PS_EXTRA::Pixel Shader Computed Depth Mode == PSCDEPTH_OFF
+    */
+   const bool ps_computes_depth =
+      (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) &&
+      fp->FragDepthLayout != FRAG_DEPTH_LAYOUT_UNCHANGED;
+
+   /* BRW_NEW_FRAGMENT_PROGRAM: 3DSTATE_PS_EXTRA::PixelShaderKillsPixels
+    * CACHE_NEW_WM_PROG:        3DSTATE_PS_EXTRA::oMask Present to RenderTarget
+    * _NEW_MULTISAMPLE:         3DSTATE_PS_BLEND::AlphaToCoverageEnable
+    * _NEW_COLOR:               3DSTATE_PS_BLEND::AlphaTestEnable
+    *
+    * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable is always false.
+    * 3DSTATE_WM::ForceKillPix != ForceOff is always true.
+    */
+   const bool kill_pixel =
+      fp->UsesKill ||
+      brw->wm.prog_data->uses_omask ||
+      (ctx->Multisample._Enabled && ctx->Multisample.SampleAlphaToCoverage) ||
+      ctx->Color.AlphaEnabled;
+
+   /* The big formula in CACHE_MODE_1::NP PMA FIX ENABLE. */
+   return !wm_force_thread_dispatch &&
+          !raster_force_sample_count_nonzero &&
+          hiz_enabled &&
+          edsc_not_preps &&
+          pixel_shader_valid &&
+          !in_hiz_op &&
+          depth_test_enabled &&
+          (ps_computes_depth ||
+           (kill_pixel && (depth_writes_enabled || stencil_writes_enabled)));
+}
+
+static void
+write_pma_stall_bits(struct brw_context *brw, uint32_t pma_stall_bits)
+{
+   struct gl_context *ctx = &brw->ctx;
+
+   /* If we haven't actually changed the value, bail now to avoid unnecessary
+    * pipeline stalls and register writes.
+    */
+   if (brw->pma_stall_bits == pma_stall_bits)
+      return;
+
+   brw->pma_stall_bits = pma_stall_bits;
+
+   /* According to the PIPE_CONTROL documentation, software should emit a
+    * PIPE_CONTROL with the CS Stall and Depth Cache Flush bits set prior
+    * to the LRI.  If stencil buffer writes are enabled, then a Render Cache
+    * Flush is also necessary.
+    */
+   const uint32_t render_cache_flush =
+      ctx->Stencil._WriteEnabled ? PIPE_CONTROL_WRITE_FLUSH : 0;
+   brw_emit_pipe_control_flush(brw,
+                               PIPE_CONTROL_CS_STALL |
+                               PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+                               render_cache_flush);
+
+   /* CACHE_MODE_1 is a non-privileged register. */
+   BEGIN_BATCH(3);
+   OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2));
+   OUT_BATCH(GEN7_CACHE_MODE_1);
+   OUT_BATCH(GEN8_HIZ_PMA_MASK_BITS | pma_stall_bits);
+   ADVANCE_BATCH();
+
+   /* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache
+    * Flush bits is often necessary.  We do it regardless because it's easier.
+    * The render cache flush is also necessary if stencil writes are enabled.
+    */
+   brw_emit_pipe_control_flush(brw,
+                               PIPE_CONTROL_DEPTH_STALL |
+                               PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+                               render_cache_flush);
+
+}
+
+static void
+gen8_emit_pma_stall_workaround(struct brw_context *brw)
+{
+   uint32_t bits = 0;
+   if (pma_fix_enable(brw))
+      bits |= GEN8_HIZ_NP_PMA_FIX_ENABLE | GEN8_HIZ_NP_EARLY_Z_FAILS_DISABLE;
+
+   write_pma_stall_bits(brw, bits);
+}
+
+const struct brw_tracked_state gen8_pma_fix = {
+   .dirty = {
+      .mesa = _NEW_BUFFERS |
+              _NEW_COLOR |
+              _NEW_DEPTH |
+              _NEW_MULTISAMPLE |
+              _NEW_STENCIL,
+      .brw = BRW_NEW_FRAGMENT_PROGRAM,
+      .cache = CACHE_NEW_WM_PROG,
+   },
+   .emit = gen8_emit_pma_stall_workaround
+};
+
 /**
  * Emit packets to perform a depth/HiZ resolve or fast depth/stencil clear.
  *
@@ -224,6 +391,9 @@ gen8_hiz_exec(struct brw_context *brw, struct intel_mipmap_tree *mt,
    if (op == GEN6_HIZ_OP_NONE)
       return;
 
+   /* Disable the PMA stall fix since we're about to do a HiZ operation. */
+   write_pma_stall_bits(brw, 0);
+
    assert(mt->first_level == 0);
    assert(mt->logical_depth0 >= 1);