iris: Implement the Broadwell NP Z PMA Stall Fix
authorKenneth Graunke <kenneth@whitecape.org>
Wed, 25 Sep 2019 07:31:07 +0000 (00:31 -0700)
committerKenneth Graunke <kenneth@whitecape.org>
Wed, 9 Oct 2019 04:53:12 +0000 (21:53 -0700)
This should help avoid stalls in the pixel mask array in certain
non-promoted depth cases.  It especially helps for Z16, as each bit
in the PMA corresponds to two pixels when using Z16, as opposed to
the usual one pixel.

Improves performance in GFXBench5 TRex by 22% (n=1).

src/gallium/drivers/iris/iris_blorp.c
src/gallium/drivers/iris/iris_context.h
src/gallium/drivers/iris/iris_genx_protos.h
src/gallium/drivers/iris/iris_program.c
src/gallium/drivers/iris/iris_state.c

index fa46dafcda2fc2b376d2d554394213eabb399b43..d6b99c59ca7b6fbb9ac25d699e7e7beae2bb7c62 100644 (file)
@@ -307,6 +307,10 @@ iris_blorp_exec(struct blorp_batch *blorp_batch,
 
    iris_require_command_space(batch, 1400);
 
+#if GEN_GEN == 8
+   genX(update_pma_fix)(ice, batch, false);
+#endif
+
    const unsigned scale = params->fast_clear_op ? UINT_MAX : 1;
    if (ice->state.current_hash_scale != scale) {
       genX(emit_hashing_mode)(ice, batch, params->x1 - params->x0,
index 85ffd1fece41b22347787352b2e9de7ca6831e0f..74a66f4a5cf18f95ad4609ac40f85965bc91068d 100644 (file)
@@ -134,6 +134,7 @@ enum {
 #define IRIS_DIRTY_RENDER_RESOLVES_AND_FLUSHES  (1ull << 55)
 #define IRIS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES (1ull << 56)
 #define IRIS_DIRTY_VF_STATISTICS            (1ull << 57)
+#define IRIS_DIRTY_PMA_FIX                  (1ull << 58)
 
 #define IRIS_ALL_DIRTY_FOR_COMPUTE (IRIS_DIRTY_CS | \
                                     IRIS_DIRTY_SAMPLER_STATES_CS | \
index 16da78d7e9f1d8ade700b7ac9f52fcc666ae5d14..84d4b4b324c957a91050b3b1608a014d877a7a19 100644 (file)
@@ -37,6 +37,9 @@ void genX(emit_hashing_mode)(struct iris_context *ice,
                              struct iris_batch *batch,
                              unsigned width, unsigned height,
                              unsigned scale);
+void genX(update_pma_fix)(struct iris_context *ice,
+                          struct iris_batch *batch,
+                          bool enable);
 
 /* iris_blorp.c */
 void genX(init_blorp)(struct iris_context *ice);
index ae701ec984d73ce55d09c5f75f00476c21d56923..886cdff56b665452b754bcbc1b45ab0bac3c292d 100644 (file)
@@ -2387,6 +2387,8 @@ static void
 iris_bind_fs_state(struct pipe_context *ctx, void *state)
 {
    struct iris_context *ice = (struct iris_context *) ctx;
+   struct iris_screen *screen = (struct iris_screen *) ctx->screen;
+   const struct gen_device_info *devinfo = &screen->devinfo;
    struct iris_uncompiled_shader *old_ish =
       ice->shaders.uncompiled[MESA_SHADER_FRAGMENT];
    struct iris_uncompiled_shader *new_ish = state;
@@ -2401,6 +2403,9 @@ iris_bind_fs_state(struct pipe_context *ctx, void *state)
        (new_ish->nir->info.outputs_written & color_bits))
       ice->state.dirty |= IRIS_DIRTY_PS_BLEND;
 
+   if (devinfo->gen == 8)
+      ice->state.dirty |= IRIS_DIRTY_PMA_FIX;
+
    bind_shader_state((void *) ctx, state, MESA_SHADER_FRAGMENT);
 }
 
index 6f0aa18233964b62f98313f5ebd73516d9fb9576..ac6a5dd5fd1d31a54fa4e4b80feedc3161333bf5 100644 (file)
@@ -1055,6 +1055,10 @@ struct iris_genx_state {
 
    uint32_t so_buffers[4 * GENX(3DSTATE_SO_BUFFER_length)];
 
+#if GEN_GEN == 8
+   bool pma_fix_enabled;
+#endif
+
 #if GEN_GEN == 9
    /* Is object level preemption enabled? */
    bool object_preemption;
@@ -1242,6 +1246,9 @@ iris_bind_blend_state(struct pipe_context *ctx, void *state)
    ice->state.dirty |= IRIS_DIRTY_BLEND_STATE;
    ice->state.dirty |= IRIS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
    ice->state.dirty |= ice->state.dirty_for_nos[IRIS_NOS_BLEND];
+
+   if (GEN_GEN == 8)
+      ice->state.dirty |= IRIS_DIRTY_PMA_FIX;
 }
 
 /**
@@ -1276,6 +1283,9 @@ struct iris_depth_stencil_alpha_state {
    /** Outbound to resolve and cache set tracking. */
    bool depth_writes_enabled;
    bool stencil_writes_enabled;
+
+   /** Outbound to Gen8-9 PMA stall equations */
+   bool depth_test_enabled;
 };
 
 /**
@@ -1295,6 +1305,7 @@ iris_create_zsa_state(struct pipe_context *ctx,
 
    cso->alpha = state->alpha;
    cso->depth_writes_enabled = state->depth.writemask;
+   cso->depth_test_enabled = state->depth.enabled;
    cso->stencil_writes_enabled =
       state->stencil[0].writemask != 0 ||
       (two_sided_stencil && state->stencil[1].writemask != 0);
@@ -1364,6 +1375,181 @@ iris_bind_zsa_state(struct pipe_context *ctx, void *state)
    ice->state.dirty |= IRIS_DIRTY_CC_VIEWPORT;
    ice->state.dirty |= IRIS_DIRTY_WM_DEPTH_STENCIL;
    ice->state.dirty |= ice->state.dirty_for_nos[IRIS_NOS_DEPTH_STENCIL_ALPHA];
+
+   if (GEN_GEN == 8)
+      ice->state.dirty |= IRIS_DIRTY_PMA_FIX;
+}
+
+#if GEN_GEN == 8
+static bool
+want_pma_fix(struct iris_context *ice)
+{
+   UNUSED struct iris_screen *screen = (void *) ice->ctx.screen;
+   UNUSED const struct gen_device_info *devinfo = &screen->devinfo;
+   const struct brw_wm_prog_data *wm_prog_data = (void *)
+      ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data;
+   const struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
+   const struct iris_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;
+   const struct iris_blend_state *cso_blend = ice->state.cso_blend;
+
+   /* In very specific combinations of state, we can instruct Gen8-9 hardware
+    * to avoid stalling at the pixel mask array.  The state equations are
+    * documented in these places:
+    *
+    * - Gen8 Depth PMA Fix:   CACHE_MODE_1::NP_PMA_FIX_ENABLE
+    * - Gen9 Stencil PMA Fix: CACHE_MODE_0::STC PMA Optimization Enable
+    *
+    * Both equations share some common elements:
+    *
+    *    no_hiz_op =
+    *       !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
+    *         3DSTATE_WM_HZ_OP::DepthBufferResolve ||
+    *         3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
+    *         3DSTATE_WM_HZ_OP::StencilBufferClear) &&
+    *
+    *    killpixels =
+    *       3DSTATE_WM::ForceKillPix != ForceOff &&
+    *       (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
+    *        3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
+    *        3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
+    *        3DSTATE_PS_BLEND::AlphaTestEnable ||
+    *        3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)
+    *
+    *    (Technically the stencil PMA treats ForceKillPix differently,
+    *     but I think this is a documentation oversight, and we don't
+    *     ever use it in this way, so it doesn't matter).
+    *
+    *    common_pma_fix =
+    *       3DSTATE_WM::ForceThreadDispatch != 1 &&
+    *       3DSTATE_RASTER::ForceSampleCount == NUMRASTSAMPLES_0 &&
+    *       3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
+    *       3DSTATE_DEPTH_BUFFER::HIZ Enable &&
+    *       3DSTATE_WM::EDSC_Mode != EDSC_PREPS &&
+    *       3DSTATE_PS_EXTRA::PixelShaderValid &&
+    *       no_hiz_op
+    *
+    * These are always true:
+    *
+    *    3DSTATE_RASTER::ForceSampleCount == NUMRASTSAMPLES_0
+    *    3DSTATE_PS_EXTRA::PixelShaderValid
+    *
+    * Also, we never use the normal drawing path for HiZ ops; these are true:
+    *
+    *    !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
+    *      3DSTATE_WM_HZ_OP::DepthBufferResolve ||
+    *      3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
+    *      3DSTATE_WM_HZ_OP::StencilBufferClear)
+    *
+    * This happens sometimes:
+    *
+    *    3DSTATE_WM::ForceThreadDispatch != 1
+    *
+    * However, we choose to ignore it as it either agrees with the signal
+    * (dispatch was already enabled, so nothing out of the ordinary), or
+    * there are no framebuffer attachments (so no depth or HiZ anyway,
+    * meaning the PMA signal will already be disabled).
+    */
+
+   if (!cso_fb->zsbuf)
+      return false;
+
+   struct iris_resource *zres, *sres;
+   iris_get_depth_stencil_resources(cso_fb->zsbuf->texture, &zres, &sres);
+
+   /* 3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
+    * 3DSTATE_DEPTH_BUFFER::HIZ Enable &&
+    */
+   if (!zres || !iris_resource_level_has_hiz(zres, cso_fb->zsbuf->u.tex.level))
+      return false;
+
+   /* 3DSTATE_WM::EDSC_Mode != EDSC_PREPS */
+   if (wm_prog_data->early_fragment_tests)
+      return false;
+
+   /* 3DSTATE_WM::ForceKillPix != ForceOff &&
+    * (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
+    *  3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
+    *  3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
+    *  3DSTATE_PS_BLEND::AlphaTestEnable ||
+    *  3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)
+    */
+   bool killpixels = wm_prog_data->uses_kill || wm_prog_data->uses_omask ||
+                     cso_blend->alpha_to_coverage || cso_zsa->alpha.enabled;
+
+   /* The Gen8 depth PMA equation becomes:
+    *
+    *    depth_writes =
+    *       3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable &&
+    *       3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE
+    *
+    *    stencil_writes =
+    *       3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
+    *       3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE &&
+    *       3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE
+    *
+    *    Z_PMA_OPT =
+    *       common_pma_fix &&
+    *       3DSTATE_WM_DEPTH_STENCIL::DepthTestEnable &&
+    *       ((killpixels && (depth_writes || stencil_writes)) ||
+    *        3DSTATE_PS_EXTRA::PixelShaderComputedDepthMode != PSCDEPTH_OFF)
+    *
+    */
+   if (!cso_zsa->depth_test_enabled)
+      return false;
+
+   return wm_prog_data->computed_depth_mode != PSCDEPTH_OFF ||
+          (killpixels && (cso_zsa->depth_writes_enabled ||
+                          (sres && cso_zsa->stencil_writes_enabled)));
+}
+#endif
+
+void
+genX(update_pma_fix)(struct iris_context *ice,
+                     struct iris_batch *batch,
+                     bool enable)
+{
+#if GEN_GEN == 8
+   struct iris_genx_state *genx = ice->state.genx;
+
+   if (genx->pma_fix_enabled == enable)
+      return;
+
+   genx->pma_fix_enabled = enable;
+
+   /* According to the Broadwell PIPE_CONTROL documentation, software should
+    * emit a PIPE_CONTROL with the CS Stall and Depth Cache Flush bits set
+    * prior to the LRI.  If stencil buffer writes are enabled, then a Render        * Cache Flush is also necessary.
+    *
+    * The Gen9 docs say to use a depth stall rather than a command streamer
+    * stall.  However, the hardware seems to violently disagree.  A full
+    * command streamer stall seems to be needed in both cases.
+    */
+   iris_emit_pipe_control_flush(batch, "PMA fix change (1/2)",
+                                PIPE_CONTROL_CS_STALL |
+                                PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+                                PIPE_CONTROL_RENDER_TARGET_FLUSH);
+
+   uint32_t reg_val;
+   iris_pack_state(GENX(CACHE_MODE_1), &reg_val, reg) {
+      reg.NPPMAFixEnable = enable;
+      reg.NPEarlyZFailsDisable = enable;
+      reg.NPPMAFixEnableMask = true;
+      reg.NPEarlyZFailsDisableMask = true;
+   }
+   iris_emit_lri(batch, CACHE_MODE_1, reg_val);
+
+   /* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache
+    * Flush bits is often necessary.  We do it regardless because it's easier.
+    * The render cache flush is also necessary if stencil writes are enabled.
+    *
+    * Again, the Gen9 docs give a different set of flushes but the Broadwell
+    * flushes seem to work just as well.
+    */
+   iris_emit_pipe_control_flush(batch, "PMA fix change (1/2)",
+                                PIPE_CONTROL_DEPTH_STALL |
+                                PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+                                PIPE_CONTROL_RENDER_TARGET_FLUSH);
+#endif
 }
 
 /**
@@ -2816,6 +3002,9 @@ iris_set_framebuffer_state(struct pipe_context *ctx,
 
    ice->state.dirty |= ice->state.dirty_for_nos[IRIS_NOS_FRAMEBUFFER];
 
+   if (GEN_GEN == 8)
+      ice->state.dirty |= IRIS_DIRTY_PMA_FIX;
+
 #if GEN_GEN == 11
    // XXX: we may want to flag IRIS_DIRTY_MULTISAMPLE (or SAMPLE_MASK?)
    // XXX: see commit 979fc1bc9bcc64027ff2cfafd285676f31b930a6
@@ -5642,10 +5831,15 @@ iris_upload_dirty_render_state(struct iris_context *ice,
       }
    }
 
+#if GEN_GEN == 8
+   if (dirty & IRIS_DIRTY_PMA_FIX) {
+      bool enable = want_pma_fix(ice);
+      genX(update_pma_fix)(ice, batch, enable);
+   }
+#endif
+
    if (ice->state.current_hash_scale != 1)
       genX(emit_hashing_mode)(ice, batch, UINT_MAX, UINT_MAX, 1);
-
-   /* TODO: Gen8 PMA fix */
 }
 
 static void