intel/blorp: Always emit URB config on Gen7+
[mesa.git] / src / intel / blorp / blorp_genX_exec.h
index f3a96fbd58cc302d1c51265576cca440635ae925..d271bb4248c705f9548aff01d84c6e95299207a8 100644 (file)
@@ -27,6 +27,7 @@
 #include "blorp_priv.h"
 #include "dev/gen_device_info.h"
 #include "common/gen_sample_positions.h"
+#include "common/gen_l3_config.h"
 #include "genxml/gen_macros.h"
 
 /**
@@ -59,11 +60,14 @@ blorp_alloc_dynamic_state(struct blorp_batch *batch,
 static void *
 blorp_alloc_vertex_buffer(struct blorp_batch *batch, uint32_t size,
                           struct blorp_address *addr);
+static void
+blorp_vf_invalidate_for_vb_48b_transitions(struct blorp_batch *batch,
+                                           const struct blorp_address *addrs,
+                                           uint32_t *sizes,
+                                           unsigned num_vbs);
 
-#if GEN_GEN >= 8
-static struct blorp_address
+UNUSED static struct blorp_address
 blorp_get_workaround_page(struct blorp_batch *batch);
-#endif
 
 static void
 blorp_alloc_binding_table(struct blorp_batch *batch, unsigned num_entries,
@@ -78,14 +82,23 @@ static void
 blorp_surface_reloc(struct blorp_batch *batch, uint32_t ss_offset,
                     struct blorp_address address, uint32_t delta);
 
-#if GEN_GEN >= 7 && GEN_GEN <= 10
+static uint64_t
+blorp_get_surface_address(struct blorp_batch *batch,
+                          struct blorp_address address);
+
+#if GEN_GEN >= 7 && GEN_GEN < 10
 static struct blorp_address
 blorp_get_surface_base_address(struct blorp_batch *batch);
 #endif
 
+#if GEN_GEN >= 7
+static const struct gen_l3_config *
+blorp_get_l3_config(struct blorp_batch *batch);
+# else
 static void
 blorp_emit_urb_config(struct blorp_batch *batch,
                       unsigned vs_entry_size, unsigned sf_entry_size);
+#endif
 
 static void
 blorp_emit_pipeline(struct blorp_batch *batch,
@@ -122,12 +135,13 @@ _blorp_combine_address(struct blorp_batch *batch, void *location,
         _blorp_cmd_pack(cmd)(batch, (void *)_dst, &name),         \
         _dst = NULL)
 
-#define blorp_emitn(batch, cmd, n) ({                       \
+#define blorp_emitn(batch, cmd, n, ...) ({                  \
       uint32_t *_dw = blorp_emit_dwords(batch, n);          \
       if (_dw) {                                            \
          struct cmd template = {                            \
             _blorp_cmd_header(cmd),                         \
             .DWordLength = n - _blorp_cmd_length_bias(cmd), \
+            __VA_ARGS__                                     \
          };                                                 \
          _blorp_cmd_pack(cmd)(batch, _dw, &template);       \
       }                                                     \
@@ -197,9 +211,52 @@ emit_urb_config(struct blorp_batch *batch,
    const unsigned sf_entry_size =
       params->sf_prog_data ? params->sf_prog_data->urb_entry_size : 0;
 
+#if GEN_GEN >= 7
+   assert(sf_entry_size == 0);
+   const unsigned entry_size[4] = { vs_entry_size, 1, 1, 1 };
+
+   unsigned entries[4], start[4];
+   gen_get_urb_config(batch->blorp->compiler->devinfo,
+                      blorp_get_l3_config(batch),
+                      false, false, entry_size, entries, start);
+
+#if GEN_GEN == 7 && !GEN_IS_HASWELL
+   /* From the IVB PRM Vol. 2, Part 1, Section 3.2.1:
+    *
+    *    "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth stall
+    *    needs to be sent just prior to any 3DSTATE_VS, 3DSTATE_URB_VS,
+    *    3DSTATE_CONSTANT_VS, 3DSTATE_BINDING_TABLE_POINTER_VS,
+    *    3DSTATE_SAMPLER_STATE_POINTER_VS command.  Only one PIPE_CONTROL
+    *    needs to be sent before any combination of VS associated 3DSTATE."
+    */
+   blorp_emit(batch, GENX(PIPE_CONTROL), pc) {
+      pc.DepthStallEnable  = true;
+      pc.PostSyncOperation = WriteImmediateData;
+      pc.Address           = blorp_get_workaround_page(batch);
+   }
+#endif
+
+   for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
+      blorp_emit(batch, GENX(3DSTATE_URB_VS), urb) {
+         urb._3DCommandSubOpcode      += i;
+         urb.VSURBStartingAddress      = start[i];
+         urb.VSURBEntryAllocationSize  = entry_size[i] - 1;
+         urb.VSNumberofURBEntries      = entries[i];
+      }
+   }
+#else /* GEN_GEN < 7 */
    blorp_emit_urb_config(batch, vs_entry_size, sf_entry_size);
+#endif
 }
 
+#if GEN_GEN >= 7
+static void
+blorp_emit_memcpy(struct blorp_batch *batch,
+                  struct blorp_address dst,
+                  struct blorp_address src,
+                  uint32_t size);
+#endif
+
 static void
 blorp_emit_vertex_data(struct blorp_batch *batch,
                        const struct blorp_params *params,
@@ -260,6 +317,31 @@ blorp_emit_input_varying_data(struct blorp_batch *batch,
    }
 
    blorp_flush_range(batch, data, *size);
+
+   if (params->dst_clear_color_as_input) {
+#if GEN_GEN >= 7
+      /* In this case, the clear color isn't known statically and instead
+       * comes in through an indirect which we have to copy into the vertex
+       * buffer before we execute the 3DPRIMITIVE.  We already copied the
+       * value of params->wm_inputs.clear_color into the vertex buffer in the
+       * loop above.  Now we emit code to stomp it from the GPU with the
+       * actual clear color value.
+       */
+      assert(num_varyings == 1);
+
+      /* The clear color is the first thing after the header */
+      struct blorp_address clear_color_input_addr = *addr;
+      clear_color_input_addr.offset += 16;
+
+      const unsigned clear_color_size =
+         GEN_GEN < 10 ? batch->blorp->isl_dev->ss.clear_value_size : 4 * 4;
+      blorp_emit_memcpy(batch, clear_color_input_addr,
+                        params->dst.clear_color_addr,
+                        clear_color_size);
+#else
+      unreachable("MCS partial resolve is not a thing on SNB and earlier");
+#endif
+   }
 }
 
 static void
@@ -274,7 +356,7 @@ blorp_fill_vertex_buffer_state(struct blorp_batch *batch,
    vb[idx].BufferPitch = stride;
 
 #if GEN_GEN >= 6
-   vb[idx].VertexBufferMOCS = addr.mocs;
+   vb[idx].MOCS = addr.mocs;
 #endif
 
 #if GEN_GEN >= 7
@@ -298,24 +380,19 @@ blorp_emit_vertex_buffers(struct blorp_batch *batch,
                           const struct blorp_params *params)
 {
    struct GENX(VERTEX_BUFFER_STATE) vb[3];
+   uint32_t num_vbs = 2;
    memset(vb, 0, sizeof(vb));
 
-   struct blorp_address addr;
-   uint32_t size;
-   blorp_emit_vertex_data(batch, params, &addr, &size);
-   blorp_fill_vertex_buffer_state(batch, vb, 0, addr, size, 3 * sizeof(float));
+   struct blorp_address addrs[2] = {};
+   uint32_t sizes[2];
+   blorp_emit_vertex_data(batch, params, &addrs[0], &sizes[0]);
+   blorp_fill_vertex_buffer_state(batch, vb, 0, addrs[0], sizes[0],
+                                  3 * sizeof(float));
 
-   blorp_emit_input_varying_data(batch, params, &addr, &size);
-   blorp_fill_vertex_buffer_state(batch, vb, 1, addr, size, 0);
+   blorp_emit_input_varying_data(batch, params, &addrs[1], &sizes[1]);
+   blorp_fill_vertex_buffer_state(batch, vb, 1, addrs[1], sizes[1], 0);
 
-   uint32_t num_vbs = 2;
-   if (params->dst_clear_color_as_input) {
-      const unsigned clear_color_size =
-         GEN_GEN < 10 ? batch->blorp->isl_dev->ss.clear_value_size : 4 * 4;
-      blorp_fill_vertex_buffer_state(batch, vb, num_vbs++,
-                                     params->dst.clear_color_addr,
-                                     clear_color_size, 0);
-   }
+   blorp_vf_invalidate_for_vb_48b_transitions(batch, addrs, sizes, num_vbs);
 
    const unsigned num_dwords = 1 + num_vbs * GENX(VERTEX_BUFFER_STATE_length);
    uint32_t *dw = blorp_emitn(batch, GENX(3DSTATE_VERTEX_BUFFERS), num_dwords);
@@ -449,49 +526,21 @@ blorp_emit_vertex_elements(struct blorp_batch *batch,
    };
    slot++;
 
-   if (params->dst_clear_color_as_input) {
-      /* If the caller wants the destination indirect clear color, redirect
-       * to vertex buffer 2 where we stored it earlier.  The only users of
-       * an indirect clear color source have that as their only vertex
-       * attribute.
-       */
-      assert(num_varyings == 1);
+   for (unsigned i = 0; i < num_varyings; ++i) {
       ve[slot] = (struct GENX(VERTEX_ELEMENT_STATE)) {
-         .VertexBufferIndex = 2,
+         .VertexBufferIndex = 1,
          .Valid = true,
-         .SourceElementOffset = 0,
-         .Component0Control = VFCOMP_STORE_SRC,
-#if GEN_GEN >= 9
          .SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT,
+         .SourceElementOffset = 16 + i * 4 * sizeof(float),
+         .Component0Control = VFCOMP_STORE_SRC,
          .Component1Control = VFCOMP_STORE_SRC,
          .Component2Control = VFCOMP_STORE_SRC,
          .Component3Control = VFCOMP_STORE_SRC,
-#else
-         /* Clear colors on gen7-8 are for bits out of one dword */
-         .SourceElementFormat = ISL_FORMAT_R32_FLOAT,
-         .Component1Control = VFCOMP_STORE_0,
-         .Component2Control = VFCOMP_STORE_0,
-         .Component3Control = VFCOMP_STORE_0,
+#if GEN_GEN <= 5
+         .DestinationElementOffset = slot * 4,
 #endif
       };
       slot++;
-   } else {
-      for (unsigned i = 0; i < num_varyings; ++i) {
-         ve[slot] = (struct GENX(VERTEX_ELEMENT_STATE)) {
-            .VertexBufferIndex = 1,
-            .Valid = true,
-            .SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT,
-            .SourceElementOffset = 16 + i * 4 * sizeof(float),
-            .Component0Control = VFCOMP_STORE_SRC,
-            .Component1Control = VFCOMP_STORE_SRC,
-            .Component2Control = VFCOMP_STORE_SRC,
-            .Component3Control = VFCOMP_STORE_SRC,
-#if GEN_GEN <= 5
-            .DestinationElementOffset = slot * 4,
-#endif
-         };
-         slot++;
-      }
    }
 
    const unsigned num_dwords =
@@ -505,6 +554,10 @@ blorp_emit_vertex_elements(struct blorp_batch *batch,
       dw += GENX(VERTEX_ELEMENT_STATE_length);
    }
 
+   blorp_emit(batch, GENX(3DSTATE_VF_STATISTICS), vf) {
+      vf.StatisticsEnable = false;
+   }
+
 #if GEN_GEN >= 8
    /* Overwrite Render Target Array Index (2nd dword) in the VUE header with
     * primitive instance identifier. This is used for layered clears.
@@ -657,7 +710,11 @@ blorp_emit_sf_config(struct blorp_batch *batch,
 
 #if GEN_GEN >= 8
 
-   blorp_emit(batch, GENX(3DSTATE_SF), sf);
+   blorp_emit(batch, GENX(3DSTATE_SF), sf) {
+#if GEN_GEN >= 12
+      sf.DerefBlockSize = PerPolyDerefMode;
+#endif
+   }
 
    blorp_emit(batch, GENX(3DSTATE_RASTER), raster) {
       raster.CullMode = CULLMODE_NONE;
@@ -758,18 +815,42 @@ blorp_emit_ps_config(struct blorp_batch *batch,
          ps.BindingTableEntryCount = 1;
       }
 
-      if (prog_data) {
-         ps.DispatchGRFStartRegisterForConstantSetupData0 =
-            prog_data->base.dispatch_grf_start_reg;
-         ps.DispatchGRFStartRegisterForConstantSetupData2 =
-            prog_data->dispatch_grf_start_reg_2;
+      /* SAMPLER_STATE prefetching is broken on Gen11 - WA_1606682166 */
+      if (GEN_GEN == 11)
+         ps.SamplerCount = 0;
 
+      if (prog_data) {
          ps._8PixelDispatchEnable = prog_data->dispatch_8;
          ps._16PixelDispatchEnable = prog_data->dispatch_16;
+         ps._32PixelDispatchEnable = prog_data->dispatch_32;
+
+         /* From the Sky Lake PRM 3DSTATE_PS::32 Pixel Dispatch Enable:
+          *
+          *    "When NUM_MULTISAMPLES = 16 or FORCE_SAMPLE_COUNT = 16, SIMD32
+          *    Dispatch must not be enabled for PER_PIXEL dispatch mode."
+          *
+          * Since 16x MSAA is first introduced on SKL, we don't need to apply
+          * the workaround on any older hardware.
+          */
+         if (GEN_GEN >= 9 && !prog_data->persample_dispatch &&
+             params->num_samples == 16) {
+            assert(ps._8PixelDispatchEnable || ps._16PixelDispatchEnable);
+            ps._32PixelDispatchEnable = false;
+         }
 
-         ps.KernelStartPointer0 = params->wm_prog_kernel;
-         ps.KernelStartPointer2 =
-            params->wm_prog_kernel + prog_data->prog_offset_2;
+         ps.DispatchGRFStartRegisterForConstantSetupData0 =
+            brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 0);
+         ps.DispatchGRFStartRegisterForConstantSetupData1 =
+            brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 1);
+         ps.DispatchGRFStartRegisterForConstantSetupData2 =
+            brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 2);
+
+         ps.KernelStartPointer0 = params->wm_prog_kernel +
+                                  brw_wm_prog_data_prog_offset(prog_data, ps, 0);
+         ps.KernelStartPointer1 = params->wm_prog_kernel +
+                                  brw_wm_prog_data_prog_offset(prog_data, ps, 1);
+         ps.KernelStartPointer2 = params->wm_prog_kernel +
+                                  brw_wm_prog_data_prog_offset(prog_data, ps, 2);
       }
 
       /* 3DSTATE_PS expects the number of threads per PSD, which is always 64
@@ -787,6 +868,12 @@ blorp_emit_ps_config(struct blorp_batch *batch,
       switch (params->fast_clear_op) {
       case ISL_AUX_OP_NONE:
          break;
+#if GEN_GEN >= 10
+      case ISL_AUX_OP_AMBIGUATE:
+         ps.RenderTargetFastClearEnable = true;
+         ps.RenderTargetResolveType = FAST_CLEAR_0;
+         break;
+#endif
 #if GEN_GEN >= 9
       case ISL_AUX_OP_PARTIAL_RESOLVE:
          ps.RenderTargetResolveType = RESOLVE_PARTIAL;
@@ -863,17 +950,23 @@ blorp_emit_ps_config(struct blorp_batch *batch,
 #endif
 
       if (prog_data) {
+         ps._8PixelDispatchEnable = prog_data->dispatch_8;
+         ps._16PixelDispatchEnable = prog_data->dispatch_16;
+         ps._32PixelDispatchEnable = prog_data->dispatch_32;
+
          ps.DispatchGRFStartRegisterForConstantSetupData0 =
-            prog_data->base.dispatch_grf_start_reg;
+            brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 0);
+         ps.DispatchGRFStartRegisterForConstantSetupData1 =
+            brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 1);
          ps.DispatchGRFStartRegisterForConstantSetupData2 =
-            prog_data->dispatch_grf_start_reg_2;
+            brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 2);
 
-         ps.KernelStartPointer0 = params->wm_prog_kernel;
-         ps.KernelStartPointer2 =
-            params->wm_prog_kernel + prog_data->prog_offset_2;
-
-         ps._8PixelDispatchEnable = prog_data->dispatch_8;
-         ps._16PixelDispatchEnable = prog_data->dispatch_16;
+         ps.KernelStartPointer0 = params->wm_prog_kernel +
+                                  brw_wm_prog_data_prog_offset(prog_data, ps, 0);
+         ps.KernelStartPointer1 = params->wm_prog_kernel +
+                                  brw_wm_prog_data_prog_offset(prog_data, ps, 1);
+         ps.KernelStartPointer2 = params->wm_prog_kernel +
+                                  brw_wm_prog_data_prog_offset(prog_data, ps, 2);
 
          ps.AttributeEnable = prog_data->num_varying_inputs > 0;
       } else {
@@ -925,17 +1018,23 @@ blorp_emit_ps_config(struct blorp_batch *batch,
       if (prog_data) {
          wm.ThreadDispatchEnable = true;
 
+         wm._8PixelDispatchEnable = prog_data->dispatch_8;
+         wm._16PixelDispatchEnable = prog_data->dispatch_16;
+         wm._32PixelDispatchEnable = prog_data->dispatch_32;
+
          wm.DispatchGRFStartRegisterForConstantSetupData0 =
-            prog_data->base.dispatch_grf_start_reg;
+            brw_wm_prog_data_dispatch_grf_start_reg(prog_data, wm, 0);
+         wm.DispatchGRFStartRegisterForConstantSetupData1 =
+            brw_wm_prog_data_dispatch_grf_start_reg(prog_data, wm, 1);
          wm.DispatchGRFStartRegisterForConstantSetupData2 =
-            prog_data->dispatch_grf_start_reg_2;
+            brw_wm_prog_data_dispatch_grf_start_reg(prog_data, wm, 2);
 
-         wm.KernelStartPointer0 = params->wm_prog_kernel;
-         wm.KernelStartPointer2 =
-            params->wm_prog_kernel + prog_data->prog_offset_2;
-
-         wm._8PixelDispatchEnable = prog_data->dispatch_8;
-         wm._16PixelDispatchEnable = prog_data->dispatch_16;
+         wm.KernelStartPointer0 = params->wm_prog_kernel +
+                                  brw_wm_prog_data_prog_offset(prog_data, wm, 0);
+         wm.KernelStartPointer1 = params->wm_prog_kernel +
+                                  brw_wm_prog_data_prog_offset(prog_data, wm, 1);
+         wm.KernelStartPointer2 = params->wm_prog_kernel +
+                                  brw_wm_prog_data_prog_offset(prog_data, wm, 2);
 
          wm.NumberofSFOutputAttributes = prog_data->num_varying_inputs;
       }
@@ -1012,7 +1111,7 @@ blorp_emit_blend_state(struct blorp_batch *batch,
 
 static uint32_t
 blorp_emit_color_calc_state(struct blorp_batch *batch,
-                            MAYBE_UNUSED const struct blorp_params *params)
+                            UNUSED const struct blorp_params *params)
 {
    uint32_t offset;
    blorp_emit_dynamic(batch, GENX(COLOR_CALC_STATE), cc, 64, &offset) {
@@ -1049,11 +1148,6 @@ blorp_emit_depth_stencil_state(struct blorp_batch *batch,
       ds.DepthBufferWriteEnable = true;
 
       switch (params->hiz_op) {
-      case ISL_AUX_OP_NONE:
-         ds.DepthTestEnable = true;
-         ds.DepthTestFunction = COMPAREFUNCTION_ALWAYS;
-         break;
-
       /* See the following sections of the Sandy Bridge PRM, Volume 2, Part1:
        *   - 7.5.3.1 Depth Buffer Clear
        *   - 7.5.3.2 Depth Buffer Resolve
@@ -1064,6 +1158,7 @@ blorp_emit_depth_stencil_state(struct blorp_batch *batch,
          ds.DepthTestFunction = COMPAREFUNCTION_NEVER;
          break;
 
+      case ISL_AUX_OP_NONE:
       case ISL_AUX_OP_FAST_CLEAR:
       case ISL_AUX_OP_AMBIGUATE:
          ds.DepthTestEnable = false;
@@ -1194,6 +1289,12 @@ blorp_emit_pipeline(struct blorp_batch *batch,
    (void)depth_stencil_state_offset;
 #endif
 
+#if GEN_GEN >= 12
+   blorp_emit(batch, GENX(3DSTATE_CONSTANT_ALL), pc) {
+      /* Update empty push constants for all stages (bitmask = 11111b) */
+      pc.ShaderUpdateEnable = 0x1f;
+   }
+#else
    blorp_emit(batch, GENX(3DSTATE_CONSTANT_VS), vs);
 #if GEN_GEN >= 7
    blorp_emit(batch, GENX(3DSTATE_CONSTANT_HS), hs);
@@ -1201,6 +1302,7 @@ blorp_emit_pipeline(struct blorp_batch *batch,
 #endif
    blorp_emit(batch, GENX(3DSTATE_CONSTANT_GS), gs);
    blorp_emit(batch, GENX(3DSTATE_CONSTANT_PS), ps);
+#endif
 
    if (params->src.enabled)
       blorp_emit_sampler_state(batch);
@@ -1244,7 +1346,7 @@ blorp_emit_pipeline(struct blorp_batch *batch,
 
 #endif /* GEN_GEN >= 6 */
 
-#if GEN_GEN >= 7 && GEN_GEN <= 10
+#if GEN_GEN >= 7
 static void
 blorp_emit_memcpy(struct blorp_batch *batch,
                   struct blorp_address dst,
@@ -1283,6 +1385,7 @@ blorp_emit_memcpy(struct blorp_batch *batch,
 static void
 blorp_emit_surface_state(struct blorp_batch *batch,
                          const struct brw_blorp_surface_info *surface,
+                         enum isl_aux_op aux_op,
                          void *state, uint32_t state_offset,
                          const bool color_write_disables[4],
                          bool is_render_target)
@@ -1296,10 +1399,13 @@ blorp_emit_surface_state(struct blorp_batch *batch,
       surf.dim = ISL_SURF_DIM_2D;
    }
 
-   /* Blorp doesn't support HiZ in any of the blit or slow-clear paths */
+   if (isl_aux_usage_has_hiz(surface->aux_usage)) {
+      /* BLORP doesn't render with depth so we can't use HiZ */
+      assert(!is_render_target);
+      /* We can't reinterpret HiZ */
+      assert(surface->surf.format == surface->view.format);
+   }
    enum isl_aux_usage aux_usage = surface->aux_usage;
-   if (aux_usage == ISL_AUX_USAGE_HIZ)
-      aux_usage = ISL_AUX_USAGE_NONE;
 
    isl_channel_mask_t write_disable_mask = 0;
    if (is_render_target && GEN_GEN <= 5) {
@@ -1313,11 +1419,22 @@ blorp_emit_surface_state(struct blorp_batch *batch,
          write_disable_mask |= ISL_CHANNEL_ALPHA_BIT;
    }
 
+   const bool use_clear_address =
+      GEN_GEN >= 10 && (surface->clear_color_addr.buffer != NULL);
+
    isl_surf_fill_state(batch->blorp->isl_dev, state,
                        .surf = &surf, .view = &surface->view,
                        .aux_surf = &surface->aux_surf, .aux_usage = aux_usage,
+                       .address =
+                          blorp_get_surface_address(batch, surface->addr),
+                       .aux_address = aux_usage == ISL_AUX_USAGE_NONE ? 0 :
+                          blorp_get_surface_address(batch, surface->aux_addr),
+                       .clear_address = !use_clear_address ? 0 :
+                          blorp_get_surface_address(batch,
+                                                    surface->clear_color_addr),
                        .mocs = surface->addr.mocs,
                        .clear_color = surface->clear_color,
+                       .use_clear_address = use_clear_address,
                        .write_disables = write_disable_mask);
 
    blorp_surface_reloc(batch, state_offset + isl_dev->ss.addr_offset,
@@ -1334,20 +1451,29 @@ blorp_emit_surface_state(struct blorp_batch *batch,
                           surface->aux_addr, *aux_addr);
    }
 
-   blorp_flush_range(batch, state, GENX(RENDER_SURFACE_STATE_length) * 4);
-
-   if (surface->clear_color_addr.buffer) {
-#if GEN_GEN > 10
-      unreachable("Implement indirect clear support on gen11+");
-#elif GEN_GEN >= 7 && GEN_GEN <= 10
-      struct blorp_address dst_addr = blorp_get_surface_base_address(batch);
-      dst_addr.offset += state_offset + isl_dev->ss.clear_value_offset;
-      blorp_emit_memcpy(batch, dst_addr, surface->clear_color_addr,
-                        isl_dev->ss.clear_value_size);
+   if (aux_usage != ISL_AUX_USAGE_NONE && surface->clear_color_addr.buffer) {
+#if GEN_GEN >= 10
+      assert((surface->clear_color_addr.offset & 0x3f) == 0);
+      uint32_t *clear_addr = state + isl_dev->ss.clear_color_state_offset;
+      blorp_surface_reloc(batch, state_offset +
+                          isl_dev->ss.clear_color_state_offset,
+                          surface->clear_color_addr, *clear_addr);
+#elif GEN_GEN >= 7
+      /* Fast clears just whack the AUX surface and don't actually use the
+       * clear color for anything.  We can avoid the MI memcpy on that case.
+       */
+      if (aux_op != ISL_AUX_OP_FAST_CLEAR) {
+         struct blorp_address dst_addr = blorp_get_surface_base_address(batch);
+         dst_addr.offset += state_offset + isl_dev->ss.clear_value_offset;
+         blorp_emit_memcpy(batch, dst_addr, surface->clear_color_addr,
+                           isl_dev->ss.clear_value_size);
+      }
 #else
       unreachable("Fast clears are only supported on gen7+");
 #endif
    }
+
+   blorp_flush_range(batch, state, GENX(RENDER_SURFACE_STATE_length) * 4);
 }
 
 static void
@@ -1392,7 +1518,7 @@ blorp_emit_surface_states(struct blorp_batch *batch,
    uint32_t bind_offset = 0, surface_offsets[2];
    void *surface_maps[2];
 
-   MAYBE_UNUSED bool has_indirect_clear_color = false;
+   UNUSED bool has_indirect_clear_color = false;
    if (params->use_pre_baked_binding_table) {
       bind_offset = params->pre_baked_binding_table_offset;
    } else {
@@ -1403,6 +1529,7 @@ blorp_emit_surface_states(struct blorp_batch *batch,
 
       if (params->dst.enabled) {
          blorp_emit_surface_state(batch, &params->dst,
+                                  params->fast_clear_op,
                                   surface_maps[BLORP_RENDERBUFFER_BT_INDEX],
                                   surface_offsets[BLORP_RENDERBUFFER_BT_INDEX],
                                   params->color_write_disable, true);
@@ -1418,6 +1545,7 @@ blorp_emit_surface_states(struct blorp_batch *batch,
 
       if (params->src.enabled) {
          blorp_emit_surface_state(batch, &params->src,
+                                  params->fast_clear_op,
                                   surface_maps[BLORP_TEXTURE_BT_INDEX],
                                   surface_offsets[BLORP_TEXTURE_BT_INDEX],
                                   NULL, false);
@@ -1439,6 +1567,9 @@ blorp_emit_surface_states(struct blorp_batch *batch,
        */
       blorp_emit(batch, GENX(PIPE_CONTROL), pipe) {
          pipe.StateCacheInvalidationEnable = true;
+#if GEN_GEN >= 12
+         pipe.TileCacheFlushEnable = true;
+#endif
       }
    }
 #endif
@@ -1492,7 +1623,7 @@ blorp_emit_depth_stencil_config(struct blorp_batch *batch,
                           params->depth.addr, 0);
 
       info.hiz_usage = params->depth.aux_usage;
-      if (info.hiz_usage == ISL_AUX_USAGE_HIZ) {
+      if (isl_aux_usage_has_hiz(info.hiz_usage)) {
          info.hiz_surf = &params->depth.aux_surf;
 
          struct blorp_address hiz_address = params->depth.aux_addr;
@@ -1520,6 +1651,7 @@ blorp_emit_depth_stencil_config(struct blorp_batch *batch,
    if (params->stencil.enabled) {
       info.stencil_surf = &params->stencil.surf;
 
+      info.stencil_aux_usage = params->stencil.aux_usage;
       struct blorp_address stencil_address = params->stencil.addr;
 #if GEN_GEN == 6
       /* Sandy bridge hardware does not technically support mipmapped stencil.
@@ -1540,6 +1672,20 @@ blorp_emit_depth_stencil_config(struct blorp_batch *batch,
    }
 
    isl_emit_depth_stencil_hiz_s(isl_dev, dw, &info);
+
+#if GEN_GEN >= 12
+   /* GEN:BUG:1408224581
+    *
+    * Workaround: Gen12LP Astep only An additional pipe control with
+    * post-sync = store dword operation would be required.( w/a is to
+    * have an additional pipe control after the stencil state whenever
+    * the surface state bits of this state is changing).
+    */
+   blorp_emit(batch, GENX(PIPE_CONTROL), pc) {
+      pc.PostSyncOperation = WriteImmediateData;
+      pc.Address = blorp_get_workaround_page(batch);
+   }
+#endif
 }
 
 #if GEN_GEN >= 8
@@ -1555,11 +1701,18 @@ blorp_emit_gen8_hiz_op(struct blorp_batch *batch,
     */
    assert(params->depth.enabled || params->stencil.enabled);
 
-   /* The stencil buffer should only be enabled if a fast clear operation is
-    * requested.
+   /* The stencil buffer should only be enabled on GEN == 12, if a fast clear
+    * or full resolve operation is requested. On rest of the GEN, if a fast
+    * clear operation is requested.
     */
-   if (params->stencil.enabled)
+   if (params->stencil.enabled) {
+#if GEN_GEN >= 12
+      assert(params->hiz_op == ISL_AUX_OP_FAST_CLEAR ||
+             params->hiz_op == ISL_AUX_OP_FULL_RESOLVE);
+#else
       assert(params->hiz_op == ISL_AUX_OP_FAST_CLEAR);
+#endif
+   }
 
    /* From the BDW PRM Volume 2, 3DSTATE_WM_HZ_OP:
     *
@@ -1572,6 +1725,29 @@ blorp_emit_gen8_hiz_op(struct blorp_batch *batch,
     */
    blorp_emit_3dstate_multisample(batch, params);
 
+   /* From the BDW PRM Volume 7, Depth Buffer Clear:
+    *
+    *    The clear value must be between the min and max depth values
+    *    (inclusive) defined in the CC_VIEWPORT. If the depth buffer format is
+    *    D32_FLOAT, then +/-DENORM values are also allowed.
+    *
+    * Set the bounds to match our hardware limits, [0.0, 1.0].
+    */
+   if (params->depth.enabled && params->hiz_op == ISL_AUX_OP_FAST_CLEAR) {
+      assert(params->depth.clear_color.f32[0] >= 0.0f);
+      assert(params->depth.clear_color.f32[0] <= 1.0f);
+      blorp_emit_cc_viewport(batch);
+   }
+
+   /* According to the SKL PRM formula for WM_INT::ThreadDispatchEnable, the
+    * 3DSTATE_WM::ForceThreadDispatchEnable field can force WM thread dispatch
+    * even when WM_HZ_OP is active.  However, WM thread dispatch is normally
+    * disabled for HiZ ops and it appears that force-enabling it can lead to
+    * GPU hangs on at least Skylake.  Since we don't know the current state of
+    * the 3DSTATE_WM packet, just emit a dummy one prior to 3DSTATE_WM_HZ_OP.
+    */
+   blorp_emit(batch, GENX(3DSTATE_WM), wm);
+
    /* If we can't alter the depth stencil config and multiple layers are
     * involved, the HiZ op will fail. This is because the op requires that a
     * new config is emitted for each additional layer.
@@ -1592,7 +1768,13 @@ blorp_emit_gen8_hiz_op(struct blorp_batch *batch,
          break;
       case ISL_AUX_OP_FULL_RESOLVE:
          assert(params->full_surface_hiz_op);
-         hzp.DepthBufferResolveEnable = true;
+         hzp.DepthBufferResolveEnable = params->depth.enabled;
+#if GEN_GEN >= 12
+         if (params->stencil.enabled) {
+            assert(params->stencil.aux_usage == ISL_AUX_USAGE_CCS_E);
+            hzp.StencilBufferResolveEnable = true;
+         }
+#endif
          break;
       case ISL_AUX_OP_AMBIGUATE:
          assert(params->full_surface_hiz_op);
@@ -1630,6 +1812,128 @@ blorp_emit_gen8_hiz_op(struct blorp_batch *batch,
 }
 #endif
 
+static void
+blorp_update_clear_color(struct blorp_batch *batch,
+                         const struct brw_blorp_surface_info *info,
+                         enum isl_aux_op op)
+{
+   if (info->clear_color_addr.buffer && op == ISL_AUX_OP_FAST_CLEAR) {
+#if GEN_GEN == 11
+      blorp_emit(batch, GENX(PIPE_CONTROL), pipe) {
+         pipe.CommandStreamerStallEnable = true;
+      }
+
+      /* 2 QWORDS */
+      const unsigned inlinedata_dw = 2 * 2;
+      const unsigned num_dwords = GENX(MI_ATOMIC_length) + inlinedata_dw;
+
+      struct blorp_address clear_addr = info->clear_color_addr;
+      uint32_t *dw = blorp_emitn(batch, GENX(MI_ATOMIC), num_dwords,
+                                 .DataSize = MI_ATOMIC_QWORD,
+                                 .ATOMICOPCODE = MI_ATOMIC_OP_MOVE8B,
+                                 .InlineData = true,
+                                 .MemoryAddress = clear_addr);
+      /* dw starts at dword 1, but we need to fill dwords 3 and 5 */
+      dw[2] = info->clear_color.u32[0];
+      dw[3] = 0;
+      dw[4] = info->clear_color.u32[1];
+      dw[5] = 0;
+
+      clear_addr.offset += 8;
+      dw = blorp_emitn(batch, GENX(MI_ATOMIC), num_dwords,
+                                 .DataSize = MI_ATOMIC_QWORD,
+                                 .ATOMICOPCODE = MI_ATOMIC_OP_MOVE8B,
+                                 .CSSTALL = true,
+                                 .ReturnDataControl = true,
+                                 .InlineData = true,
+                                 .MemoryAddress = clear_addr);
+      /* dw starts at dword 1, but we need to fill dwords 3 and 5 */
+      dw[2] = info->clear_color.u32[2];
+      dw[3] = 0;
+      dw[4] = info->clear_color.u32[3];
+      dw[5] = 0;
+
+      blorp_emit(batch, GENX(PIPE_CONTROL), pipe) {
+         pipe.StateCacheInvalidationEnable = true;
+         pipe.TextureCacheInvalidationEnable = true;
+      }
+#elif GEN_GEN >= 9
+
+      /* According to GEN:BUG:2201730850, in the Clear Color Programming Note
+       * under the Red channel, "Software shall write the converted Depth
+       * Clear to this dword." The only depth formats listed under the red
+       * channel are IEEE_FP and UNORM24_X8. These two requirements are
+       * incompatible with the UNORM16 depth format, so just ignore that case
+       * and simply perform the conversion for all depth formats.
+       */
+      union isl_color_value fixed_color = info->clear_color;
+      if (GEN_GEN == 12 && isl_surf_usage_is_depth(info->surf.usage)) {
+         isl_color_value_pack(&info->clear_color, info->surf.format,
+                              fixed_color.u32);
+      }
+
+      for (int i = 0; i < 4; i++) {
+         blorp_emit(batch, GENX(MI_STORE_DATA_IMM), sdi) {
+            sdi.Address = info->clear_color_addr;
+            sdi.Address.offset += i * 4;
+            sdi.ImmediateData = fixed_color.u32[i];
+#if GEN_GEN >= 12
+            if (i == 3)
+               sdi.ForceWriteCompletionCheck = true;
+#endif
+         }
+      }
+
+/* The RENDER_SURFACE_STATE::ClearColor field states that software should
+ * write the converted depth value 16B after the clear address:
+ *
+ *    3D Sampler will always fetch clear depth from the location 16-bytes
+ *    above this address, where the clear depth, converted to native
+ *    surface format by software, will be stored.
+ *
+ */
+#if GEN_GEN >= 12
+      if (isl_surf_usage_is_depth(info->surf.usage)) {
+         blorp_emit(batch, GENX(MI_STORE_DATA_IMM), sdi) {
+            sdi.Address = info->clear_color_addr;
+            sdi.Address.offset += 4 * 4;
+            sdi.ImmediateData = fixed_color.u32[0];
+            sdi.ForceWriteCompletionCheck = true;
+         }
+      }
+#endif
+
+#elif GEN_GEN >= 7
+      blorp_emit(batch, GENX(MI_STORE_DATA_IMM), sdi) {
+         sdi.Address = info->clear_color_addr;
+         sdi.ImmediateData = ISL_CHANNEL_SELECT_RED   << 25 |
+                             ISL_CHANNEL_SELECT_GREEN << 22 |
+                             ISL_CHANNEL_SELECT_BLUE  << 19 |
+                             ISL_CHANNEL_SELECT_ALPHA << 16;
+         if (isl_format_has_int_channel(info->view.format)) {
+            for (unsigned i = 0; i < 4; i++) {
+               assert(info->clear_color.u32[i] == 0 ||
+                      info->clear_color.u32[i] == 1);
+            }
+            sdi.ImmediateData |= (info->clear_color.u32[0] != 0) << 31;
+            sdi.ImmediateData |= (info->clear_color.u32[1] != 0) << 30;
+            sdi.ImmediateData |= (info->clear_color.u32[2] != 0) << 29;
+            sdi.ImmediateData |= (info->clear_color.u32[3] != 0) << 28;
+         } else {
+            for (unsigned i = 0; i < 4; i++) {
+               assert(info->clear_color.f32[i] == 0.0f ||
+                      info->clear_color.f32[i] == 1.0f);
+            }
+            sdi.ImmediateData |= (info->clear_color.f32[0] != 0.0f) << 31;
+            sdi.ImmediateData |= (info->clear_color.f32[1] != 0.0f) << 30;
+            sdi.ImmediateData |= (info->clear_color.f32[2] != 0.0f) << 29;
+            sdi.ImmediateData |= (info->clear_color.f32[3] != 0.0f) << 28;
+         }
+      }
+#endif
+   }
+}
+
 /**
  * \brief Execute a blit or render pass operation.
  *
@@ -1642,6 +1946,11 @@ blorp_emit_gen8_hiz_op(struct blorp_batch *batch,
 static void
 blorp_exec(struct blorp_batch *batch, const struct blorp_params *params)
 {
+   if (!(batch->flags & BLORP_BATCH_NO_UPDATE_CLEAR_COLOR)) {
+      blorp_update_clear_color(batch, &params->dst, params->fast_clear_op);
+      blorp_update_clear_color(batch, &params->depth, params->hiz_op);
+   }
+
 #if GEN_GEN >= 8
    if (params->hiz_op != ISL_AUX_OP_NONE) {
       blorp_emit_gen8_hiz_op(batch, params);