intel/blorp: emit VF caching workaround before 3DSTATE_VERTEX_BUFFERS
[mesa.git] / src / intel / blorp / blorp_genX_exec.h
index 5f9a8ab4a510a74229ed7818fa3c92ef698542e4..9010b03fb671ad6217a0d5b7b3c65fd5cec30b87 100644 (file)
@@ -25,7 +25,7 @@
 #define BLORP_GENX_EXEC_H
 
 #include "blorp_priv.h"
-#include "common/gen_device_info.h"
+#include "dev/gen_device_info.h"
 #include "common/gen_sample_positions.h"
 #include "genxml/gen_macros.h"
 
@@ -59,6 +59,10 @@ blorp_alloc_dynamic_state(struct blorp_batch *batch,
 static void *
 blorp_alloc_vertex_buffer(struct blorp_batch *batch, uint32_t size,
                           struct blorp_address *addr);
+static void
+blorp_vf_invalidate_for_vb_48b_transitions(struct blorp_batch *batch,
+                                           const struct blorp_address *addrs,
+                                           unsigned num_vbs);
 
 #if GEN_GEN >= 8
 static struct blorp_address
@@ -78,6 +82,11 @@ static void
 blorp_surface_reloc(struct blorp_batch *batch, uint32_t ss_offset,
                     struct blorp_address address, uint32_t delta);
 
+#if GEN_GEN >= 7 && GEN_GEN < 10
+static struct blorp_address
+blorp_get_surface_base_address(struct blorp_batch *batch);
+#endif
+
 static void
 blorp_emit_urb_config(struct blorp_batch *batch,
                       unsigned vs_entry_size, unsigned sf_entry_size);
@@ -195,6 +204,14 @@ emit_urb_config(struct blorp_batch *batch,
    blorp_emit_urb_config(batch, vs_entry_size, sf_entry_size);
 }
 
+#if GEN_GEN >= 7
+static void
+blorp_emit_memcpy(struct blorp_batch *batch,
+                  struct blorp_address dst,
+                  struct blorp_address src,
+                  uint32_t size);
+#endif
+
 static void
 blorp_emit_vertex_data(struct blorp_batch *batch,
                        const struct blorp_params *params,
@@ -255,63 +272,89 @@ blorp_emit_input_varying_data(struct blorp_batch *batch,
    }
 
    blorp_flush_range(batch, data, *size);
+
+   if (params->dst_clear_color_as_input) {
+#if GEN_GEN >= 7
+      /* In this case, the clear color isn't known statically and instead
+       * comes in through an indirect which we have to copy into the vertex
+       * buffer before we execute the 3DPRIMITIVE.  We already copied the
+       * value of params->wm_inputs.clear_color into the vertex buffer in the
+       * loop above.  Now we emit code to stomp it from the GPU with the
+       * actual clear color value.
+       */
+      assert(num_varyings == 1);
+
+      /* The clear color is the first thing after the header */
+      struct blorp_address clear_color_input_addr = *addr;
+      clear_color_input_addr.offset += 16;
+
+      const unsigned clear_color_size =
+         GEN_GEN < 10 ? batch->blorp->isl_dev->ss.clear_value_size : 4 * 4;
+      blorp_emit_memcpy(batch, clear_color_input_addr,
+                        params->dst.clear_color_addr,
+                        clear_color_size);
+#else
+      unreachable("MCS partial resolve is not a thing on SNB and earlier");
+#endif
+   }
 }
 
 static void
-blorp_emit_vertex_buffers(struct blorp_batch *batch,
-                          const struct blorp_params *params)
+blorp_fill_vertex_buffer_state(struct blorp_batch *batch,
+                               struct GENX(VERTEX_BUFFER_STATE) *vb,
+                               unsigned idx,
+                               struct blorp_address addr, uint32_t size,
+                               uint32_t stride)
 {
-   struct GENX(VERTEX_BUFFER_STATE) vb[2];
-   memset(vb, 0, sizeof(vb));
+   vb[idx].VertexBufferIndex = idx;
+   vb[idx].BufferStartingAddress = addr;
+   vb[idx].BufferPitch = stride;
 
-   uint32_t size;
-   blorp_emit_vertex_data(batch, params, &vb[0].BufferStartingAddress, &size);
-   vb[0].VertexBufferIndex = 0;
-   vb[0].BufferPitch = 3 * sizeof(float);
 #if GEN_GEN >= 6
-   vb[0].VertexBufferMOCS = batch->blorp->mocs.vb;
-#endif
-#if GEN_GEN >= 7
-   vb[0].AddressModifyEnable = true;
-#endif
-#if GEN_GEN >= 8
-   vb[0].BufferSize = size;
-#elif GEN_GEN >= 5
-   vb[0].BufferAccessType = VERTEXDATA;
-   vb[0].EndAddress = vb[0].BufferStartingAddress;
-   vb[0].EndAddress.offset += size - 1;
-#elif GEN_GEN == 4
-   vb[0].BufferAccessType = VERTEXDATA;
-   vb[0].MaxIndex = 2;
+   vb[idx].MOCS = addr.mocs;
 #endif
 
-   blorp_emit_input_varying_data(batch, params,
-                                 &vb[1].BufferStartingAddress, &size);
-   vb[1].VertexBufferIndex = 1;
-   vb[1].BufferPitch = 0;
-#if GEN_GEN >= 6
-   vb[1].VertexBufferMOCS = batch->blorp->mocs.vb;
-#endif
 #if GEN_GEN >= 7
-   vb[1].AddressModifyEnable = true;
+   vb[idx].AddressModifyEnable = true;
 #endif
+
 #if GEN_GEN >= 8
-   vb[1].BufferSize = size;
+   vb[idx].BufferSize = size;
 #elif GEN_GEN >= 5
-   vb[1].BufferAccessType = INSTANCEDATA;
-   vb[1].EndAddress = vb[1].BufferStartingAddress;
-   vb[1].EndAddress.offset += size - 1;
+   vb[idx].BufferAccessType = stride > 0 ? VERTEXDATA : INSTANCEDATA;
+   vb[idx].EndAddress = vb[idx].BufferStartingAddress;
+   vb[idx].EndAddress.offset += size - 1;
 #elif GEN_GEN == 4
-   vb[1].BufferAccessType = INSTANCEDATA;
-   vb[1].MaxIndex = 0;
+   vb[idx].BufferAccessType = stride > 0 ? VERTEXDATA : INSTANCEDATA;
+   vb[idx].MaxIndex = stride > 0 ? size / stride : 0;
 #endif
+}
 
-   const unsigned num_dwords = 1 + GENX(VERTEX_BUFFER_STATE_length) * 2;
+static void
+blorp_emit_vertex_buffers(struct blorp_batch *batch,
+                          const struct blorp_params *params)
+{
+   struct GENX(VERTEX_BUFFER_STATE) vb[3];
+   uint32_t num_vbs = 2;
+   memset(vb, 0, sizeof(vb));
+
+   struct blorp_address addrs[2] = {};
+   uint32_t size;
+   blorp_emit_vertex_data(batch, params, &addrs[0], &size);
+   blorp_fill_vertex_buffer_state(batch, vb, 0, addrs[0], size,
+                                  3 * sizeof(float));
+
+   blorp_emit_input_varying_data(batch, params, &addrs[1], &size);
+   blorp_fill_vertex_buffer_state(batch, vb, 1, addrs[1], size, 0);
+
+   blorp_vf_invalidate_for_vb_48b_transitions(batch, addrs, num_vbs);
+
+   const unsigned num_dwords = 1 + num_vbs * GENX(VERTEX_BUFFER_STATE_length);
    uint32_t *dw = blorp_emitn(batch, GENX(3DSTATE_VERTEX_BUFFERS), num_dwords);
    if (!dw)
       return;
 
-   for (unsigned i = 0; i < 2; i++) {
+   for (unsigned i = 0; i < num_vbs; i++) {
       GENX(VERTEX_BUFFER_STATE_pack)(batch, dw, &vb[i]);
       dw += GENX(VERTEX_BUFFER_STATE_length);
    }
@@ -380,7 +423,7 @@ blorp_emit_vertex_elements(struct blorp_batch *batch,
    ve[slot] = (struct GENX(VERTEX_ELEMENT_STATE)) {
       .VertexBufferIndex = 1,
       .Valid = true,
-      .SourceElementFormat = (enum GENX(SURFACE_FORMAT)) ISL_FORMAT_R32G32B32A32_FLOAT,
+      .SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT,
       .SourceElementOffset = 0,
       .Component0Control = VFCOMP_STORE_SRC,
 
@@ -412,7 +455,7 @@ blorp_emit_vertex_elements(struct blorp_batch *batch,
    ve[slot] = (struct GENX(VERTEX_ELEMENT_STATE)) {
       .VertexBufferIndex = 0,
       .Valid = true,
-      .SourceElementFormat = (enum GENX(SURFACE_FORMAT)) ISL_FORMAT_R32G32B32_FLOAT,
+      .SourceElementFormat = ISL_FORMAT_R32G32B32_FLOAT,
       .SourceElementOffset = 0,
       .Component0Control = VFCOMP_STORE_SRC,
       .Component1Control = VFCOMP_STORE_SRC,
@@ -426,7 +469,7 @@ blorp_emit_vertex_elements(struct blorp_batch *batch,
    ve[slot] = (struct GENX(VERTEX_ELEMENT_STATE)) {
       .VertexBufferIndex = 0,
       .Valid = true,
-      .SourceElementFormat = (enum GENX(SURFACE_FORMAT)) ISL_FORMAT_R32G32B32_FLOAT,
+      .SourceElementFormat = ISL_FORMAT_R32G32B32_FLOAT,
       .SourceElementOffset = 0,
       .Component0Control = VFCOMP_STORE_SRC,
       .Component1Control = VFCOMP_STORE_SRC,
@@ -442,7 +485,7 @@ blorp_emit_vertex_elements(struct blorp_batch *batch,
       ve[slot] = (struct GENX(VERTEX_ELEMENT_STATE)) {
          .VertexBufferIndex = 1,
          .Valid = true,
-         .SourceElementFormat = (enum GENX(SURFACE_FORMAT)) ISL_FORMAT_R32G32B32A32_FLOAT,
+         .SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT,
          .SourceElementOffset = 16 + i * 4 * sizeof(float),
          .Component0Control = VFCOMP_STORE_SRC,
          .Component1Control = VFCOMP_STORE_SRC,
@@ -491,8 +534,7 @@ blorp_emit_vertex_elements(struct blorp_batch *batch,
 
 /* 3DSTATE_VIEWPORT_STATE_POINTERS */
 static uint32_t
-blorp_emit_cc_viewport(struct blorp_batch *batch,
-                       const struct blorp_params *params)
+blorp_emit_cc_viewport(struct blorp_batch *batch)
 {
    uint32_t cc_vp_offset;
    blorp_emit_dynamic(batch, GENX(CC_VIEWPORT), vp, 32, &cc_vp_offset) {
@@ -515,8 +557,7 @@ blorp_emit_cc_viewport(struct blorp_batch *batch,
 }
 
 static uint32_t
-blorp_emit_sampler_state(struct blorp_batch *batch,
-                         const struct blorp_params *params)
+blorp_emit_sampler_state(struct blorp_batch *batch)
 {
    uint32_t offset;
    blorp_emit_dynamic(batch, GENX(SAMPLER_STATE), sampler, 32, &offset) {
@@ -567,6 +608,8 @@ blorp_emit_vs_config(struct blorp_batch *batch,
                      const struct blorp_params *params)
 {
    struct brw_vs_prog_data *vs_prog_data = params->vs_prog_data;
+   assert(!vs_prog_data || GEN_GEN < 11 ||
+          vs_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8);
 
    blorp_emit(batch, GENX(3DSTATE_VS), vs) {
       if (vs_prog_data) {
@@ -719,25 +762,53 @@ blorp_emit_ps_config(struct blorp_batch *batch,
          ps.BindingTableEntryCount = 1;
       }
 
-      if (prog_data) {
-         ps.DispatchGRFStartRegisterForConstantSetupData0 =
-            prog_data->base.dispatch_grf_start_reg;
-         ps.DispatchGRFStartRegisterForConstantSetupData2 =
-            prog_data->dispatch_grf_start_reg_2;
+     /* Gen 11 workarounds table #2056 WABTPPrefetchDisable suggests to
+      * disable prefetching of binding tables on A0 and B0 steppings.
+      * TODO: Revisit this WA on C0 stepping.
+      */
+      if (GEN_GEN == 11)
+         ps.BindingTableEntryCount = 0;
 
+      if (prog_data) {
          ps._8PixelDispatchEnable = prog_data->dispatch_8;
          ps._16PixelDispatchEnable = prog_data->dispatch_16;
+         ps._32PixelDispatchEnable = prog_data->dispatch_32;
+
+         /* From the Sky Lake PRM 3DSTATE_PS::32 Pixel Dispatch Enable:
+          *
+          *    "When NUM_MULTISAMPLES = 16 or FORCE_SAMPLE_COUNT = 16, SIMD32
+          *    Dispatch must not be enabled for PER_PIXEL dispatch mode."
+          *
+          * Since 16x MSAA is first introduced on SKL, we don't need to apply
+          * the workaround on any older hardware.
+          */
+         if (GEN_GEN >= 9 && !prog_data->persample_dispatch &&
+             params->num_samples == 16) {
+            assert(ps._8PixelDispatchEnable || ps._16PixelDispatchEnable);
+            ps._32PixelDispatchEnable = false;
+         }
 
-         ps.KernelStartPointer0 = params->wm_prog_kernel;
-         ps.KernelStartPointer2 =
-            params->wm_prog_kernel + prog_data->prog_offset_2;
+         ps.DispatchGRFStartRegisterForConstantSetupData0 =
+            brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 0);
+         ps.DispatchGRFStartRegisterForConstantSetupData1 =
+            brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 1);
+         ps.DispatchGRFStartRegisterForConstantSetupData2 =
+            brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 2);
+
+         ps.KernelStartPointer0 = params->wm_prog_kernel +
+                                  brw_wm_prog_data_prog_offset(prog_data, ps, 0);
+         ps.KernelStartPointer1 = params->wm_prog_kernel +
+                                  brw_wm_prog_data_prog_offset(prog_data, ps, 1);
+         ps.KernelStartPointer2 = params->wm_prog_kernel +
+                                  brw_wm_prog_data_prog_offset(prog_data, ps, 2);
       }
 
-      /* 3DSTATE_PS expects the number of threads per PSD, which is always 64;
-       * it implicitly scales for different GT levels (which have some # of
-       * PSDs).
+      /* 3DSTATE_PS expects the number of threads per PSD, which is always 64
+       * for pre Gen11 and 128 for gen11+; On gen11+ If a programmed value is
+       * k, it implies 2(k+1) threads. It implicitly scales for different GT
+       * levels (which have some # of PSDs).
        *
-       * In Gen8 the format is U8-2 whereas in Gen9 it is U8-1.
+       * In Gen8 the format is U8-2 whereas in Gen9+ it is U9-1.
        */
       if (GEN_GEN >= 9)
          ps.MaximumNumberofThreadsPerPSD = 64 - 1;
@@ -745,21 +816,21 @@ blorp_emit_ps_config(struct blorp_batch *batch,
          ps.MaximumNumberofThreadsPerPSD = 64 - 2;
 
       switch (params->fast_clear_op) {
-      case BLORP_FAST_CLEAR_OP_NONE:
+      case ISL_AUX_OP_NONE:
          break;
 #if GEN_GEN >= 9
-      case BLORP_FAST_CLEAR_OP_RESOLVE_PARTIAL:
+      case ISL_AUX_OP_PARTIAL_RESOLVE:
          ps.RenderTargetResolveType = RESOLVE_PARTIAL;
          break;
-      case BLORP_FAST_CLEAR_OP_RESOLVE_FULL:
+      case ISL_AUX_OP_FULL_RESOLVE:
          ps.RenderTargetResolveType = RESOLVE_FULL;
          break;
 #else
-      case BLORP_FAST_CLEAR_OP_RESOLVE_FULL:
+      case ISL_AUX_OP_FULL_RESOLVE:
          ps.RenderTargetResolveEnable = true;
          break;
 #endif
-      case BLORP_FAST_CLEAR_OP_CLEAR:
+      case ISL_AUX_OP_FAST_CLEAR:
          ps.RenderTargetFastClearEnable = true;
          break;
       default:
@@ -782,16 +853,16 @@ blorp_emit_ps_config(struct blorp_batch *batch,
 
    blorp_emit(batch, GENX(3DSTATE_WM), wm) {
       switch (params->hiz_op) {
-      case BLORP_HIZ_OP_DEPTH_CLEAR:
+      case ISL_AUX_OP_FAST_CLEAR:
          wm.DepthBufferClear = true;
          break;
-      case BLORP_HIZ_OP_DEPTH_RESOLVE:
+      case ISL_AUX_OP_FULL_RESOLVE:
          wm.DepthBufferResolveEnable = true;
          break;
-      case BLORP_HIZ_OP_HIZ_RESOLVE:
+      case ISL_AUX_OP_AMBIGUATE:
          wm.HierarchicalDepthBufferResolveEnable = true;
          break;
-      case BLORP_HIZ_OP_NONE:
+      case ISL_AUX_OP_NONE:
          break;
       default:
          unreachable("not reached");
@@ -823,17 +894,23 @@ blorp_emit_ps_config(struct blorp_batch *batch,
 #endif
 
       if (prog_data) {
+         ps._8PixelDispatchEnable = prog_data->dispatch_8;
+         ps._16PixelDispatchEnable = prog_data->dispatch_16;
+         ps._32PixelDispatchEnable = prog_data->dispatch_32;
+
          ps.DispatchGRFStartRegisterForConstantSetupData0 =
-            prog_data->base.dispatch_grf_start_reg;
+            brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 0);
+         ps.DispatchGRFStartRegisterForConstantSetupData1 =
+            brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 1);
          ps.DispatchGRFStartRegisterForConstantSetupData2 =
-            prog_data->dispatch_grf_start_reg_2;
-
-         ps.KernelStartPointer0 = params->wm_prog_kernel;
-         ps.KernelStartPointer2 =
-            params->wm_prog_kernel + prog_data->prog_offset_2;
+            brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 2);
 
-         ps._8PixelDispatchEnable = prog_data->dispatch_8;
-         ps._16PixelDispatchEnable = prog_data->dispatch_16;
+         ps.KernelStartPointer0 = params->wm_prog_kernel +
+                                  brw_wm_prog_data_prog_offset(prog_data, ps, 0);
+         ps.KernelStartPointer1 = params->wm_prog_kernel +
+                                  brw_wm_prog_data_prog_offset(prog_data, ps, 1);
+         ps.KernelStartPointer2 = params->wm_prog_kernel +
+                                  brw_wm_prog_data_prog_offset(prog_data, ps, 2);
 
          ps.AttributeEnable = prog_data->num_varying_inputs > 0;
       } else {
@@ -847,12 +924,12 @@ blorp_emit_ps_config(struct blorp_batch *batch,
          ps.SamplerCount = 1; /* Up to 4 samplers */
 
       switch (params->fast_clear_op) {
-      case BLORP_FAST_CLEAR_OP_NONE:
+      case ISL_AUX_OP_NONE:
          break;
-      case BLORP_FAST_CLEAR_OP_RESOLVE_FULL:
+      case ISL_AUX_OP_FULL_RESOLVE:
          ps.RenderTargetResolveEnable = true;
          break;
-      case BLORP_FAST_CLEAR_OP_CLEAR:
+      case ISL_AUX_OP_FAST_CLEAR:
          ps.RenderTargetFastClearEnable = true;
          break;
       default:
@@ -867,16 +944,16 @@ blorp_emit_ps_config(struct blorp_batch *batch,
          batch->blorp->isl_dev->info->max_wm_threads - 1;
 
       switch (params->hiz_op) {
-      case BLORP_HIZ_OP_DEPTH_CLEAR:
+      case ISL_AUX_OP_FAST_CLEAR:
          wm.DepthBufferClear = true;
          break;
-      case BLORP_HIZ_OP_DEPTH_RESOLVE:
+      case ISL_AUX_OP_FULL_RESOLVE:
          wm.DepthBufferResolveEnable = true;
          break;
-      case BLORP_HIZ_OP_HIZ_RESOLVE:
+      case ISL_AUX_OP_AMBIGUATE:
          wm.HierarchicalDepthBufferResolveEnable = true;
          break;
-      case BLORP_HIZ_OP_NONE:
+      case ISL_AUX_OP_NONE:
          break;
       default:
          unreachable("not reached");
@@ -885,17 +962,23 @@ blorp_emit_ps_config(struct blorp_batch *batch,
       if (prog_data) {
          wm.ThreadDispatchEnable = true;
 
+         wm._8PixelDispatchEnable = prog_data->dispatch_8;
+         wm._16PixelDispatchEnable = prog_data->dispatch_16;
+         wm._32PixelDispatchEnable = prog_data->dispatch_32;
+
          wm.DispatchGRFStartRegisterForConstantSetupData0 =
-            prog_data->base.dispatch_grf_start_reg;
+            brw_wm_prog_data_dispatch_grf_start_reg(prog_data, wm, 0);
+         wm.DispatchGRFStartRegisterForConstantSetupData1 =
+            brw_wm_prog_data_dispatch_grf_start_reg(prog_data, wm, 1);
          wm.DispatchGRFStartRegisterForConstantSetupData2 =
-            prog_data->dispatch_grf_start_reg_2;
-
-         wm.KernelStartPointer0 = params->wm_prog_kernel;
-         wm.KernelStartPointer2 =
-            params->wm_prog_kernel + prog_data->prog_offset_2;
+            brw_wm_prog_data_dispatch_grf_start_reg(prog_data, wm, 2);
 
-         wm._8PixelDispatchEnable = prog_data->dispatch_8;
-         wm._16PixelDispatchEnable = prog_data->dispatch_16;
+         wm.KernelStartPointer0 = params->wm_prog_kernel +
+                                  brw_wm_prog_data_prog_offset(prog_data, wm, 0);
+         wm.KernelStartPointer1 = params->wm_prog_kernel +
+                                  brw_wm_prog_data_prog_offset(prog_data, wm, 1);
+         wm.KernelStartPointer2 = params->wm_prog_kernel +
+                                  brw_wm_prog_data_prog_offset(prog_data, wm, 2);
 
          wm.NumberofSFOutputAttributes = prog_data->num_varying_inputs;
       }
@@ -972,7 +1055,7 @@ blorp_emit_blend_state(struct blorp_batch *batch,
 
 static uint32_t
 blorp_emit_color_calc_state(struct blorp_batch *batch,
-                            const struct blorp_params *params)
+                            MAYBE_UNUSED const struct blorp_params *params)
 {
    uint32_t offset;
    blorp_emit_dynamic(batch, GENX(COLOR_CALC_STATE), cc, 64, &offset) {
@@ -1009,7 +1092,7 @@ blorp_emit_depth_stencil_state(struct blorp_batch *batch,
       ds.DepthBufferWriteEnable = true;
 
       switch (params->hiz_op) {
-      case BLORP_HIZ_OP_NONE:
+      case ISL_AUX_OP_NONE:
          ds.DepthTestEnable = true;
          ds.DepthTestFunction = COMPAREFUNCTION_ALWAYS;
          break;
@@ -1019,15 +1102,17 @@ blorp_emit_depth_stencil_state(struct blorp_batch *batch,
        *   - 7.5.3.2 Depth Buffer Resolve
        *   - 7.5.3.3 Hierarchical Depth Buffer Resolve
        */
-      case BLORP_HIZ_OP_DEPTH_RESOLVE:
+      case ISL_AUX_OP_FULL_RESOLVE:
          ds.DepthTestEnable = true;
          ds.DepthTestFunction = COMPAREFUNCTION_NEVER;
          break;
 
-      case BLORP_HIZ_OP_DEPTH_CLEAR:
-      case BLORP_HIZ_OP_HIZ_RESOLVE:
+      case ISL_AUX_OP_FAST_CLEAR:
+      case ISL_AUX_OP_AMBIGUATE:
          ds.DepthTestEnable = false;
          break;
+      case ISL_AUX_OP_PARTIAL_RESOLVE:
+         unreachable("Invalid HIZ op");
       }
    }
 
@@ -1161,7 +1246,7 @@ blorp_emit_pipeline(struct blorp_batch *batch,
    blorp_emit(batch, GENX(3DSTATE_CONSTANT_PS), ps);
 
    if (params->src.enabled)
-      blorp_emit_sampler_state(batch, params);
+      blorp_emit_sampler_state(batch);
 
    blorp_emit_3dstate_multisample(batch, params);
 
@@ -1195,16 +1280,53 @@ blorp_emit_pipeline(struct blorp_batch *batch,
    blorp_emit_sf_config(batch, params);
    blorp_emit_ps_config(batch, params);
 
-   blorp_emit_cc_viewport(batch, params);
+   blorp_emit_cc_viewport(batch);
 }
 
 /******** This is the end of the pipeline setup code ********/
 
 #endif /* GEN_GEN >= 6 */
 
+#if GEN_GEN >= 7
+static void
+blorp_emit_memcpy(struct blorp_batch *batch,
+                  struct blorp_address dst,
+                  struct blorp_address src,
+                  uint32_t size)
+{
+   assert(size % 4 == 0);
+
+   for (unsigned dw = 0; dw < size; dw += 4) {
+#if GEN_GEN >= 8
+      blorp_emit(batch, GENX(MI_COPY_MEM_MEM), cp) {
+         cp.DestinationMemoryAddress = dst;
+         cp.SourceMemoryAddress = src;
+      }
+#else
+      /* IVB does not have a general purpose register for command streamer
+       * commands. Therefore, we use an alternate temporary register.
+       */
+#define BLORP_TEMP_REG 0x2440 /* GEN7_3DPRIM_BASE_VERTEX */
+      blorp_emit(batch, GENX(MI_LOAD_REGISTER_MEM), load) {
+         load.RegisterAddress = BLORP_TEMP_REG;
+         load.MemoryAddress = src;
+      }
+      blorp_emit(batch, GENX(MI_STORE_REGISTER_MEM), store) {
+         store.RegisterAddress = BLORP_TEMP_REG;
+         store.MemoryAddress = dst;
+      }
+#undef BLORP_TEMP_REG
+#endif
+      dst.offset += 4;
+      src.offset += 4;
+   }
+}
+#endif
+
 static void
 blorp_emit_surface_state(struct blorp_batch *batch,
                          const struct brw_blorp_surface_info *surface,
+                         enum isl_aux_op op,
                          void *state, uint32_t state_offset,
                          const bool color_write_disables[4],
                          bool is_render_target)
@@ -1235,13 +1357,15 @@ blorp_emit_surface_state(struct blorp_batch *batch,
          write_disable_mask |= ISL_CHANNEL_ALPHA_BIT;
    }
 
-   const uint32_t mocs =
-      is_render_target ? batch->blorp->mocs.rb : batch->blorp->mocs.tex;
+   const bool use_clear_address =
+      GEN_GEN >= 10 && (surface->clear_color_addr.buffer != NULL);
 
    isl_surf_fill_state(batch->blorp->isl_dev, state,
                        .surf = &surf, .view = &surface->view,
                        .aux_surf = &surface->aux_surf, .aux_usage = aux_usage,
-                       .mocs = mocs, .clear_color = surface->clear_color,
+                       .mocs = surface->addr.mocs,
+                       .clear_color = surface->clear_color,
+                       .use_clear_address = use_clear_address,
                        .write_disables = write_disable_mask);
 
    blorp_surface_reloc(batch, state_offset + isl_dev->ss.addr_offset,
@@ -1258,6 +1382,25 @@ blorp_emit_surface_state(struct blorp_batch *batch,
                           surface->aux_addr, *aux_addr);
    }
 
+   if (surface->clear_color_addr.buffer) {
+#if GEN_GEN >= 10
+      assert((surface->clear_color_addr.offset & 0x3f) == 0);
+      uint32_t *clear_addr = state + isl_dev->ss.clear_color_state_offset;
+      blorp_surface_reloc(batch, state_offset +
+                          isl_dev->ss.clear_color_state_offset,
+                          surface->clear_color_addr, *clear_addr);
+#elif GEN_GEN >= 7
+      if (op == ISL_AUX_OP_FULL_RESOLVE || op == ISL_AUX_OP_PARTIAL_RESOLVE) {
+         struct blorp_address dst_addr = blorp_get_surface_base_address(batch);
+         dst_addr.offset += state_offset + isl_dev->ss.clear_value_offset;
+         blorp_emit_memcpy(batch, dst_addr, surface->clear_color_addr,
+                           isl_dev->ss.clear_value_size);
+      }
+#else
+      unreachable("Fast clears are only supported on gen7+");
+#endif
+   }
+
    blorp_flush_range(batch, state, GENX(RENDER_SURFACE_STATE_length) * 4);
 }
 
@@ -1268,7 +1411,7 @@ blorp_emit_null_surface_state(struct blorp_batch *batch,
 {
    struct GENX(RENDER_SURFACE_STATE) ss = {
       .SurfaceType = SURFTYPE_NULL,
-      .SurfaceFormat = (enum GENX(SURFACE_FORMAT)) ISL_FORMAT_R8G8B8A8_UNORM,
+      .SurfaceFormat = ISL_FORMAT_R8G8B8A8_UNORM,
       .Width = surface->surf.logical_level0_px.width - 1,
       .Height = surface->surf.logical_level0_px.height - 1,
       .MIPCountLOD = surface->view.base_level,
@@ -1300,9 +1443,10 @@ blorp_emit_surface_states(struct blorp_batch *batch,
                           const struct blorp_params *params)
 {
    const struct isl_device *isl_dev = batch->blorp->isl_dev;
-   uint32_t bind_offset, surface_offsets[2];
+   uint32_t bind_offset = 0, surface_offsets[2];
    void *surface_maps[2];
 
+   MAYBE_UNUSED bool has_indirect_clear_color = false;
    if (params->use_pre_baked_binding_table) {
       bind_offset = params->pre_baked_binding_table_offset;
    } else {
@@ -1313,9 +1457,12 @@ blorp_emit_surface_states(struct blorp_batch *batch,
 
       if (params->dst.enabled) {
          blorp_emit_surface_state(batch, &params->dst,
+                                  params->fast_clear_op,
                                   surface_maps[BLORP_RENDERBUFFER_BT_INDEX],
                                   surface_offsets[BLORP_RENDERBUFFER_BT_INDEX],
                                   params->color_write_disable, true);
+         if (params->dst.clear_color_addr.buffer != NULL)
+            has_indirect_clear_color = true;
       } else {
          assert(params->depth.enabled || params->stencil.enabled);
          const struct brw_blorp_surface_info *surface =
@@ -1326,12 +1473,32 @@ blorp_emit_surface_states(struct blorp_batch *batch,
 
       if (params->src.enabled) {
          blorp_emit_surface_state(batch, &params->src,
+                                  params->fast_clear_op,
                                   surface_maps[BLORP_TEXTURE_BT_INDEX],
                                   surface_offsets[BLORP_TEXTURE_BT_INDEX],
                                   NULL, false);
+         if (params->src.clear_color_addr.buffer != NULL)
+            has_indirect_clear_color = true;
       }
    }
 
+#if GEN_GEN >= 7
+   if (has_indirect_clear_color) {
+      /* Updating a surface state object may require that the state cache be
+       * invalidated. From the SKL PRM, Shared Functions -> State -> State
+       * Caching:
+       *
+       *    Whenever the RENDER_SURFACE_STATE object in memory pointed to by
+       *    the Binding Table Pointer (BTP) and Binding Table Index (BTI) is
+       *    modified [...], the L1 state cache must be invalidated to ensure
+       *    the new surface or sampler state is fetched from system memory.
+       */
+      blorp_emit(batch, GENX(PIPE_CONTROL), pipe) {
+         pipe.StateCacheInvalidationEnable = true;
+      }
+   }
+#endif
+
 #if GEN_GEN >= 7
    blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), bt);
    blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_HS), bt);
@@ -1363,18 +1530,14 @@ blorp_emit_depth_stencil_config(struct blorp_batch *batch,
    if (dw == NULL)
       return;
 
-   struct isl_depth_stencil_hiz_emit_info info = {
-#if GEN_GEN >= 7
-      .mocs = 1, /* GEN7_MOCS_L3 */
-#else
-      .mocs = 0,
-#endif
-   };
+   struct isl_depth_stencil_hiz_emit_info info = { };
 
    if (params->depth.enabled) {
       info.view = &params->depth.view;
+      info.mocs = params->depth.addr.mocs;
    } else if (params->stencil.enabled) {
       info.view = &params->stencil.view;
+      info.mocs = params->stencil.addr.mocs;
    }
 
    if (params->depth.enabled) {
@@ -1452,7 +1615,41 @@ blorp_emit_gen8_hiz_op(struct blorp_batch *batch,
     * requested.
     */
    if (params->stencil.enabled)
-      assert(params->hiz_op == BLORP_HIZ_OP_DEPTH_CLEAR);
+      assert(params->hiz_op == ISL_AUX_OP_FAST_CLEAR);
+
+   /* From the BDW PRM Volume 2, 3DSTATE_WM_HZ_OP:
+    *
+    * 3DSTATE_MULTISAMPLE packet must be used prior to this packet to change
+    * the Number of Multisamples. This packet must not be used to change
+    * Number of Multisamples in a rendering sequence.
+    *
+    * Since HIZ may be the first thing in a batch buffer, play safe and always
+    * emit 3DSTATE_MULTISAMPLE.
+    */
+   blorp_emit_3dstate_multisample(batch, params);
+
+   /* From the BDW PRM Volume 7, Depth Buffer Clear:
+    *
+    *    The clear value must be between the min and max depth values
+    *    (inclusive) defined in the CC_VIEWPORT. If the depth buffer format is
+    *    D32_FLOAT, then +/-DENORM values are also allowed.
+    *
+    * Set the bounds to match our hardware limits, [0.0, 1.0].
+    */
+   if (params->depth.enabled && params->hiz_op == ISL_AUX_OP_FAST_CLEAR) {
+      assert(params->depth.clear_color.f32[0] >= 0.0f);
+      assert(params->depth.clear_color.f32[0] <= 1.0f);
+      blorp_emit_cc_viewport(batch);
+   }
+
+   /* According to the SKL PRM formula for WM_INT::ThreadDispatchEnable, the
+    * 3DSTATE_WM::ForceThreadDispatchEnable field can force WM thread dispatch
+    * even when WM_HZ_OP is active.  However, WM thread dispatch is normally
+    * disabled for HiZ ops and it appears that force-enabling it can lead to
+    * GPU hangs on at least Skylake.  Since we don't know the current state of
+    * the 3DSTATE_WM packet, just emit a dummy one prior to 3DSTATE_WM_HZ_OP.
+    */
+   blorp_emit(batch, GENX(3DSTATE_WM), wm);
 
    /* If we can't alter the depth stencil config and multiple layers are
     * involved, the HiZ op will fail. This is because the op requires that a
@@ -1466,21 +1663,22 @@ blorp_emit_gen8_hiz_op(struct blorp_batch *batch,
 
    blorp_emit(batch, GENX(3DSTATE_WM_HZ_OP), hzp) {
       switch (params->hiz_op) {
-      case BLORP_HIZ_OP_DEPTH_CLEAR:
+      case ISL_AUX_OP_FAST_CLEAR:
          hzp.StencilBufferClearEnable = params->stencil.enabled;
          hzp.DepthBufferClearEnable = params->depth.enabled;
          hzp.StencilClearValue = params->stencil_ref;
          hzp.FullSurfaceDepthandStencilClear = params->full_surface_hiz_op;
          break;
-      case BLORP_HIZ_OP_DEPTH_RESOLVE:
+      case ISL_AUX_OP_FULL_RESOLVE:
          assert(params->full_surface_hiz_op);
          hzp.DepthBufferResolveEnable = true;
          break;
-      case BLORP_HIZ_OP_HIZ_RESOLVE:
+      case ISL_AUX_OP_AMBIGUATE:
          assert(params->full_surface_hiz_op);
          hzp.HierarchicalDepthBufferResolveEnable = true;
          break;
-      case BLORP_HIZ_OP_NONE:
+      case ISL_AUX_OP_PARTIAL_RESOLVE:
+      case ISL_AUX_OP_NONE:
          unreachable("Invalid HIZ op");
       }
 
@@ -1511,6 +1709,51 @@ blorp_emit_gen8_hiz_op(struct blorp_batch *batch,
 }
 #endif
 
+static void
+blorp_update_clear_color(struct blorp_batch *batch,
+                         const struct brw_blorp_surface_info *info,
+                         enum isl_aux_op op)
+{
+   if (info->clear_color_addr.buffer && op == ISL_AUX_OP_FAST_CLEAR) {
+#if GEN_GEN >= 9
+      for (int i = 0; i < 4; i++) {
+         blorp_emit(batch, GENX(MI_STORE_DATA_IMM), sdi) {
+            sdi.Address = info->clear_color_addr;
+            sdi.Address.offset += i * 4;
+            sdi.ImmediateData = info->clear_color.u32[i];
+         }
+      }
+#elif GEN_GEN >= 7
+      blorp_emit(batch, GENX(MI_STORE_DATA_IMM), sdi) {
+         sdi.Address = info->clear_color_addr;
+         sdi.ImmediateData = ISL_CHANNEL_SELECT_RED   << 25 |
+                             ISL_CHANNEL_SELECT_GREEN << 22 |
+                             ISL_CHANNEL_SELECT_BLUE  << 19 |
+                             ISL_CHANNEL_SELECT_ALPHA << 16;
+         if (isl_format_has_int_channel(info->view.format)) {
+            for (unsigned i = 0; i < 4; i++) {
+               assert(info->clear_color.u32[i] == 0 ||
+                      info->clear_color.u32[i] == 1);
+            }
+            sdi.ImmediateData |= (info->clear_color.u32[0] != 0) << 31;
+            sdi.ImmediateData |= (info->clear_color.u32[1] != 0) << 30;
+            sdi.ImmediateData |= (info->clear_color.u32[2] != 0) << 29;
+            sdi.ImmediateData |= (info->clear_color.u32[3] != 0) << 28;
+         } else {
+            for (unsigned i = 0; i < 4; i++) {
+               assert(info->clear_color.f32[i] == 0.0f ||
+                      info->clear_color.f32[i] == 1.0f);
+            }
+            sdi.ImmediateData |= (info->clear_color.f32[0] != 0.0f) << 31;
+            sdi.ImmediateData |= (info->clear_color.f32[1] != 0.0f) << 30;
+            sdi.ImmediateData |= (info->clear_color.f32[2] != 0.0f) << 29;
+            sdi.ImmediateData |= (info->clear_color.f32[3] != 0.0f) << 28;
+         }
+      }
+#endif
+   }
+}
+
 /**
  * \brief Execute a blit or render pass operation.
  *
@@ -1523,8 +1766,13 @@ blorp_emit_gen8_hiz_op(struct blorp_batch *batch,
 static void
 blorp_exec(struct blorp_batch *batch, const struct blorp_params *params)
 {
+   if (!(batch->flags & BLORP_BATCH_NO_UPDATE_CLEAR_COLOR)) {
+      blorp_update_clear_color(batch, &params->dst, params->fast_clear_op);
+      blorp_update_clear_color(batch, &params->depth, params->hiz_op);
+   }
+
 #if GEN_GEN >= 8
-   if (params->hiz_op != BLORP_HIZ_OP_NONE) {
+   if (params->hiz_op != ISL_AUX_OP_NONE) {
       blorp_emit_gen8_hiz_op(batch, params);
       return;
    }