intel/blorp: emit VF caching workaround before 3DSTATE_VERTEX_BUFFERS
[mesa.git] / src / intel / blorp / blorp_genX_exec.h
index 62f16a3ad7a79705ca374a35126f3cd75ce71bc9..9010b03fb671ad6217a0d5b7b3c65fd5cec30b87 100644 (file)
  * IN THE SOFTWARE.
  */
 
+#ifndef BLORP_GENX_EXEC_H
+#define BLORP_GENX_EXEC_H
+
 #include "blorp_priv.h"
-#include "common/gen_device_info.h"
+#include "dev/gen_device_info.h"
 #include "common/gen_sample_positions.h"
-#include "intel_aub.h"
+#include "genxml/gen_macros.h"
 
 /**
  * This file provides the blorp pipeline setup and execution functionality.
@@ -50,29 +53,49 @@ blorp_emit_reloc(struct blorp_batch *batch,
 
 static void *
 blorp_alloc_dynamic_state(struct blorp_batch *batch,
-                          enum aub_state_struct_type type,
                           uint32_t size,
                           uint32_t alignment,
                           uint32_t *offset);
 static void *
 blorp_alloc_vertex_buffer(struct blorp_batch *batch, uint32_t size,
                           struct blorp_address *addr);
+static void
+blorp_vf_invalidate_for_vb_48b_transitions(struct blorp_batch *batch,
+                                           const struct blorp_address *addrs,
+                                           unsigned num_vbs);
+
+#if GEN_GEN >= 8
+static struct blorp_address
+blorp_get_workaround_page(struct blorp_batch *batch);
+#endif
 
 static void
 blorp_alloc_binding_table(struct blorp_batch *batch, unsigned num_entries,
                           unsigned state_size, unsigned state_alignment,
                           uint32_t *bt_offset, uint32_t *surface_offsets,
                           void **surface_maps);
+
+static void
+blorp_flush_range(struct blorp_batch *batch, void *start, size_t size);
+
 static void
 blorp_surface_reloc(struct blorp_batch *batch, uint32_t ss_offset,
                     struct blorp_address address, uint32_t delta);
 
+#if GEN_GEN >= 7 && GEN_GEN < 10
+static struct blorp_address
+blorp_get_surface_base_address(struct blorp_batch *batch);
+#endif
+
 static void
-blorp_emit_urb_config(struct blorp_batch *batch, unsigned vs_entry_size);
+blorp_emit_urb_config(struct blorp_batch *batch,
+                      unsigned vs_entry_size, unsigned sf_entry_size);
 
-/***** BEGIN blorp_exec implementation ******/
+static void
+blorp_emit_pipeline(struct blorp_batch *batch,
+                    const struct blorp_params *params);
 
-#include "genxml/gen_macros.h"
+/***** BEGIN blorp_exec implementation ******/
 
 static uint64_t
 _blorp_combine_address(struct blorp_batch *batch, void *location,
@@ -103,16 +126,30 @@ _blorp_combine_address(struct blorp_batch *batch, void *location,
         _blorp_cmd_pack(cmd)(batch, (void *)_dst, &name),         \
         _dst = NULL)
 
-#define blorp_emitn(batch, cmd, n) ({                    \
-      uint32_t *_dw = blorp_emit_dwords(batch, n);       \
-      struct cmd template = {                            \
-         _blorp_cmd_header(cmd),                         \
-         .DWordLength = n - _blorp_cmd_length_bias(cmd), \
-      };                                                 \
-      _blorp_cmd_pack(cmd)(batch, _dw, &template);       \
-      _dw + 1; /* Array starts at dw[1] */               \
+#define blorp_emitn(batch, cmd, n) ({                       \
+      uint32_t *_dw = blorp_emit_dwords(batch, n);          \
+      if (_dw) {                                            \
+         struct cmd template = {                            \
+            _blorp_cmd_header(cmd),                         \
+            .DWordLength = n - _blorp_cmd_length_bias(cmd), \
+         };                                                 \
+         _blorp_cmd_pack(cmd)(batch, _dw, &template);       \
+      }                                                     \
+      _dw ? _dw + 1 : NULL; /* Array starts at dw[1] */     \
    })
 
+#define STRUCT_ZERO(S) ({ struct S t; memset(&t, 0, sizeof(t)); t; })
+
+#define blorp_emit_dynamic(batch, state, name, align, offset)      \
+   for (struct state name = STRUCT_ZERO(state),                         \
+        *_dst = blorp_alloc_dynamic_state(batch,                   \
+                                          _blorp_cmd_length(state) * 4, \
+                                          align, offset);               \
+        __builtin_expect(_dst != NULL, 1);                              \
+        _blorp_cmd_pack(state)(batch, (void *)_dst, &name),             \
+        blorp_flush_range(batch, _dst, _blorp_cmd_length(state) * 4),   \
+        _dst = NULL)
+
 /* 3DSTATE_URB
  * 3DSTATE_URB_VS
  * 3DSTATE_URB_HS
@@ -161,9 +198,20 @@ emit_urb_config(struct blorp_batch *batch,
    /* The URB size is expressed in units of 64 bytes (512 bits) */
    const unsigned vs_entry_size = DIV_ROUND_UP(total_needed, 64);
 
-   blorp_emit_urb_config(batch, vs_entry_size);
+   const unsigned sf_entry_size =
+      params->sf_prog_data ? params->sf_prog_data->urb_entry_size : 0;
+
+   blorp_emit_urb_config(batch, vs_entry_size, sf_entry_size);
 }
 
+#if GEN_GEN >= 7
+static void
+blorp_emit_memcpy(struct blorp_batch *batch,
+                  struct blorp_address dst,
+                  struct blorp_address src,
+                  uint32_t size);
+#endif
+
 static void
 blorp_emit_vertex_data(struct blorp_batch *batch,
                        const struct blorp_params *params,
@@ -171,14 +219,15 @@ blorp_emit_vertex_data(struct blorp_batch *batch,
                        uint32_t *size)
 {
    const float vertices[] = {
-      /* v0 */ (float)params->x1, (float)params->y1,
-      /* v1 */ (float)params->x0, (float)params->y1,
-      /* v2 */ (float)params->x0, (float)params->y0,
+      /* v0 */ (float)params->x1, (float)params->y1, params->z,
+      /* v1 */ (float)params->x0, (float)params->y1, params->z,
+      /* v2 */ (float)params->x0, (float)params->y0, params->z,
    };
 
    void *data = blorp_alloc_vertex_buffer(batch, sizeof(vertices), addr);
    memcpy(data, vertices, sizeof(vertices));
    *size = sizeof(vertices);
+   blorp_flush_range(batch, data, *size);
 }
 
 static void
@@ -190,78 +239,122 @@ blorp_emit_input_varying_data(struct blorp_batch *batch,
    const unsigned vec4_size_in_bytes = 4 * sizeof(float);
    const unsigned max_num_varyings =
       DIV_ROUND_UP(sizeof(params->wm_inputs), vec4_size_in_bytes);
-   const unsigned num_varyings = params->wm_prog_data->num_varying_inputs;
+   const unsigned num_varyings =
+      params->wm_prog_data ? params->wm_prog_data->num_varying_inputs : 0;
 
-   *size = num_varyings * vec4_size_in_bytes;
+   *size = 16 + num_varyings * vec4_size_in_bytes;
 
-   const float *const inputs_src = (const float *)&params->wm_inputs;
-   float *inputs = blorp_alloc_vertex_buffer(batch, *size, addr);
+   const uint32_t *const inputs_src = (const uint32_t *)&params->wm_inputs;
+   void *data = blorp_alloc_vertex_buffer(batch, *size, addr);
+   uint32_t *inputs = data;
 
-   /* Walk over the attribute slots, determine if the attribute is used by
-    * the program and when necessary copy the values from the input storage to
-    * the vertex data buffer.
-    */
-   for (unsigned i = 0; i < max_num_varyings; i++) {
-      const gl_varying_slot attr = VARYING_SLOT_VAR0 + i;
+   /* Copy in the VS inputs */
+   assert(sizeof(params->vs_inputs) == 16);
+   memcpy(inputs, &params->vs_inputs, sizeof(params->vs_inputs));
+   inputs += 4;
+
+   if (params->wm_prog_data) {
+      /* Walk over the attribute slots, determine if the attribute is used by
+       * the program and when necessary copy the values from the input storage
+       * to the vertex data buffer.
+       */
+      for (unsigned i = 0; i < max_num_varyings; i++) {
+         const gl_varying_slot attr = VARYING_SLOT_VAR0 + i;
+
+         const int input_index = params->wm_prog_data->urb_setup[attr];
+         if (input_index < 0)
+            continue;
+
+         memcpy(inputs, inputs_src + i * 4, vec4_size_in_bytes);
+
+         inputs += 4;
+      }
+   }
 
-      if (!(params->wm_prog_data->inputs_read & (1ull << attr)))
-         continue;
+   blorp_flush_range(batch, data, *size);
+
+   if (params->dst_clear_color_as_input) {
+#if GEN_GEN >= 7
+      /* In this case, the clear color isn't known statically and instead
+       * comes in through an indirect which we have to copy into the vertex
+       * buffer before we execute the 3DPRIMITIVE.  We already copied the
+       * value of params->wm_inputs.clear_color into the vertex buffer in the
+       * loop above.  Now we emit code to stomp it from the GPU with the
+       * actual clear color value.
+       */
+      assert(num_varyings == 1);
 
-      memcpy(inputs, inputs_src + i * 4, vec4_size_in_bytes);
+      /* The clear color is the first thing after the header */
+      struct blorp_address clear_color_input_addr = *addr;
+      clear_color_input_addr.offset += 16;
 
-      inputs += 4;
+      const unsigned clear_color_size =
+         GEN_GEN < 10 ? batch->blorp->isl_dev->ss.clear_value_size : 4 * 4;
+      blorp_emit_memcpy(batch, clear_color_input_addr,
+                        params->dst.clear_color_addr,
+                        clear_color_size);
+#else
+      unreachable("MCS partial resolve is not a thing on SNB and earlier");
+#endif
    }
 }
 
 static void
-blorp_emit_vertex_buffers(struct blorp_batch *batch,
-                          const struct blorp_params *params)
+blorp_fill_vertex_buffer_state(struct blorp_batch *batch,
+                               struct GENX(VERTEX_BUFFER_STATE) *vb,
+                               unsigned idx,
+                               struct blorp_address addr, uint32_t size,
+                               uint32_t stride)
 {
-   struct GENX(VERTEX_BUFFER_STATE) vb[2];
-   memset(vb, 0, sizeof(vb));
+   vb[idx].VertexBufferIndex = idx;
+   vb[idx].BufferStartingAddress = addr;
+   vb[idx].BufferPitch = stride;
 
-   unsigned num_buffers = 1;
-
-   uint32_t size;
-   blorp_emit_vertex_data(batch, params, &vb[0].BufferStartingAddress, &size);
-   vb[0].VertexBufferIndex = 0;
-   vb[0].BufferPitch = 2 * sizeof(float);
-   vb[0].VertexBufferMOCS = batch->blorp->mocs.vb;
-#if GEN_GEN >= 7
-   vb[0].AddressModifyEnable = true;
-#endif
-#if GEN_GEN >= 8
-   vb[0].BufferSize = size;
-#else
-   vb[0].BufferAccessType = VERTEXDATA;
-   vb[0].EndAddress = vb[0].BufferStartingAddress;
-   vb[0].EndAddress.offset += size - 1;
+#if GEN_GEN >= 6
+   vb[idx].MOCS = addr.mocs;
 #endif
 
-   if (params->wm_prog_data && params->wm_prog_data->num_varying_inputs) {
-      blorp_emit_input_varying_data(batch, params,
-                                    &vb[1].BufferStartingAddress, &size);
-      vb[1].VertexBufferIndex = 1;
-      vb[1].BufferPitch = 0;
-      vb[1].VertexBufferMOCS = batch->blorp->mocs.vb;
 #if GEN_GEN >= 7
-      vb[1].AddressModifyEnable = true;
+   vb[idx].AddressModifyEnable = true;
 #endif
+
 #if GEN_GEN >= 8
-      vb[1].BufferSize = size;
-#else
-      vb[1].BufferAccessType = INSTANCEDATA;
-      vb[1].EndAddress = vb[1].BufferStartingAddress;
-      vb[1].EndAddress.offset += size - 1;
+   vb[idx].BufferSize = size;
+#elif GEN_GEN >= 5
+   vb[idx].BufferAccessType = stride > 0 ? VERTEXDATA : INSTANCEDATA;
+   vb[idx].EndAddress = vb[idx].BufferStartingAddress;
+   vb[idx].EndAddress.offset += size - 1;
+#elif GEN_GEN == 4
+   vb[idx].BufferAccessType = stride > 0 ? VERTEXDATA : INSTANCEDATA;
+   vb[idx].MaxIndex = stride > 0 ? size / stride : 0;
 #endif
-      num_buffers++;
-   }
+}
 
-   const unsigned num_dwords =
-      1 + GENX(VERTEX_BUFFER_STATE_length) * num_buffers;
+static void
+blorp_emit_vertex_buffers(struct blorp_batch *batch,
+                          const struct blorp_params *params)
+{
+   struct GENX(VERTEX_BUFFER_STATE) vb[3];
+   uint32_t num_vbs = 2;
+   memset(vb, 0, sizeof(vb));
+
+   struct blorp_address addrs[2] = {};
+   uint32_t size;
+   blorp_emit_vertex_data(batch, params, &addrs[0], &size);
+   blorp_fill_vertex_buffer_state(batch, vb, 0, addrs[0], size,
+                                  3 * sizeof(float));
+
+   blorp_emit_input_varying_data(batch, params, &addrs[1], &size);
+   blorp_fill_vertex_buffer_state(batch, vb, 1, addrs[1], size, 0);
+
+   blorp_vf_invalidate_for_vb_48b_transitions(batch, addrs, num_vbs);
+
+   const unsigned num_dwords = 1 + num_vbs * GENX(VERTEX_BUFFER_STATE_length);
    uint32_t *dw = blorp_emitn(batch, GENX(3DSTATE_VERTEX_BUFFERS), num_dwords);
+   if (!dw)
+      return;
 
-   for (unsigned i = 0; i < num_buffers; i++) {
+   for (unsigned i = 0; i < num_vbs; i++) {
       GENX(VERTEX_BUFFER_STATE_pack)(batch, dw, &vb[i]);
       dw += GENX(VERTEX_BUFFER_STATE_length);
    }
@@ -273,7 +366,8 @@ blorp_emit_vertex_elements(struct blorp_batch *batch,
 {
    const unsigned num_varyings =
       params->wm_prog_data ? params->wm_prog_data->num_varying_inputs : 0;
-   const unsigned num_elements = 2 + num_varyings;
+   bool need_ndc = batch->blorp->compiler->devinfo->gen <= 5;
+   const unsigned num_elements = 2 + need_ndc + num_varyings;
 
    struct GENX(VERTEX_ELEMENT_STATE) ve[num_elements];
    memset(ve, 0, num_elements * sizeof(*ve));
@@ -324,47 +418,91 @@ blorp_emit_vertex_elements(struct blorp_batch *batch,
     *
     * See the vertex element setup below.
     */
-   ve[0].VertexBufferIndex = 0;
-   ve[0].Valid = true;
-   ve[0].SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT;
-   ve[0].SourceElementOffset = 0;
-   ve[0].Component0Control = VFCOMP_STORE_0;
-
-   /* From Gen8 onwards hardware is no more instructed to overwrite components
-    * using an element specifier. Instead one has separate 3DSTATE_VF_SGVS
-    * (System Generated Value Setup) state packet for it.
-    */
+   unsigned slot = 0;
+
+   ve[slot] = (struct GENX(VERTEX_ELEMENT_STATE)) {
+      .VertexBufferIndex = 1,
+      .Valid = true,
+      .SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT,
+      .SourceElementOffset = 0,
+      .Component0Control = VFCOMP_STORE_SRC,
+
+      /* From Gen8 onwards hardware is no more instructed to overwrite
+       * components using an element specifier. Instead one has separate
+       * 3DSTATE_VF_SGVS (System Generated Value Setup) state packet for it.
+       */
 #if GEN_GEN >= 8
-   ve[0].Component1Control = VFCOMP_STORE_0;
+      .Component1Control = VFCOMP_STORE_0,
+#elif GEN_GEN >= 5
+      .Component1Control = VFCOMP_STORE_IID,
 #else
-   ve[0].Component1Control = VFCOMP_STORE_IID;
+      .Component1Control = VFCOMP_STORE_0,
+#endif
+      .Component2Control = VFCOMP_STORE_0,
+      .Component3Control = VFCOMP_STORE_0,
+#if GEN_GEN <= 5
+      .DestinationElementOffset = slot * 4,
+#endif
+   };
+   slot++;
+
+#if GEN_GEN <= 5
+   /* On Iron Lake and earlier, a native device coordinates version of the
+    * position goes right after the normal VUE header and before position.
+    * Since w == 1 for all of our coordinates, this is just a copy of the
+    * position.
+    */
+   ve[slot] = (struct GENX(VERTEX_ELEMENT_STATE)) {
+      .VertexBufferIndex = 0,
+      .Valid = true,
+      .SourceElementFormat = ISL_FORMAT_R32G32B32_FLOAT,
+      .SourceElementOffset = 0,
+      .Component0Control = VFCOMP_STORE_SRC,
+      .Component1Control = VFCOMP_STORE_SRC,
+      .Component2Control = VFCOMP_STORE_SRC,
+      .Component3Control = VFCOMP_STORE_1_FP,
+      .DestinationElementOffset = slot * 4,
+   };
+   slot++;
+#endif
+
+   ve[slot] = (struct GENX(VERTEX_ELEMENT_STATE)) {
+      .VertexBufferIndex = 0,
+      .Valid = true,
+      .SourceElementFormat = ISL_FORMAT_R32G32B32_FLOAT,
+      .SourceElementOffset = 0,
+      .Component0Control = VFCOMP_STORE_SRC,
+      .Component1Control = VFCOMP_STORE_SRC,
+      .Component2Control = VFCOMP_STORE_SRC,
+      .Component3Control = VFCOMP_STORE_1_FP,
+#if GEN_GEN <= 5
+      .DestinationElementOffset = slot * 4,
 #endif
-   ve[0].Component2Control = VFCOMP_STORE_0;
-   ve[0].Component3Control = VFCOMP_STORE_0;
-
-   ve[1].VertexBufferIndex = 0;
-   ve[1].Valid = true;
-   ve[1].SourceElementFormat = ISL_FORMAT_R32G32_FLOAT;
-   ve[1].SourceElementOffset = 0;
-   ve[1].Component0Control = VFCOMP_STORE_SRC;
-   ve[1].Component1Control = VFCOMP_STORE_SRC;
-   ve[1].Component2Control = VFCOMP_STORE_0;
-   ve[1].Component3Control = VFCOMP_STORE_1_FP;
+   };
+   slot++;
 
    for (unsigned i = 0; i < num_varyings; ++i) {
-      ve[i + 2].VertexBufferIndex = 1;
-      ve[i + 2].Valid = true;
-      ve[i + 2].SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT;
-      ve[i + 2].SourceElementOffset = i * 4 * sizeof(float);
-      ve[i + 2].Component0Control = VFCOMP_STORE_SRC;
-      ve[i + 2].Component1Control = VFCOMP_STORE_SRC;
-      ve[i + 2].Component2Control = VFCOMP_STORE_SRC;
-      ve[i + 2].Component3Control = VFCOMP_STORE_SRC;
+      ve[slot] = (struct GENX(VERTEX_ELEMENT_STATE)) {
+         .VertexBufferIndex = 1,
+         .Valid = true,
+         .SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT,
+         .SourceElementOffset = 16 + i * 4 * sizeof(float),
+         .Component0Control = VFCOMP_STORE_SRC,
+         .Component1Control = VFCOMP_STORE_SRC,
+         .Component2Control = VFCOMP_STORE_SRC,
+         .Component3Control = VFCOMP_STORE_SRC,
+#if GEN_GEN <= 5
+         .DestinationElementOffset = slot * 4,
+#endif
+      };
+      slot++;
    }
 
    const unsigned num_dwords =
       1 + GENX(VERTEX_ELEMENT_STATE_length) * num_elements;
    uint32_t *dw = blorp_emitn(batch, GENX(3DSTATE_VERTEX_ELEMENTS), num_dwords);
+   if (!dw)
+      return;
 
    for (unsigned i = 0; i < num_elements; i++) {
       GENX(VERTEX_ELEMENT_STATE_pack)(batch, dw, &ve[i]);
@@ -394,11 +532,113 @@ blorp_emit_vertex_elements(struct blorp_batch *batch,
 #endif
 }
 
+/* 3DSTATE_VIEWPORT_STATE_POINTERS */
+static uint32_t
+blorp_emit_cc_viewport(struct blorp_batch *batch)
+{
+   uint32_t cc_vp_offset;
+   blorp_emit_dynamic(batch, GENX(CC_VIEWPORT), vp, 32, &cc_vp_offset) {
+      vp.MinimumDepth = 0.0;
+      vp.MaximumDepth = 1.0;
+   }
+
+#if GEN_GEN >= 7
+   blorp_emit(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), vsp) {
+      vsp.CCViewportPointer = cc_vp_offset;
+   }
+#elif GEN_GEN == 6
+   blorp_emit(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vsp) {
+      vsp.CCViewportStateChange = true;
+      vsp.PointertoCC_VIEWPORT = cc_vp_offset;
+   }
+#endif
+
+   return cc_vp_offset;
+}
+
+static uint32_t
+blorp_emit_sampler_state(struct blorp_batch *batch)
+{
+   uint32_t offset;
+   blorp_emit_dynamic(batch, GENX(SAMPLER_STATE), sampler, 32, &offset) {
+      sampler.MipModeFilter = MIPFILTER_NONE;
+      sampler.MagModeFilter = MAPFILTER_LINEAR;
+      sampler.MinModeFilter = MAPFILTER_LINEAR;
+      sampler.MinLOD = 0;
+      sampler.MaxLOD = 0;
+      sampler.TCXAddressControlMode = TCM_CLAMP;
+      sampler.TCYAddressControlMode = TCM_CLAMP;
+      sampler.TCZAddressControlMode = TCM_CLAMP;
+      sampler.MaximumAnisotropy = RATIO21;
+      sampler.RAddressMinFilterRoundingEnable = true;
+      sampler.RAddressMagFilterRoundingEnable = true;
+      sampler.VAddressMinFilterRoundingEnable = true;
+      sampler.VAddressMagFilterRoundingEnable = true;
+      sampler.UAddressMinFilterRoundingEnable = true;
+      sampler.UAddressMagFilterRoundingEnable = true;
+#if GEN_GEN > 6
+      sampler.NonnormalizedCoordinateEnable = true;
+#endif
+   }
+
+#if GEN_GEN >= 7
+   blorp_emit(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS_PS), ssp) {
+      ssp.PointertoPSSamplerState = offset;
+   }
+#elif GEN_GEN == 6
+   blorp_emit(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS), ssp) {
+      ssp.VSSamplerStateChange = true;
+      ssp.GSSamplerStateChange = true;
+      ssp.PSSamplerStateChange = true;
+      ssp.PointertoPSSamplerState = offset;
+   }
+#endif
+
+   return offset;
+}
+
+/* What follows is the code for setting up a "pipeline" on Sandy Bridge and
+ * later hardware.  This file will be included by i965 for gen4-5 as well, so
+ * this code is guarded by GEN_GEN >= 6.
+ */
+#if GEN_GEN >= 6
+
+static void
+blorp_emit_vs_config(struct blorp_batch *batch,
+                     const struct blorp_params *params)
+{
+   struct brw_vs_prog_data *vs_prog_data = params->vs_prog_data;
+   assert(!vs_prog_data || GEN_GEN < 11 ||
+          vs_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8);
+
+   blorp_emit(batch, GENX(3DSTATE_VS), vs) {
+      if (vs_prog_data) {
+         vs.Enable = true;
+
+         vs.KernelStartPointer = params->vs_prog_kernel;
+
+         vs.DispatchGRFStartRegisterForURBData =
+            vs_prog_data->base.base.dispatch_grf_start_reg;
+         vs.VertexURBEntryReadLength =
+            vs_prog_data->base.urb_read_length;
+         vs.VertexURBEntryReadOffset = 0;
+
+         vs.MaximumNumberofThreads =
+            batch->blorp->isl_dev->info->max_vs_threads - 1;
+
+#if GEN_GEN >= 8
+         vs.SIMD8DispatchEnable =
+            vs_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8;
+#endif
+      }
+   }
+}
+
 static void
 blorp_emit_sf_config(struct blorp_batch *batch,
                      const struct blorp_params *params)
 {
-   const struct brw_blorp_prog_data *prog_data = params->wm_prog_data;
+   const struct brw_wm_prog_data *prog_data = params->wm_prog_data;
 
    /* 3DSTATE_SF
     *
@@ -429,11 +669,16 @@ blorp_emit_sf_config(struct blorp_batch *batch,
 
    blorp_emit(batch, GENX(3DSTATE_SBE), sbe) {
       sbe.VertexURBEntryReadOffset = 1;
-      sbe.NumberofSFOutputAttributes = prog_data->num_varying_inputs;
-      sbe.VertexURBEntryReadLength = brw_blorp_get_urb_length(prog_data);
+      if (prog_data) {
+         sbe.NumberofSFOutputAttributes = prog_data->num_varying_inputs;
+         sbe.VertexURBEntryReadLength = brw_blorp_get_urb_length(prog_data);
+         sbe.ConstantInterpolationEnable = prog_data->flat_inputs;
+      } else {
+         sbe.NumberofSFOutputAttributes = 0;
+         sbe.VertexURBEntryReadLength = 1;
+      }
       sbe.ForceVertexURBEntryReadLength = true;
       sbe.ForceVertexURBEntryReadOffset = true;
-      sbe.ConstantInterpolationEnable = prog_data->flat_inputs;
 
 #if GEN_GEN >= 9
       for (unsigned i = 0; i < 32; i++)
@@ -447,7 +692,7 @@ blorp_emit_sf_config(struct blorp_batch *batch,
       sf.FrontFaceFillMode = FILL_MODE_SOLID;
       sf.BackFaceFillMode = FILL_MODE_SOLID;
 
-      sf.MultisampleRasterizationMode = params->dst.surf.samples > 1 ?
+      sf.MultisampleRasterizationMode = params->num_samples > 1 ?
          MSRASTMODE_ON_PATTERN : MSRASTMODE_OFF_PIXEL;
 
 #if GEN_GEN == 7
@@ -473,7 +718,7 @@ blorp_emit_sf_config(struct blorp_batch *batch,
       sf.FrontFaceFillMode = FILL_MODE_SOLID;
       sf.BackFaceFillMode = FILL_MODE_SOLID;
 
-      sf.MultisampleRasterizationMode = params->dst.surf.samples > 1 ?
+      sf.MultisampleRasterizationMode = params->num_samples > 1 ?
          MSRASTMODE_ON_PATTERN : MSRASTMODE_OFF_PIXEL;
 
       sf.VertexURBEntryReadOffset = 1;
@@ -494,7 +739,7 @@ static void
 blorp_emit_ps_config(struct blorp_batch *batch,
                      const struct blorp_params *params)
 {
-   const struct brw_blorp_prog_data *prog_data = params->wm_prog_data;
+   const struct brw_wm_prog_data *prog_data = params->wm_prog_data;
 
    /* Even when thread dispatch is disabled, max threads (dw5.25:31) must be
     * nonzero to prevent the GPU from hanging.  While the documentation doesn't
@@ -510,30 +755,60 @@ blorp_emit_ps_config(struct blorp_batch *batch,
    blorp_emit(batch, GENX(3DSTATE_WM), wm);
 
    blorp_emit(batch, GENX(3DSTATE_PS), ps) {
-      if (params->src.addr.buffer) {
+      if (params->src.enabled) {
          ps.SamplerCount = 1; /* Up to 4 samplers */
          ps.BindingTableEntryCount = 2;
       } else {
          ps.BindingTableEntryCount = 1;
       }
 
-      ps.DispatchGRFStartRegisterForConstantSetupData0 =
-         prog_data->first_curbe_grf_0;
-      ps.DispatchGRFStartRegisterForConstantSetupData2 =
-         prog_data->first_curbe_grf_2;
+     /* Gen 11 workarounds table #2056 WABTPPrefetchDisable suggests to
+      * disable prefetching of binding tables on A0 and B0 steppings.
+      * TODO: Revisit this WA on C0 stepping.
+      */
+      if (GEN_GEN == 11)
+         ps.BindingTableEntryCount = 0;
 
-      ps._8PixelDispatchEnable = prog_data->dispatch_8;
-      ps._16PixelDispatchEnable = prog_data->dispatch_16;
-
-      ps.KernelStartPointer0 = params->wm_prog_kernel;
-      ps.KernelStartPointer2 =
-         params->wm_prog_kernel + prog_data->ksp_offset_2;
+      if (prog_data) {
+         ps._8PixelDispatchEnable = prog_data->dispatch_8;
+         ps._16PixelDispatchEnable = prog_data->dispatch_16;
+         ps._32PixelDispatchEnable = prog_data->dispatch_32;
+
+         /* From the Sky Lake PRM 3DSTATE_PS::32 Pixel Dispatch Enable:
+          *
+          *    "When NUM_MULTISAMPLES = 16 or FORCE_SAMPLE_COUNT = 16, SIMD32
+          *    Dispatch must not be enabled for PER_PIXEL dispatch mode."
+          *
+          * Since 16x MSAA is first introduced on SKL, we don't need to apply
+          * the workaround on any older hardware.
+          */
+         if (GEN_GEN >= 9 && !prog_data->persample_dispatch &&
+             params->num_samples == 16) {
+            assert(ps._8PixelDispatchEnable || ps._16PixelDispatchEnable);
+            ps._32PixelDispatchEnable = false;
+         }
+
+         ps.DispatchGRFStartRegisterForConstantSetupData0 =
+            brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 0);
+         ps.DispatchGRFStartRegisterForConstantSetupData1 =
+            brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 1);
+         ps.DispatchGRFStartRegisterForConstantSetupData2 =
+            brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 2);
+
+         ps.KernelStartPointer0 = params->wm_prog_kernel +
+                                  brw_wm_prog_data_prog_offset(prog_data, ps, 0);
+         ps.KernelStartPointer1 = params->wm_prog_kernel +
+                                  brw_wm_prog_data_prog_offset(prog_data, ps, 1);
+         ps.KernelStartPointer2 = params->wm_prog_kernel +
+                                  brw_wm_prog_data_prog_offset(prog_data, ps, 2);
+      }
 
-      /* 3DSTATE_PS expects the number of threads per PSD, which is always 64;
-       * it implicitly scales for different GT levels (which have some # of
-       * PSDs).
+      /* 3DSTATE_PS expects the number of threads per PSD, which is always 64
+       * for pre Gen11 and 128 for gen11+; On gen11+ If a programmed value is
+       * k, it implies 2(k+1) threads. It implicitly scales for different GT
+       * levels (which have some # of PSDs).
        *
-       * In Gen8 the format is U8-2 whereas in Gen9 it is U8-1.
+       * In Gen8 the format is U8-2 whereas in Gen9+ it is U9-1.
        */
       if (GEN_GEN >= 9)
          ps.MaximumNumberofThreadsPerPSD = 64 - 1;
@@ -541,21 +816,21 @@ blorp_emit_ps_config(struct blorp_batch *batch,
          ps.MaximumNumberofThreadsPerPSD = 64 - 2;
 
       switch (params->fast_clear_op) {
-      case BLORP_FAST_CLEAR_OP_NONE:
+      case ISL_AUX_OP_NONE:
          break;
 #if GEN_GEN >= 9
-      case BLORP_FAST_CLEAR_OP_RESOLVE_PARTIAL:
+      case ISL_AUX_OP_PARTIAL_RESOLVE:
          ps.RenderTargetResolveType = RESOLVE_PARTIAL;
          break;
-      case BLORP_FAST_CLEAR_OP_RESOLVE_FULL:
+      case ISL_AUX_OP_FULL_RESOLVE:
          ps.RenderTargetResolveType = RESOLVE_FULL;
          break;
 #else
-      case BLORP_FAST_CLEAR_OP_RESOLVE_FULL:
+      case ISL_AUX_OP_FULL_RESOLVE:
          ps.RenderTargetResolveEnable = true;
          break;
 #endif
-      case BLORP_FAST_CLEAR_OP_CLEAR:
+      case ISL_AUX_OP_FAST_CLEAR:
          ps.RenderTargetFastClearEnable = true;
          break;
       default:
@@ -564,31 +839,30 @@ blorp_emit_ps_config(struct blorp_batch *batch,
    }
 
    blorp_emit(batch, GENX(3DSTATE_PS_EXTRA), psx) {
-      psx.PixelShaderValid = true;
+      if (prog_data) {
+         psx.PixelShaderValid = true;
+         psx.AttributeEnable = prog_data->num_varying_inputs > 0;
+         psx.PixelShaderIsPerSample = prog_data->persample_dispatch;
+      }
 
-      if (params->src.addr.buffer)
+      if (params->src.enabled)
          psx.PixelShaderKillsPixel = true;
-
-      psx.AttributeEnable = prog_data->num_varying_inputs > 0;
-
-      if (prog_data && prog_data->persample_msaa_dispatch)
-         psx.PixelShaderIsPerSample = true;
    }
 
 #elif GEN_GEN >= 7
 
    blorp_emit(batch, GENX(3DSTATE_WM), wm) {
       switch (params->hiz_op) {
-      case BLORP_HIZ_OP_DEPTH_CLEAR:
+      case ISL_AUX_OP_FAST_CLEAR:
          wm.DepthBufferClear = true;
          break;
-      case BLORP_HIZ_OP_DEPTH_RESOLVE:
+      case ISL_AUX_OP_FULL_RESOLVE:
          wm.DepthBufferResolveEnable = true;
          break;
-      case BLORP_HIZ_OP_HIZ_RESOLVE:
+      case ISL_AUX_OP_AMBIGUATE:
          wm.HierarchicalDepthBufferResolveEnable = true;
          break;
-      case BLORP_HIZ_OP_NONE:
+      case ISL_AUX_OP_NONE:
          break;
       default:
          unreachable("not reached");
@@ -597,13 +871,13 @@ blorp_emit_ps_config(struct blorp_batch *batch,
       if (prog_data)
          wm.ThreadDispatchEnable = true;
 
-      if (params->src.addr.buffer)
-         wm.PixelShaderKillPixel = true;
+      if (params->src.enabled)
+         wm.PixelShaderKillsPixel = true;
 
-      if (params->dst.surf.samples > 1) {
+      if (params->num_samples > 1) {
          wm.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
          wm.MultisampleDispatchMode =
-            (prog_data && prog_data->persample_msaa_dispatch) ?
+            (prog_data && prog_data->persample_dispatch) ?
             MSDISPMODE_PERSAMPLE : MSDISPMODE_PERPIXEL;
       } else {
          wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
@@ -620,17 +894,23 @@ blorp_emit_ps_config(struct blorp_batch *batch,
 #endif
 
       if (prog_data) {
-         ps.DispatchGRFStartRegisterforConstantSetupData0 =
-            prog_data->first_curbe_grf_0;
-         ps.DispatchGRFStartRegisterforConstantSetupData2 =
-            prog_data->first_curbe_grf_2;
-
-         ps.KernelStartPointer0 = params->wm_prog_kernel;
-         ps.KernelStartPointer2 =
-            params->wm_prog_kernel + prog_data->ksp_offset_2;
-
          ps._8PixelDispatchEnable = prog_data->dispatch_8;
          ps._16PixelDispatchEnable = prog_data->dispatch_16;
+         ps._32PixelDispatchEnable = prog_data->dispatch_32;
+
+         ps.DispatchGRFStartRegisterForConstantSetupData0 =
+            brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 0);
+         ps.DispatchGRFStartRegisterForConstantSetupData1 =
+            brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 1);
+         ps.DispatchGRFStartRegisterForConstantSetupData2 =
+            brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 2);
+
+         ps.KernelStartPointer0 = params->wm_prog_kernel +
+                                  brw_wm_prog_data_prog_offset(prog_data, ps, 0);
+         ps.KernelStartPointer1 = params->wm_prog_kernel +
+                                  brw_wm_prog_data_prog_offset(prog_data, ps, 1);
+         ps.KernelStartPointer2 = params->wm_prog_kernel +
+                                  brw_wm_prog_data_prog_offset(prog_data, ps, 2);
 
          ps.AttributeEnable = prog_data->num_varying_inputs > 0;
       } else {
@@ -640,16 +920,16 @@ blorp_emit_ps_config(struct blorp_batch *batch,
          ps._16PixelDispatchEnable = true;
       }
 
-      if (params->src.addr.buffer)
+      if (params->src.enabled)
          ps.SamplerCount = 1; /* Up to 4 samplers */
 
       switch (params->fast_clear_op) {
-      case BLORP_FAST_CLEAR_OP_NONE:
+      case ISL_AUX_OP_NONE:
          break;
-      case BLORP_FAST_CLEAR_OP_RESOLVE_FULL:
+      case ISL_AUX_OP_FULL_RESOLVE:
          ps.RenderTargetResolveEnable = true;
          break;
-      case BLORP_FAST_CLEAR_OP_CLEAR:
+      case ISL_AUX_OP_FAST_CLEAR:
          ps.RenderTargetFastClearEnable = true;
          break;
       default:
@@ -664,16 +944,16 @@ blorp_emit_ps_config(struct blorp_batch *batch,
          batch->blorp->isl_dev->info->max_wm_threads - 1;
 
       switch (params->hiz_op) {
-      case BLORP_HIZ_OP_DEPTH_CLEAR:
+      case ISL_AUX_OP_FAST_CLEAR:
          wm.DepthBufferClear = true;
          break;
-      case BLORP_HIZ_OP_DEPTH_RESOLVE:
+      case ISL_AUX_OP_FULL_RESOLVE:
          wm.DepthBufferResolveEnable = true;
          break;
-      case BLORP_HIZ_OP_HIZ_RESOLVE:
+      case ISL_AUX_OP_AMBIGUATE:
          wm.HierarchicalDepthBufferResolveEnable = true;
          break;
-      case BLORP_HIZ_OP_NONE:
+      case ISL_AUX_OP_NONE:
          break;
       default:
          unreachable("not reached");
@@ -682,30 +962,36 @@ blorp_emit_ps_config(struct blorp_batch *batch,
       if (prog_data) {
          wm.ThreadDispatchEnable = true;
 
-         wm.DispatchGRFStartRegisterforConstantSetupData0 =
-            prog_data->first_curbe_grf_0;
-         wm.DispatchGRFStartRegisterforConstantSetupData2 =
-            prog_data->first_curbe_grf_2;
-
-         wm.KernelStartPointer0 = params->wm_prog_kernel;
-         wm.KernelStartPointer2 =
-            params->wm_prog_kernel + prog_data->ksp_offset_2;
-
          wm._8PixelDispatchEnable = prog_data->dispatch_8;
          wm._16PixelDispatchEnable = prog_data->dispatch_16;
+         wm._32PixelDispatchEnable = prog_data->dispatch_32;
+
+         wm.DispatchGRFStartRegisterForConstantSetupData0 =
+            brw_wm_prog_data_dispatch_grf_start_reg(prog_data, wm, 0);
+         wm.DispatchGRFStartRegisterForConstantSetupData1 =
+            brw_wm_prog_data_dispatch_grf_start_reg(prog_data, wm, 1);
+         wm.DispatchGRFStartRegisterForConstantSetupData2 =
+            brw_wm_prog_data_dispatch_grf_start_reg(prog_data, wm, 2);
+
+         wm.KernelStartPointer0 = params->wm_prog_kernel +
+                                  brw_wm_prog_data_prog_offset(prog_data, wm, 0);
+         wm.KernelStartPointer1 = params->wm_prog_kernel +
+                                  brw_wm_prog_data_prog_offset(prog_data, wm, 1);
+         wm.KernelStartPointer2 = params->wm_prog_kernel +
+                                  brw_wm_prog_data_prog_offset(prog_data, wm, 2);
 
          wm.NumberofSFOutputAttributes = prog_data->num_varying_inputs;
       }
 
-      if (params->src.addr.buffer) {
+      if (params->src.enabled) {
          wm.SamplerCount = 1; /* Up to 4 samplers */
-         wm.PixelShaderKillPixel = true; /* TODO: temporarily smash on */
+         wm.PixelShaderKillsPixel = true; /* TODO: temporarily smash on */
       }
 
-      if (params->dst.surf.samples > 1) {
+      if (params->num_samples > 1) {
          wm.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
          wm.MultisampleDispatchMode =
-            (prog_data && prog_data->persample_msaa_dispatch) ?
+            (prog_data && prog_data->persample_dispatch) ?
             MSDISPMODE_PERSAMPLE : MSDISPMODE_PERPIXEL;
       } else {
          wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
@@ -716,68 +1002,6 @@ blorp_emit_ps_config(struct blorp_batch *batch,
 #endif /* GEN_GEN */
 }
 
-
-static void
-blorp_emit_depth_stencil_config(struct blorp_batch *batch,
-                                const struct blorp_params *params)
-{
-#if GEN_GEN >= 7
-   const uint32_t mocs = 1; /* GEN7_MOCS_L3 */
-#else
-   const uint32_t mocs = 0;
-#endif
-
-   blorp_emit(batch, GENX(3DSTATE_DEPTH_BUFFER), db) {
-      switch (params->depth.surf.dim) {
-      case ISL_SURF_DIM_1D:
-         db.SurfaceType = SURFTYPE_1D;
-         break;
-      case ISL_SURF_DIM_2D:
-         db.SurfaceType = SURFTYPE_2D;
-         break;
-      case ISL_SURF_DIM_3D:
-         db.SurfaceType = SURFTYPE_3D;
-         break;
-      }
-
-      db.SurfaceFormat = params->depth_format;
-
-#if GEN_GEN >= 7
-      db.DepthWriteEnable = true;
-#endif
-
-#if GEN_GEN <= 6
-      db.TiledSurface = true;
-      db.TileWalk = TILEWALK_YMAJOR;
-      db.MIPMapLayoutMode = MIPLAYOUT_BELOW;
-      db.SeparateStencilBufferEnable = true;
-#endif
-
-      db.HierarchicalDepthBufferEnable = true;
-
-      db.Width = params->depth.surf.logical_level0_px.width - 1;
-      db.Height = params->depth.surf.logical_level0_px.height - 1;
-      db.RenderTargetViewExtent = db.Depth =
-         MAX2(params->depth.surf.logical_level0_px.depth,
-              params->depth.surf.logical_level0_px.array_len) - 1;
-
-      db.LOD = params->depth.view.base_level;
-      db.MinimumArrayElement = params->depth.view.base_array_layer;
-
-      db.SurfacePitch = params->depth.surf.row_pitch - 1;
-      db.SurfaceBaseAddress = params->depth.addr;
-      db.DepthBufferMOCS = mocs;
-   }
-
-   blorp_emit(batch, GENX(3DSTATE_HIER_DEPTH_BUFFER), hiz) {
-      hiz.SurfacePitch = params->depth.aux_surf.row_pitch - 1;
-      hiz.SurfaceBaseAddress = params->depth.aux_addr;
-      hiz.HierarchicalDepthBufferMOCS = mocs;
-   }
-
-   blorp_emit(batch, GENX(3DSTATE_STENCIL_BUFFER), sb);
-}
-
 static uint32_t
 blorp_emit_blend_state(struct blorp_batch *batch,
                        const struct blorp_params *params)
@@ -785,22 +1009,31 @@ blorp_emit_blend_state(struct blorp_batch *batch,
    struct GENX(BLEND_STATE) blend;
    memset(&blend, 0, sizeof(blend));
 
+   uint32_t offset;
+   int size = GENX(BLEND_STATE_length) * 4;
+   size += GENX(BLEND_STATE_ENTRY_length) * 4 * params->num_draw_buffers;
+   uint32_t *state = blorp_alloc_dynamic_state(batch, size, 64, &offset);
+   uint32_t *pos = state;
+
+   GENX(BLEND_STATE_pack)(NULL, pos, &blend);
+   pos += GENX(BLEND_STATE_length);
+
    for (unsigned i = 0; i < params->num_draw_buffers; ++i) {
-      blend.Entry[i].PreBlendColorClampEnable = true;
-      blend.Entry[i].PostBlendColorClampEnable = true;
-      blend.Entry[i].ColorClampRange = COLORCLAMP_RTFORMAT;
-
-      blend.Entry[i].WriteDisableRed = params->color_write_disable[0];
-      blend.Entry[i].WriteDisableGreen = params->color_write_disable[1];
-      blend.Entry[i].WriteDisableBlue = params->color_write_disable[2];
-      blend.Entry[i].WriteDisableAlpha = params->color_write_disable[3];
+      struct GENX(BLEND_STATE_ENTRY) entry = {
+         .PreBlendColorClampEnable = true,
+         .PostBlendColorClampEnable = true,
+         .ColorClampRange = COLORCLAMP_RTFORMAT,
+
+         .WriteDisableRed = params->color_write_disable[0],
+         .WriteDisableGreen = params->color_write_disable[1],
+         .WriteDisableBlue = params->color_write_disable[2],
+         .WriteDisableAlpha = params->color_write_disable[3],
+      };
+      GENX(BLEND_STATE_ENTRY_pack)(NULL, pos, &entry);
+      pos += GENX(BLEND_STATE_ENTRY_length);
    }
 
-   uint32_t offset;
-   void *state = blorp_alloc_dynamic_state(batch, AUB_TRACE_BLEND_STATE,
-                                           GENX(BLEND_STATE_length) * 4,
-                                           64, &offset);
-   GENX(BLEND_STATE_pack)(NULL, state, &blend);
+   blorp_flush_range(batch, state, size);
 
 #if GEN_GEN >= 7
    blorp_emit(batch, GENX(3DSTATE_BLEND_STATE_POINTERS), sp) {
@@ -822,13 +1055,14 @@ blorp_emit_blend_state(struct blorp_batch *batch,
 
 static uint32_t
 blorp_emit_color_calc_state(struct blorp_batch *batch,
-                            const struct blorp_params *params)
+                            MAYBE_UNUSED const struct blorp_params *params)
 {
    uint32_t offset;
-   void *state = blorp_alloc_dynamic_state(batch, AUB_TRACE_CC_STATE,
-                                           GENX(COLOR_CALC_STATE_length) * 4,
-                                           64, &offset);
-   memset(state, 0, GENX(COLOR_CALC_STATE_length) * 4);
+   blorp_emit_dynamic(batch, GENX(COLOR_CALC_STATE), cc, 64, &offset) {
+#if GEN_GEN <= 8
+      cc.StencilReferenceValue = params->stencil_ref;
+#endif
+   }
 
 #if GEN_GEN >= 7
    blorp_emit(batch, GENX(3DSTATE_CC_STATE_POINTERS), sp) {
@@ -847,88 +1081,294 @@ blorp_emit_depth_stencil_state(struct blorp_batch *batch,
                                const struct blorp_params *params)
 {
 #if GEN_GEN >= 8
+   struct GENX(3DSTATE_WM_DEPTH_STENCIL) ds = {
+      GENX(3DSTATE_WM_DEPTH_STENCIL_header),
+   };
+#else
+   struct GENX(DEPTH_STENCIL_STATE) ds = { 0 };
+#endif
 
-   /* On gen8+, DEPTH_STENCIL state is simply an instruction */
-   blorp_emit(batch, GENX(3DSTATE_WM_DEPTH_STENCIL), ds);
-   return 0;
+   if (params->depth.enabled) {
+      ds.DepthBufferWriteEnable = true;
 
-#else /* GEN_GEN <= 7 */
+      switch (params->hiz_op) {
+      case ISL_AUX_OP_NONE:
+         ds.DepthTestEnable = true;
+         ds.DepthTestFunction = COMPAREFUNCTION_ALWAYS;
+         break;
 
-   /* See the following sections of the Sandy Bridge PRM, Volume 1, Part2:
-    *   - 7.5.3.1 Depth Buffer Clear
-    *   - 7.5.3.2 Depth Buffer Resolve
-    *   - 7.5.3.3 Hierarchical Depth Buffer Resolve
-    */
-   struct GENX(DEPTH_STENCIL_STATE) ds = {
-      .DepthBufferWriteEnable = true,
-   };
+      /* See the following sections of the Sandy Bridge PRM, Volume 2, Part1:
+       *   - 7.5.3.1 Depth Buffer Clear
+       *   - 7.5.3.2 Depth Buffer Resolve
+       *   - 7.5.3.3 Hierarchical Depth Buffer Resolve
+       */
+      case ISL_AUX_OP_FULL_RESOLVE:
+         ds.DepthTestEnable = true;
+         ds.DepthTestFunction = COMPAREFUNCTION_NEVER;
+         break;
+
+      case ISL_AUX_OP_FAST_CLEAR:
+      case ISL_AUX_OP_AMBIGUATE:
+         ds.DepthTestEnable = false;
+         break;
+      case ISL_AUX_OP_PARTIAL_RESOLVE:
+         unreachable("Invalid HIZ op");
+      }
+   }
+
+   if (params->stencil.enabled) {
+      ds.StencilBufferWriteEnable = true;
+      ds.StencilTestEnable = true;
+      ds.DoubleSidedStencilEnable = false;
+
+      ds.StencilTestFunction = COMPAREFUNCTION_ALWAYS;
+      ds.StencilPassDepthPassOp = STENCILOP_REPLACE;
 
-   if (params->hiz_op == BLORP_HIZ_OP_DEPTH_RESOLVE) {
-      ds.DepthTestEnable = true;
-      ds.DepthTestFunction = COMPAREFUNCTION_NEVER;
+      ds.StencilWriteMask = params->stencil_mask;
+#if GEN_GEN >= 9
+      ds.StencilReferenceValue = params->stencil_ref;
+#endif
    }
 
+#if GEN_GEN >= 8
+   uint32_t offset = 0;
+   uint32_t *dw = blorp_emit_dwords(batch,
+                                    GENX(3DSTATE_WM_DEPTH_STENCIL_length));
+   if (!dw)
+      return 0;
+
+   GENX(3DSTATE_WM_DEPTH_STENCIL_pack)(NULL, dw, &ds);
+#else
    uint32_t offset;
-   void *state = blorp_alloc_dynamic_state(batch, AUB_TRACE_DEPTH_STENCIL_STATE,
+   void *state = blorp_alloc_dynamic_state(batch,
                                            GENX(DEPTH_STENCIL_STATE_length) * 4,
                                            64, &offset);
    GENX(DEPTH_STENCIL_STATE_pack)(NULL, state, &ds);
+   blorp_flush_range(batch, state, GENX(DEPTH_STENCIL_STATE_length) * 4);
+#endif
 
-#if GEN_GEN >= 7
+#if GEN_GEN == 7
    blorp_emit(batch, GENX(3DSTATE_DEPTH_STENCIL_STATE_POINTERS), sp) {
       sp.PointertoDEPTH_STENCIL_STATE = offset;
    }
 #endif
 
    return offset;
-
-#endif /* GEN_GEN */
 }
 
-struct surface_state_info {
-   unsigned num_dwords;
-   unsigned ss_align; /* Required alignment of RENDER_SURFACE_STATE in bytes */
-   unsigned reloc_dw;
-   unsigned aux_reloc_dw;
-};
-
-static const struct surface_state_info surface_state_infos[] = {
-   [6] = {6,  32, 1,  0},
-   [7] = {8,  32, 1,  6},
-   [8] = {13, 64, 8,  10},
-   [9] = {16, 64, 8,  10},
-};
-
 static void
-blorp_emit_surface_state(struct blorp_batch *batch,
-                         const struct brw_blorp_surface_info *surface,
-                         uint32_t *state, uint32_t state_offset,
-                         bool is_render_target)
+blorp_emit_3dstate_multisample(struct blorp_batch *batch,
+                               const struct blorp_params *params)
 {
-   const struct surface_state_info ss_info = surface_state_infos[GEN_GEN];
+   blorp_emit(batch, GENX(3DSTATE_MULTISAMPLE), ms) {
+      ms.NumberofMultisamples       = __builtin_ffs(params->num_samples) - 1;
 
-   struct isl_surf surf = surface->surf;
+#if GEN_GEN >= 8
+      /* The PRM says that this bit is valid only for DX9:
+       *
+       *    SW can choose to set this bit only for DX9 API. DX10/OGL API's
+       *    should not have any effect by setting or not setting this bit.
+       */
+      ms.PixelPositionOffsetEnable  = false;
+#elif GEN_GEN >= 7
 
-   if (surf.dim == ISL_SURF_DIM_1D &&
-       surf.dim_layout == ISL_DIM_LAYOUT_GEN4_2D) {
-      assert(surf.logical_level0_px.height == 1);
-      surf.dim = ISL_SURF_DIM_2D;
+      switch (params->num_samples) {
+      case 1:
+         GEN_SAMPLE_POS_1X(ms.Sample);
+         break;
+      case 2:
+         GEN_SAMPLE_POS_2X(ms.Sample);
+         break;
+      case 4:
+         GEN_SAMPLE_POS_4X(ms.Sample);
+         break;
+      case 8:
+         GEN_SAMPLE_POS_8X(ms.Sample);
+         break;
+      default:
+         break;
+      }
+#else
+      GEN_SAMPLE_POS_4X(ms.Sample);
+#endif
+      ms.PixelLocation              = CENTER;
    }
+}
 
-   /* Blorp doesn't support HiZ in any of the blit or slow-clear paths */
+static void
+blorp_emit_pipeline(struct blorp_batch *batch,
+                    const struct blorp_params *params)
+{
+   uint32_t blend_state_offset = 0;
+   uint32_t color_calc_state_offset;
+   uint32_t depth_stencil_state_offset;
+
+   emit_urb_config(batch, params);
+
+   if (params->wm_prog_data) {
+      blend_state_offset = blorp_emit_blend_state(batch, params);
+   }
+   color_calc_state_offset = blorp_emit_color_calc_state(batch, params);
+   depth_stencil_state_offset = blorp_emit_depth_stencil_state(batch, params);
+
+#if GEN_GEN == 6
+   /* 3DSTATE_CC_STATE_POINTERS
+    *
+    * The pointer offsets are relative to
+    * CMD_STATE_BASE_ADDRESS.DynamicStateBaseAddress.
+    *
+    * The HiZ op doesn't use BLEND_STATE or COLOR_CALC_STATE.
+    *
+    * The dynamic state emit helpers emit their own STATE_POINTERS packets on
+    * gen7+.  However, on gen6 and earlier, they're all lumpped together in
+    * one CC_STATE_POINTERS packet so we have to emit that here.
+    */
+   blorp_emit(batch, GENX(3DSTATE_CC_STATE_POINTERS), cc) {
+      cc.BLEND_STATEChange = true;
+      cc.ColorCalcStatePointerValid = true;
+      cc.DEPTH_STENCIL_STATEChange = true;
+      cc.PointertoBLEND_STATE = blend_state_offset;
+      cc.ColorCalcStatePointer = color_calc_state_offset;
+      cc.PointertoDEPTH_STENCIL_STATE = depth_stencil_state_offset;
+   }
+#else
+   (void)blend_state_offset;
+   (void)color_calc_state_offset;
+   (void)depth_stencil_state_offset;
+#endif
+
+   blorp_emit(batch, GENX(3DSTATE_CONSTANT_VS), vs);
+#if GEN_GEN >= 7
+   blorp_emit(batch, GENX(3DSTATE_CONSTANT_HS), hs);
+   blorp_emit(batch, GENX(3DSTATE_CONSTANT_DS), DS);
+#endif
+   blorp_emit(batch, GENX(3DSTATE_CONSTANT_GS), gs);
+   blorp_emit(batch, GENX(3DSTATE_CONSTANT_PS), ps);
+
+   if (params->src.enabled)
+      blorp_emit_sampler_state(batch);
+
+   blorp_emit_3dstate_multisample(batch, params);
+
+   blorp_emit(batch, GENX(3DSTATE_SAMPLE_MASK), mask) {
+      mask.SampleMask = (1 << params->num_samples) - 1;
+   }
+
+   /* From the BSpec, 3D Pipeline > Geometry > Vertex Shader > State,
+    * 3DSTATE_VS, Dword 5.0 "VS Function Enable":
+    *
+    *   [DevSNB] A pipeline flush must be programmed prior to a
+    *   3DSTATE_VS command that causes the VS Function Enable to
+    *   toggle. Pipeline flush can be executed by sending a PIPE_CONTROL
+    *   command with CS stall bit set and a post sync operation.
+    *
+    * We've already done one at the start of the BLORP operation.
+    */
+   blorp_emit_vs_config(batch, params);
+#if GEN_GEN >= 7
+   blorp_emit(batch, GENX(3DSTATE_HS), hs);
+   blorp_emit(batch, GENX(3DSTATE_TE), te);
+   blorp_emit(batch, GENX(3DSTATE_DS), DS);
+   blorp_emit(batch, GENX(3DSTATE_STREAMOUT), so);
+#endif
+   blorp_emit(batch, GENX(3DSTATE_GS), gs);
+
+   blorp_emit(batch, GENX(3DSTATE_CLIP), clip) {
+      clip.PerspectiveDivideDisable = true;
+   }
+
+   blorp_emit_sf_config(batch, params);
+   blorp_emit_ps_config(batch, params);
+
+   blorp_emit_cc_viewport(batch);
+}
+
+/******** This is the end of the pipeline setup code ********/
+
+#endif /* GEN_GEN >= 6 */
+
+#if GEN_GEN >= 7
+static void
+blorp_emit_memcpy(struct blorp_batch *batch,
+                  struct blorp_address dst,
+                  struct blorp_address src,
+                  uint32_t size)
+{
+   assert(size % 4 == 0);
+
+   for (unsigned dw = 0; dw < size; dw += 4) {
+#if GEN_GEN >= 8
+      blorp_emit(batch, GENX(MI_COPY_MEM_MEM), cp) {
+         cp.DestinationMemoryAddress = dst;
+         cp.SourceMemoryAddress = src;
+      }
+#else
+      /* IVB does not have a general purpose register for command streamer
+       * commands. Therefore, we use an alternate temporary register.
+       */
+#define BLORP_TEMP_REG 0x2440 /* GEN7_3DPRIM_BASE_VERTEX */
+      blorp_emit(batch, GENX(MI_LOAD_REGISTER_MEM), load) {
+         load.RegisterAddress = BLORP_TEMP_REG;
+         load.MemoryAddress = src;
+      }
+      blorp_emit(batch, GENX(MI_STORE_REGISTER_MEM), store) {
+         store.RegisterAddress = BLORP_TEMP_REG;
+         store.MemoryAddress = dst;
+      }
+#undef BLORP_TEMP_REG
+#endif
+      dst.offset += 4;
+      src.offset += 4;
+   }
+}
+#endif
+
+static void
+blorp_emit_surface_state(struct blorp_batch *batch,
+                         const struct brw_blorp_surface_info *surface,
+                         enum isl_aux_op op,
+                         void *state, uint32_t state_offset,
+                         const bool color_write_disables[4],
+                         bool is_render_target)
+{
+   const struct isl_device *isl_dev = batch->blorp->isl_dev;
+   struct isl_surf surf = surface->surf;
+
+   if (surf.dim == ISL_SURF_DIM_1D &&
+       surf.dim_layout == ISL_DIM_LAYOUT_GEN4_2D) {
+      assert(surf.logical_level0_px.height == 1);
+      surf.dim = ISL_SURF_DIM_2D;
+   }
+
+   /* Blorp doesn't support HiZ in any of the blit or slow-clear paths */
    enum isl_aux_usage aux_usage = surface->aux_usage;
    if (aux_usage == ISL_AUX_USAGE_HIZ)
       aux_usage = ISL_AUX_USAGE_NONE;
 
-   const uint32_t mocs =
-      is_render_target ? batch->blorp->mocs.rb : batch->blorp->mocs.tex;
+   isl_channel_mask_t write_disable_mask = 0;
+   if (is_render_target && GEN_GEN <= 5) {
+      if (color_write_disables[0])
+         write_disable_mask |= ISL_CHANNEL_RED_BIT;
+      if (color_write_disables[1])
+         write_disable_mask |= ISL_CHANNEL_GREEN_BIT;
+      if (color_write_disables[2])
+         write_disable_mask |= ISL_CHANNEL_BLUE_BIT;
+      if (color_write_disables[3])
+         write_disable_mask |= ISL_CHANNEL_ALPHA_BIT;
+   }
+
+   const bool use_clear_address =
+      GEN_GEN >= 10 && (surface->clear_color_addr.buffer != NULL);
 
    isl_surf_fill_state(batch->blorp->isl_dev, state,
                        .surf = &surf, .view = &surface->view,
                        .aux_surf = &surface->aux_surf, .aux_usage = aux_usage,
-                       .mocs = mocs, .clear_color = surface->clear_color);
+                       .mocs = surface->addr.mocs,
+                       .clear_color = surface->clear_color,
+                       .use_clear_address = use_clear_address,
+                       .write_disables = write_disable_mask);
 
-   blorp_surface_reloc(batch, state_offset + ss_info.reloc_dw * 4,
+   blorp_surface_reloc(batch, state_offset + isl_dev->ss.addr_offset,
                        surface->addr, 0);
 
    if (aux_usage != ISL_AUX_USAGE_NONE) {
@@ -937,161 +1377,382 @@ blorp_emit_surface_state(struct blorp_batch *batch,
        * surface buffer addresses are always 4K page alinged.
        */
       assert((surface->aux_addr.offset & 0xfff) == 0);
-      blorp_surface_reloc(batch, state_offset + ss_info.aux_reloc_dw * 4,
-                          surface->aux_addr, state[ss_info.aux_reloc_dw]);
+      uint32_t *aux_addr = state + isl_dev->ss.aux_addr_offset;
+      blorp_surface_reloc(batch, state_offset + isl_dev->ss.aux_addr_offset,
+                          surface->aux_addr, *aux_addr);
    }
+
+   if (surface->clear_color_addr.buffer) {
+#if GEN_GEN >= 10
+      assert((surface->clear_color_addr.offset & 0x3f) == 0);
+      uint32_t *clear_addr = state + isl_dev->ss.clear_color_state_offset;
+      blorp_surface_reloc(batch, state_offset +
+                          isl_dev->ss.clear_color_state_offset,
+                          surface->clear_color_addr, *clear_addr);
+#elif GEN_GEN >= 7
+      if (op == ISL_AUX_OP_FULL_RESOLVE || op == ISL_AUX_OP_PARTIAL_RESOLVE) {
+         struct blorp_address dst_addr = blorp_get_surface_base_address(batch);
+         dst_addr.offset += state_offset + isl_dev->ss.clear_value_offset;
+         blorp_emit_memcpy(batch, dst_addr, surface->clear_color_addr,
+                           isl_dev->ss.clear_value_size);
+      }
+#else
+      unreachable("Fast clears are only supported on gen7+");
+#endif
+   }
+
+   blorp_flush_range(batch, state, GENX(RENDER_SURFACE_STATE_length) * 4);
+}
+
+static void
+blorp_emit_null_surface_state(struct blorp_batch *batch,
+                              const struct brw_blorp_surface_info *surface,
+                              uint32_t *state)
+{
+   struct GENX(RENDER_SURFACE_STATE) ss = {
+      .SurfaceType = SURFTYPE_NULL,
+      .SurfaceFormat = ISL_FORMAT_R8G8B8A8_UNORM,
+      .Width = surface->surf.logical_level0_px.width - 1,
+      .Height = surface->surf.logical_level0_px.height - 1,
+      .MIPCountLOD = surface->view.base_level,
+      .MinimumArrayElement = surface->view.base_array_layer,
+      .Depth = surface->view.array_len - 1,
+      .RenderTargetViewExtent = surface->view.array_len - 1,
+#if GEN_GEN >= 6
+      .NumberofMultisamples = ffs(surface->surf.samples) - 1,
+#endif
+
+#if GEN_GEN >= 7
+      .SurfaceArray = surface->surf.dim != ISL_SURF_DIM_3D,
+#endif
+
+#if GEN_GEN >= 8
+      .TileMode = YMAJOR,
+#else
+      .TiledSurface = true,
+#endif
+   };
+
+   GENX(RENDER_SURFACE_STATE_pack)(NULL, state, &ss);
+
+   blorp_flush_range(batch, state, GENX(RENDER_SURFACE_STATE_length) * 4);
 }
 
 static void
 blorp_emit_surface_states(struct blorp_batch *batch,
                           const struct blorp_params *params)
 {
-   uint32_t bind_offset, surface_offsets[2];
+   const struct isl_device *isl_dev = batch->blorp->isl_dev;
+   uint32_t bind_offset = 0, surface_offsets[2];
    void *surface_maps[2];
 
-   const unsigned ss_size = GENX(RENDER_SURFACE_STATE_length) * 4;
-   const unsigned ss_align = GENX(RENDER_SURFACE_STATE_length) > 8 ? 64 : 32;
+   MAYBE_UNUSED bool has_indirect_clear_color = false;
+   if (params->use_pre_baked_binding_table) {
+      bind_offset = params->pre_baked_binding_table_offset;
+   } else {
+      unsigned num_surfaces = 1 + params->src.enabled;
+      blorp_alloc_binding_table(batch, num_surfaces,
+                                isl_dev->ss.size, isl_dev->ss.align,
+                                &bind_offset, surface_offsets, surface_maps);
+
+      if (params->dst.enabled) {
+         blorp_emit_surface_state(batch, &params->dst,
+                                  params->fast_clear_op,
+                                  surface_maps[BLORP_RENDERBUFFER_BT_INDEX],
+                                  surface_offsets[BLORP_RENDERBUFFER_BT_INDEX],
+                                  params->color_write_disable, true);
+         if (params->dst.clear_color_addr.buffer != NULL)
+            has_indirect_clear_color = true;
+      } else {
+         assert(params->depth.enabled || params->stencil.enabled);
+         const struct brw_blorp_surface_info *surface =
+            params->depth.enabled ? &params->depth : &params->stencil;
+         blorp_emit_null_surface_state(batch, surface,
+                                       surface_maps[BLORP_RENDERBUFFER_BT_INDEX]);
+      }
 
-   unsigned num_surfaces = 1 + (params->src.addr.buffer != NULL);
-   blorp_alloc_binding_table(batch, num_surfaces, ss_size, ss_align,
-                             &bind_offset, surface_offsets, surface_maps);
+      if (params->src.enabled) {
+         blorp_emit_surface_state(batch, &params->src,
+                                  params->fast_clear_op,
+                                  surface_maps[BLORP_TEXTURE_BT_INDEX],
+                                  surface_offsets[BLORP_TEXTURE_BT_INDEX],
+                                  NULL, false);
+         if (params->src.clear_color_addr.buffer != NULL)
+            has_indirect_clear_color = true;
+      }
+   }
 
-   blorp_emit_surface_state(batch, &params->dst,
-                            surface_maps[BLORP_RENDERBUFFER_BT_INDEX],
-                            surface_offsets[BLORP_RENDERBUFFER_BT_INDEX], true);
-   if (params->src.addr.buffer) {
-      blorp_emit_surface_state(batch, &params->src,
-                               surface_maps[BLORP_TEXTURE_BT_INDEX],
-                               surface_offsets[BLORP_TEXTURE_BT_INDEX], false);
+#if GEN_GEN >= 7
+   if (has_indirect_clear_color) {
+      /* Updating a surface state object may require that the state cache be
+       * invalidated. From the SKL PRM, Shared Functions -> State -> State
+       * Caching:
+       *
+       *    Whenever the RENDER_SURFACE_STATE object in memory pointed to by
+       *    the Binding Table Pointer (BTP) and Binding Table Index (BTI) is
+       *    modified [...], the L1 state cache must be invalidated to ensure
+       *    the new surface or sampler state is fetched from system memory.
+       */
+      blorp_emit(batch, GENX(PIPE_CONTROL), pipe) {
+         pipe.StateCacheInvalidationEnable = true;
+      }
    }
+#endif
 
 #if GEN_GEN >= 7
+   blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), bt);
+   blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_HS), bt);
+   blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_DS), bt);
+   blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_GS), bt);
+
    blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_PS), bt) {
       bt.PointertoPSBindingTable = bind_offset;
    }
-#else
+#elif GEN_GEN >= 6
    blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS), bt) {
       bt.PSBindingTableChange = true;
       bt.PointertoPSBindingTable = bind_offset;
    }
+#else
+   blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS), bt) {
+      bt.PointertoPSBindingTable = bind_offset;
+   }
 #endif
 }
 
 static void
-blorp_emit_sampler_state(struct blorp_batch *batch,
-                         const struct blorp_params *params)
+blorp_emit_depth_stencil_config(struct blorp_batch *batch,
+                                const struct blorp_params *params)
 {
-   struct GENX(SAMPLER_STATE) sampler = {
-      .MipModeFilter = MIPFILTER_NONE,
-      .MagModeFilter = MAPFILTER_LINEAR,
-      .MinModeFilter = MAPFILTER_LINEAR,
-      .MinLOD = 0,
-      .MaxLOD = 0,
-      .TCXAddressControlMode = TCM_CLAMP,
-      .TCYAddressControlMode = TCM_CLAMP,
-      .TCZAddressControlMode = TCM_CLAMP,
-      .MaximumAnisotropy = RATIO21,
-      .RAddressMinFilterRoundingEnable = true,
-      .RAddressMagFilterRoundingEnable = true,
-      .VAddressMinFilterRoundingEnable = true,
-      .VAddressMagFilterRoundingEnable = true,
-      .UAddressMinFilterRoundingEnable = true,
-      .UAddressMagFilterRoundingEnable = true,
-      .NonnormalizedCoordinateEnable = true,
-   };
+   const struct isl_device *isl_dev = batch->blorp->isl_dev;
 
-   uint32_t offset;
-   void *state = blorp_alloc_dynamic_state(batch, AUB_TRACE_SAMPLER_STATE,
-                                           GENX(SAMPLER_STATE_length) * 4,
-                                           32, &offset);
-   GENX(SAMPLER_STATE_pack)(NULL, state, &sampler);
+   uint32_t *dw = blorp_emit_dwords(batch, isl_dev->ds.size / 4);
+   if (dw == NULL)
+      return;
 
-#if GEN_GEN >= 7
-   blorp_emit(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS_PS), ssp) {
-      ssp.PointertoPSSamplerState = offset;
+   struct isl_depth_stencil_hiz_emit_info info = { };
+
+   if (params->depth.enabled) {
+      info.view = &params->depth.view;
+      info.mocs = params->depth.addr.mocs;
+   } else if (params->stencil.enabled) {
+      info.view = &params->stencil.view;
+      info.mocs = params->stencil.addr.mocs;
    }
-#else
-   blorp_emit(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS), ssp) {
-      ssp.VSSamplerStateChange = true;
-      ssp.GSSamplerStateChange = true;
-      ssp.PSSamplerStateChange = true;
-      ssp.PointertoPSSamplerState = offset;
+
+   if (params->depth.enabled) {
+      info.depth_surf = &params->depth.surf;
+
+      info.depth_address =
+         blorp_emit_reloc(batch, dw + isl_dev->ds.depth_offset / 4,
+                          params->depth.addr, 0);
+
+      info.hiz_usage = params->depth.aux_usage;
+      if (info.hiz_usage == ISL_AUX_USAGE_HIZ) {
+         info.hiz_surf = &params->depth.aux_surf;
+
+         struct blorp_address hiz_address = params->depth.aux_addr;
+#if GEN_GEN == 6
+         /* Sandy bridge hardware does not technically support mipmapped HiZ.
+          * However, we have a special layout that allows us to make it work
+          * anyway by manually offsetting to the specified miplevel.
+          */
+         assert(info.hiz_surf->dim_layout == ISL_DIM_LAYOUT_GEN6_STENCIL_HIZ);
+         uint32_t offset_B;
+         isl_surf_get_image_offset_B_tile_sa(info.hiz_surf,
+                                             info.view->base_level, 0, 0,
+                                             &offset_B, NULL, NULL);
+         hiz_address.offset += offset_B;
+#endif
+
+         info.hiz_address =
+            blorp_emit_reloc(batch, dw + isl_dev->ds.hiz_offset / 4,
+                             hiz_address, 0);
+
+         info.depth_clear_value = params->depth.clear_color.f32[0];
+      }
    }
+
+   if (params->stencil.enabled) {
+      info.stencil_surf = &params->stencil.surf;
+
+      struct blorp_address stencil_address = params->stencil.addr;
+#if GEN_GEN == 6
+      /* Sandy bridge hardware does not technically support mipmapped stencil.
+       * However, we have a special layout that allows us to make it work
+       * anyway by manually offsetting to the specified miplevel.
+       */
+      assert(info.stencil_surf->dim_layout == ISL_DIM_LAYOUT_GEN6_STENCIL_HIZ);
+      uint32_t offset_B;
+      isl_surf_get_image_offset_B_tile_sa(info.stencil_surf,
+                                          info.view->base_level, 0, 0,
+                                          &offset_B, NULL, NULL);
+      stencil_address.offset += offset_B;
 #endif
+
+      info.stencil_address =
+         blorp_emit_reloc(batch, dw + isl_dev->ds.stencil_offset / 4,
+                          stencil_address, 0);
+   }
+
+   isl_emit_depth_stencil_hiz_s(isl_dev, dw, &info);
 }
 
+#if GEN_GEN >= 8
+/* Emits the Optimized HiZ sequence specified in the BDW+ PRMs. The
+ * depth/stencil buffer extents are ignored to handle APIs which perform
+ * clearing operations without such information.
+ * */
 static void
-blorp_emit_3dstate_multisample(struct blorp_batch *batch,
-                               const struct blorp_params *params)
+blorp_emit_gen8_hiz_op(struct blorp_batch *batch,
+                       const struct blorp_params *params)
 {
-   const unsigned samples = params->dst.surf.samples;
+   /* We should be performing an operation on a depth or stencil buffer.
+    */
+   assert(params->depth.enabled || params->stencil.enabled);
 
-   blorp_emit(batch, GENX(3DSTATE_MULTISAMPLE), ms) {
-      ms.NumberofMultisamples       = __builtin_ffs(samples) - 1;
+   /* The stencil buffer should only be enabled if a fast clear operation is
+    * requested.
+    */
+   if (params->stencil.enabled)
+      assert(params->hiz_op == ISL_AUX_OP_FAST_CLEAR);
 
-#if GEN_GEN >= 8
-      /* The PRM says that this bit is valid only for DX9:
-       *
-       *    SW can choose to set this bit only for DX9 API. DX10/OGL API's
-       *    should not have any effect by setting or not setting this bit.
-       */
-      ms.PixelPositionOffsetEnable  = false;
-      ms.PixelLocation              = CENTER;
-#elif GEN_GEN >= 7
-      ms.PixelLocation              = PIXLOC_CENTER;
+   /* From the BDW PRM Volume 2, 3DSTATE_WM_HZ_OP:
+    *
+    * 3DSTATE_MULTISAMPLE packet must be used prior to this packet to change
+    * the Number of Multisamples. This packet must not be used to change
+    * Number of Multisamples in a rendering sequence.
+    *
+    * Since HIZ may be the first thing in a batch buffer, play safe and always
+    * emit 3DSTATE_MULTISAMPLE.
+    */
+   blorp_emit_3dstate_multisample(batch, params);
 
-      switch (samples) {
-      case 1:
-         GEN_SAMPLE_POS_1X(ms.Sample);
-         break;
-      case 2:
-         GEN_SAMPLE_POS_2X(ms.Sample);
-         break;
-      case 4:
-         GEN_SAMPLE_POS_4X(ms.Sample);
+   /* From the BDW PRM Volume 7, Depth Buffer Clear:
+    *
+    *    The clear value must be between the min and max depth values
+    *    (inclusive) defined in the CC_VIEWPORT. If the depth buffer format is
+    *    D32_FLOAT, then +/-DENORM values are also allowed.
+    *
+    * Set the bounds to match our hardware limits, [0.0, 1.0].
+    */
+   if (params->depth.enabled && params->hiz_op == ISL_AUX_OP_FAST_CLEAR) {
+      assert(params->depth.clear_color.f32[0] >= 0.0f);
+      assert(params->depth.clear_color.f32[0] <= 1.0f);
+      blorp_emit_cc_viewport(batch);
+   }
+
+   /* According to the SKL PRM formula for WM_INT::ThreadDispatchEnable, the
+    * 3DSTATE_WM::ForceThreadDispatchEnable field can force WM thread dispatch
+    * even when WM_HZ_OP is active.  However, WM thread dispatch is normally
+    * disabled for HiZ ops and it appears that force-enabling it can lead to
+    * GPU hangs on at least Skylake.  Since we don't know the current state of
+    * the 3DSTATE_WM packet, just emit a dummy one prior to 3DSTATE_WM_HZ_OP.
+    */
+   blorp_emit(batch, GENX(3DSTATE_WM), wm);
+
+   /* If we can't alter the depth stencil config and multiple layers are
+    * involved, the HiZ op will fail. This is because the op requires that a
+    * new config is emitted for each additional layer.
+    */
+   if (batch->flags & BLORP_BATCH_NO_EMIT_DEPTH_STENCIL) {
+      assert(params->num_layers <= 1);
+   } else {
+      blorp_emit_depth_stencil_config(batch, params);
+   }
+
+   blorp_emit(batch, GENX(3DSTATE_WM_HZ_OP), hzp) {
+      switch (params->hiz_op) {
+      case ISL_AUX_OP_FAST_CLEAR:
+         hzp.StencilBufferClearEnable = params->stencil.enabled;
+         hzp.DepthBufferClearEnable = params->depth.enabled;
+         hzp.StencilClearValue = params->stencil_ref;
+         hzp.FullSurfaceDepthandStencilClear = params->full_surface_hiz_op;
          break;
-      case 8:
-         GEN_SAMPLE_POS_8X(ms.Sample);
+      case ISL_AUX_OP_FULL_RESOLVE:
+         assert(params->full_surface_hiz_op);
+         hzp.DepthBufferResolveEnable = true;
          break;
-      default:
+      case ISL_AUX_OP_AMBIGUATE:
+         assert(params->full_surface_hiz_op);
+         hzp.HierarchicalDepthBufferResolveEnable = true;
          break;
+      case ISL_AUX_OP_PARTIAL_RESOLVE:
+      case ISL_AUX_OP_NONE:
+         unreachable("Invalid HIZ op");
       }
-#else
-      ms.PixelLocation              = PIXLOC_CENTER;
-      GEN_SAMPLE_POS_4X(ms.Sample);
-#endif
-   }
-}
 
-/* 3DSTATE_VIEWPORT_STATE_POINTERS */
-static void
-blorp_emit_viewport_state(struct blorp_batch *batch,
-                          const struct blorp_params *params)
-{
-   uint32_t cc_vp_offset;
+      hzp.NumberofMultisamples = ffs(params->num_samples) - 1;
+      hzp.SampleMask = 0xFFFF;
 
-   void *state = blorp_alloc_dynamic_state(batch, AUB_TRACE_CC_VP_STATE,
-                                           GENX(CC_VIEWPORT_length) * 4, 32,
-                                           &cc_vp_offset);
+      /* Due to a hardware issue, this bit MBZ */
+      assert(hzp.ScissorRectangleEnable == false);
 
-   GENX(CC_VIEWPORT_pack)(batch, state,
-      &(struct GENX(CC_VIEWPORT)) {
-         .MinimumDepth = 0.0,
-         .MaximumDepth = 1.0,
-      });
+      /* Contrary to the HW docs both fields are inclusive */
+      hzp.ClearRectangleXMin = params->x0;
+      hzp.ClearRectangleYMin = params->y0;
 
-#if GEN_GEN >= 7
-   blorp_emit(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), vsp) {
-      vsp.CCViewportPointer = cc_vp_offset;
+      /* Contrary to the HW docs both fields are exclusive */
+      hzp.ClearRectangleXMax = params->x1;
+      hzp.ClearRectangleYMax = params->y1;
    }
-#else
-   blorp_emit(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vsp) {
-      vsp.CCViewportStateChange = true;
-      vsp.PointertoCC_VIEWPORT = cc_vp_offset;
+
+   /* PIPE_CONTROL w/ all bits clear except for “Post-Sync Operation” must set
+    * to “Write Immediate Data” enabled.
+    */
+   blorp_emit(batch, GENX(PIPE_CONTROL), pc) {
+      pc.PostSyncOperation = WriteImmediateData;
+      pc.Address = blorp_get_workaround_page(batch);
    }
-#endif
+
+   blorp_emit(batch, GENX(3DSTATE_WM_HZ_OP), hzp);
 }
+#endif
 
+static void
+blorp_update_clear_color(struct blorp_batch *batch,
+                         const struct brw_blorp_surface_info *info,
+                         enum isl_aux_op op)
+{
+   if (info->clear_color_addr.buffer && op == ISL_AUX_OP_FAST_CLEAR) {
+#if GEN_GEN >= 9
+      for (int i = 0; i < 4; i++) {
+         blorp_emit(batch, GENX(MI_STORE_DATA_IMM), sdi) {
+            sdi.Address = info->clear_color_addr;
+            sdi.Address.offset += i * 4;
+            sdi.ImmediateData = info->clear_color.u32[i];
+         }
+      }
+#elif GEN_GEN >= 7
+      blorp_emit(batch, GENX(MI_STORE_DATA_IMM), sdi) {
+         sdi.Address = info->clear_color_addr;
+         sdi.ImmediateData = ISL_CHANNEL_SELECT_RED   << 25 |
+                             ISL_CHANNEL_SELECT_GREEN << 22 |
+                             ISL_CHANNEL_SELECT_BLUE  << 19 |
+                             ISL_CHANNEL_SELECT_ALPHA << 16;
+         if (isl_format_has_int_channel(info->view.format)) {
+            for (unsigned i = 0; i < 4; i++) {
+               assert(info->clear_color.u32[i] == 0 ||
+                      info->clear_color.u32[i] == 1);
+            }
+            sdi.ImmediateData |= (info->clear_color.u32[0] != 0) << 31;
+            sdi.ImmediateData |= (info->clear_color.u32[1] != 0) << 30;
+            sdi.ImmediateData |= (info->clear_color.u32[2] != 0) << 29;
+            sdi.ImmediateData |= (info->clear_color.u32[3] != 0) << 28;
+         } else {
+            for (unsigned i = 0; i < 4; i++) {
+               assert(info->clear_color.f32[i] == 0.0f ||
+                      info->clear_color.f32[i] == 1.0f);
+            }
+            sdi.ImmediateData |= (info->clear_color.f32[0] != 0.0f) << 31;
+            sdi.ImmediateData |= (info->clear_color.f32[1] != 0.0f) << 30;
+            sdi.ImmediateData |= (info->clear_color.f32[2] != 0.0f) << 29;
+            sdi.ImmediateData |= (info->clear_color.f32[3] != 0.0f) << 28;
+         }
+      }
+#endif
+   }
+}
 
 /**
  * \brief Execute a blit or render pass operation.
@@ -1105,121 +1766,37 @@ blorp_emit_viewport_state(struct blorp_batch *batch,
 static void
 blorp_exec(struct blorp_batch *batch, const struct blorp_params *params)
 {
-   uint32_t blend_state_offset = 0;
-   uint32_t color_calc_state_offset = 0;
-   uint32_t depth_stencil_state_offset;
-
-   blorp_emit_vertex_buffers(batch, params);
-   blorp_emit_vertex_elements(batch, params);
-
-   emit_urb_config(batch, params);
-
-   if (params->wm_prog_data) {
-      blend_state_offset = blorp_emit_blend_state(batch, params);
-      color_calc_state_offset = blorp_emit_color_calc_state(batch, params);
-   }
-   depth_stencil_state_offset = blorp_emit_depth_stencil_state(batch, params);
-
-#if GEN_GEN <= 6
-   /* 3DSTATE_CC_STATE_POINTERS
-    *
-    * The pointer offsets are relative to
-    * CMD_STATE_BASE_ADDRESS.DynamicStateBaseAddress.
-    *
-    * The HiZ op doesn't use BLEND_STATE or COLOR_CALC_STATE.
-    *
-    * The dynamic state emit helpers emit their own STATE_POINTERS packets on
-    * gen7+.  However, on gen6 and earlier, they're all lumpped together in
-    * one CC_STATE_POINTERS packet so we have to emit that here.
-    */
-   blorp_emit(batch, GENX(3DSTATE_CC_STATE_POINTERS), cc) {
-      cc.BLEND_STATEChange = true;
-      cc.COLOR_CALC_STATEChange = true;
-      cc.DEPTH_STENCIL_STATEChange = true;
-      cc.PointertoBLEND_STATE = blend_state_offset;
-      cc.PointertoCOLOR_CALC_STATE = color_calc_state_offset;
-      cc.PointertoDEPTH_STENCIL_STATE = depth_stencil_state_offset;
+   if (!(batch->flags & BLORP_BATCH_NO_UPDATE_CLEAR_COLOR)) {
+      blorp_update_clear_color(batch, &params->dst, params->fast_clear_op);
+      blorp_update_clear_color(batch, &params->depth, params->hiz_op);
    }
-#else
-   (void)blend_state_offset;
-   (void)color_calc_state_offset;
-   (void)depth_stencil_state_offset;
-#endif
 
-   blorp_emit(batch, GENX(3DSTATE_CONSTANT_VS), vs);
-#if GEN_GEN >= 7
-   blorp_emit(batch, GENX(3DSTATE_CONSTANT_HS), hs);
-   blorp_emit(batch, GENX(3DSTATE_CONSTANT_DS), DS);
-#endif
-   blorp_emit(batch, GENX(3DSTATE_CONSTANT_GS), gs);
-   blorp_emit(batch, GENX(3DSTATE_CONSTANT_PS), ps);
-
-   if (params->wm_prog_data)
-      blorp_emit_surface_states(batch, params);
-
-   if (params->src.addr.buffer)
-      blorp_emit_sampler_state(batch, params);
-
-   blorp_emit_3dstate_multisample(batch, params);
-
-   blorp_emit(batch, GENX(3DSTATE_SAMPLE_MASK), mask) {
-      mask.SampleMask = (1 << params->dst.surf.samples) - 1;
+#if GEN_GEN >= 8
+   if (params->hiz_op != ISL_AUX_OP_NONE) {
+      blorp_emit_gen8_hiz_op(batch, params);
+      return;
    }
-
-   /* From the BSpec, 3D Pipeline > Geometry > Vertex Shader > State,
-    * 3DSTATE_VS, Dword 5.0 "VS Function Enable":
-    *
-    *   [DevSNB] A pipeline flush must be programmed prior to a
-    *   3DSTATE_VS command that causes the VS Function Enable to
-    *   toggle. Pipeline flush can be executed by sending a PIPE_CONTROL
-    *   command with CS stall bit set and a post sync operation.
-    *
-    * We've already done one at the start of the BLORP operation.
-    */
-   blorp_emit(batch, GENX(3DSTATE_VS), vs);
-#if GEN_GEN >= 7
-   blorp_emit(batch, GENX(3DSTATE_HS), hs);
-   blorp_emit(batch, GENX(3DSTATE_TE), te);
-   blorp_emit(batch, GENX(3DSTATE_DS), DS);
-   blorp_emit(batch, GENX(3DSTATE_STREAMOUT), so);
 #endif
-   blorp_emit(batch, GENX(3DSTATE_GS), gs);
 
-   blorp_emit(batch, GENX(3DSTATE_CLIP), clip) {
-      clip.PerspectiveDivideDisable = true;
-   }
+   blorp_emit_vertex_buffers(batch, params);
+   blorp_emit_vertex_elements(batch, params);
 
-   blorp_emit_sf_config(batch, params);
-   blorp_emit_ps_config(batch, params);
+   blorp_emit_pipeline(batch, params);
 
-   blorp_emit_viewport_state(batch, params);
+   blorp_emit_surface_states(batch, params);
 
-   if (params->depth.addr.buffer) {
+   if (!(batch->flags & BLORP_BATCH_NO_EMIT_DEPTH_STENCIL))
       blorp_emit_depth_stencil_config(batch, params);
-   } else {
-      blorp_emit(batch, GENX(3DSTATE_DEPTH_BUFFER), db) {
-         db.SurfaceType = SURFTYPE_NULL;
-         db.SurfaceFormat = D32_FLOAT;
-      }
-      blorp_emit(batch, GENX(3DSTATE_HIER_DEPTH_BUFFER), hiz);
-      blorp_emit(batch, GENX(3DSTATE_STENCIL_BUFFER), sb);
-   }
-
-   /* 3DSTATE_CLEAR_PARAMS
-    *
-    * From the Sandybridge PRM, Volume 2, Part 1, Section 3DSTATE_CLEAR_PARAMS:
-    *   [DevSNB] 3DSTATE_CLEAR_PARAMS packet must follow the DEPTH_BUFFER_STATE
-    *   packet when HiZ is enabled and the DEPTH_BUFFER_STATE changes.
-    */
-   blorp_emit(batch, GENX(3DSTATE_CLEAR_PARAMS), clear) {
-      clear.DepthClearValueValid = true;
-      clear.DepthClearValue = params->depth.clear_color.u32[0];
-   }
 
    blorp_emit(batch, GENX(3DPRIMITIVE), prim) {
       prim.VertexAccessType = SEQUENTIAL;
       prim.PrimitiveTopologyType = _3DPRIM_RECTLIST;
+#if GEN_GEN >= 7
+      prim.PredicateEnable = batch->flags & BLORP_BATCH_PREDICATE_ENABLE;
+#endif
       prim.VertexCountPerInstance = 3;
       prim.InstanceCount = params->num_layers;
    }
 }
+
+#endif /* BLORP_GENX_EXEC_H */