intel/blorp: Use an actual chunk of vertex buffer for the VUE header
[mesa.git] / src / intel / blorp / blorp_genX_exec.h
index eb4a5b9f05ad11b414556bb9afb425692e7c3f8f..78fb3ffe4ca34d1ce99f39d8664363813bf1ad9d 100644 (file)
@@ -21,6 +21,9 @@
  * IN THE SOFTWARE.
  */
 
+#ifndef BLORP_GENX_EXEC_H
+#define BLORP_GENX_EXEC_H
+
 #include "blorp_priv.h"
 #include "common/gen_device_info.h"
 #include "common/gen_sample_positions.h"
@@ -171,9 +174,9 @@ blorp_emit_vertex_data(struct blorp_batch *batch,
                        uint32_t *size)
 {
    const float vertices[] = {
-      /* v0 */ (float)params->x0, (float)params->y1,
-      /* v1 */ (float)params->x1, (float)params->y1,
-      /* v2 */ (float)params->x0, (float)params->y0,
+      /* v0 */ (float)params->x1, (float)params->y1, params->z,
+      /* v1 */ (float)params->x0, (float)params->y1, params->z,
+      /* v2 */ (float)params->x0, (float)params->y0, params->z,
    };
 
    void *data = blorp_alloc_vertex_buffer(batch, sizeof(vertices), addr);
@@ -190,26 +193,34 @@ blorp_emit_input_varying_data(struct blorp_batch *batch,
    const unsigned vec4_size_in_bytes = 4 * sizeof(float);
    const unsigned max_num_varyings =
       DIV_ROUND_UP(sizeof(params->wm_inputs), vec4_size_in_bytes);
-   const unsigned num_varyings = params->wm_prog_data->num_varying_inputs;
+   const unsigned num_varyings =
+      params->wm_prog_data ? params->wm_prog_data->num_varying_inputs : 0;
 
-   *size = num_varyings * vec4_size_in_bytes;
+   *size = 16 + num_varyings * vec4_size_in_bytes;
 
-   const float *const inputs_src = (const float *)&params->wm_inputs;
-   float *inputs = blorp_alloc_vertex_buffer(batch, *size, addr);
+   const uint32_t *const inputs_src = (const uint32_t *)&params->wm_inputs;
+   uint32_t *inputs = blorp_alloc_vertex_buffer(batch, *size, addr);
 
-   /* Walk over the attribute slots, determine if the attribute is used by
-    * the program and when necessary copy the values from the input storage to
-    * the vertex data buffer.
-    */
-   for (unsigned i = 0; i < max_num_varyings; i++) {
-      const gl_varying_slot attr = VARYING_SLOT_VAR0 + i;
+   /* Zero data for the VUE header */
+   memset(inputs, 0, 4 * sizeof(uint32_t));
+   inputs += 4;
+
+   if (params->wm_prog_data) {
+      /* Walk over the attribute slots, determine if the attribute is used by
+       * the program and when necessary copy the values from the input storage
+       * to the vertex data buffer.
+       */
+      for (unsigned i = 0; i < max_num_varyings; i++) {
+         const gl_varying_slot attr = VARYING_SLOT_VAR0 + i;
 
-      if (!(params->wm_prog_data->inputs_read & (1ull << attr)))
-         continue;
+         const int input_index = params->wm_prog_data->urb_setup[attr];
+         if (input_index < 0)
+            continue;
 
-      memcpy(inputs, inputs_src + i * 4, vec4_size_in_bytes);
+         memcpy(inputs, inputs_src + i * 4, vec4_size_in_bytes);
 
-      inputs += 4;
+         inputs += 4;
+      }
    }
 }
 
@@ -220,12 +231,10 @@ blorp_emit_vertex_buffers(struct blorp_batch *batch,
    struct GENX(VERTEX_BUFFER_STATE) vb[2];
    memset(vb, 0, sizeof(vb));
 
-   unsigned num_buffers = 1;
-
    uint32_t size;
    blorp_emit_vertex_data(batch, params, &vb[0].BufferStartingAddress, &size);
    vb[0].VertexBufferIndex = 0;
-   vb[0].BufferPitch = 2 * sizeof(float);
+   vb[0].BufferPitch = 3 * sizeof(float);
    vb[0].VertexBufferMOCS = batch->blorp->mocs.vb;
 #if GEN_GEN >= 7
    vb[0].AddressModifyEnable = true;
@@ -238,30 +247,26 @@ blorp_emit_vertex_buffers(struct blorp_batch *batch,
    vb[0].EndAddress.offset += size - 1;
 #endif
 
-   if (params->wm_prog_data && params->wm_prog_data->num_varying_inputs) {
-      blorp_emit_input_varying_data(batch, params,
-                                    &vb[1].BufferStartingAddress, &size);
-      vb[1].VertexBufferIndex = 1;
-      vb[1].BufferPitch = 0;
-      vb[1].VertexBufferMOCS = batch->blorp->mocs.vb;
+   blorp_emit_input_varying_data(batch, params,
+                                 &vb[1].BufferStartingAddress, &size);
+   vb[1].VertexBufferIndex = 1;
+   vb[1].BufferPitch = 0;
+   vb[1].VertexBufferMOCS = batch->blorp->mocs.vb;
 #if GEN_GEN >= 7
-      vb[1].AddressModifyEnable = true;
+   vb[1].AddressModifyEnable = true;
 #endif
 #if GEN_GEN >= 8
-      vb[1].BufferSize = size;
+   vb[1].BufferSize = size;
 #else
-      vb[1].BufferAccessType = INSTANCEDATA;
-      vb[1].EndAddress = vb[1].BufferStartingAddress;
-      vb[1].EndAddress.offset += size - 1;
+   vb[1].BufferAccessType = INSTANCEDATA;
+   vb[1].EndAddress = vb[1].BufferStartingAddress;
+   vb[1].EndAddress.offset += size - 1;
 #endif
-      num_buffers++;
-   }
 
-   const unsigned num_dwords =
-      1 + GENX(VERTEX_BUFFER_STATE_length) * num_buffers;
+   const unsigned num_dwords = 1 + GENX(VERTEX_BUFFER_STATE_length) * 2;
    uint32_t *dw = blorp_emitn(batch, GENX(3DSTATE_VERTEX_BUFFERS), num_dwords);
 
-   for (unsigned i = 0; i < num_buffers; i++) {
+   for (unsigned i = 0; i < 2; i++) {
       GENX(VERTEX_BUFFER_STATE_pack)(batch, dw, &vb[i]);
       dw += GENX(VERTEX_BUFFER_STATE_length);
    }
@@ -287,7 +292,7 @@ blorp_emit_vertex_elements(struct blorp_batch *batch,
     *   v2 ------ implied
     *    |        |
     *    |        |
-    *   v0 ----- v1
+    *   v1 ----- v0
     *
     * Since the VS is disabled, the clipper loads each VUE directly from
     * the URB. This is controlled by the 3DSTATE_VERTEX_BUFFERS and
@@ -324,7 +329,7 @@ blorp_emit_vertex_elements(struct blorp_batch *batch,
     *
     * See the vertex element setup below.
     */
-   ve[0].VertexBufferIndex = 0;
+   ve[0].VertexBufferIndex = 1;
    ve[0].Valid = true;
    ve[0].SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT;
    ve[0].SourceElementOffset = 0;
@@ -344,18 +349,18 @@ blorp_emit_vertex_elements(struct blorp_batch *batch,
 
    ve[1].VertexBufferIndex = 0;
    ve[1].Valid = true;
-   ve[1].SourceElementFormat = ISL_FORMAT_R32G32_FLOAT;
+   ve[1].SourceElementFormat = ISL_FORMAT_R32G32B32_FLOAT;
    ve[1].SourceElementOffset = 0;
    ve[1].Component0Control = VFCOMP_STORE_SRC;
    ve[1].Component1Control = VFCOMP_STORE_SRC;
-   ve[1].Component2Control = VFCOMP_STORE_0;
+   ve[1].Component2Control = VFCOMP_STORE_SRC;
    ve[1].Component3Control = VFCOMP_STORE_1_FP;
 
    for (unsigned i = 0; i < num_varyings; ++i) {
       ve[i + 2].VertexBufferIndex = 1;
       ve[i + 2].Valid = true;
       ve[i + 2].SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT;
-      ve[i + 2].SourceElementOffset = i * 4 * sizeof(float);
+      ve[i + 2].SourceElementOffset = 16 + i * 4 * sizeof(float);
       ve[i + 2].Component0Control = VFCOMP_STORE_SRC;
       ve[i + 2].Component1Control = VFCOMP_STORE_SRC;
       ve[i + 2].Component2Control = VFCOMP_STORE_SRC;
@@ -398,7 +403,7 @@ static void
 blorp_emit_sf_config(struct blorp_batch *batch,
                      const struct blorp_params *params)
 {
-   const struct brw_blorp_prog_data *prog_data = params->wm_prog_data;
+   const struct brw_wm_prog_data *prog_data = params->wm_prog_data;
 
    /* 3DSTATE_SF
     *
@@ -429,11 +434,16 @@ blorp_emit_sf_config(struct blorp_batch *batch,
 
    blorp_emit(batch, GENX(3DSTATE_SBE), sbe) {
       sbe.VertexURBEntryReadOffset = 1;
-      sbe.NumberofSFOutputAttributes = prog_data->num_varying_inputs;
-      sbe.VertexURBEntryReadLength = brw_blorp_get_urb_length(prog_data);
+      if (prog_data) {
+         sbe.NumberofSFOutputAttributes = prog_data->num_varying_inputs;
+         sbe.VertexURBEntryReadLength = brw_blorp_get_urb_length(prog_data);
+         sbe.ConstantInterpolationEnable = prog_data->flat_inputs;
+      } else {
+         sbe.NumberofSFOutputAttributes = 0;
+         sbe.VertexURBEntryReadLength = 1;
+      }
       sbe.ForceVertexURBEntryReadLength = true;
       sbe.ForceVertexURBEntryReadOffset = true;
-      sbe.ConstantInterpolationEnable = prog_data->flat_inputs;
 
 #if GEN_GEN >= 9
       for (unsigned i = 0; i < 32; i++)
@@ -447,7 +457,7 @@ blorp_emit_sf_config(struct blorp_batch *batch,
       sf.FrontFaceFillMode = FILL_MODE_SOLID;
       sf.BackFaceFillMode = FILL_MODE_SOLID;
 
-      sf.MultisampleRasterizationMode = params->dst.surf.samples > 1 ?
+      sf.MultisampleRasterizationMode = params->num_samples > 1 ?
          MSRASTMODE_ON_PATTERN : MSRASTMODE_OFF_PIXEL;
 
 #if GEN_GEN == 7
@@ -473,7 +483,7 @@ blorp_emit_sf_config(struct blorp_batch *batch,
       sf.FrontFaceFillMode = FILL_MODE_SOLID;
       sf.BackFaceFillMode = FILL_MODE_SOLID;
 
-      sf.MultisampleRasterizationMode = params->dst.surf.samples > 1 ?
+      sf.MultisampleRasterizationMode = params->num_samples > 1 ?
          MSRASTMODE_ON_PATTERN : MSRASTMODE_OFF_PIXEL;
 
       sf.VertexURBEntryReadOffset = 1;
@@ -494,7 +504,7 @@ static void
 blorp_emit_ps_config(struct blorp_batch *batch,
                      const struct blorp_params *params)
 {
-   const struct brw_blorp_prog_data *prog_data = params->wm_prog_data;
+   const struct brw_wm_prog_data *prog_data = params->wm_prog_data;
 
    /* Even when thread dispatch is disabled, max threads (dw5.25:31) must be
     * nonzero to prevent the GPU from hanging.  While the documentation doesn't
@@ -510,24 +520,26 @@ blorp_emit_ps_config(struct blorp_batch *batch,
    blorp_emit(batch, GENX(3DSTATE_WM), wm);
 
    blorp_emit(batch, GENX(3DSTATE_PS), ps) {
-      if (params->src.addr.buffer) {
+      if (params->src.enabled) {
          ps.SamplerCount = 1; /* Up to 4 samplers */
          ps.BindingTableEntryCount = 2;
       } else {
          ps.BindingTableEntryCount = 1;
       }
 
-      ps.DispatchGRFStartRegisterForConstantSetupData0 =
-         prog_data->first_curbe_grf_0;
-      ps.DispatchGRFStartRegisterForConstantSetupData2 =
-         prog_data->first_curbe_grf_2;
+      if (prog_data) {
+         ps.DispatchGRFStartRegisterForConstantSetupData0 =
+            prog_data->base.dispatch_grf_start_reg;
+         ps.DispatchGRFStartRegisterForConstantSetupData2 =
+            prog_data->dispatch_grf_start_reg_2;
 
-      ps._8PixelDispatchEnable = prog_data->dispatch_8;
-      ps._16PixelDispatchEnable = prog_data->dispatch_16;
+         ps._8PixelDispatchEnable = prog_data->dispatch_8;
+         ps._16PixelDispatchEnable = prog_data->dispatch_16;
 
-      ps.KernelStartPointer0 = params->wm_prog_kernel;
-      ps.KernelStartPointer2 =
-         params->wm_prog_kernel + prog_data->ksp_offset_2;
+         ps.KernelStartPointer0 = params->wm_prog_kernel;
+         ps.KernelStartPointer2 =
+            params->wm_prog_kernel + prog_data->prog_offset_2;
+      }
 
       /* 3DSTATE_PS expects the number of threads per PSD, which is always 64;
        * it implicitly scales for different GT levels (which have some # of
@@ -564,15 +576,14 @@ blorp_emit_ps_config(struct blorp_batch *batch,
    }
 
    blorp_emit(batch, GENX(3DSTATE_PS_EXTRA), psx) {
-      psx.PixelShaderValid = true;
+      if (prog_data) {
+         psx.PixelShaderValid = true;
+         psx.AttributeEnable = prog_data->num_varying_inputs > 0;
+         psx.PixelShaderIsPerSample = prog_data->persample_dispatch;
+      }
 
-      if (params->src.addr.buffer)
+      if (params->src.enabled)
          psx.PixelShaderKillsPixel = true;
-
-      psx.AttributeEnable = prog_data->num_varying_inputs > 0;
-
-      if (prog_data && prog_data->persample_msaa_dispatch)
-         psx.PixelShaderIsPerSample = true;
    }
 
 #elif GEN_GEN >= 7
@@ -597,13 +608,13 @@ blorp_emit_ps_config(struct blorp_batch *batch,
       if (prog_data)
          wm.ThreadDispatchEnable = true;
 
-      if (params->src.addr.buffer)
-         wm.PixelShaderKillPixel = true;
+      if (params->src.enabled)
+         wm.PixelShaderKillsPixel = true;
 
-      if (params->dst.surf.samples > 1) {
+      if (params->num_samples > 1) {
          wm.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
          wm.MultisampleDispatchMode =
-            (prog_data && prog_data->persample_msaa_dispatch) ?
+            (prog_data && prog_data->persample_dispatch) ?
             MSDISPMODE_PERSAMPLE : MSDISPMODE_PERPIXEL;
       } else {
          wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
@@ -620,14 +631,14 @@ blorp_emit_ps_config(struct blorp_batch *batch,
 #endif
 
       if (prog_data) {
-         ps.DispatchGRFStartRegisterforConstantSetupData0 =
-            prog_data->first_curbe_grf_0;
-         ps.DispatchGRFStartRegisterforConstantSetupData2 =
-            prog_data->first_curbe_grf_2;
+         ps.DispatchGRFStartRegisterForConstantSetupData0 =
+            prog_data->base.dispatch_grf_start_reg;
+         ps.DispatchGRFStartRegisterForConstantSetupData2 =
+            prog_data->dispatch_grf_start_reg_2;
 
          ps.KernelStartPointer0 = params->wm_prog_kernel;
          ps.KernelStartPointer2 =
-            params->wm_prog_kernel + prog_data->ksp_offset_2;
+            params->wm_prog_kernel + prog_data->prog_offset_2;
 
          ps._8PixelDispatchEnable = prog_data->dispatch_8;
          ps._16PixelDispatchEnable = prog_data->dispatch_16;
@@ -640,7 +651,7 @@ blorp_emit_ps_config(struct blorp_batch *batch,
          ps._16PixelDispatchEnable = true;
       }
 
-      if (params->src.addr.buffer)
+      if (params->src.enabled)
          ps.SamplerCount = 1; /* Up to 4 samplers */
 
       switch (params->fast_clear_op) {
@@ -682,14 +693,14 @@ blorp_emit_ps_config(struct blorp_batch *batch,
       if (prog_data) {
          wm.ThreadDispatchEnable = true;
 
-         wm.DispatchGRFStartRegisterforConstantSetupData0 =
-            prog_data->first_curbe_grf_0;
-         wm.DispatchGRFStartRegisterforConstantSetupData2 =
-            prog_data->first_curbe_grf_2;
+         wm.DispatchGRFStartRegisterForConstantSetupData0 =
+            prog_data->base.dispatch_grf_start_reg;
+         wm.DispatchGRFStartRegisterForConstantSetupData2 =
+            prog_data->dispatch_grf_start_reg_2;
 
          wm.KernelStartPointer0 = params->wm_prog_kernel;
          wm.KernelStartPointer2 =
-            params->wm_prog_kernel + prog_data->ksp_offset_2;
+            params->wm_prog_kernel + prog_data->prog_offset_2;
 
          wm._8PixelDispatchEnable = prog_data->dispatch_8;
          wm._16PixelDispatchEnable = prog_data->dispatch_16;
@@ -697,15 +708,15 @@ blorp_emit_ps_config(struct blorp_batch *batch,
          wm.NumberofSFOutputAttributes = prog_data->num_varying_inputs;
       }
 
-      if (params->src.addr.buffer) {
+      if (params->src.enabled) {
          wm.SamplerCount = 1; /* Up to 4 samplers */
-         wm.PixelShaderKillPixel = true; /* TODO: temporarily smash on */
+         wm.PixelShaderKillsPixel = true; /* TODO: temporarily smash on */
       }
 
-      if (params->dst.surf.samples > 1) {
+      if (params->num_samples > 1) {
          wm.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
          wm.MultisampleDispatchMode =
-            (prog_data && prog_data->persample_msaa_dispatch) ?
+            (prog_data && prog_data->persample_dispatch) ?
             MSDISPMODE_PERSAMPLE : MSDISPMODE_PERPIXEL;
       } else {
          wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
@@ -716,6 +727,24 @@ blorp_emit_ps_config(struct blorp_batch *batch,
 #endif /* GEN_GEN */
 }
 
+static const uint32_t isl_to_gen_ds_surftype [] = {
+#if GEN_GEN >= 9
+   /* From the SKL PRM, "3DSTATE_DEPTH_STENCIL::SurfaceType":
+    *
+    *    "If depth/stencil is enabled with 1D render target, depth/stencil
+    *    surface type needs to be set to 2D surface type and height set to 1.
+    *    Depth will use (legacy) TileY and stencil will use TileW. For this
+    *    case only, the Surface Type of the depth buffer can be 2D while the
+    *    Surface Type of the render target(s) are 1D, representing an
+    *    exception to a programming note above.
+    */
+   [ISL_SURF_DIM_1D] = SURFTYPE_2D,
+#else
+   [ISL_SURF_DIM_1D] = SURFTYPE_1D,
+#endif
+   [ISL_SURF_DIM_2D] = SURFTYPE_2D,
+   [ISL_SURF_DIM_3D] = SURFTYPE_3D,
+};
 
 static void
 blorp_emit_depth_stencil_config(struct blorp_batch *batch,
@@ -728,54 +757,100 @@ blorp_emit_depth_stencil_config(struct blorp_batch *batch,
 #endif
 
    blorp_emit(batch, GENX(3DSTATE_DEPTH_BUFFER), db) {
-      switch (params->depth.surf.dim) {
-      case ISL_SURF_DIM_1D:
-         db.SurfaceType = SURFTYPE_1D;
-         break;
-      case ISL_SURF_DIM_2D:
-         db.SurfaceType = SURFTYPE_2D;
-         break;
-      case ISL_SURF_DIM_3D:
-         db.SurfaceType = SURFTYPE_3D;
-         break;
-      }
-
-      db.SurfaceFormat = params->depth_format;
-
 #if GEN_GEN >= 7
-      db.DepthWriteEnable = true;
+      db.DepthWriteEnable = params->depth.enabled;
+      db.StencilWriteEnable = params->stencil.enabled;
 #endif
 
 #if GEN_GEN <= 6
-      db.TiledSurface = true;
-      db.TileWalk = TILEWALK_YMAJOR;
-      db.MIPMapLayoutMode = MIPLAYOUT_BELOW;
       db.SeparateStencilBufferEnable = true;
 #endif
 
-      db.HierarchicalDepthBufferEnable = true;
+      if (params->depth.enabled) {
+         db.SurfaceFormat = params->depth_format;
+         db.SurfaceType = isl_to_gen_ds_surftype[params->depth.surf.dim];
+
+#if GEN_GEN <= 6
+         db.TiledSurface = true;
+         db.TileWalk = TILEWALK_YMAJOR;
+         db.MIPMapLayoutMode = MIPLAYOUT_BELOW;
+#endif
+
+         db.HierarchicalDepthBufferEnable =
+            params->depth.aux_usage == ISL_AUX_USAGE_HIZ;
 
-      db.Width = params->depth.surf.logical_level0_px.width - 1;
-      db.Height = params->depth.surf.logical_level0_px.height - 1;
-      db.RenderTargetViewExtent = db.Depth =
-         MAX2(params->depth.surf.logical_level0_px.depth,
-              params->depth.surf.logical_level0_px.array_len) - 1;
+         db.Width = params->depth.surf.logical_level0_px.width - 1;
+         db.Height = params->depth.surf.logical_level0_px.height - 1;
+         db.RenderTargetViewExtent = db.Depth =
+            params->depth.view.array_len - 1;
 
-      db.LOD = params->depth.view.base_level;
-      db.MinimumArrayElement = params->depth.view.base_array_layer;
+         db.LOD = params->depth.view.base_level;
+         db.MinimumArrayElement = params->depth.view.base_array_layer;
 
-      db.SurfacePitch = params->depth.surf.row_pitch - 1;
-      db.SurfaceBaseAddress = params->depth.addr;
-      db.DepthBufferMOCS = mocs;
+         db.SurfacePitch = params->depth.surf.row_pitch - 1;
+#if GEN_GEN >= 8
+         db.SurfaceQPitch =
+            isl_surf_get_array_pitch_el_rows(&params->depth.surf) >> 2,
+#endif
+
+         db.SurfaceBaseAddress = params->depth.addr;
+         db.DepthBufferMOCS = mocs;
+      } else if (params->stencil.enabled) {
+         db.SurfaceFormat = D32_FLOAT;
+         db.SurfaceType = isl_to_gen_ds_surftype[params->stencil.surf.dim];
+
+         db.Width = params->stencil.surf.logical_level0_px.width - 1;
+         db.Height = params->stencil.surf.logical_level0_px.height - 1;
+         db.RenderTargetViewExtent = db.Depth =
+            params->stencil.view.array_len - 1;
+
+         db.LOD = params->stencil.view.base_level;
+         db.MinimumArrayElement = params->stencil.view.base_array_layer;
+      } else {
+         db.SurfaceType = SURFTYPE_NULL;
+         db.SurfaceFormat = D32_FLOAT;
+      }
    }
 
    blorp_emit(batch, GENX(3DSTATE_HIER_DEPTH_BUFFER), hiz) {
-      hiz.SurfacePitch = params->depth.aux_surf.row_pitch - 1;
-      hiz.SurfaceBaseAddress = params->depth.aux_addr;
-      hiz.HierarchicalDepthBufferMOCS = mocs;
+      if (params->depth.aux_usage == ISL_AUX_USAGE_HIZ) {
+         hiz.SurfacePitch = params->depth.aux_surf.row_pitch - 1;
+         hiz.SurfaceBaseAddress = params->depth.aux_addr;
+         hiz.HierarchicalDepthBufferMOCS = mocs;
+#if GEN_GEN >= 8
+         hiz.SurfaceQPitch =
+            isl_surf_get_array_pitch_sa_rows(&params->depth.aux_surf) >> 2;
+#endif
+      }
+   }
+
+   blorp_emit(batch, GENX(3DSTATE_STENCIL_BUFFER), sb) {
+      if (params->stencil.enabled) {
+#if GEN_GEN >= 8 || GEN_IS_HASWELL
+         sb.StencilBufferEnable = true;
+#endif
+
+         sb.SurfacePitch = params->stencil.surf.row_pitch - 1,
+#if GEN_GEN >= 8
+         sb.SurfaceQPitch =
+            isl_surf_get_array_pitch_el_rows(&params->stencil.surf) >> 2,
+#endif
+
+         sb.SurfaceBaseAddress = params->stencil.addr;
+         sb.StencilBufferMOCS = batch->blorp->mocs.tex;
+      }
    }
 
-   blorp_emit(batch, GENX(3DSTATE_STENCIL_BUFFER), sb);
+   /* 3DSTATE_CLEAR_PARAMS
+    *
+    * From the Sandybridge PRM, Volume 2, Part 1, Section 3DSTATE_CLEAR_PARAMS:
+    *   [DevSNB] 3DSTATE_CLEAR_PARAMS packet must follow the DEPTH_BUFFER_STATE
+    *   packet when HiZ is enabled and the DEPTH_BUFFER_STATE changes.
+    */
+   blorp_emit(batch, GENX(3DSTATE_CLEAR_PARAMS), clear) {
+      clear.DepthClearValueValid = true;
+      clear.DepthClearValue = params->depth.clear_color.u32[0];
+   }
 }
 
 static uint32_t
@@ -824,11 +899,17 @@ static uint32_t
 blorp_emit_color_calc_state(struct blorp_batch *batch,
                             const struct blorp_params *params)
 {
+   struct GENX(COLOR_CALC_STATE) cc = { 0 };
+
+#if GEN_GEN <= 8
+   cc.StencilReferenceValue = params->stencil_ref;
+#endif
+
    uint32_t offset;
    void *state = blorp_alloc_dynamic_state(batch, AUB_TRACE_CC_STATE,
                                            GENX(COLOR_CALC_STATE_length) * 4,
                                            64, &offset);
-   memset(state, 0, GENX(COLOR_CALC_STATE_length) * 4);
+   GENX(COLOR_CALC_STATE_pack)(NULL, state, &cc);
 
 #if GEN_GEN >= 7
    blorp_emit(batch, GENX(3DSTATE_CC_STATE_POINTERS), sp) {
@@ -847,66 +928,82 @@ blorp_emit_depth_stencil_state(struct blorp_batch *batch,
                                const struct blorp_params *params)
 {
 #if GEN_GEN >= 8
+   struct GENX(3DSTATE_WM_DEPTH_STENCIL) ds = {
+      GENX(3DSTATE_WM_DEPTH_STENCIL_header),
+   };
+#else
+   struct GENX(DEPTH_STENCIL_STATE) ds = { 0 };
+#endif
 
-   /* On gen8+, DEPTH_STENCIL state is simply an instruction */
-   blorp_emit(batch, GENX(3DSTATE_WM_DEPTH_STENCIL), ds);
-   return 0;
+   if (params->depth.enabled) {
+      ds.DepthBufferWriteEnable = true;
 
-#else /* GEN_GEN <= 7 */
+      switch (params->hiz_op) {
+      case BLORP_HIZ_OP_NONE:
+         ds.DepthTestEnable = true;
+         ds.DepthTestFunction = COMPAREFUNCTION_ALWAYS;
+         break;
 
-   /* See the following sections of the Sandy Bridge PRM, Volume 1, Part2:
-    *   - 7.5.3.1 Depth Buffer Clear
-    *   - 7.5.3.2 Depth Buffer Resolve
-    *   - 7.5.3.3 Hierarchical Depth Buffer Resolve
-    */
-   struct GENX(DEPTH_STENCIL_STATE) ds = {
-      .DepthBufferWriteEnable = true,
-   };
+      /* See the following sections of the Sandy Bridge PRM, Volume 2, Part1:
+       *   - 7.5.3.1 Depth Buffer Clear
+       *   - 7.5.3.2 Depth Buffer Resolve
+       *   - 7.5.3.3 Hierarchical Depth Buffer Resolve
+       */
+      case BLORP_HIZ_OP_DEPTH_RESOLVE:
+         ds.DepthTestEnable = true;
+         ds.DepthTestFunction = COMPAREFUNCTION_NEVER;
+         break;
 
-   if (params->hiz_op == BLORP_HIZ_OP_DEPTH_RESOLVE) {
-      ds.DepthTestEnable = true;
-      ds.DepthTestFunction = COMPAREFUNCTION_NEVER;
+      case BLORP_HIZ_OP_DEPTH_CLEAR:
+      case BLORP_HIZ_OP_HIZ_RESOLVE:
+         ds.DepthTestEnable = false;
+         break;
+      }
    }
 
+   if (params->stencil.enabled) {
+      ds.StencilBufferWriteEnable = true;
+      ds.StencilTestEnable = true;
+      ds.DoubleSidedStencilEnable = false;
+
+      ds.StencilTestFunction = COMPAREFUNCTION_ALWAYS;
+      ds.StencilPassDepthPassOp = STENCILOP_REPLACE;
+
+      ds.StencilWriteMask = params->stencil_mask;
+#if GEN_GEN >= 9
+      ds.StencilReferenceValue = params->stencil_ref;
+#endif
+   }
+
+#if GEN_GEN >= 8
+   uint32_t offset = 0;
+   uint32_t *dw = blorp_emit_dwords(batch,
+                                    GENX(3DSTATE_WM_DEPTH_STENCIL_length));
+   GENX(3DSTATE_WM_DEPTH_STENCIL_pack)(NULL, dw, &ds);
+#else
    uint32_t offset;
    void *state = blorp_alloc_dynamic_state(batch, AUB_TRACE_DEPTH_STENCIL_STATE,
                                            GENX(DEPTH_STENCIL_STATE_length) * 4,
                                            64, &offset);
    GENX(DEPTH_STENCIL_STATE_pack)(NULL, state, &ds);
+#endif
 
-#if GEN_GEN >= 7
+#if GEN_GEN == 7
    blorp_emit(batch, GENX(3DSTATE_DEPTH_STENCIL_STATE_POINTERS), sp) {
       sp.PointertoDEPTH_STENCIL_STATE = offset;
    }
 #endif
 
    return offset;
-
-#endif /* GEN_GEN */
 }
 
-struct surface_state_info {
-   unsigned num_dwords;
-   unsigned ss_align; /* Required alignment of RENDER_SURFACE_STATE in bytes */
-   unsigned reloc_dw;
-   unsigned aux_reloc_dw;
-};
-
-static const struct surface_state_info surface_state_infos[] = {
-   [6] = {6,  32, 1,  0},
-   [7] = {8,  32, 1,  6},
-   [8] = {13, 64, 8,  10},
-   [9] = {16, 64, 8,  10},
-};
-
 static void
 blorp_emit_surface_state(struct blorp_batch *batch,
                          const struct brw_blorp_surface_info *surface,
-                         uint32_t *state, uint32_t state_offset,
+                         void *state, uint32_t state_offset,
                          bool is_render_target)
 {
-   const struct surface_state_info ss_info = surface_state_infos[GEN_GEN];
-
+   const struct isl_device *isl_dev = batch->blorp->isl_dev;
    struct isl_surf surf = surface->surf;
 
    if (surf.dim == ISL_SURF_DIM_1D &&
@@ -928,7 +1025,7 @@ blorp_emit_surface_state(struct blorp_batch *batch,
                        .aux_surf = &surface->aux_surf, .aux_usage = aux_usage,
                        .mocs = mocs, .clear_color = surface->clear_color);
 
-   blorp_surface_reloc(batch, state_offset + ss_info.reloc_dw * 4,
+   blorp_surface_reloc(batch, state_offset + isl_dev->ss.addr_offset,
                        surface->addr, 0);
 
    if (aux_usage != ISL_AUX_USAGE_NONE) {
@@ -937,35 +1034,80 @@ blorp_emit_surface_state(struct blorp_batch *batch,
        * surface buffer addresses are always 4K page alinged.
        */
       assert((surface->aux_addr.offset & 0xfff) == 0);
-      blorp_surface_reloc(batch, state_offset + ss_info.aux_reloc_dw * 4,
-                          surface->aux_addr, state[ss_info.aux_reloc_dw]);
+      uint32_t *aux_addr = state + isl_dev->ss.aux_addr_offset;
+      blorp_surface_reloc(batch, state_offset + isl_dev->ss.aux_addr_offset,
+                          surface->aux_addr, *aux_addr);
    }
 }
 
+static void
+blorp_emit_null_surface_state(struct blorp_batch *batch,
+                              const struct brw_blorp_surface_info *surface,
+                              uint32_t *state)
+{
+   struct GENX(RENDER_SURFACE_STATE) ss = {
+      .SurfaceType = SURFTYPE_NULL,
+      .SurfaceFormat = ISL_FORMAT_R8G8B8A8_UNORM,
+      .Width = surface->surf.logical_level0_px.width - 1,
+      .Height = surface->surf.logical_level0_px.height - 1,
+      .MIPCountLOD = surface->view.base_level,
+      .MinimumArrayElement = surface->view.base_array_layer,
+      .Depth = surface->view.array_len - 1,
+      .RenderTargetViewExtent = surface->view.array_len - 1,
+      .NumberofMultisamples = ffs(surface->surf.samples) - 1,
+
+#if GEN_GEN >= 7
+      .SurfaceArray = surface->surf.dim != ISL_SURF_DIM_3D,
+#endif
+
+#if GEN_GEN >= 8
+      .TileMode = YMAJOR,
+#else
+      .TiledSurface = true,
+#endif
+   };
+
+   GENX(RENDER_SURFACE_STATE_pack)(NULL, state, &ss);
+}
+
 static void
 blorp_emit_surface_states(struct blorp_batch *batch,
                           const struct blorp_params *params)
 {
+   const struct isl_device *isl_dev = batch->blorp->isl_dev;
    uint32_t bind_offset, surface_offsets[2];
    void *surface_maps[2];
 
-   const unsigned ss_size = GENX(RENDER_SURFACE_STATE_length) * 4;
-   const unsigned ss_align = GENX(RENDER_SURFACE_STATE_length) > 8 ? 64 : 32;
-
-   unsigned num_surfaces = 1 + (params->src.addr.buffer != NULL);
-   blorp_alloc_binding_table(batch, num_surfaces, ss_size, ss_align,
+   unsigned num_surfaces = 1 + params->src.enabled;
+   blorp_alloc_binding_table(batch, num_surfaces,
+                             isl_dev->ss.size, isl_dev->ss.align,
                              &bind_offset, surface_offsets, surface_maps);
 
-   blorp_emit_surface_state(batch, &params->dst,
-                            surface_maps[BLORP_RENDERBUFFER_BT_INDEX],
-                            surface_offsets[BLORP_RENDERBUFFER_BT_INDEX], true);
-   if (params->src.addr.buffer) {
+   if (params->dst.enabled) {
+      blorp_emit_surface_state(batch, &params->dst,
+                               surface_maps[BLORP_RENDERBUFFER_BT_INDEX],
+                               surface_offsets[BLORP_RENDERBUFFER_BT_INDEX],
+                               true);
+   } else {
+      assert(params->depth.enabled || params->stencil.enabled);
+      const struct brw_blorp_surface_info *surface =
+         params->depth.enabled ? &params->depth : &params->stencil;
+      blorp_emit_null_surface_state(batch, surface,
+                                    surface_maps[BLORP_RENDERBUFFER_BT_INDEX]);
+   }
+
+   if (params->src.enabled) {
       blorp_emit_surface_state(batch, &params->src,
                                surface_maps[BLORP_TEXTURE_BT_INDEX],
                                surface_offsets[BLORP_TEXTURE_BT_INDEX], false);
    }
 
 #if GEN_GEN >= 7
+   blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), bt);
+   blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_HS), bt);
+   blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_DS), bt);
+   blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_GS), bt);
+
    blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_PS), bt) {
       bt.PointertoPSBindingTable = bind_offset;
    }
@@ -1024,10 +1166,8 @@ static void
 blorp_emit_3dstate_multisample(struct blorp_batch *batch,
                                const struct blorp_params *params)
 {
-   const unsigned samples = params->dst.surf.samples;
-
    blorp_emit(batch, GENX(3DSTATE_MULTISAMPLE), ms) {
-      ms.NumberofMultisamples       = __builtin_ffs(samples) - 1;
+      ms.NumberofMultisamples       = __builtin_ffs(params->num_samples) - 1;
 
 #if GEN_GEN >= 8
       /* The PRM says that this bit is valid only for DX9:
@@ -1040,7 +1180,7 @@ blorp_emit_3dstate_multisample(struct blorp_batch *batch,
 #elif GEN_GEN >= 7
       ms.PixelLocation              = PIXLOC_CENTER;
 
-      switch (samples) {
+      switch (params->num_samples) {
       case 1:
          GEN_SAMPLE_POS_1X(ms.Sample);
          break;
@@ -1116,8 +1256,8 @@ blorp_exec(struct blorp_batch *batch, const struct blorp_params *params)
 
    if (params->wm_prog_data) {
       blend_state_offset = blorp_emit_blend_state(batch, params);
-      color_calc_state_offset = blorp_emit_color_calc_state(batch, params);
    }
+   color_calc_state_offset = blorp_emit_color_calc_state(batch, params);
    depth_stencil_state_offset = blorp_emit_depth_stencil_state(batch, params);
 
 #if GEN_GEN <= 6
@@ -1154,16 +1294,15 @@ blorp_exec(struct blorp_batch *batch, const struct blorp_params *params)
    blorp_emit(batch, GENX(3DSTATE_CONSTANT_GS), gs);
    blorp_emit(batch, GENX(3DSTATE_CONSTANT_PS), ps);
 
-   if (params->wm_prog_data)
-      blorp_emit_surface_states(batch, params);
+   blorp_emit_surface_states(batch, params);
 
-   if (params->src.addr.buffer)
+   if (params->src.enabled)
       blorp_emit_sampler_state(batch, params);
 
    blorp_emit_3dstate_multisample(batch, params);
 
    blorp_emit(batch, GENX(3DSTATE_SAMPLE_MASK), mask) {
-      mask.SampleMask = (1 << params->dst.surf.samples) - 1;
+      mask.SampleMask = (1 << params->num_samples) - 1;
    }
 
    /* From the BSpec, 3D Pipeline > Geometry > Vertex Shader > State,
@@ -1194,27 +1333,8 @@ blorp_exec(struct blorp_batch *batch, const struct blorp_params *params)
 
    blorp_emit_viewport_state(batch, params);
 
-   if (params->depth.addr.buffer) {
+   if (!(batch->flags & BLORP_BATCH_NO_EMIT_DEPTH_STENCIL))
       blorp_emit_depth_stencil_config(batch, params);
-   } else {
-      blorp_emit(batch, GENX(3DSTATE_DEPTH_BUFFER), db) {
-         db.SurfaceType = SURFTYPE_NULL;
-         db.SurfaceFormat = D32_FLOAT;
-      }
-      blorp_emit(batch, GENX(3DSTATE_HIER_DEPTH_BUFFER), hiz);
-      blorp_emit(batch, GENX(3DSTATE_STENCIL_BUFFER), sb);
-   }
-
-   /* 3DSTATE_CLEAR_PARAMS
-    *
-    * From the Sandybridge PRM, Volume 2, Part 1, Section 3DSTATE_CLEAR_PARAMS:
-    *   [DevSNB] 3DSTATE_CLEAR_PARAMS packet must follow the DEPTH_BUFFER_STATE
-    *   packet when HiZ is enabled and the DEPTH_BUFFER_STATE changes.
-    */
-   blorp_emit(batch, GENX(3DSTATE_CLEAR_PARAMS), clear) {
-      clear.DepthClearValueValid = true;
-      clear.DepthClearValue = params->depth.clear_color.u32[0];
-   }
 
    blorp_emit(batch, GENX(3DPRIMITIVE), prim) {
       prim.VertexAccessType = SEQUENTIAL;
@@ -1223,3 +1343,5 @@ blorp_exec(struct blorp_batch *batch, const struct blorp_params *params)
       prim.InstanceCount = params->num_layers;
    }
 }
+
+#endif /* BLORP_GENX_EXEC_H */