intel/blorp: Use an actual chunk of vertex buffer for the VUE header
[mesa.git] / src / intel / blorp / blorp_genX_exec.h
index 69424c4437097e0b9dbca4ad0a71438731870a00..78fb3ffe4ca34d1ce99f39d8664363813bf1ad9d 100644 (file)
@@ -193,26 +193,34 @@ blorp_emit_input_varying_data(struct blorp_batch *batch,
    const unsigned vec4_size_in_bytes = 4 * sizeof(float);
    const unsigned max_num_varyings =
       DIV_ROUND_UP(sizeof(params->wm_inputs), vec4_size_in_bytes);
-   const unsigned num_varyings = params->wm_prog_data->num_varying_inputs;
+   const unsigned num_varyings =
+      params->wm_prog_data ? params->wm_prog_data->num_varying_inputs : 0;
 
-   *size = num_varyings * vec4_size_in_bytes;
+   *size = 16 + num_varyings * vec4_size_in_bytes;
 
-   const float *const inputs_src = (const float *)&params->wm_inputs;
-   float *inputs = blorp_alloc_vertex_buffer(batch, *size, addr);
+   const uint32_t *const inputs_src = (const uint32_t *)&params->wm_inputs;
+   uint32_t *inputs = blorp_alloc_vertex_buffer(batch, *size, addr);
 
-   /* Walk over the attribute slots, determine if the attribute is used by
-    * the program and when necessary copy the values from the input storage to
-    * the vertex data buffer.
-    */
-   for (unsigned i = 0; i < max_num_varyings; i++) {
-      const gl_varying_slot attr = VARYING_SLOT_VAR0 + i;
+   /* Zero data for the VUE header */
+   memset(inputs, 0, 4 * sizeof(uint32_t));
+   inputs += 4;
+
+   if (params->wm_prog_data) {
+      /* Walk over the attribute slots, determine if the attribute is used by
+       * the program and when necessary copy the values from the input storage
+       * to the vertex data buffer.
+       */
+      for (unsigned i = 0; i < max_num_varyings; i++) {
+         const gl_varying_slot attr = VARYING_SLOT_VAR0 + i;
 
-      if (!(params->wm_prog_data->inputs_read & (1ull << attr)))
-         continue;
+         const int input_index = params->wm_prog_data->urb_setup[attr];
+         if (input_index < 0)
+            continue;
 
-      memcpy(inputs, inputs_src + i * 4, vec4_size_in_bytes);
+         memcpy(inputs, inputs_src + i * 4, vec4_size_in_bytes);
 
-      inputs += 4;
+         inputs += 4;
+      }
    }
 }
 
@@ -223,8 +231,6 @@ blorp_emit_vertex_buffers(struct blorp_batch *batch,
    struct GENX(VERTEX_BUFFER_STATE) vb[2];
    memset(vb, 0, sizeof(vb));
 
-   unsigned num_buffers = 1;
-
    uint32_t size;
    blorp_emit_vertex_data(batch, params, &vb[0].BufferStartingAddress, &size);
    vb[0].VertexBufferIndex = 0;
@@ -241,30 +247,26 @@ blorp_emit_vertex_buffers(struct blorp_batch *batch,
    vb[0].EndAddress.offset += size - 1;
 #endif
 
-   if (params->wm_prog_data && params->wm_prog_data->num_varying_inputs) {
-      blorp_emit_input_varying_data(batch, params,
-                                    &vb[1].BufferStartingAddress, &size);
-      vb[1].VertexBufferIndex = 1;
-      vb[1].BufferPitch = 0;
-      vb[1].VertexBufferMOCS = batch->blorp->mocs.vb;
+   blorp_emit_input_varying_data(batch, params,
+                                 &vb[1].BufferStartingAddress, &size);
+   vb[1].VertexBufferIndex = 1;
+   vb[1].BufferPitch = 0;
+   vb[1].VertexBufferMOCS = batch->blorp->mocs.vb;
 #if GEN_GEN >= 7
-      vb[1].AddressModifyEnable = true;
+   vb[1].AddressModifyEnable = true;
 #endif
 #if GEN_GEN >= 8
-      vb[1].BufferSize = size;
+   vb[1].BufferSize = size;
 #else
-      vb[1].BufferAccessType = INSTANCEDATA;
-      vb[1].EndAddress = vb[1].BufferStartingAddress;
-      vb[1].EndAddress.offset += size - 1;
+   vb[1].BufferAccessType = INSTANCEDATA;
+   vb[1].EndAddress = vb[1].BufferStartingAddress;
+   vb[1].EndAddress.offset += size - 1;
 #endif
-      num_buffers++;
-   }
 
-   const unsigned num_dwords =
-      1 + GENX(VERTEX_BUFFER_STATE_length) * num_buffers;
+   const unsigned num_dwords = 1 + GENX(VERTEX_BUFFER_STATE_length) * 2;
    uint32_t *dw = blorp_emitn(batch, GENX(3DSTATE_VERTEX_BUFFERS), num_dwords);
 
-   for (unsigned i = 0; i < num_buffers; i++) {
+   for (unsigned i = 0; i < 2; i++) {
       GENX(VERTEX_BUFFER_STATE_pack)(batch, dw, &vb[i]);
       dw += GENX(VERTEX_BUFFER_STATE_length);
    }
@@ -327,7 +329,7 @@ blorp_emit_vertex_elements(struct blorp_batch *batch,
     *
     * See the vertex element setup below.
     */
-   ve[0].VertexBufferIndex = 0;
+   ve[0].VertexBufferIndex = 1;
    ve[0].Valid = true;
    ve[0].SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT;
    ve[0].SourceElementOffset = 0;
@@ -358,7 +360,7 @@ blorp_emit_vertex_elements(struct blorp_batch *batch,
       ve[i + 2].VertexBufferIndex = 1;
       ve[i + 2].Valid = true;
       ve[i + 2].SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT;
-      ve[i + 2].SourceElementOffset = i * 4 * sizeof(float);
+      ve[i + 2].SourceElementOffset = 16 + i * 4 * sizeof(float);
       ve[i + 2].Component0Control = VFCOMP_STORE_SRC;
       ve[i + 2].Component1Control = VFCOMP_STORE_SRC;
       ve[i + 2].Component2Control = VFCOMP_STORE_SRC;
@@ -401,7 +403,7 @@ static void
 blorp_emit_sf_config(struct blorp_batch *batch,
                      const struct blorp_params *params)
 {
-   const struct brw_blorp_prog_data *prog_data = params->wm_prog_data;
+   const struct brw_wm_prog_data *prog_data = params->wm_prog_data;
 
    /* 3DSTATE_SF
     *
@@ -432,11 +434,16 @@ blorp_emit_sf_config(struct blorp_batch *batch,
 
    blorp_emit(batch, GENX(3DSTATE_SBE), sbe) {
       sbe.VertexURBEntryReadOffset = 1;
-      sbe.NumberofSFOutputAttributes = prog_data->num_varying_inputs;
-      sbe.VertexURBEntryReadLength = brw_blorp_get_urb_length(prog_data);
+      if (prog_data) {
+         sbe.NumberofSFOutputAttributes = prog_data->num_varying_inputs;
+         sbe.VertexURBEntryReadLength = brw_blorp_get_urb_length(prog_data);
+         sbe.ConstantInterpolationEnable = prog_data->flat_inputs;
+      } else {
+         sbe.NumberofSFOutputAttributes = 0;
+         sbe.VertexURBEntryReadLength = 1;
+      }
       sbe.ForceVertexURBEntryReadLength = true;
       sbe.ForceVertexURBEntryReadOffset = true;
-      sbe.ConstantInterpolationEnable = prog_data->flat_inputs;
 
 #if GEN_GEN >= 9
       for (unsigned i = 0; i < 32; i++)
@@ -450,7 +457,7 @@ blorp_emit_sf_config(struct blorp_batch *batch,
       sf.FrontFaceFillMode = FILL_MODE_SOLID;
       sf.BackFaceFillMode = FILL_MODE_SOLID;
 
-      sf.MultisampleRasterizationMode = params->dst.surf.samples > 1 ?
+      sf.MultisampleRasterizationMode = params->num_samples > 1 ?
          MSRASTMODE_ON_PATTERN : MSRASTMODE_OFF_PIXEL;
 
 #if GEN_GEN == 7
@@ -476,7 +483,7 @@ blorp_emit_sf_config(struct blorp_batch *batch,
       sf.FrontFaceFillMode = FILL_MODE_SOLID;
       sf.BackFaceFillMode = FILL_MODE_SOLID;
 
-      sf.MultisampleRasterizationMode = params->dst.surf.samples > 1 ?
+      sf.MultisampleRasterizationMode = params->num_samples > 1 ?
          MSRASTMODE_ON_PATTERN : MSRASTMODE_OFF_PIXEL;
 
       sf.VertexURBEntryReadOffset = 1;
@@ -497,7 +504,7 @@ static void
 blorp_emit_ps_config(struct blorp_batch *batch,
                      const struct blorp_params *params)
 {
-   const struct brw_blorp_prog_data *prog_data = params->wm_prog_data;
+   const struct brw_wm_prog_data *prog_data = params->wm_prog_data;
 
    /* Even when thread dispatch is disabled, max threads (dw5.25:31) must be
     * nonzero to prevent the GPU from hanging.  While the documentation doesn't
@@ -513,24 +520,26 @@ blorp_emit_ps_config(struct blorp_batch *batch,
    blorp_emit(batch, GENX(3DSTATE_WM), wm);
 
    blorp_emit(batch, GENX(3DSTATE_PS), ps) {
-      if (params->src.addr.buffer) {
+      if (params->src.enabled) {
          ps.SamplerCount = 1; /* Up to 4 samplers */
          ps.BindingTableEntryCount = 2;
       } else {
          ps.BindingTableEntryCount = 1;
       }
 
-      ps.DispatchGRFStartRegisterForConstantSetupData0 =
-         prog_data->first_curbe_grf_0;
-      ps.DispatchGRFStartRegisterForConstantSetupData2 =
-         prog_data->first_curbe_grf_2;
+      if (prog_data) {
+         ps.DispatchGRFStartRegisterForConstantSetupData0 =
+            prog_data->base.dispatch_grf_start_reg;
+         ps.DispatchGRFStartRegisterForConstantSetupData2 =
+            prog_data->dispatch_grf_start_reg_2;
 
-      ps._8PixelDispatchEnable = prog_data->dispatch_8;
-      ps._16PixelDispatchEnable = prog_data->dispatch_16;
+         ps._8PixelDispatchEnable = prog_data->dispatch_8;
+         ps._16PixelDispatchEnable = prog_data->dispatch_16;
 
-      ps.KernelStartPointer0 = params->wm_prog_kernel;
-      ps.KernelStartPointer2 =
-         params->wm_prog_kernel + prog_data->ksp_offset_2;
+         ps.KernelStartPointer0 = params->wm_prog_kernel;
+         ps.KernelStartPointer2 =
+            params->wm_prog_kernel + prog_data->prog_offset_2;
+      }
 
       /* 3DSTATE_PS expects the number of threads per PSD, which is always 64;
        * it implicitly scales for different GT levels (which have some # of
@@ -567,15 +576,14 @@ blorp_emit_ps_config(struct blorp_batch *batch,
    }
 
    blorp_emit(batch, GENX(3DSTATE_PS_EXTRA), psx) {
-      psx.PixelShaderValid = true;
+      if (prog_data) {
+         psx.PixelShaderValid = true;
+         psx.AttributeEnable = prog_data->num_varying_inputs > 0;
+         psx.PixelShaderIsPerSample = prog_data->persample_dispatch;
+      }
 
-      if (params->src.addr.buffer)
+      if (params->src.enabled)
          psx.PixelShaderKillsPixel = true;
-
-      psx.AttributeEnable = prog_data->num_varying_inputs > 0;
-
-      if (prog_data && prog_data->persample_msaa_dispatch)
-         psx.PixelShaderIsPerSample = true;
    }
 
 #elif GEN_GEN >= 7
@@ -600,13 +608,13 @@ blorp_emit_ps_config(struct blorp_batch *batch,
       if (prog_data)
          wm.ThreadDispatchEnable = true;
 
-      if (params->src.addr.buffer)
-         wm.PixelShaderKillPixel = true;
+      if (params->src.enabled)
+         wm.PixelShaderKillsPixel = true;
 
-      if (params->dst.surf.samples > 1) {
+      if (params->num_samples > 1) {
          wm.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
          wm.MultisampleDispatchMode =
-            (prog_data && prog_data->persample_msaa_dispatch) ?
+            (prog_data && prog_data->persample_dispatch) ?
             MSDISPMODE_PERSAMPLE : MSDISPMODE_PERPIXEL;
       } else {
          wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
@@ -623,14 +631,14 @@ blorp_emit_ps_config(struct blorp_batch *batch,
 #endif
 
       if (prog_data) {
-         ps.DispatchGRFStartRegisterforConstantSetupData0 =
-            prog_data->first_curbe_grf_0;
-         ps.DispatchGRFStartRegisterforConstantSetupData2 =
-            prog_data->first_curbe_grf_2;
+         ps.DispatchGRFStartRegisterForConstantSetupData0 =
+            prog_data->base.dispatch_grf_start_reg;
+         ps.DispatchGRFStartRegisterForConstantSetupData2 =
+            prog_data->dispatch_grf_start_reg_2;
 
          ps.KernelStartPointer0 = params->wm_prog_kernel;
          ps.KernelStartPointer2 =
-            params->wm_prog_kernel + prog_data->ksp_offset_2;
+            params->wm_prog_kernel + prog_data->prog_offset_2;
 
          ps._8PixelDispatchEnable = prog_data->dispatch_8;
          ps._16PixelDispatchEnable = prog_data->dispatch_16;
@@ -643,7 +651,7 @@ blorp_emit_ps_config(struct blorp_batch *batch,
          ps._16PixelDispatchEnable = true;
       }
 
-      if (params->src.addr.buffer)
+      if (params->src.enabled)
          ps.SamplerCount = 1; /* Up to 4 samplers */
 
       switch (params->fast_clear_op) {
@@ -685,14 +693,14 @@ blorp_emit_ps_config(struct blorp_batch *batch,
       if (prog_data) {
          wm.ThreadDispatchEnable = true;
 
-         wm.DispatchGRFStartRegisterforConstantSetupData0 =
-            prog_data->first_curbe_grf_0;
-         wm.DispatchGRFStartRegisterforConstantSetupData2 =
-            prog_data->first_curbe_grf_2;
+         wm.DispatchGRFStartRegisterForConstantSetupData0 =
+            prog_data->base.dispatch_grf_start_reg;
+         wm.DispatchGRFStartRegisterForConstantSetupData2 =
+            prog_data->dispatch_grf_start_reg_2;
 
          wm.KernelStartPointer0 = params->wm_prog_kernel;
          wm.KernelStartPointer2 =
-            params->wm_prog_kernel + prog_data->ksp_offset_2;
+            params->wm_prog_kernel + prog_data->prog_offset_2;
 
          wm._8PixelDispatchEnable = prog_data->dispatch_8;
          wm._16PixelDispatchEnable = prog_data->dispatch_16;
@@ -700,15 +708,15 @@ blorp_emit_ps_config(struct blorp_batch *batch,
          wm.NumberofSFOutputAttributes = prog_data->num_varying_inputs;
       }
 
-      if (params->src.addr.buffer) {
+      if (params->src.enabled) {
          wm.SamplerCount = 1; /* Up to 4 samplers */
-         wm.PixelShaderKillPixel = true; /* TODO: temporarily smash on */
+         wm.PixelShaderKillsPixel = true; /* TODO: temporarily smash on */
       }
 
-      if (params->dst.surf.samples > 1) {
+      if (params->num_samples > 1) {
          wm.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
          wm.MultisampleDispatchMode =
-            (prog_data && prog_data->persample_msaa_dispatch) ?
+            (prog_data && prog_data->persample_dispatch) ?
             MSDISPMODE_PERSAMPLE : MSDISPMODE_PERPIXEL;
       } else {
          wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
@@ -750,15 +758,15 @@ blorp_emit_depth_stencil_config(struct blorp_batch *batch,
 
    blorp_emit(batch, GENX(3DSTATE_DEPTH_BUFFER), db) {
 #if GEN_GEN >= 7
-      db.DepthWriteEnable = params->depth.addr.buffer != NULL;
-      db.StencilWriteEnable = params->stencil.addr.buffer != NULL;
+      db.DepthWriteEnable = params->depth.enabled;
+      db.StencilWriteEnable = params->stencil.enabled;
 #endif
 
 #if GEN_GEN <= 6
       db.SeparateStencilBufferEnable = true;
 #endif
 
-      if (params->depth.addr.buffer) {
+      if (params->depth.enabled) {
          db.SurfaceFormat = params->depth_format;
          db.SurfaceType = isl_to_gen_ds_surftype[params->depth.surf.dim];
 
@@ -787,7 +795,7 @@ blorp_emit_depth_stencil_config(struct blorp_batch *batch,
 
          db.SurfaceBaseAddress = params->depth.addr;
          db.DepthBufferMOCS = mocs;
-      } else if (params->stencil.addr.buffer) {
+      } else if (params->stencil.enabled) {
          db.SurfaceFormat = D32_FLOAT;
          db.SurfaceType = isl_to_gen_ds_surftype[params->stencil.surf.dim];
 
@@ -817,7 +825,7 @@ blorp_emit_depth_stencil_config(struct blorp_batch *batch,
    }
 
    blorp_emit(batch, GENX(3DSTATE_STENCIL_BUFFER), sb) {
-      if (params->stencil.addr.buffer) {
+      if (params->stencil.enabled) {
 #if GEN_GEN >= 8 || GEN_IS_HASWELL
          sb.StencilBufferEnable = true;
 #endif
@@ -891,11 +899,17 @@ static uint32_t
 blorp_emit_color_calc_state(struct blorp_batch *batch,
                             const struct blorp_params *params)
 {
+   struct GENX(COLOR_CALC_STATE) cc = { 0 };
+
+#if GEN_GEN <= 8
+   cc.StencilReferenceValue = params->stencil_ref;
+#endif
+
    uint32_t offset;
    void *state = blorp_alloc_dynamic_state(batch, AUB_TRACE_CC_STATE,
                                            GENX(COLOR_CALC_STATE_length) * 4,
                                            64, &offset);
-   memset(state, 0, GENX(COLOR_CALC_STATE_length) * 4);
+   GENX(COLOR_CALC_STATE_pack)(NULL, state, &cc);
 
 #if GEN_GEN >= 7
    blorp_emit(batch, GENX(3DSTATE_CC_STATE_POINTERS), sp) {
@@ -921,16 +935,44 @@ blorp_emit_depth_stencil_state(struct blorp_batch *batch,
    struct GENX(DEPTH_STENCIL_STATE) ds = { 0 };
 #endif
 
-   ds.DepthBufferWriteEnable = params->depth.addr.buffer != NULL;
+   if (params->depth.enabled) {
+      ds.DepthBufferWriteEnable = true;
 
-   /* See the following sections of the Sandy Bridge PRM, Volume 1, Part2:
-    *   - 7.5.3.1 Depth Buffer Clear
-    *   - 7.5.3.2 Depth Buffer Resolve
-    *   - 7.5.3.3 Hierarchical Depth Buffer Resolve
-    */
-   if (params->hiz_op == BLORP_HIZ_OP_DEPTH_RESOLVE) {
-      ds.DepthTestEnable = true;
-      ds.DepthTestFunction = COMPAREFUNCTION_NEVER;
+      switch (params->hiz_op) {
+      case BLORP_HIZ_OP_NONE:
+         ds.DepthTestEnable = true;
+         ds.DepthTestFunction = COMPAREFUNCTION_ALWAYS;
+         break;
+
+      /* See the following sections of the Sandy Bridge PRM, Volume 2, Part1:
+       *   - 7.5.3.1 Depth Buffer Clear
+       *   - 7.5.3.2 Depth Buffer Resolve
+       *   - 7.5.3.3 Hierarchical Depth Buffer Resolve
+       */
+      case BLORP_HIZ_OP_DEPTH_RESOLVE:
+         ds.DepthTestEnable = true;
+         ds.DepthTestFunction = COMPAREFUNCTION_NEVER;
+         break;
+
+      case BLORP_HIZ_OP_DEPTH_CLEAR:
+      case BLORP_HIZ_OP_HIZ_RESOLVE:
+         ds.DepthTestEnable = false;
+         break;
+      }
+   }
+
+   if (params->stencil.enabled) {
+      ds.StencilBufferWriteEnable = true;
+      ds.StencilTestEnable = true;
+      ds.DoubleSidedStencilEnable = false;
+
+      ds.StencilTestFunction = COMPAREFUNCTION_ALWAYS;
+      ds.StencilPassDepthPassOp = STENCILOP_REPLACE;
+
+      ds.StencilWriteMask = params->stencil_mask;
+#if GEN_GEN >= 9
+      ds.StencilReferenceValue = params->stencil_ref;
+#endif
    }
 
 #if GEN_GEN >= 8
@@ -955,28 +997,13 @@ blorp_emit_depth_stencil_state(struct blorp_batch *batch,
    return offset;
 }
 
-struct surface_state_info {
-   unsigned num_dwords;
-   unsigned ss_align; /* Required alignment of RENDER_SURFACE_STATE in bytes */
-   unsigned reloc_dw;
-   unsigned aux_reloc_dw;
-};
-
-static const struct surface_state_info surface_state_infos[] = {
-   [6] = {6,  32, 1,  0},
-   [7] = {8,  32, 1,  6},
-   [8] = {13, 64, 8,  10},
-   [9] = {16, 64, 8,  10},
-};
-
 static void
 blorp_emit_surface_state(struct blorp_batch *batch,
                          const struct brw_blorp_surface_info *surface,
-                         uint32_t *state, uint32_t state_offset,
+                         void *state, uint32_t state_offset,
                          bool is_render_target)
 {
-   const struct surface_state_info ss_info = surface_state_infos[GEN_GEN];
-
+   const struct isl_device *isl_dev = batch->blorp->isl_dev;
    struct isl_surf surf = surface->surf;
 
    if (surf.dim == ISL_SURF_DIM_1D &&
@@ -998,7 +1025,7 @@ blorp_emit_surface_state(struct blorp_batch *batch,
                        .aux_surf = &surface->aux_surf, .aux_usage = aux_usage,
                        .mocs = mocs, .clear_color = surface->clear_color);
 
-   blorp_surface_reloc(batch, state_offset + ss_info.reloc_dw * 4,
+   blorp_surface_reloc(batch, state_offset + isl_dev->ss.addr_offset,
                        surface->addr, 0);
 
    if (aux_usage != ISL_AUX_USAGE_NONE) {
@@ -1007,35 +1034,80 @@ blorp_emit_surface_state(struct blorp_batch *batch,
        * surface buffer addresses are always 4K page alinged.
        */
       assert((surface->aux_addr.offset & 0xfff) == 0);
-      blorp_surface_reloc(batch, state_offset + ss_info.aux_reloc_dw * 4,
-                          surface->aux_addr, state[ss_info.aux_reloc_dw]);
+      uint32_t *aux_addr = state + isl_dev->ss.aux_addr_offset;
+      blorp_surface_reloc(batch, state_offset + isl_dev->ss.aux_addr_offset,
+                          surface->aux_addr, *aux_addr);
    }
 }
 
+static void
+blorp_emit_null_surface_state(struct blorp_batch *batch,
+                              const struct brw_blorp_surface_info *surface,
+                              uint32_t *state)
+{
+   struct GENX(RENDER_SURFACE_STATE) ss = {
+      .SurfaceType = SURFTYPE_NULL,
+      .SurfaceFormat = ISL_FORMAT_R8G8B8A8_UNORM,
+      .Width = surface->surf.logical_level0_px.width - 1,
+      .Height = surface->surf.logical_level0_px.height - 1,
+      .MIPCountLOD = surface->view.base_level,
+      .MinimumArrayElement = surface->view.base_array_layer,
+      .Depth = surface->view.array_len - 1,
+      .RenderTargetViewExtent = surface->view.array_len - 1,
+      .NumberofMultisamples = ffs(surface->surf.samples) - 1,
+
+#if GEN_GEN >= 7
+      .SurfaceArray = surface->surf.dim != ISL_SURF_DIM_3D,
+#endif
+
+#if GEN_GEN >= 8
+      .TileMode = YMAJOR,
+#else
+      .TiledSurface = true,
+#endif
+   };
+
+   GENX(RENDER_SURFACE_STATE_pack)(NULL, state, &ss);
+}
+
 static void
 blorp_emit_surface_states(struct blorp_batch *batch,
                           const struct blorp_params *params)
 {
+   const struct isl_device *isl_dev = batch->blorp->isl_dev;
    uint32_t bind_offset, surface_offsets[2];
    void *surface_maps[2];
 
-   const unsigned ss_size = GENX(RENDER_SURFACE_STATE_length) * 4;
-   const unsigned ss_align = GENX(RENDER_SURFACE_STATE_length) > 8 ? 64 : 32;
-
-   unsigned num_surfaces = 1 + (params->src.addr.buffer != NULL);
-   blorp_alloc_binding_table(batch, num_surfaces, ss_size, ss_align,
+   unsigned num_surfaces = 1 + params->src.enabled;
+   blorp_alloc_binding_table(batch, num_surfaces,
+                             isl_dev->ss.size, isl_dev->ss.align,
                              &bind_offset, surface_offsets, surface_maps);
 
-   blorp_emit_surface_state(batch, &params->dst,
-                            surface_maps[BLORP_RENDERBUFFER_BT_INDEX],
-                            surface_offsets[BLORP_RENDERBUFFER_BT_INDEX], true);
-   if (params->src.addr.buffer) {
+   if (params->dst.enabled) {
+      blorp_emit_surface_state(batch, &params->dst,
+                               surface_maps[BLORP_RENDERBUFFER_BT_INDEX],
+                               surface_offsets[BLORP_RENDERBUFFER_BT_INDEX],
+                               true);
+   } else {
+      assert(params->depth.enabled || params->stencil.enabled);
+      const struct brw_blorp_surface_info *surface =
+         params->depth.enabled ? &params->depth : &params->stencil;
+      blorp_emit_null_surface_state(batch, surface,
+                                    surface_maps[BLORP_RENDERBUFFER_BT_INDEX]);
+   }
+
+   if (params->src.enabled) {
       blorp_emit_surface_state(batch, &params->src,
                                surface_maps[BLORP_TEXTURE_BT_INDEX],
                                surface_offsets[BLORP_TEXTURE_BT_INDEX], false);
    }
 
 #if GEN_GEN >= 7
+   blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), bt);
+   blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_HS), bt);
+   blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_DS), bt);
+   blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_GS), bt);
+
    blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_PS), bt) {
       bt.PointertoPSBindingTable = bind_offset;
    }
@@ -1094,10 +1166,8 @@ static void
 blorp_emit_3dstate_multisample(struct blorp_batch *batch,
                                const struct blorp_params *params)
 {
-   const unsigned samples = params->dst.surf.samples;
-
    blorp_emit(batch, GENX(3DSTATE_MULTISAMPLE), ms) {
-      ms.NumberofMultisamples       = __builtin_ffs(samples) - 1;
+      ms.NumberofMultisamples       = __builtin_ffs(params->num_samples) - 1;
 
 #if GEN_GEN >= 8
       /* The PRM says that this bit is valid only for DX9:
@@ -1110,7 +1180,7 @@ blorp_emit_3dstate_multisample(struct blorp_batch *batch,
 #elif GEN_GEN >= 7
       ms.PixelLocation              = PIXLOC_CENTER;
 
-      switch (samples) {
+      switch (params->num_samples) {
       case 1:
          GEN_SAMPLE_POS_1X(ms.Sample);
          break;
@@ -1186,8 +1256,8 @@ blorp_exec(struct blorp_batch *batch, const struct blorp_params *params)
 
    if (params->wm_prog_data) {
       blend_state_offset = blorp_emit_blend_state(batch, params);
-      color_calc_state_offset = blorp_emit_color_calc_state(batch, params);
    }
+   color_calc_state_offset = blorp_emit_color_calc_state(batch, params);
    depth_stencil_state_offset = blorp_emit_depth_stencil_state(batch, params);
 
 #if GEN_GEN <= 6
@@ -1224,16 +1294,15 @@ blorp_exec(struct blorp_batch *batch, const struct blorp_params *params)
    blorp_emit(batch, GENX(3DSTATE_CONSTANT_GS), gs);
    blorp_emit(batch, GENX(3DSTATE_CONSTANT_PS), ps);
 
-   if (params->wm_prog_data)
-      blorp_emit_surface_states(batch, params);
+   blorp_emit_surface_states(batch, params);
 
-   if (params->src.addr.buffer)
+   if (params->src.enabled)
       blorp_emit_sampler_state(batch, params);
 
    blorp_emit_3dstate_multisample(batch, params);
 
    blorp_emit(batch, GENX(3DSTATE_SAMPLE_MASK), mask) {
-      mask.SampleMask = (1 << params->dst.surf.samples) - 1;
+      mask.SampleMask = (1 << params->num_samples) - 1;
    }
 
    /* From the BSpec, 3D Pipeline > Geometry > Vertex Shader > State,
@@ -1264,7 +1333,8 @@ blorp_exec(struct blorp_batch *batch, const struct blorp_params *params)
 
    blorp_emit_viewport_state(batch, params);
 
-   blorp_emit_depth_stencil_config(batch, params);
+   if (!(batch->flags & BLORP_BATCH_NO_EMIT_DEPTH_STENCIL))
+      blorp_emit_depth_stencil_config(batch, params);
 
    blorp_emit(batch, GENX(3DPRIMITIVE), prim) {
       prim.VertexAccessType = SEQUENTIAL;