intel/blorp: emit VF caching workaround before 3DSTATE_VERTEX_BUFFERS
[mesa.git] / src / intel / blorp / blorp_genX_exec.h
index b612709035aaf8a7c45dbb4ca313aeb0e49b95f1..9010b03fb671ad6217a0d5b7b3c65fd5cec30b87 100644 (file)
@@ -59,6 +59,10 @@ blorp_alloc_dynamic_state(struct blorp_batch *batch,
 static void *
 blorp_alloc_vertex_buffer(struct blorp_batch *batch, uint32_t size,
                           struct blorp_address *addr);
+static void
+blorp_vf_invalidate_for_vb_48b_transitions(struct blorp_batch *batch,
+                                           const struct blorp_address *addrs,
+                                           unsigned num_vbs);
 
 #if GEN_GEN >= 8
 static struct blorp_address
@@ -78,7 +82,7 @@ static void
 blorp_surface_reloc(struct blorp_batch *batch, uint32_t ss_offset,
                     struct blorp_address address, uint32_t delta);
 
-#if GEN_GEN >= 7 && GEN_GEN <= 10
+#if GEN_GEN >= 7 && GEN_GEN < 10
 static struct blorp_address
 blorp_get_surface_base_address(struct blorp_batch *batch);
 #endif
@@ -200,6 +204,14 @@ emit_urb_config(struct blorp_batch *batch,
    blorp_emit_urb_config(batch, vs_entry_size, sf_entry_size);
 }
 
+#if GEN_GEN >= 7
+static void
+blorp_emit_memcpy(struct blorp_batch *batch,
+                  struct blorp_address dst,
+                  struct blorp_address src,
+                  uint32_t size);
+#endif
+
 static void
 blorp_emit_vertex_data(struct blorp_batch *batch,
                        const struct blorp_params *params,
@@ -260,6 +272,31 @@ blorp_emit_input_varying_data(struct blorp_batch *batch,
    }
 
    blorp_flush_range(batch, data, *size);
+
+   if (params->dst_clear_color_as_input) {
+#if GEN_GEN >= 7
+      /* In this case, the clear color isn't known statically and instead
+       * comes in through an indirect which we have to copy into the vertex
+       * buffer before we execute the 3DPRIMITIVE.  We already copied the
+       * value of params->wm_inputs.clear_color into the vertex buffer in the
+       * loop above.  Now we emit code to stomp it from the GPU with the
+       * actual clear color value.
+       */
+      assert(num_varyings == 1);
+
+      /* The clear color is the first thing after the header */
+      struct blorp_address clear_color_input_addr = *addr;
+      clear_color_input_addr.offset += 16;
+
+      const unsigned clear_color_size =
+         GEN_GEN < 10 ? batch->blorp->isl_dev->ss.clear_value_size : 4 * 4;
+      blorp_emit_memcpy(batch, clear_color_input_addr,
+                        params->dst.clear_color_addr,
+                        clear_color_size);
+#else
+      unreachable("MCS partial resolve is not a thing on SNB and earlier");
+#endif
+   }
 }
 
 static void
@@ -274,7 +311,7 @@ blorp_fill_vertex_buffer_state(struct blorp_batch *batch,
    vb[idx].BufferPitch = stride;
 
 #if GEN_GEN >= 6
-   vb[idx].VertexBufferMOCS = addr.mocs;
+   vb[idx].MOCS = addr.mocs;
 #endif
 
 #if GEN_GEN >= 7
@@ -298,23 +335,19 @@ blorp_emit_vertex_buffers(struct blorp_batch *batch,
                           const struct blorp_params *params)
 {
    struct GENX(VERTEX_BUFFER_STATE) vb[3];
+   uint32_t num_vbs = 2;
    memset(vb, 0, sizeof(vb));
 
-   struct blorp_address addr;
+   struct blorp_address addrs[2] = {};
    uint32_t size;
-   blorp_emit_vertex_data(batch, params, &addr, &size);
-   blorp_fill_vertex_buffer_state(batch, vb, 0, addr, size, 3 * sizeof(float));
+   blorp_emit_vertex_data(batch, params, &addrs[0], &size);
+   blorp_fill_vertex_buffer_state(batch, vb, 0, addrs[0], size,
+                                  3 * sizeof(float));
 
-   blorp_emit_input_varying_data(batch, params, &addr, &size);
-   blorp_fill_vertex_buffer_state(batch, vb, 1, addr, size, 0);
+   blorp_emit_input_varying_data(batch, params, &addrs[1], &size);
+   blorp_fill_vertex_buffer_state(batch, vb, 1, addrs[1], size, 0);
 
-   uint32_t num_vbs = 2;
-   if (params->dst_clear_color_as_input) {
-      blorp_fill_vertex_buffer_state(batch, vb, num_vbs++,
-                                     params->dst.clear_color_addr,
-                                     batch->blorp->isl_dev->ss.clear_value_size,
-                                     0);
-   }
+   blorp_vf_invalidate_for_vb_48b_transitions(batch, addrs, num_vbs);
 
    const unsigned num_dwords = 1 + num_vbs * GENX(VERTEX_BUFFER_STATE_length);
    uint32_t *dw = blorp_emitn(batch, GENX(3DSTATE_VERTEX_BUFFERS), num_dwords);
@@ -448,49 +481,21 @@ blorp_emit_vertex_elements(struct blorp_batch *batch,
    };
    slot++;
 
-   if (params->dst_clear_color_as_input) {
-      /* If the caller wants the destination indirect clear color, redirect
-       * to vertex buffer 2 where we stored it earlier.  The only users of
-       * an indirect clear color source have that as their only vertex
-       * attribute.
-       */
-      assert(num_varyings == 1);
+   for (unsigned i = 0; i < num_varyings; ++i) {
       ve[slot] = (struct GENX(VERTEX_ELEMENT_STATE)) {
-         .VertexBufferIndex = 2,
+         .VertexBufferIndex = 1,
          .Valid = true,
-         .SourceElementOffset = 0,
-         .Component0Control = VFCOMP_STORE_SRC,
-#if GEN_GEN >= 9
          .SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT,
+         .SourceElementOffset = 16 + i * 4 * sizeof(float),
+         .Component0Control = VFCOMP_STORE_SRC,
          .Component1Control = VFCOMP_STORE_SRC,
          .Component2Control = VFCOMP_STORE_SRC,
          .Component3Control = VFCOMP_STORE_SRC,
-#else
-         /* Clear colors on gen7-8 are for bits out of one dword */
-         .SourceElementFormat = ISL_FORMAT_R32_FLOAT,
-         .Component1Control = VFCOMP_STORE_0,
-         .Component2Control = VFCOMP_STORE_0,
-         .Component3Control = VFCOMP_STORE_0,
+#if GEN_GEN <= 5
+         .DestinationElementOffset = slot * 4,
 #endif
       };
       slot++;
-   } else {
-      for (unsigned i = 0; i < num_varyings; ++i) {
-         ve[slot] = (struct GENX(VERTEX_ELEMENT_STATE)) {
-            .VertexBufferIndex = 1,
-            .Valid = true,
-            .SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT,
-            .SourceElementOffset = 16 + i * 4 * sizeof(float),
-            .Component0Control = VFCOMP_STORE_SRC,
-            .Component1Control = VFCOMP_STORE_SRC,
-            .Component2Control = VFCOMP_STORE_SRC,
-            .Component3Control = VFCOMP_STORE_SRC,
-#if GEN_GEN <= 5
-            .DestinationElementOffset = slot * 4,
-#endif
-         };
-         slot++;
-      }
    }
 
    const unsigned num_dwords =
@@ -757,18 +762,45 @@ blorp_emit_ps_config(struct blorp_batch *batch,
          ps.BindingTableEntryCount = 1;
       }
 
-      if (prog_data) {
-         ps.DispatchGRFStartRegisterForConstantSetupData0 =
-            prog_data->base.dispatch_grf_start_reg;
-         ps.DispatchGRFStartRegisterForConstantSetupData2 =
-            prog_data->dispatch_grf_start_reg_2;
+     /* Gen 11 workarounds table #2056 WABTPPrefetchDisable suggests to
+      * disable prefetching of binding tables on A0 and B0 steppings.
+      * TODO: Revisit this WA on C0 stepping.
+      */
+      if (GEN_GEN == 11)
+         ps.BindingTableEntryCount = 0;
 
+      if (prog_data) {
          ps._8PixelDispatchEnable = prog_data->dispatch_8;
          ps._16PixelDispatchEnable = prog_data->dispatch_16;
+         ps._32PixelDispatchEnable = prog_data->dispatch_32;
+
+         /* From the Sky Lake PRM 3DSTATE_PS::32 Pixel Dispatch Enable:
+          *
+          *    "When NUM_MULTISAMPLES = 16 or FORCE_SAMPLE_COUNT = 16, SIMD32
+          *    Dispatch must not be enabled for PER_PIXEL dispatch mode."
+          *
+          * Since 16x MSAA is first introduced on SKL, we don't need to apply
+          * the workaround on any older hardware.
+          */
+         if (GEN_GEN >= 9 && !prog_data->persample_dispatch &&
+             params->num_samples == 16) {
+            assert(ps._8PixelDispatchEnable || ps._16PixelDispatchEnable);
+            ps._32PixelDispatchEnable = false;
+         }
 
-         ps.KernelStartPointer0 = params->wm_prog_kernel;
-         ps.KernelStartPointer2 =
-            params->wm_prog_kernel + prog_data->prog_offset_2;
+         ps.DispatchGRFStartRegisterForConstantSetupData0 =
+            brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 0);
+         ps.DispatchGRFStartRegisterForConstantSetupData1 =
+            brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 1);
+         ps.DispatchGRFStartRegisterForConstantSetupData2 =
+            brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 2);
+
+         ps.KernelStartPointer0 = params->wm_prog_kernel +
+                                  brw_wm_prog_data_prog_offset(prog_data, ps, 0);
+         ps.KernelStartPointer1 = params->wm_prog_kernel +
+                                  brw_wm_prog_data_prog_offset(prog_data, ps, 1);
+         ps.KernelStartPointer2 = params->wm_prog_kernel +
+                                  brw_wm_prog_data_prog_offset(prog_data, ps, 2);
       }
 
       /* 3DSTATE_PS expects the number of threads per PSD, which is always 64
@@ -862,17 +894,23 @@ blorp_emit_ps_config(struct blorp_batch *batch,
 #endif
 
       if (prog_data) {
+         ps._8PixelDispatchEnable = prog_data->dispatch_8;
+         ps._16PixelDispatchEnable = prog_data->dispatch_16;
+         ps._32PixelDispatchEnable = prog_data->dispatch_32;
+
          ps.DispatchGRFStartRegisterForConstantSetupData0 =
-            prog_data->base.dispatch_grf_start_reg;
+            brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 0);
+         ps.DispatchGRFStartRegisterForConstantSetupData1 =
+            brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 1);
          ps.DispatchGRFStartRegisterForConstantSetupData2 =
-            prog_data->dispatch_grf_start_reg_2;
+            brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 2);
 
-         ps.KernelStartPointer0 = params->wm_prog_kernel;
-         ps.KernelStartPointer2 =
-            params->wm_prog_kernel + prog_data->prog_offset_2;
-
-         ps._8PixelDispatchEnable = prog_data->dispatch_8;
-         ps._16PixelDispatchEnable = prog_data->dispatch_16;
+         ps.KernelStartPointer0 = params->wm_prog_kernel +
+                                  brw_wm_prog_data_prog_offset(prog_data, ps, 0);
+         ps.KernelStartPointer1 = params->wm_prog_kernel +
+                                  brw_wm_prog_data_prog_offset(prog_data, ps, 1);
+         ps.KernelStartPointer2 = params->wm_prog_kernel +
+                                  brw_wm_prog_data_prog_offset(prog_data, ps, 2);
 
          ps.AttributeEnable = prog_data->num_varying_inputs > 0;
       } else {
@@ -924,17 +962,23 @@ blorp_emit_ps_config(struct blorp_batch *batch,
       if (prog_data) {
          wm.ThreadDispatchEnable = true;
 
+         wm._8PixelDispatchEnable = prog_data->dispatch_8;
+         wm._16PixelDispatchEnable = prog_data->dispatch_16;
+         wm._32PixelDispatchEnable = prog_data->dispatch_32;
+
          wm.DispatchGRFStartRegisterForConstantSetupData0 =
-            prog_data->base.dispatch_grf_start_reg;
+            brw_wm_prog_data_dispatch_grf_start_reg(prog_data, wm, 0);
+         wm.DispatchGRFStartRegisterForConstantSetupData1 =
+            brw_wm_prog_data_dispatch_grf_start_reg(prog_data, wm, 1);
          wm.DispatchGRFStartRegisterForConstantSetupData2 =
-            prog_data->dispatch_grf_start_reg_2;
-
-         wm.KernelStartPointer0 = params->wm_prog_kernel;
-         wm.KernelStartPointer2 =
-            params->wm_prog_kernel + prog_data->prog_offset_2;
+            brw_wm_prog_data_dispatch_grf_start_reg(prog_data, wm, 2);
 
-         wm._8PixelDispatchEnable = prog_data->dispatch_8;
-         wm._16PixelDispatchEnable = prog_data->dispatch_16;
+         wm.KernelStartPointer0 = params->wm_prog_kernel +
+                                  brw_wm_prog_data_prog_offset(prog_data, wm, 0);
+         wm.KernelStartPointer1 = params->wm_prog_kernel +
+                                  brw_wm_prog_data_prog_offset(prog_data, wm, 1);
+         wm.KernelStartPointer2 = params->wm_prog_kernel +
+                                  brw_wm_prog_data_prog_offset(prog_data, wm, 2);
 
          wm.NumberofSFOutputAttributes = prog_data->num_varying_inputs;
       }
@@ -1243,7 +1287,7 @@ blorp_emit_pipeline(struct blorp_batch *batch,
 
 #endif /* GEN_GEN >= 6 */
 
-#if GEN_GEN >= 7 && GEN_GEN <= 10
+#if GEN_GEN >= 7
 static void
 blorp_emit_memcpy(struct blorp_batch *batch,
                   struct blorp_address dst,
@@ -1282,6 +1326,7 @@ blorp_emit_memcpy(struct blorp_batch *batch,
 static void
 blorp_emit_surface_state(struct blorp_batch *batch,
                          const struct brw_blorp_surface_info *surface,
+                         enum isl_aux_op op,
                          void *state, uint32_t state_offset,
                          const bool color_write_disables[4],
                          bool is_render_target)
@@ -1312,11 +1357,15 @@ blorp_emit_surface_state(struct blorp_batch *batch,
          write_disable_mask |= ISL_CHANNEL_ALPHA_BIT;
    }
 
+   const bool use_clear_address =
+      GEN_GEN >= 10 && (surface->clear_color_addr.buffer != NULL);
+
    isl_surf_fill_state(batch->blorp->isl_dev, state,
                        .surf = &surf, .view = &surface->view,
                        .aux_surf = &surface->aux_surf, .aux_usage = aux_usage,
                        .mocs = surface->addr.mocs,
                        .clear_color = surface->clear_color,
+                       .use_clear_address = use_clear_address,
                        .write_disables = write_disable_mask);
 
    blorp_surface_reloc(batch, state_offset + isl_dev->ss.addr_offset,
@@ -1333,20 +1382,26 @@ blorp_emit_surface_state(struct blorp_batch *batch,
                           surface->aux_addr, *aux_addr);
    }
 
-   blorp_flush_range(batch, state, GENX(RENDER_SURFACE_STATE_length) * 4);
-
    if (surface->clear_color_addr.buffer) {
-#if GEN_GEN > 10
-      unreachable("Implement indirect clear support on gen11+");
-#elif GEN_GEN >= 7 && GEN_GEN <= 10
-      struct blorp_address dst_addr = blorp_get_surface_base_address(batch);
-      dst_addr.offset += state_offset + isl_dev->ss.clear_value_offset;
-      blorp_emit_memcpy(batch, dst_addr, surface->clear_color_addr,
-                        isl_dev->ss.clear_value_size);
+#if GEN_GEN >= 10
+      assert((surface->clear_color_addr.offset & 0x3f) == 0);
+      uint32_t *clear_addr = state + isl_dev->ss.clear_color_state_offset;
+      blorp_surface_reloc(batch, state_offset +
+                          isl_dev->ss.clear_color_state_offset,
+                          surface->clear_color_addr, *clear_addr);
+#elif GEN_GEN >= 7
+      if (op == ISL_AUX_OP_FULL_RESOLVE || op == ISL_AUX_OP_PARTIAL_RESOLVE) {
+         struct blorp_address dst_addr = blorp_get_surface_base_address(batch);
+         dst_addr.offset += state_offset + isl_dev->ss.clear_value_offset;
+         blorp_emit_memcpy(batch, dst_addr, surface->clear_color_addr,
+                           isl_dev->ss.clear_value_size);
+      }
 #else
       unreachable("Fast clears are only supported on gen7+");
 #endif
    }
+
+   blorp_flush_range(batch, state, GENX(RENDER_SURFACE_STATE_length) * 4);
 }
 
 static void
@@ -1388,7 +1443,7 @@ blorp_emit_surface_states(struct blorp_batch *batch,
                           const struct blorp_params *params)
 {
    const struct isl_device *isl_dev = batch->blorp->isl_dev;
-   uint32_t bind_offset, surface_offsets[2];
+   uint32_t bind_offset = 0, surface_offsets[2];
    void *surface_maps[2];
 
    MAYBE_UNUSED bool has_indirect_clear_color = false;
@@ -1402,6 +1457,7 @@ blorp_emit_surface_states(struct blorp_batch *batch,
 
       if (params->dst.enabled) {
          blorp_emit_surface_state(batch, &params->dst,
+                                  params->fast_clear_op,
                                   surface_maps[BLORP_RENDERBUFFER_BT_INDEX],
                                   surface_offsets[BLORP_RENDERBUFFER_BT_INDEX],
                                   params->color_write_disable, true);
@@ -1417,6 +1473,7 @@ blorp_emit_surface_states(struct blorp_batch *batch,
 
       if (params->src.enabled) {
          blorp_emit_surface_state(batch, &params->src,
+                                  params->fast_clear_op,
                                   surface_maps[BLORP_TEXTURE_BT_INDEX],
                                   surface_offsets[BLORP_TEXTURE_BT_INDEX],
                                   NULL, false);
@@ -1571,6 +1628,29 @@ blorp_emit_gen8_hiz_op(struct blorp_batch *batch,
     */
    blorp_emit_3dstate_multisample(batch, params);
 
+   /* From the BDW PRM Volume 7, Depth Buffer Clear:
+    *
+    *    The clear value must be between the min and max depth values
+    *    (inclusive) defined in the CC_VIEWPORT. If the depth buffer format is
+    *    D32_FLOAT, then +/-DENORM values are also allowed.
+    *
+    * Set the bounds to match our hardware limits, [0.0, 1.0].
+    */
+   if (params->depth.enabled && params->hiz_op == ISL_AUX_OP_FAST_CLEAR) {
+      assert(params->depth.clear_color.f32[0] >= 0.0f);
+      assert(params->depth.clear_color.f32[0] <= 1.0f);
+      blorp_emit_cc_viewport(batch);
+   }
+
+   /* According to the SKL PRM formula for WM_INT::ThreadDispatchEnable, the
+    * 3DSTATE_WM::ForceThreadDispatchEnable field can force WM thread dispatch
+    * even when WM_HZ_OP is active.  However, WM thread dispatch is normally
+    * disabled for HiZ ops and it appears that force-enabling it can lead to
+    * GPU hangs on at least Skylake.  Since we don't know the current state of
+    * the 3DSTATE_WM packet, just emit a dummy one prior to 3DSTATE_WM_HZ_OP.
+    */
+   blorp_emit(batch, GENX(3DSTATE_WM), wm);
+
    /* If we can't alter the depth stencil config and multiple layers are
     * involved, the HiZ op will fail. This is because the op requires that a
     * new config is emitted for each additional layer.
@@ -1629,6 +1709,51 @@ blorp_emit_gen8_hiz_op(struct blorp_batch *batch,
 }
 #endif
 
+static void
+blorp_update_clear_color(struct blorp_batch *batch,
+                         const struct brw_blorp_surface_info *info,
+                         enum isl_aux_op op)
+{
+   if (info->clear_color_addr.buffer && op == ISL_AUX_OP_FAST_CLEAR) {
+#if GEN_GEN >= 9
+      for (int i = 0; i < 4; i++) {
+         blorp_emit(batch, GENX(MI_STORE_DATA_IMM), sdi) {
+            sdi.Address = info->clear_color_addr;
+            sdi.Address.offset += i * 4;
+            sdi.ImmediateData = info->clear_color.u32[i];
+         }
+      }
+#elif GEN_GEN >= 7
+      blorp_emit(batch, GENX(MI_STORE_DATA_IMM), sdi) {
+         sdi.Address = info->clear_color_addr;
+         sdi.ImmediateData = ISL_CHANNEL_SELECT_RED   << 25 |
+                             ISL_CHANNEL_SELECT_GREEN << 22 |
+                             ISL_CHANNEL_SELECT_BLUE  << 19 |
+                             ISL_CHANNEL_SELECT_ALPHA << 16;
+         if (isl_format_has_int_channel(info->view.format)) {
+            for (unsigned i = 0; i < 4; i++) {
+               assert(info->clear_color.u32[i] == 0 ||
+                      info->clear_color.u32[i] == 1);
+            }
+            sdi.ImmediateData |= (info->clear_color.u32[0] != 0) << 31;
+            sdi.ImmediateData |= (info->clear_color.u32[1] != 0) << 30;
+            sdi.ImmediateData |= (info->clear_color.u32[2] != 0) << 29;
+            sdi.ImmediateData |= (info->clear_color.u32[3] != 0) << 28;
+         } else {
+            for (unsigned i = 0; i < 4; i++) {
+               assert(info->clear_color.f32[i] == 0.0f ||
+                      info->clear_color.f32[i] == 1.0f);
+            }
+            sdi.ImmediateData |= (info->clear_color.f32[0] != 0.0f) << 31;
+            sdi.ImmediateData |= (info->clear_color.f32[1] != 0.0f) << 30;
+            sdi.ImmediateData |= (info->clear_color.f32[2] != 0.0f) << 29;
+            sdi.ImmediateData |= (info->clear_color.f32[3] != 0.0f) << 28;
+         }
+      }
+#endif
+   }
+}
+
 /**
  * \brief Execute a blit or render pass operation.
  *
@@ -1641,6 +1766,11 @@ blorp_emit_gen8_hiz_op(struct blorp_batch *batch,
 static void
 blorp_exec(struct blorp_batch *batch, const struct blorp_params *params)
 {
+   if (!(batch->flags & BLORP_BATCH_NO_UPDATE_CLEAR_COLOR)) {
+      blorp_update_clear_color(batch, &params->dst, params->fast_clear_op);
+      blorp_update_clear_color(batch, &params->depth, params->hiz_op);
+   }
+
 #if GEN_GEN >= 8
    if (params->hiz_op != ISL_AUX_OP_NONE) {
       blorp_emit_gen8_hiz_op(batch, params);