From 46af0ecc1d1f060786a1c2dfede1f936b407fbf6 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason@jlekstrand.net>
Date: Mon, 25 Nov 2019 21:55:51 -0600
Subject: [PATCH] anv: Use PIPE_CONTROL flushes to implement the gen8 VF cache
 WA

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
---
 src/intel/vulkan/anv_device.c      |  14 ++-
 src/intel/vulkan/anv_genX.h        |   8 ++
 src/intel/vulkan/anv_private.h     |  26 +++++
 src/intel/vulkan/genX_blorp_exec.c |  33 +++---
 src/intel/vulkan/genX_cmd_buffer.c | 180 +++++++++++++++++++++++++++++
 src/intel/vulkan/genX_gpu_memcpy.c |   4 +
 6 files changed, 245 insertions(+), 20 deletions(-)

diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
index 2c48003b276..be4d23356e3 100644
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -141,8 +141,12 @@ anv_physical_device_init_heaps(struct anv_physical_device *device, int fd)
       }
    }
 
+   /* We only allow 48-bit addresses with softpin because knowing the actual
+    * address is required for the vertex cache flush workaround.
+    */
    device->supports_48bit_addresses = (device->info.gen >= 8) &&
-      gtt_size > (4ULL << 30 /* GiB */);
+                                      device->has_softpin &&
+                                      gtt_size > (4ULL << 30 /* GiB */);
 
    uint64_t heap_size = anv_compute_heap_size(fd, gtt_size);
 
@@ -471,10 +475,6 @@ anv_physical_device_init(struct anv_physical_device *device,
       goto fail;
    }
 
-   result = anv_physical_device_init_heaps(device, fd);
-   if (result != VK_SUCCESS)
-      goto fail;
-
    device->has_softpin = anv_gem_get_param(fd, I915_PARAM_HAS_EXEC_SOFTPIN);
    device->has_exec_async = anv_gem_get_param(fd, I915_PARAM_HAS_EXEC_ASYNC);
    device->has_exec_capture = anv_gem_get_param(fd, I915_PARAM_HAS_EXEC_CAPTURE);
@@ -484,6 +484,10 @@ anv_physical_device_init(struct anv_physical_device *device,
                               anv_gem_supports_syncobj_wait(fd);
    device->has_context_priority = anv_gem_has_context_priority(fd);
 
+   result = anv_physical_device_init_heaps(device, fd);
+   if (result != VK_SUCCESS)
+      goto fail;
+
    device->use_softpin = device->has_softpin &&
                          device->supports_48bit_addresses;
 
diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h
index 0274fe8b3a8..8c2a0e40099 100644
--- a/src/intel/vulkan/anv_genX.h
+++ b/src/intel/vulkan/anv_genX.h
@@ -44,6 +44,14 @@ void genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer);
 
 void genX(cmd_buffer_emit_gen7_depth_flush)(struct anv_cmd_buffer *cmd_buffer);
 
+void genX(cmd_buffer_set_binding_for_gen8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,
+                                                    int vb_index,
+                                                    struct anv_address vb_address,
+                                                    uint32_t vb_size);
+void genX(cmd_buffer_update_dirty_vbs_for_gen8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,
+                                                         uint32_t access_type,
+                                                         uint64_t vb_used);
+
 void genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer,
                                         unsigned width, unsigned height,
                                         unsigned scale);
diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h
index fd6f0fdb104..2abbb866b2f 100644
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -2503,6 +2503,27 @@ struct anv_attachment_state {
    struct anv_image_view *                      image_view;
 };
 
+/** State tracking for vertex buffer flushes
+ *
+ * On Gen8-9, the VF cache only considers the bottom 32 bits of memory
+ * addresses.  If you happen to have two vertex buffers which get placed
+ * exactly 4 GiB apart and use them in back-to-back draw calls, you can get
+ * collisions.  In order to solve this problem, we track vertex address ranges
+ * which are live in the cache and invalidate the cache if one ever exceeds 32
+ * bits.
+ */
+struct anv_vb_cache_range {
+   /* Virtual address at which the live vertex buffer cache range starts for
+    * this vertex buffer index.
+    */
+   uint64_t start;
+
+   /* Virtual address of the byte after where vertex buffer cache range ends.
+    * This is exclusive such that end - start is the size of the range.
+    */
+   uint64_t end;
+};
+
 /** State tracking for particular pipeline bind point
  *
  * This struct is the base struct for anv_cmd_graphics_state and
@@ -2531,6 +2552,11 @@ struct anv_cmd_graphics_state {
    anv_cmd_dirty_mask_t dirty;
    uint32_t vb_dirty;
 
+   struct anv_vb_cache_range ib_bound_range;
+   struct anv_vb_cache_range ib_dirty_range;
+   struct anv_vb_cache_range vb_bound_ranges[33];
+   struct anv_vb_cache_range vb_dirty_ranges[33];
+
    struct anv_dynamic_state dynamic;
 
    struct {
diff --git a/src/intel/vulkan/genX_blorp_exec.c b/src/intel/vulkan/genX_blorp_exec.c
index 79e18d95282..302acb54461 100644
--- a/src/intel/vulkan/genX_blorp_exec.c
+++ b/src/intel/vulkan/genX_blorp_exec.c
@@ -139,19 +139,6 @@ blorp_alloc_vertex_buffer(struct blorp_batch *batch, uint32_t size,
                           struct blorp_address *addr)
 {
    struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
-
-   /* From the Skylake PRM, 3DSTATE_VERTEX_BUFFERS:
-    *
-    *    "The VF cache needs to be invalidated before binding and then using
-    *    Vertex Buffers that overlap with any previously bound Vertex Buffer
-    *    (at a 64B granularity) since the last invalidation.  A VF cache
-    *    invalidate is performed by setting the "VF Cache Invalidation Enable"
-    *    bit in PIPE_CONTROL."
-    *
-    * This restriction first appears in the Skylake PRM but the internal docs
-    * also list it as being an issue on Broadwell.  In order to avoid this
-    * problem, we align all vertex buffer allocations to 64 bytes.
-    */
    struct anv_state vb_state =
       anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, size, 64);
 
@@ -170,9 +157,25 @@ blorp_vf_invalidate_for_vb_48b_transitions(struct blorp_batch *batch,
                                            uint32_t *sizes,
                                            unsigned num_vbs)
 {
-   /* anv forces all vertex buffers into the low 4GB so there are never any
-    * transitions that require a VF invalidation.
+   struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
+
+   for (unsigned i = 0; i < num_vbs; i++) {
+      struct anv_address anv_addr = {
+         .bo = addrs[i].buffer,
+         .offset = addrs[i].offset,
+      };
+      genX(cmd_buffer_set_binding_for_gen8_vb_flush)(cmd_buffer,
+                                                     i, anv_addr, sizes[i]);
+   }
+
+   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+   /* Technically, we should call this *after* 3DPRIMITIVE but it doesn't
+    * really matter for blorp because we never call apply_pipe_flushes after
+    * this point.
     */
+   genX(cmd_buffer_update_dirty_vbs_for_gen8_vb_flush)(cmd_buffer, SEQUENTIAL,
+                                                       (1 << num_vbs) - 1);
 }
 
 #if GEN_GEN >= 8
diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c
index 677377ea302..c764011eb5f 100644
--- a/src/intel/vulkan/genX_cmd_buffer.c
+++ b/src/intel/vulkan/genX_cmd_buffer.c
@@ -1392,6 +1392,10 @@ genX(BeginCommandBuffer)(
     * executing anything.  The chances are fairly high that they will use
     * blorp at least once per primary command buffer so it shouldn't be
     * wasted.
+    *
+    * There is also a workaround on gen8 which requires us to invalidate the
+    * VF cache occasionally.  It's easier if we can assume we start with a
+    * fresh cache (See also genX(cmd_buffer_set_binding_for_gen8_vb_flush).)
     */
    cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
 
@@ -1598,6 +1602,14 @@ genX(CmdExecuteCommands)(
       anv_cmd_buffer_add_secondary(primary, secondary);
    }
 
+   /* The secondary isn't counted in our VF cache tracking so we need to
+    * invalidate the whole thing.
+    */
+   if (GEN_GEN >= 8 && GEN_GEN <= 9) {
+      primary->state.pending_pipe_bits |=
+         ANV_PIPE_CS_STALL_BIT | ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
+   }
+
    /* The secondary may have selected a different pipeline (3D or compute) and
     * may have changed the current L3$ configuration.  Reset our tracking
     * variables to invalid values to ensure that we re-emit these in the case
@@ -1836,6 +1848,18 @@ genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
       bits |= ANV_PIPE_TILE_CACHE_FLUSH_BIT;
    }
 
+   if ((GEN_GEN >= 8 && GEN_GEN <= 9) &&
+       (bits & ANV_PIPE_CS_STALL_BIT) &&
+       (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT)) {
+      /* If we are doing a VF cache invalidate AND a CS stall (it must be
+       * both) then we can reset our vertex cache tracking.
+       */
+      memset(cmd_buffer->state.gfx.vb_dirty_ranges, 0,
+             sizeof(cmd_buffer->state.gfx.vb_dirty_ranges));
+      memset(&cmd_buffer->state.gfx.ib_dirty_range, 0,
+             sizeof(cmd_buffer->state.gfx.ib_dirty_range));
+   }
+
    if (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT)) {
       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
 #if GEN_GEN >= 12
@@ -2830,6 +2854,12 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer)
 #endif
          };
 
+#if GEN_GEN >= 8 && GEN_GEN <= 9
+         genX(cmd_buffer_set_binding_for_gen8_vb_flush)(cmd_buffer, vb,
+                                                        state.BufferStartingAddress,
+                                                        state.BufferSize);
+#endif
+
          GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, &p[1 + i * 4], &state);
          i++;
       }
@@ -2967,6 +2997,9 @@ emit_vertex_bo(struct anv_cmd_buffer *cmd_buffer,
          .EndAddress = anv_address_add(addr, size),
 #endif
       });
+
+   genX(cmd_buffer_set_binding_for_gen8_vb_flush)(cmd_buffer,
+                                                  index, addr, size);
 }
 
 static void
@@ -3014,6 +3047,25 @@ emit_draw_index(struct anv_cmd_buffer *cmd_buffer, uint32_t draw_index)
    emit_vertex_bo(cmd_buffer, addr, 4, ANV_DRAWID_VB_INDEX);
 }
 
+static void
+update_dirty_vbs_for_gen8_vb_flush(struct anv_cmd_buffer *cmd_buffer,
+                                   uint32_t access_type)
+{
+   struct anv_pipeline *pipeline = cmd_buffer->state.gfx.base.pipeline;
+   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+
+   uint64_t vb_used = pipeline->vb_used;
+   if (vs_prog_data->uses_firstvertex ||
+       vs_prog_data->uses_baseinstance)
+      vb_used |= 1ull << ANV_SVGS_VB_INDEX;
+   if (vs_prog_data->uses_drawid)
+      vb_used |= 1ull << ANV_DRAWID_VB_INDEX;
+
+   genX(cmd_buffer_update_dirty_vbs_for_gen8_vb_flush)(cmd_buffer,
+                                                       access_type == RANDOM,
+                                                       vb_used);
+}
+
 void genX(CmdDraw)(
     VkCommandBuffer                             commandBuffer,
     uint32_t                                    vertexCount,
@@ -3059,6 +3111,8 @@ void genX(CmdDraw)(
       prim.StartInstanceLocation    = firstInstance;
       prim.BaseVertexLocation       = 0;
    }
+
+   update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, SEQUENTIAL);
 }
 
 void genX(CmdDrawIndexed)(
@@ -3107,6 +3161,8 @@ void genX(CmdDrawIndexed)(
       prim.StartInstanceLocation    = firstInstance;
       prim.BaseVertexLocation       = vertexOffset;
    }
+
+   update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, RANDOM);
 }
 
 /* Auto-Draw / Indirect Registers */
@@ -3179,6 +3235,8 @@ void genX(CmdDrawIndirectByteCountEXT)(
       prim.VertexAccessType         = SEQUENTIAL;
       prim.PrimitiveTopologyType    = pipeline->topology;
    }
+
+   update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, SEQUENTIAL);
 #endif /* GEN_IS_HASWELL || GEN_GEN >= 8 */
 }
 
@@ -3263,6 +3321,8 @@ void genX(CmdDrawIndirect)(
          prim.PrimitiveTopologyType    = pipeline->topology;
       }
 
+      update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, SEQUENTIAL);
+
       offset += stride;
    }
 }
@@ -3311,6 +3371,8 @@ void genX(CmdDrawIndexedIndirect)(
          prim.PrimitiveTopologyType    = pipeline->topology;
       }
 
+      update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, RANDOM);
+
       offset += stride;
    }
 }
@@ -3465,6 +3527,8 @@ void genX(CmdDrawIndirectCountKHR)(
          prim.PrimitiveTopologyType    = pipeline->topology;
       }
 
+      update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, SEQUENTIAL);
+
       offset += stride;
    }
 }
@@ -3530,6 +3594,8 @@ void genX(CmdDrawIndexedIndirectCountKHR)(
          prim.PrimitiveTopologyType    = pipeline->topology;
       }
 
+      update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, RANDOM);
+
       offset += stride;
    }
 }
@@ -4115,6 +4181,120 @@ genX(cmd_buffer_emit_gen7_depth_flush)(struct anv_cmd_buffer *cmd_buffer)
    }
 }
 
+/* From the Skylake PRM, 3DSTATE_VERTEX_BUFFERS:
+ *
+ *    "The VF cache needs to be invalidated before binding and then using
+ *    Vertex Buffers that overlap with any previously bound Vertex Buffer
+ *    (at a 64B granularity) since the last invalidation.  A VF cache
+ *    invalidate is performed by setting the "VF Cache Invalidation Enable"
+ *    bit in PIPE_CONTROL."
+ *
+ * This is implemented by carefully tracking all vertex and index buffer
+ * bindings and flushing if the cache ever ends up with a range in the cache
+ * that would exceed 4 GiB.  This is implemented in three parts:
+ *
+ *    1. genX(cmd_buffer_set_binding_for_gen8_vb_flush)() which must be called
+ *       every time a 3DSTATE_VERTEX_BUFFER packet is emitted and informs the
+ *       tracking code of the new binding.  If this new binding would cause
+ *       the cache to have a too-large range on the next draw call, a pipeline
+ *       stall and VF cache invalidate are added to pending_pipeline_bits.
+ *
+ *    2. genX(cmd_buffer_apply_pipe_flushes)() resets the cache tracking to
+ *       empty whenever we emit a VF invalidate.
+ *
+ *    3. genX(cmd_buffer_update_dirty_vbs_for_gen8_vb_flush)() must be called
+ *       after every 3DPRIMITIVE and copies the bound range into the dirty
+ *       range for each used buffer.  This has to be a separate step because
+ *       we don't always re-bind all buffers and so 1. can't know which
+ *       buffers are actually bound.
+ */
+void
+genX(cmd_buffer_set_binding_for_gen8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,
+                                               int vb_index,
+                                               struct anv_address vb_address,
+                                               uint32_t vb_size)
+{
+   if (GEN_GEN < 8 || GEN_GEN > 9 ||
+       !cmd_buffer->device->instance->physicalDevice.use_softpin)
+      return;
+
+   struct anv_vb_cache_range *bound, *dirty;
+   if (vb_index == -1) {
+      bound = &cmd_buffer->state.gfx.ib_bound_range;
+      dirty = &cmd_buffer->state.gfx.ib_dirty_range;
+   } else {
+      assert(vb_index >= 0);
+      assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges));
+      assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges));
+      bound = &cmd_buffer->state.gfx.vb_bound_ranges[vb_index];
+      dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[vb_index];
+   }
+
+   if (vb_size == 0) {
+      bound->start = 0;
+      bound->end = 0;
+      return;
+   }
+
+   assert(vb_address.bo && (vb_address.bo->flags & EXEC_OBJECT_PINNED));
+   bound->start = gen_48b_address(anv_address_physical(vb_address));
+   bound->end = bound->start + vb_size;
+   assert(bound->end > bound->start); /* No overflow */
+
+   /* Align everything to a cache line */
+   bound->start &= ~(64ull - 1ull);
+   bound->end = align_u64(bound->end, 64);
+
+   /* Compute the dirty range */
+   dirty->start = MIN2(dirty->start, bound->start);
+   dirty->end = MAX2(dirty->end, bound->end);
+
+   /* If our range is larger than 32 bits, we have to flush */
+   assert(bound->end - bound->start <= (1ull << 32));
+   if (dirty->end - dirty->start > (1ull << 32)) {
+      cmd_buffer->state.pending_pipe_bits |=
+         ANV_PIPE_CS_STALL_BIT | ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
+   }
+}
+
+void
+genX(cmd_buffer_update_dirty_vbs_for_gen8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,
+                                                    uint32_t access_type,
+                                                    uint64_t vb_used)
+{
+   if (GEN_GEN < 8 || GEN_GEN > 9 ||
+       !cmd_buffer->device->instance->physicalDevice.use_softpin)
+      return;
+
+   if (access_type == RANDOM) {
+      /* We have an index buffer */
+      struct anv_vb_cache_range *bound = &cmd_buffer->state.gfx.ib_bound_range;
+      struct anv_vb_cache_range *dirty = &cmd_buffer->state.gfx.ib_dirty_range;
+
+      if (bound->end > bound->start) {
+         dirty->start = MIN2(dirty->start, bound->start);
+         dirty->end = MAX2(dirty->end, bound->end);
+      }
+   }
+
+   uint64_t mask = vb_used;
+   while (mask) {
+      int i = u_bit_scan64(&mask);
+      assert(i >= 0);
+      assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges));
+      assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges));
+
+      struct anv_vb_cache_range *bound, *dirty;
+      bound = &cmd_buffer->state.gfx.vb_bound_ranges[i];
+      dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[i];
+
+      if (bound->end > bound->start) {
+         dirty->start = MIN2(dirty->start, bound->start);
+         dirty->end = MAX2(dirty->end, bound->end);
+      }
+   }
+}
+
 /**
  * Update the pixel hashing modes that determine the balancing of PS threads
  * across subslices and slices.
diff --git a/src/intel/vulkan/genX_gpu_memcpy.c b/src/intel/vulkan/genX_gpu_memcpy.c
index 5af7085393e..28de5def12d 100644
--- a/src/intel/vulkan/genX_gpu_memcpy.c
+++ b/src/intel/vulkan/genX_gpu_memcpy.c
@@ -78,6 +78,7 @@ genX(cmd_buffer_so_memcpy)(struct anv_cmd_buffer *cmd_buffer,
       genX(cmd_buffer_config_l3)(cmd_buffer, cfg);
    }
 
+   genX(cmd_buffer_set_binding_for_gen8_vb_flush)(cmd_buffer, 32, src, size);
    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
 
    genX(flush_pipeline_select_3d)(cmd_buffer);
@@ -229,5 +230,8 @@ genX(cmd_buffer_so_memcpy)(struct anv_cmd_buffer *cmd_buffer,
       prim.BaseVertexLocation       = 0;
    }
 
+   genX(cmd_buffer_update_dirty_vbs_for_gen8_vb_flush)(cmd_buffer, SEQUENTIAL,
+                                                       1ull << 32);
+
    cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_PIPELINE;
 }
-- 
2.30.2