anv: Always fill out the AUX table even if CCS is disabled
[mesa.git] / src / intel / vulkan / genX_cmd_buffer.c
index 5fbdf93afbd69ae363fb9125f97974bb1ddfece6..608e2a06f6c9dce13110800020e695f934b4c247 100644 (file)
@@ -132,13 +132,21 @@ genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
        * these fields.  However, since we will be growing the BO's live, we
        * just set them all to the maximum.
        */
-      sba.GeneralStateBufferSize                = 0xfffff;
+      sba.GeneralStateBufferSize       = 0xfffff;
+      sba.IndirectObjectBufferSize     = 0xfffff;
+      if (device->physical->use_softpin) {
+         /* With softpin, we use fixed addresses so we actually know how big
+          * our base addresses are.
+          */
+         sba.DynamicStateBufferSize    = DYNAMIC_STATE_POOL_SIZE / 4096;
+         sba.InstructionBufferSize     = INSTRUCTION_STATE_POOL_SIZE / 4096;
+      } else {
+         sba.DynamicStateBufferSize    = 0xfffff;
+         sba.InstructionBufferSize     = 0xfffff;
+      }
       sba.GeneralStateBufferSizeModifyEnable    = true;
-      sba.DynamicStateBufferSize                = 0xfffff;
-      sba.DynamicStateBufferSizeModifyEnable    = true;
-      sba.IndirectObjectBufferSize              = 0xfffff;
       sba.IndirectObjectBufferSizeModifyEnable  = true;
-      sba.InstructionBufferSize                 = 0xfffff;
+      sba.DynamicStateBufferSizeModifyEnable    = true;
       sba.InstructionBuffersizeModifyEnable     = true;
 #  else
       /* On gen7, we have upper bounds instead.  According to the docs,
@@ -522,7 +530,7 @@ transition_depth_buffer(struct anv_cmd_buffer *cmd_buffer,
 {
    uint32_t depth_plane =
       anv_image_aspect_to_plane(image->aspects, VK_IMAGE_ASPECT_DEPTH_BIT);
-   if (image->planes[depth_plane].aux_surface.isl.size_B == 0)
+   if (image->planes[depth_plane].aux_usage == ISL_AUX_USAGE_NONE)
       return;
 
    const enum isl_aux_state initial_state =
@@ -990,6 +998,105 @@ genX(copy_fast_clear_dwords)(struct anv_cmd_buffer *cmd_buffer,
    }
 }
 
+#define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
+
+#if GEN_GEN == 12
+static void
+anv_image_init_aux_tt(struct anv_cmd_buffer *cmd_buffer,
+                      const struct anv_image *image,
+                      VkImageAspectFlagBits aspect,
+                      uint32_t base_level, uint32_t level_count,
+                      uint32_t base_layer, uint32_t layer_count)
+{
+   uint32_t plane = anv_image_aspect_to_plane(image->aspects, aspect);
+
+   uint64_t base_address =
+      anv_address_physical(image->planes[plane].address);
+
+   const struct isl_surf *isl_surf = &image->planes[plane].surface.isl;
+   uint64_t format_bits = gen_aux_map_format_bits_for_isl_surf(isl_surf);
+
+   /* We're about to live-update the AUX-TT.  We really don't want anyone else
+    * trying to read it while we're doing this.  We could probably get away
+    * with not having this stall in some cases if we were really careful but
+    * it's better to play it safe.  Full stall the GPU.
+    */
+   cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT;
+   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+   struct gen_mi_builder b;
+   gen_mi_builder_init(&b, &cmd_buffer->batch);
+
+   for (uint32_t a = 0; a < layer_count; a++) {
+      const uint32_t layer = base_layer + a;
+
+      uint64_t start_offset_B = UINT64_MAX, end_offset_B = 0;
+      for (uint32_t l = 0; l < level_count; l++) {
+         const uint32_t level = base_level + l;
+
+         uint32_t logical_array_layer, logical_z_offset_px;
+         if (image->type == VK_IMAGE_TYPE_3D) {
+            logical_array_layer = 0;
+
+            /* If the given miplevel does not have this layer, then any higher
+             * miplevels won't either because miplevels only get smaller the
+             * higher the LOD.
+             */
+            assert(layer < image->extent.depth);
+            if (layer >= anv_minify(image->extent.depth, level))
+               break;
+            logical_z_offset_px = layer;
+         } else {
+            assert(layer < image->array_size);
+            logical_array_layer = layer;
+            logical_z_offset_px = 0;
+         }
+
+         uint32_t slice_start_offset_B, slice_end_offset_B;
+         isl_surf_get_image_range_B_tile(isl_surf, level,
+                                         logical_array_layer,
+                                         logical_z_offset_px,
+                                         &slice_start_offset_B,
+                                         &slice_end_offset_B);
+
+         start_offset_B = MIN2(start_offset_B, slice_start_offset_B);
+         end_offset_B = MAX2(end_offset_B, slice_end_offset_B);
+      }
+
+      /* Aux operates 64K at a time */
+      start_offset_B = align_down_u64(start_offset_B, 64 * 1024);
+      end_offset_B = align_u64(end_offset_B, 64 * 1024);
+
+      for (uint64_t offset = start_offset_B;
+           offset < end_offset_B; offset += 64 * 1024) {
+         uint64_t address = base_address + offset;
+
+         uint64_t aux_entry_addr64, *aux_entry_map;
+         aux_entry_map = gen_aux_map_get_entry(cmd_buffer->device->aux_map_ctx,
+                                               address, &aux_entry_addr64);
+
+         assert(cmd_buffer->device->physical->use_softpin);
+         struct anv_address aux_entry_address = {
+            .bo = NULL,
+            .offset = aux_entry_addr64,
+         };
+
+         const uint64_t old_aux_entry = READ_ONCE(*aux_entry_map);
+         uint64_t new_aux_entry =
+            (old_aux_entry & GEN_AUX_MAP_ADDRESS_MASK) | format_bits;
+
+         if (isl_aux_usage_has_ccs(image->planes[plane].aux_usage))
+            new_aux_entry |= GEN_AUX_MAP_ENTRY_VALID_BIT;
+
+         gen_mi_store(&b, gen_mi_mem64(aux_entry_address),
+                          gen_mi_imm(new_aux_entry));
+      }
+   }
+
+   cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_AUX_TABLE_INVALIDATE_BIT;
+}
+#endif /* GEN_GEN == 12 */
+
 /**
  * @brief Transitions a color buffer from one layout to another.
  *
@@ -1010,7 +1117,8 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
                         VkImageLayout initial_layout,
                         VkImageLayout final_layout)
 {
-   const struct gen_device_info *devinfo = &cmd_buffer->device->info;
+   struct anv_device *device = cmd_buffer->device;
+   const struct gen_device_info *devinfo = &device->info;
    /* Validate the inputs. */
    assert(cmd_buffer);
    assert(image && image->aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
@@ -1059,6 +1167,16 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
 
    if (initial_layout == VK_IMAGE_LAYOUT_UNDEFINED ||
        initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) {
+#if GEN_GEN == 12
+      if (device->physical->has_implicit_ccs && devinfo->has_aux_map) {
+         anv_image_init_aux_tt(cmd_buffer, image, aspect,
+                               base_level, level_count,
+                               base_layer, layer_count);
+      }
+#else
+      assert(!(device->physical->has_implicit_ccs && devinfo->has_aux_map));
+#endif
+
       /* A subresource in the undefined layout may have been aliased and
        * populated with any arrangement of bits. Therefore, we must initialize
        * the related aux buffer and clear buffer entry with desirable values.
@@ -1457,6 +1575,12 @@ genX(BeginCommandBuffer)(
     */
    cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
 
+   /* Re-emit the aux table register in every command buffer.  This way we're
+    * ensured that we have the table even if this command buffer doesn't
+    * initialize any images.
+    */
+   cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_AUX_TABLE_INVALIDATE_BIT;
+
    /* We send an "Indirect State Pointers Disable" packet at
     * EndCommandBuffer, so all push contant packets are ignored during a
     * context restore. Documentation says after that command, we need to
@@ -1767,7 +1891,7 @@ genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer,
 
    uint32_t l3cr;
    anv_pack_struct(&l3cr, L3_ALLOCATION_REG,
-#if GEN_GEN < 12
+#if GEN_GEN < 11
                    .SLMEnable = has_slm,
 #endif
 #if GEN_GEN == 11
@@ -1908,6 +2032,12 @@ genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
       bits |= ANV_PIPE_TILE_CACHE_FLUSH_BIT;
    }
 
+   /* GEN:BUG:1409226450, Wait for EU to be idle before pipe control which
+    * invalidates the instruction cache
+    */
+   if (GEN_GEN == 12 && (bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT))
+      bits |= ANV_PIPE_CS_STALL_BIT | ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
+
    if ((GEN_GEN >= 8 && GEN_GEN <= 9) &&
        (bits & ANV_PIPE_CS_STALL_BIT) &&
        (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT)) {
@@ -2012,6 +2142,16 @@ genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
          }
       }
 
+#if GEN_GEN == 12
+      if ((bits & ANV_PIPE_AUX_TABLE_INVALIDATE_BIT) &&
+          cmd_buffer->device->info.has_aux_map) {
+         anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
+            lri.RegisterOffset = GENX(GFX_CCS_AUX_INV_num);
+            lri.DataDWord = 1;
+         }
+      }
+#endif
+
       bits &= ~ANV_PIPE_INVALIDATE_BITS;
    }
 
@@ -2847,34 +2987,6 @@ cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer,
    cmd_buffer->state.push_constants_dirty &= ~flushed;
 }
 
-#if GEN_GEN >= 12
-void
-genX(cmd_buffer_aux_map_state)(struct anv_cmd_buffer *cmd_buffer)
-{
-   void *aux_map_ctx = cmd_buffer->device->aux_map_ctx;
-   if (!aux_map_ctx)
-      return;
-   uint32_t aux_map_state_num = gen_aux_map_get_state_num(aux_map_ctx);
-   if (cmd_buffer->state.last_aux_map_state != aux_map_state_num) {
-      /* If the aux-map state number increased, then we need to rewrite the
-       * register. Rewriting the register is used to both set the aux-map
-       * translation table address, and also to invalidate any previously
-       * cached translations.
-       */
-      uint64_t base_addr = gen_aux_map_get_base(aux_map_ctx);
-      anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
-         lri.RegisterOffset = GENX(GFX_AUX_TABLE_BASE_ADDR_num);
-         lri.DataDWord = base_addr & 0xffffffff;
-      }
-      anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
-         lri.RegisterOffset = GENX(GFX_AUX_TABLE_BASE_ADDR_num) + 4;
-         lri.DataDWord = base_addr >> 32;
-      }
-      cmd_buffer->state.last_aux_map_state = aux_map_state_num;
-   }
-}
-#endif
-
 void
 genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer)
 {
@@ -2893,10 +3005,6 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer)
 
    genX(flush_pipeline_select_3d)(cmd_buffer);
 
-#if GEN_GEN >= 12
-   genX(cmd_buffer_aux_map_state)(cmd_buffer);
-#endif
-
    if (vb_emit) {
       const uint32_t num_buffers = __builtin_popcount(vb_emit);
       const uint32_t num_dwords = 1 + num_buffers * 4;
@@ -3786,10 +3894,6 @@ genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer)
 
    genX(flush_pipeline_select_gpgpu)(cmd_buffer);
 
-#if GEN_GEN >= 12
-   genX(cmd_buffer_aux_map_state)(cmd_buffer);
-#endif
-
    if (cmd_buffer->state.compute.pipeline_dirty) {
       /* From the Sky Lake PRM Vol 2a, MEDIA_VFE_STATE:
        *