anv: Pre-compute push ranges for graphics pipelines
authorJason Ekstrand <jason@jlekstrand.net>
Fri, 8 Nov 2019 15:42:30 +0000 (09:42 -0600)
committerJason Ekstrand <jason@jlekstrand.net>
Mon, 18 Nov 2019 18:35:14 +0000 (18:35 +0000)
It turns off that emitting push constants is one of the hottest paths in
the driver and ANY work we do there costs us.  By pre-computing things a
bit ahead of time, we shave 5% off the runtime of a CPU-limited example
running with the Dawn WebGPU implementation.

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
src/intel/Makefile.sources
src/intel/vulkan/anv_nir.h
src/intel/vulkan/anv_nir_compute_push_layout.c [new file with mode: 0644]
src/intel/vulkan/anv_pipeline.c
src/intel/vulkan/anv_pipeline_cache.c
src/intel/vulkan/anv_private.h
src/intel/vulkan/genX_cmd_buffer.c
src/intel/vulkan/meson.build

index f50b7c991d5b223a8b9ca3305671e04b96f1d8d0..b3861af32f55109829962b2a543197cb3159cc4e 100644 (file)
@@ -258,6 +258,7 @@ VULKAN_FILES := \
        vulkan/anv_nir.h \
        vulkan/anv_nir_add_base_work_group_id.c \
        vulkan/anv_nir_apply_pipeline_layout.c \
+       vulkan/anv_nir_compute_push_layout.c \
        vulkan/anv_nir_lower_multiview.c \
        vulkan/anv_nir_lower_push_constants.c \
        vulkan/anv_nir_lower_ycbcr_textures.c \
index 844e5b0bfd40bd021291968fb3ecef428c7ba2bd..8977599607e52774c8f2916f6fbc3756d3611c4c 100644 (file)
@@ -59,6 +59,10 @@ void anv_nir_apply_pipeline_layout(const struct anv_physical_device *pdevice,
                                    struct brw_stage_prog_data *prog_data,
                                    struct anv_pipeline_bind_map *map);
 
+void anv_compute_push_layout(const struct anv_physical_device *pdevice,
+                             struct brw_stage_prog_data *prog_data,
+                             struct anv_pipeline_bind_map *map);
+
 bool anv_nir_add_base_work_group_id(nir_shader *shader,
                                     struct brw_cs_prog_data *prog_data);
 
diff --git a/src/intel/vulkan/anv_nir_compute_push_layout.c b/src/intel/vulkan/anv_nir_compute_push_layout.c
new file mode 100644 (file)
index 0000000..72bff55
--- /dev/null
@@ -0,0 +1,78 @@
+/*
+ * Copyright © 2019 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_nir.h"
+#include "compiler/brw_nir.h"
+
+void
+anv_compute_push_layout(const struct anv_physical_device *pdevice,
+                        struct brw_stage_prog_data *prog_data,
+                        struct anv_pipeline_bind_map *map)
+{
+   struct anv_push_range push_constant_range = {
+      .set = ANV_DESCRIPTOR_SET_PUSH_CONSTANTS,
+      .length = DIV_ROUND_UP(prog_data->nr_params, 8),
+   };
+
+   if (pdevice->info.gen >= 8 || pdevice->info.is_haswell) {
+      /* The Skylake PRM contains the following restriction:
+       *
+       *    "The driver must ensure The following case does not occur
+       *     without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
+       *     buffer 3 read length equal to zero committed followed by a
+       *     3DSTATE_CONSTANT_* with buffer 0 read length not equal to
+       *     zero committed."
+       *
+       * To avoid this, we program the buffers in the highest slots.
+       * This way, slot 0 is only used if slot 3 is also used.
+       */
+      int n = 3;
+
+      for (int i = 3; i >= 0; i--) {
+         const struct brw_ubo_range *ubo_range = &prog_data->ubo_ranges[i];
+         if (ubo_range->length == 0)
+            continue;
+
+         const struct anv_pipeline_binding *binding =
+            &map->surface_to_descriptor[ubo_range->block];
+
+         map->push_ranges[n--] = (struct anv_push_range) {
+            .set = binding->set,
+            .index = binding->index,
+            .dynamic_offset_index = binding->dynamic_offset_index,
+            .start = ubo_range->start,
+            .length = ubo_range->length,
+         };
+      }
+
+      if (push_constant_range.length > 0)
+         map->push_ranges[n--] = push_constant_range;
+   } else {
+      /* For Ivy Bridge, the push constants packets have a different
+       * rule that would require us to iterate in the other direction
+       * and possibly mess around with dynamic state base address.
+       * Don't bother; just emit regular push constants at n = 0.
+       */
+      map->push_ranges[0] = push_constant_range;
+   }
+}
index 0f582e2801ed57468cac42fe775029fe6a6bf65b..bcc62b77a3d5cbe8af5c645c4f7f3729437b1c4f 100644 (file)
@@ -1394,6 +1394,10 @@ anv_pipeline_compile_graphics(struct anv_pipeline *pipeline,
          goto fail;
       }
 
+      anv_compute_push_layout(&pipeline->device->instance->physicalDevice,
+                              &stages[s].prog_data.base,
+                              &stages[s].bind_map);
+
       struct anv_shader_bin *bin =
          anv_device_upload_kernel(pipeline->device, cache,
                                   &stages[s].cache_key,
index e1d48b879b07effede1be87a0355a76b0e36a120..a4294e1eb6096d7767264ba4ccf3888f33a48e96 100644 (file)
@@ -169,6 +169,8 @@ anv_shader_bin_write_to_blob(const struct anv_shader_bin *shader,
    blob_write_bytes(blob, shader->bind_map.sampler_to_descriptor,
                     shader->bind_map.sampler_count *
                     sizeof(*shader->bind_map.sampler_to_descriptor));
+   blob_write_bytes(blob, shader->bind_map.push_ranges,
+                    sizeof(shader->bind_map.push_ranges));
 
    return !blob->out_of_memory;
 }
@@ -212,6 +214,7 @@ anv_shader_bin_create_from_blob(struct anv_device *device,
    bind_map.sampler_to_descriptor = (void *)
       blob_read_bytes(blob, bind_map.sampler_count *
                             sizeof(*bind_map.sampler_to_descriptor));
+   blob_copy_bytes(blob, bind_map.push_ranges, sizeof(bind_map.push_ranges));
 
    if (blob->overrun)
       return NULL;
index 5b8ac491ce570dfd84bd6021f1438110fbba54d9..82e582269de20be28bc3d897e1327341d3849af6 100644 (file)
@@ -2006,6 +2006,7 @@ anv_descriptor_set_destroy(struct anv_device *device,
                            struct anv_descriptor_pool *pool,
                            struct anv_descriptor_set *set);
 
+#define ANV_DESCRIPTOR_SET_PUSH_CONSTANTS   (UINT8_MAX - 4)
 #define ANV_DESCRIPTOR_SET_DESCRIPTORS      (UINT8_MAX - 3)
 #define ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS  (UINT8_MAX - 2)
 #define ANV_DESCRIPTOR_SET_SHADER_CONSTANTS (UINT8_MAX - 1)
@@ -2041,6 +2042,23 @@ struct anv_pipeline_binding {
    bool write_only;
 };
 
+struct anv_push_range {
+   /** Index in the descriptor set */
+   uint32_t index;
+
+   /** Descriptor set index */
+   uint8_t set;
+
+   /** Dynamic offset index (for dynamic UBOs) */
+   uint8_t dynamic_offset_index;
+
+   /** Start offset in units of 32B */
+   uint8_t start;
+
+   /** Range in units of 32B */
+   uint8_t length;
+};
+
 struct anv_pipeline_layout {
    struct {
       struct anv_descriptor_set_layout *layout;
@@ -2912,6 +2930,8 @@ struct anv_pipeline_bind_map {
 
    struct anv_pipeline_binding *                surface_to_descriptor;
    struct anv_pipeline_binding *                sampler_to_descriptor;
+
+   struct anv_push_range                        push_ranges[4];
 };
 
 struct anv_shader_bin_key {
index a0df116b05cf5f7f70c0f4fd6fb2cdec3f9cfb1e..4f35df86fc19bf4860017551434889944b9c2d5c 100644 (file)
@@ -2528,98 +2528,60 @@ cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer,
          c._3DCommandSubOpcode = push_constant_opcodes[stage];
 
          if (anv_pipeline_has_stage(pipeline, stage)) {
-#if GEN_GEN >= 8 || GEN_IS_HASWELL
-            const struct brw_stage_prog_data *prog_data =
-               pipeline->shaders[stage]->prog_data;
             const struct anv_pipeline_bind_map *bind_map =
                &pipeline->shaders[stage]->bind_map;
 
-            /* The Skylake PRM contains the following restriction:
-             *
-             *    "The driver must ensure The following case does not occur
-             *     without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
-             *     buffer 3 read length equal to zero committed followed by a
-             *     3DSTATE_CONSTANT_* with buffer 0 read length not equal to
-             *     zero committed."
-             *
-             * To avoid this, we program the buffers in the highest slots.
-             * This way, slot 0 is only used if slot 3 is also used.
-             */
-            int n = 3;
-
-            for (int i = 3; i >= 0; i--) {
-               const struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
+            for (unsigned i = 0; i < 4; i++) {
+               const struct anv_push_range *range = &bind_map->push_ranges[i];
                if (range->length == 0)
                   continue;
 
-               const unsigned surface =
-                  prog_data->binding_table.ubo_start + range->block;
-
-               assert(surface <= bind_map->surface_count);
-               const struct anv_pipeline_binding *binding =
-                  &bind_map->surface_to_descriptor[surface];
-
                struct anv_address addr;
-               if (binding->set == ANV_DESCRIPTOR_SET_DESCRIPTORS) {
+               switch (range->set) {
+               case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
                   /* This is a descriptor set buffer so the set index is
                    * actually given by binding->binding.  (Yes, that's
                    * confusing.)
                    */
                   struct anv_descriptor_set *set =
-                     gfx_state->base.descriptors[binding->index];
-
+                     gfx_state->base.descriptors[range->index];
                   addr = anv_descriptor_set_address(cmd_buffer, set);
-               } else {
-                  assert(binding->set < MAX_SETS);
+                  break;
+               }
+
+               case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS: {
+                  struct anv_state state =
+                     anv_cmd_buffer_push_constants(cmd_buffer, stage);
+                  addr = (struct anv_address) {
+                     .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
+                     .offset = state.offset,
+                  };
+                  break;
+               }
+
+               default: {
+                  assert(range->set < MAX_SETS);
                   struct anv_descriptor_set *set =
-                     gfx_state->base.descriptors[binding->set];
+                     gfx_state->base.descriptors[range->set];
                   const struct anv_descriptor *desc =
-                     &set->descriptors[binding->index];
+                     &set->descriptors[range->index];
 
                   if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
                      addr = desc->buffer_view->address;
                   } else {
                      assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC);
-
                      uint32_t dynamic_offset =
-                        gfx_state->base.dynamic_offsets[binding->dynamic_offset_index];
+                        gfx_state->base.dynamic_offsets[range->dynamic_offset_index];
                      addr = anv_address_add(desc->buffer->address,
                                             desc->offset + dynamic_offset);
                   }
                }
+               }
 
-               c.ConstantBody.Buffer[n] =
+               c.ConstantBody.ReadLength[i] = range->length;
+               c.ConstantBody.Buffer[i] =
                   anv_address_add(addr, range->start * 32);
-               c.ConstantBody.ReadLength[n] = range->length;
-               n--;
-            }
-
-            struct anv_state state =
-               anv_cmd_buffer_push_constants(cmd_buffer, stage);
-
-            if (state.alloc_size > 0) {
-               c.ConstantBody.Buffer[n] = (struct anv_address) {
-                  .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
-                  .offset = state.offset,
-               };
-               c.ConstantBody.ReadLength[n] =
-                  DIV_ROUND_UP(state.alloc_size, 32);
             }
-#else
-            /* For Ivy Bridge, the push constants packets have a different
-             * rule that would require us to iterate in the other direction
-             * and possibly mess around with dynamic state base address.
-             * Don't bother; just emit regular push constants at n = 0.
-             */
-            struct anv_state state =
-               anv_cmd_buffer_push_constants(cmd_buffer, stage);
-
-            if (state.alloc_size > 0) {
-               c.ConstantBody.Buffer[0].offset = state.offset,
-               c.ConstantBody.ReadLength[0] =
-                  DIV_ROUND_UP(state.alloc_size, 32);
-            }
-#endif
          }
       }
 
index c21d7bd25074bc7305ca3bcf41ff8ac8c658d657..31127aabf11ed2658e7b548ca86eba445365edf5 100644 (file)
@@ -114,6 +114,7 @@ libanv_files = files(
   'anv_nir.h',
   'anv_nir_add_base_work_group_id.c',
   'anv_nir_apply_pipeline_layout.c',
+  'anv_nir_compute_push_layout.c',
   'anv_nir_lower_multiview.c',
   'anv_nir_lower_push_constants.c',
   'anv_nir_lower_ycbcr_textures.c',