ir3: Rewrite UBO push analysis to support bindless
authorConnor Abbott <cwabbott0@gmail.com>
Mon, 23 Mar 2020 17:54:57 +0000 (18:54 +0100)
committerMarge Bot <eric+marge@anholt.net>
Thu, 9 Apr 2020 15:56:55 +0000 (15:56 +0000)
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4358>

src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c
src/freedreno/ir3/ir3_shader.h
src/freedreno/vulkan/tu_cmd_buffer.c
src/gallium/drivers/freedreno/ir3/ir3_gallium.c

index 64599669f02c5c2e5f38d76519627938c43dd448..ee57f2db19bea04d0226f2bc24a0115c19057074 100644 (file)
@@ -41,18 +41,62 @@ get_ubo_load_range(nir_intrinsic_instr *instr)
        return r;
 }
 
+static struct ir3_ubo_range *
+get_existing_range(nir_intrinsic_instr *instr,
+                                  struct ir3_ubo_analysis_state *state,
+                                  bool create_new)
+{
+       unsigned block, base = 0;
+       bool bindless;
+       if (nir_src_is_const(instr->src[0])) {
+               block = nir_src_as_uint(instr->src[0]);
+               bindless = false;
+       } else {
+               nir_intrinsic_instr *rsrc = ir3_bindless_resource(instr->src[0]);
+               if (rsrc && nir_src_is_const(rsrc->src[0])) {
+                       block = nir_src_as_uint(rsrc->src[0]);
+                       base = nir_intrinsic_desc_set(rsrc);
+                       bindless = true;
+               } else {
+                       return NULL;
+               }
+       }
+       for (int i = 0; i < IR3_MAX_UBO_PUSH_RANGES; i++) {
+               struct ir3_ubo_range *range = &state->range[i];
+               if (range->end < range->start) {
+                       /* We don't have a matching range, but there are more available.
+                        */
+                       if (create_new) {
+                               range->block = block;
+                               range->bindless_base = base;
+                               range->bindless = bindless;
+                               return range;
+                       } else {
+                               return NULL;
+                       }
+               } else if (range->block == block && range->bindless_base == base &&
+                                  range->bindless == bindless) {
+                       return range;
+               }
+       }
+
+       return NULL;
+}
+
 static void
 gather_ubo_ranges(nir_shader *nir, nir_intrinsic_instr *instr,
                                  struct ir3_ubo_analysis_state *state)
 {
-       if (!nir_src_is_const(instr->src[0]))
+       struct ir3_ubo_range *old_r = get_existing_range(instr, state, true);
+       if (!old_r)
                return;
 
        if (!nir_src_is_const(instr->src[1])) {
-               if (nir_src_as_uint(instr->src[0]) == 0) {
+               if (!old_r->bindless && old_r->block == 0) {
                        /* If this is an indirect on UBO 0, we'll still lower it back to
                         * load_uniform.  Set the range to cover all of UBO 0.
                         */
+                       state->range[0].start = 0;
                        state->range[0].end = ALIGN(nir->num_uniforms * 16, 16 * 4);
                }
 
@@ -60,18 +104,17 @@ gather_ubo_ranges(nir_shader *nir, nir_intrinsic_instr *instr,
        }
 
        const struct ir3_ubo_range r = get_ubo_load_range(instr);
-       const uint32_t block = nir_src_as_uint(instr->src[0]);
 
        /* if UBO lowering is disabled, we still want to lower block 0
         * (which is normal uniforms):
         */
-       if ((block > 0) && (ir3_shader_debug & IR3_DBG_NOUBOOPT))
+       if ((old_r->bindless || old_r->block != 0) && (ir3_shader_debug & IR3_DBG_NOUBOOPT))
                return;
 
-       if (r.start < state->range[block].start)
-               state->range[block].start = r.start;
-       if (state->range[block].end < r.end)
-               state->range[block].end = r.end;
+       if (r.start < old_r->start)
+               old_r->start = r.start;
+       if (old_r->end < r.end)
+               old_r->end = r.end;
 }
 
 /* For indirect offset, it is common to see a pattern of multiple
@@ -142,12 +185,11 @@ lower_ubo_load_to_uniform(nir_intrinsic_instr *instr, nir_builder *b,
         * could probably with some effort determine a block stride in number of
         * registers.
         */
-       if (!nir_src_is_const(instr->src[0]))
+       struct ir3_ubo_range *range = get_existing_range(instr, state, false);
+       if (!range)
                return;
 
-       const uint32_t block = nir_src_as_uint(instr->src[0]);
-
-       if (block > 0) {
+       if (range->bindless || range->block > 0) {
                /* We don't lower dynamic array indexing either, but we definitely should.
                 * We don't have a good way of determining the range of the dynamic
                 * access, so for now just fall back to pulling.
@@ -159,8 +201,7 @@ lower_ubo_load_to_uniform(nir_intrinsic_instr *instr, nir_builder *b,
                 * upload. Reject if we're now outside the range.
                 */
                const struct ir3_ubo_range r = get_ubo_load_range(instr);
-               if (!(state->range[block].start <= r.start &&
-                         r.end <= state->range[block].end))
+               if (!(range->start <= r.start && r.end <= range->end))
                        return;
        }
 
@@ -186,8 +227,7 @@ lower_ubo_load_to_uniform(nir_intrinsic_instr *instr, nir_builder *b,
        debug_assert(!(const_offset & 0x3));
        const_offset >>= 2;
 
-       const int range_offset =
-               (state->range[block].offset - state->range[block].start) / 4;
+       const int range_offset = (range->offset - range->start) / 4;
        const_offset += range_offset;
 
        nir_intrinsic_instr *uniform =
@@ -213,6 +253,9 @@ ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader *shader)
        struct ir3_ubo_analysis_state *state = &shader->ubo_state;
 
        memset(state, 0, sizeof(*state));
+       for (int i = 0; i < IR3_MAX_UBO_PUSH_RANGES; i++) {
+               state->range[i].start = UINT32_MAX;
+       }
 
        nir_foreach_function (function, nir) {
                if (function->impl) {
@@ -236,7 +279,13 @@ ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader *shader)
         */
        const uint32_t max_upload = 16 * 1024;
        uint32_t offset = shader->const_state.num_reserved_user_consts * 16;
+       state->num_enabled = ARRAY_SIZE(state->range);
        for (uint32_t i = 0; i < ARRAY_SIZE(state->range); i++) {
+               if (state->range[i].start >= state->range[i].end) {
+                       state->num_enabled = i;
+                       break;
+               }
+
                uint32_t range_size = state->range[i].end - state->range[i].start;
 
                debug_assert(offset <= max_upload);
@@ -247,8 +296,6 @@ ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader *shader)
                }
                offset += range_size;
 
-               if (state->range[i].start < state->range[i].end)
-                       state->enabled |= 1 << i;
        }
        state->size = offset;
 
index f9fdaa89c4e2876de547c3291c473720843c3829..63bec0d25dc9d554f23880ae023684aa93804c95 100644 (file)
@@ -70,7 +70,7 @@ enum ir3_driver_param {
 #define IR3_MAX_SO_BUFFERS        4
 #define IR3_MAX_SO_STREAMS        4
 #define IR3_MAX_SO_OUTPUTS       64
-#define IR3_MAX_CONSTANT_BUFFERS 32
+#define IR3_MAX_UBO_PUSH_RANGES  32
 
 
 /**
@@ -619,13 +619,16 @@ ir3_shader_stage(struct ir3_shader_variant *v)
 }
 
 struct ir3_ubo_range {
-       uint32_t offset; /* start offset of this block in const register file */
+       uint32_t offset; /* start offset to push in the const register file */
+       uint32_t block; /* Which constant block */
        uint32_t start, end; /* range of block that's actually used */
+       uint16_t bindless_base; /* For bindless, which base register is used */
+       bool bindless;
 };
 
 struct ir3_ubo_analysis_state {
-       struct ir3_ubo_range range[IR3_MAX_CONSTANT_BUFFERS];
-       uint32_t enabled;
+       struct ir3_ubo_range range[IR3_MAX_UBO_PUSH_RANGES];
+       uint32_t num_enabled;
        uint32_t size;
        uint32_t lower_count;
        uint32_t cmdstream_size; /* for per-gen backend to stash required cmdstream size */
index 0bf8f56f8bfe44600f3dcd9a24a1eddcbdb54af1..3de6fb42afb807a4721a3c4cee0ff85ce6a523b6 100644 (file)
@@ -2691,49 +2691,47 @@ tu6_emit_user_consts(struct tu_cs *cs, const struct tu_pipeline *pipeline,
          tu_cs_emit(cs, push_constants[i + offset * 4]);
    }
 
-   for (uint32_t i = 0; i < ARRAY_SIZE(state->range); i++) {
-      if (state->range[i].start < state->range[i].end) {
-         uint32_t size = state->range[i].end - state->range[i].start;
-         uint32_t offset = state->range[i].start;
+   for (uint32_t i = 0; i < state->num_enabled; i++) {
+      uint32_t size = state->range[i].end - state->range[i].start;
+      uint32_t offset = state->range[i].start;
 
-         /* and even if the start of the const buffer is before
-          * first_immediate, the end may not be:
-          */
-         size = MIN2(size, (16 * link->constlen) - state->range[i].offset);
+      /* and even if the start of the const buffer is before
+       * first_immediate, the end may not be:
+       */
+      size = MIN2(size, (16 * link->constlen) - state->range[i].offset);
 
-         if (size == 0)
-            continue;
+      if (size == 0)
+         continue;
 
-         /* things should be aligned to vec4: */
-         debug_assert((state->range[i].offset % 16) == 0);
-         debug_assert((size % 16) == 0);
-         debug_assert((offset % 16) == 0);
+      /* things should be aligned to vec4: */
+      debug_assert((state->range[i].offset % 16) == 0);
+      debug_assert((size % 16) == 0);
+      debug_assert((offset % 16) == 0);
 
-         /* Look through the UBO map to find our UBO index, and get the VA for
-          * that UBO.
-          */
-         uint64_t va = 0;
-         uint32_t ubo_idx = i - 1;
-         uint32_t ubo_map_base = 0;
-         for (int j = 0; j < link->ubo_map.num; j++) {
-            if (ubo_idx >= ubo_map_base &&
-                ubo_idx < ubo_map_base + link->ubo_map.array_size[j]) {
-               va = buffer_ptr(descriptors_state, &link->ubo_map, j,
-                               ubo_idx - ubo_map_base);
-               break;
-            }
-            ubo_map_base += link->ubo_map.array_size[j];
+      /* Look through the UBO map to find our UBO index, and get the VA for
+       * that UBO.
+       */
+      uint64_t va = 0;
+      uint32_t ubo_idx = state->range[i].block - 1;
+      uint32_t ubo_map_base = 0;
+      for (int j = 0; j < link->ubo_map.num; j++) {
+         if (ubo_idx >= ubo_map_base &&
+             ubo_idx < ubo_map_base + link->ubo_map.array_size[j]) {
+            va = buffer_ptr(descriptors_state, &link->ubo_map, j,
+                            ubo_idx - ubo_map_base);
+            break;
          }
-         assert(va);
-
-         tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3);
-         tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(state->range[i].offset / 16) |
-               CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
-               CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
-               CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
-               CP_LOAD_STATE6_0_NUM_UNIT(size / 16));
-         tu_cs_emit_qw(cs, va + offset);
+         ubo_map_base += link->ubo_map.array_size[j];
       }
+      assert(va);
+
+      tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3);
+      tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(state->range[i].offset / 16) |
+            CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
+            CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
+            CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
+            CP_LOAD_STATE6_0_NUM_UNIT(size / 16));
+      tu_cs_emit_qw(cs, va + offset);
    }
 }
 
index 56972a81b9d55eab1d0e27b966d70148908bf27e..8fff7da0c5e6219dffaab487f38357898c45e60b 100644 (file)
@@ -259,9 +259,12 @@ ir3_emit_user_consts(struct fd_screen *screen, const struct ir3_shader_variant *
        struct ir3_ubo_analysis_state *state;
        state = &v->shader->ubo_state;
 
-       uint32_t i;
-       foreach_bit(i, state->enabled & constbuf->enabled_mask) {
-               struct pipe_constant_buffer *cb = &constbuf->cb[i];
+       for (unsigned i = 0; i < state->num_enabled; i++) {
+               assert(!state->range[i].bindless);
+               unsigned ubo = state->range[i].block;
+               if (!(constbuf->enabled_mask & (1 << ubo)))
+                       continue;
+               struct pipe_constant_buffer *cb = &constbuf->cb[ubo];
 
                uint32_t size = state->range[i].end - state->range[i].start;
                uint32_t offset = cb->buffer_offset + state->range[i].start;