From fc850080ee304c2a62f7313c4b7ebe121c3ebb53 Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Mon, 23 Mar 2020 18:54:57 +0100 Subject: [PATCH] ir3: Rewrite UBO push analysis to support bindless Part-of: --- .../ir3/ir3_nir_analyze_ubo_ranges.c | 83 +++++++++++++++---- src/freedreno/ir3/ir3_shader.h | 11 ++- src/freedreno/vulkan/tu_cmd_buffer.c | 72 ++++++++-------- .../drivers/freedreno/ir3/ir3_gallium.c | 9 +- 4 files changed, 113 insertions(+), 62 deletions(-) diff --git a/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c b/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c index 64599669f02..ee57f2db19b 100644 --- a/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c +++ b/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c @@ -41,18 +41,62 @@ get_ubo_load_range(nir_intrinsic_instr *instr) return r; } +static struct ir3_ubo_range * +get_existing_range(nir_intrinsic_instr *instr, + struct ir3_ubo_analysis_state *state, + bool create_new) +{ + unsigned block, base = 0; + bool bindless; + if (nir_src_is_const(instr->src[0])) { + block = nir_src_as_uint(instr->src[0]); + bindless = false; + } else { + nir_intrinsic_instr *rsrc = ir3_bindless_resource(instr->src[0]); + if (rsrc && nir_src_is_const(rsrc->src[0])) { + block = nir_src_as_uint(rsrc->src[0]); + base = nir_intrinsic_desc_set(rsrc); + bindless = true; + } else { + return NULL; + } + } + for (int i = 0; i < IR3_MAX_UBO_PUSH_RANGES; i++) { + struct ir3_ubo_range *range = &state->range[i]; + if (range->end < range->start) { + /* We don't have a matching range, but there are more available. + */ + if (create_new) { + range->block = block; + range->bindless_base = base; + range->bindless = bindless; + return range; + } else { + return NULL; + } + } else if (range->block == block && range->bindless_base == base && + range->bindless == bindless) { + return range; + } + } + + return NULL; +} + static void gather_ubo_ranges(nir_shader *nir, nir_intrinsic_instr *instr, struct ir3_ubo_analysis_state *state) { - if (!nir_src_is_const(instr->src[0])) + struct ir3_ubo_range *old_r = get_existing_range(instr, state, true); + if (!old_r) return; if (!nir_src_is_const(instr->src[1])) { - if (nir_src_as_uint(instr->src[0]) == 0) { + if (!old_r->bindless && old_r->block == 0) { /* If this is an indirect on UBO 0, we'll still lower it back to * load_uniform. Set the range to cover all of UBO 0. */ + state->range[0].start = 0; state->range[0].end = ALIGN(nir->num_uniforms * 16, 16 * 4); } @@ -60,18 +104,17 @@ gather_ubo_ranges(nir_shader *nir, nir_intrinsic_instr *instr, } const struct ir3_ubo_range r = get_ubo_load_range(instr); - const uint32_t block = nir_src_as_uint(instr->src[0]); /* if UBO lowering is disabled, we still want to lower block 0 * (which is normal uniforms): */ - if ((block > 0) && (ir3_shader_debug & IR3_DBG_NOUBOOPT)) + if ((old_r->bindless || old_r->block != 0) && (ir3_shader_debug & IR3_DBG_NOUBOOPT)) return; - if (r.start < state->range[block].start) - state->range[block].start = r.start; - if (state->range[block].end < r.end) - state->range[block].end = r.end; + if (r.start < old_r->start) + old_r->start = r.start; + if (old_r->end < r.end) + old_r->end = r.end; } /* For indirect offset, it is common to see a pattern of multiple @@ -142,12 +185,11 @@ lower_ubo_load_to_uniform(nir_intrinsic_instr *instr, nir_builder *b, * could probably with some effort determine a block stride in number of * registers. */ - if (!nir_src_is_const(instr->src[0])) + struct ir3_ubo_range *range = get_existing_range(instr, state, false); + if (!range) return; - const uint32_t block = nir_src_as_uint(instr->src[0]); - - if (block > 0) { + if (range->bindless || range->block > 0) { /* We don't lower dynamic array indexing either, but we definitely should. * We don't have a good way of determining the range of the dynamic * access, so for now just fall back to pulling. @@ -159,8 +201,7 @@ lower_ubo_load_to_uniform(nir_intrinsic_instr *instr, nir_builder *b, * upload. Reject if we're now outside the range. */ const struct ir3_ubo_range r = get_ubo_load_range(instr); - if (!(state->range[block].start <= r.start && - r.end <= state->range[block].end)) + if (!(range->start <= r.start && r.end <= range->end)) return; } @@ -186,8 +227,7 @@ lower_ubo_load_to_uniform(nir_intrinsic_instr *instr, nir_builder *b, debug_assert(!(const_offset & 0x3)); const_offset >>= 2; - const int range_offset = - (state->range[block].offset - state->range[block].start) / 4; + const int range_offset = (range->offset - range->start) / 4; const_offset += range_offset; nir_intrinsic_instr *uniform = @@ -213,6 +253,9 @@ ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader *shader) struct ir3_ubo_analysis_state *state = &shader->ubo_state; memset(state, 0, sizeof(*state)); + for (int i = 0; i < IR3_MAX_UBO_PUSH_RANGES; i++) { + state->range[i].start = UINT32_MAX; + } nir_foreach_function (function, nir) { if (function->impl) { @@ -236,7 +279,13 @@ ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader *shader) */ const uint32_t max_upload = 16 * 1024; uint32_t offset = shader->const_state.num_reserved_user_consts * 16; + state->num_enabled = ARRAY_SIZE(state->range); for (uint32_t i = 0; i < ARRAY_SIZE(state->range); i++) { + if (state->range[i].start >= state->range[i].end) { + state->num_enabled = i; + break; + } + uint32_t range_size = state->range[i].end - state->range[i].start; debug_assert(offset <= max_upload); @@ -247,8 +296,6 @@ ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader *shader) } offset += range_size; - if (state->range[i].start < state->range[i].end) - state->enabled |= 1 << i; } state->size = offset; diff --git a/src/freedreno/ir3/ir3_shader.h b/src/freedreno/ir3/ir3_shader.h index f9fdaa89c4e..63bec0d25dc 100644 --- a/src/freedreno/ir3/ir3_shader.h +++ b/src/freedreno/ir3/ir3_shader.h @@ -70,7 +70,7 @@ enum ir3_driver_param { #define IR3_MAX_SO_BUFFERS 4 #define IR3_MAX_SO_STREAMS 4 #define IR3_MAX_SO_OUTPUTS 64 -#define IR3_MAX_CONSTANT_BUFFERS 32 +#define IR3_MAX_UBO_PUSH_RANGES 32 /** @@ -619,13 +619,16 @@ ir3_shader_stage(struct ir3_shader_variant *v) } struct ir3_ubo_range { - uint32_t offset; /* start offset of this block in const register file */ + uint32_t offset; /* start offset to push in the const register file */ + uint32_t block; /* Which constant block */ uint32_t start, end; /* range of block that's actually used */ + uint16_t bindless_base; /* For bindless, which base register is used */ + bool bindless; }; struct ir3_ubo_analysis_state { - struct ir3_ubo_range range[IR3_MAX_CONSTANT_BUFFERS]; - uint32_t enabled; + struct ir3_ubo_range range[IR3_MAX_UBO_PUSH_RANGES]; + uint32_t num_enabled; uint32_t size; uint32_t lower_count; uint32_t cmdstream_size; /* for per-gen backend to stash required cmdstream size */ diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c index 0bf8f56f8bf..3de6fb42afb 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.c +++ b/src/freedreno/vulkan/tu_cmd_buffer.c @@ -2691,49 +2691,47 @@ tu6_emit_user_consts(struct tu_cs *cs, const struct tu_pipeline *pipeline, tu_cs_emit(cs, push_constants[i + offset * 4]); } - for (uint32_t i = 0; i < ARRAY_SIZE(state->range); i++) { - if (state->range[i].start < state->range[i].end) { - uint32_t size = state->range[i].end - state->range[i].start; - uint32_t offset = state->range[i].start; + for (uint32_t i = 0; i < state->num_enabled; i++) { + uint32_t size = state->range[i].end - state->range[i].start; + uint32_t offset = state->range[i].start; - /* and even if the start of the const buffer is before - * first_immediate, the end may not be: - */ - size = MIN2(size, (16 * link->constlen) - state->range[i].offset); + /* and even if the start of the const buffer is before + * first_immediate, the end may not be: + */ + size = MIN2(size, (16 * link->constlen) - state->range[i].offset); - if (size == 0) - continue; + if (size == 0) + continue; - /* things should be aligned to vec4: */ - debug_assert((state->range[i].offset % 16) == 0); - debug_assert((size % 16) == 0); - debug_assert((offset % 16) == 0); + /* things should be aligned to vec4: */ + debug_assert((state->range[i].offset % 16) == 0); + debug_assert((size % 16) == 0); + debug_assert((offset % 16) == 0); - /* Look through the UBO map to find our UBO index, and get the VA for - * that UBO. - */ - uint64_t va = 0; - uint32_t ubo_idx = i - 1; - uint32_t ubo_map_base = 0; - for (int j = 0; j < link->ubo_map.num; j++) { - if (ubo_idx >= ubo_map_base && - ubo_idx < ubo_map_base + link->ubo_map.array_size[j]) { - va = buffer_ptr(descriptors_state, &link->ubo_map, j, - ubo_idx - ubo_map_base); - break; - } - ubo_map_base += link->ubo_map.array_size[j]; + /* Look through the UBO map to find our UBO index, and get the VA for + * that UBO. + */ + uint64_t va = 0; + uint32_t ubo_idx = state->range[i].block - 1; + uint32_t ubo_map_base = 0; + for (int j = 0; j < link->ubo_map.num; j++) { + if (ubo_idx >= ubo_map_base && + ubo_idx < ubo_map_base + link->ubo_map.array_size[j]) { + va = buffer_ptr(descriptors_state, &link->ubo_map, j, + ubo_idx - ubo_map_base); + break; } - assert(va); - - tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3); - tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(state->range[i].offset / 16) | - CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | - CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | - CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) | - CP_LOAD_STATE6_0_NUM_UNIT(size / 16)); - tu_cs_emit_qw(cs, va + offset); + ubo_map_base += link->ubo_map.array_size[j]; } + assert(va); + + tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3); + tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(state->range[i].offset / 16) | + CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | + CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | + CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) | + CP_LOAD_STATE6_0_NUM_UNIT(size / 16)); + tu_cs_emit_qw(cs, va + offset); } } diff --git a/src/gallium/drivers/freedreno/ir3/ir3_gallium.c b/src/gallium/drivers/freedreno/ir3/ir3_gallium.c index 56972a81b9d..8fff7da0c5e 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_gallium.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_gallium.c @@ -259,9 +259,12 @@ ir3_emit_user_consts(struct fd_screen *screen, const struct ir3_shader_variant * struct ir3_ubo_analysis_state *state; state = &v->shader->ubo_state; - uint32_t i; - foreach_bit(i, state->enabled & constbuf->enabled_mask) { - struct pipe_constant_buffer *cb = &constbuf->cb[i]; + for (unsigned i = 0; i < state->num_enabled; i++) { + assert(!state->range[i].bindless); + unsigned ubo = state->range[i].block; + if (!(constbuf->enabled_mask & (1 << ubo))) + continue; + struct pipe_constant_buffer *cb = &constbuf->cb[ubo]; uint32_t size = state->range[i].end - state->range[i].start; uint32_t offset = cb->buffer_offset + state->range[i].start; -- 2.30.2