From 2b2524099379b96a6dbeab037a25cbf5d71da7df Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Tue, 18 Aug 2020 15:45:02 -0700 Subject: [PATCH] freedreno/ir3: Replace our custom vec4 UBO intrinsic with the shared lowering. This gets us fewer comparisons in the shaders that we need to optimize back out, and reduces backend code. total instructions in shared programs: 11547270 -> 7219930 (-37.48%) total full in shared programs: 334268 -> 319602 (-4.39%) Reviewed-by: Kristian H. Kristensen Reviewed-by: Connor Abbott Part-of: --- src/compiler/nir/nir_intrinsics.py | 6 -- src/freedreno/ir3/ir3_compiler_nir.c | 4 +- src/freedreno/ir3/ir3_nir.c | 3 + .../ir3/ir3_nir_analyze_ubo_ranges.c | 4 +- src/freedreno/ir3/ir3_nir_lower_io_offsets.c | 81 ------------------- 5 files changed, 7 insertions(+), 91 deletions(-) diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index ee02b86e502..8d00446a2a2 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -815,12 +815,6 @@ intrinsic("ssbo_atomic_xor_ir3", src_comp=[1, 1, 1, 1], dest_comp=1, i intrinsic("ssbo_atomic_exchange_ir3", src_comp=[1, 1, 1, 1], dest_comp=1, indices=[ACCESS]) intrinsic("ssbo_atomic_comp_swap_ir3", src_comp=[1, 1, 1, 1, 1], dest_comp=1, indices=[ACCESS]) -# IR3-specific instruction for UBO loads using the ldc instruction. The second -# source is the indirect offset, in units of four dwords. The base is a -# component offset, in dword units. -intrinsic("load_ubo_ir3", src_comp=[1, 1], bit_sizes=[32], dest_comp=0, indices=[BASE], - flags=[CAN_REORDER, CAN_ELIMINATE]) - # System values for freedreno geometry shaders. system_value("vs_primitive_stride_ir3", 1) system_value("vs_vertex_stride_ir3", 1) diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c index 04a2dd9cea4..8172e113c1a 100644 --- a/src/freedreno/ir3/ir3_compiler_nir.c +++ b/src/freedreno/ir3/ir3_compiler_nir.c @@ -754,7 +754,7 @@ emit_intrinsic_load_ubo_ldc(struct ir3_context *ctx, nir_intrinsic_instr *intr, struct ir3_instruction *ldc = ir3_LDC(b, idx, 0, offset, 0); ldc->regs[0]->wrmask = MASK(ncomp); ldc->cat6.iim_val = ncomp; - ldc->cat6.d = nir_intrinsic_base(intr); + ldc->cat6.d = nir_intrinsic_component(intr); ldc->cat6.type = TYPE_U32; ir3_handle_bindless_cat6(ldc, intr->src[0]); @@ -1647,7 +1647,7 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr) case nir_intrinsic_load_ubo: emit_intrinsic_load_ubo(ctx, intr, dst); break; - case nir_intrinsic_load_ubo_ir3: + case nir_intrinsic_load_ubo_vec4: emit_intrinsic_load_ubo_ldc(ctx, intr, dst); break; case nir_intrinsic_load_frag_coord: diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c index 7b373dff242..bd73beefac9 100644 --- a/src/freedreno/ir3/ir3_nir.c +++ b/src/freedreno/ir3/ir3_nir.c @@ -461,6 +461,9 @@ ir3_nir_lower_variant(struct ir3_shader_variant *so, nir_shader *s) /* UBO offset lowering has to come after we've decided what will * be left as load_ubo */ + if (so->shader->compiler->gpu_id >= 600) + OPT_V(s, nir_lower_ubo_vec4); + OPT_V(s, ir3_nir_lower_io_offsets, so->shader->compiler->gpu_id); if (progress) diff --git a/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c b/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c index 900768edb03..8a9503feb74 100644 --- a/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c +++ b/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c @@ -327,8 +327,8 @@ instr_is_load_ubo(nir_instr *instr) nir_intrinsic_op op = nir_instr_as_intrinsic(instr)->intrinsic; - /* ir3_nir_lower_io_offsets happens after this pass. */ - assert(op != nir_intrinsic_load_ubo_ir3); + /* nir_lower_ubo_vec4 happens after this pass. */ + assert(op != nir_intrinsic_load_ubo_vec4); return op == nir_intrinsic_load_ubo; } diff --git a/src/freedreno/ir3/ir3_nir_lower_io_offsets.c b/src/freedreno/ir3/ir3_nir_lower_io_offsets.c index 36c48cf1299..110197d93b9 100644 --- a/src/freedreno/ir3/ir3_nir_lower_io_offsets.c +++ b/src/freedreno/ir3/ir3_nir_lower_io_offsets.c @@ -253,81 +253,6 @@ lower_offset_for_ssbo(nir_intrinsic_instr *intrinsic, nir_builder *b, return true; } -static bool -lower_offset_for_ubo(nir_intrinsic_instr *intrinsic, nir_builder *b, int gpu_id) -{ - /* We only need to lower offset if using LDC, which takes an offset in - * vec4 units and has the start component baked into the instruction. - */ - if (gpu_id < 600) - return false; - - /* TODO handle other bitsizes, including non-dword-aligned loads */ - assert(intrinsic->dest.ssa.bit_size == 32); - - b->cursor = nir_before_instr(&intrinsic->instr); - - nir_intrinsic_instr *new_intrinsic = - nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_ubo_ir3); - - debug_assert(intrinsic->dest.is_ssa); - new_intrinsic->src[0] = intrinsic->src[0]; - - nir_ssa_def *offset = intrinsic->src[1].ssa; - nir_ssa_def *new_offset = ir3_nir_try_propagate_bit_shift(b, offset, -4); - - if (!new_offset) - new_offset = nir_ushr(b, offset, nir_imm_int(b, 4)); - - new_intrinsic->src[1] = nir_src_for_ssa(new_offset); - - unsigned align_mul = nir_intrinsic_align_mul(intrinsic); - unsigned align_offset = nir_intrinsic_align_offset(intrinsic); - - unsigned components = intrinsic->num_components; - - if (align_mul % 16 != 0) - components = 4; - - new_intrinsic->num_components = components; - - nir_ssa_dest_init(&new_intrinsic->instr, &new_intrinsic->dest, - components, 32, NULL); - - nir_builder_instr_insert(b, &new_intrinsic->instr); - - nir_ssa_def *new_dest; - if (align_mul % 16 == 0) { - /* We know that the low 4 bits of the offset are constant and equal to - * align_offset. Use the component offset. - */ - unsigned component = align_offset / 4; - nir_intrinsic_set_base(new_intrinsic, component); - new_dest = &new_intrinsic->dest.ssa; - } else { - /* We have to assume it isn't aligned, and extract the components - * dynamically. - */ - nir_intrinsic_set_base(new_intrinsic, 0); - nir_ssa_def *component = - nir_iand(b, nir_ushr(b, offset, nir_imm_int(b, 2)), nir_imm_int(b, 3)); - nir_ssa_def *channels[NIR_MAX_VEC_COMPONENTS]; - for (unsigned i = 0; i < intrinsic->num_components; i++) { - nir_ssa_def *idx = nir_iadd(b, nir_imm_int(b, i), component); - channels[i] = nir_vector_extract(b, &new_intrinsic->dest.ssa, idx); - } - - new_dest = nir_vec(b, channels, intrinsic->num_components); - } - - nir_ssa_def_rewrite_uses(&intrinsic->dest.ssa, - nir_src_for_ssa(new_dest)); - - nir_instr_remove(&intrinsic->instr); - - return true; -} - static bool lower_io_offsets_block(nir_block *block, nir_builder *b, void *mem_ctx, int gpu_id) { @@ -339,12 +264,6 @@ lower_io_offsets_block(nir_block *block, nir_builder *b, void *mem_ctx, int gpu_ nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); - /* UBO */ - if (intr->intrinsic == nir_intrinsic_load_ubo) { - progress |= lower_offset_for_ubo(intr, b, gpu_id); - continue; - } - /* SSBO */ int ir3_intrinsic; uint8_t offset_src_idx; -- 2.30.2