freedreno/ir3: Replace our custom vec4 UBO intrinsic with the shared lowering.
authorEric Anholt <eric@anholt.net>
Tue, 18 Aug 2020 22:45:02 +0000 (15:45 -0700)
committerEric Anholt <eric@anholt.net>
Mon, 24 Aug 2020 16:53:36 +0000 (09:53 -0700)
This gets us fewer comparisons in the shaders that we need to optimize
back out, and reduces backend code.

total instructions in shared programs: 11547270 -> 7219930 (-37.48%)
total full in shared programs: 334268 -> 319602 (-4.39%)

Reviewed-by: Kristian H. Kristensen <hoegsberg@google.com>
Reviewed-by: Connor Abbott <cwabbott0@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6378>

src/compiler/nir/nir_intrinsics.py
src/freedreno/ir3/ir3_compiler_nir.c
src/freedreno/ir3/ir3_nir.c
src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c
src/freedreno/ir3/ir3_nir_lower_io_offsets.c

index ee02b86e5026777cf8be476a0b60e8cba46b667f..8d00446a2a24c18d70fe37403a8824b52d203008 100644 (file)
@@ -815,12 +815,6 @@ intrinsic("ssbo_atomic_xor_ir3",        src_comp=[1, 1, 1, 1],    dest_comp=1, i
 intrinsic("ssbo_atomic_exchange_ir3",   src_comp=[1, 1, 1, 1],    dest_comp=1, indices=[ACCESS])
 intrinsic("ssbo_atomic_comp_swap_ir3",  src_comp=[1, 1, 1, 1, 1], dest_comp=1, indices=[ACCESS])
 
-# IR3-specific instruction for UBO loads using the ldc instruction. The second
-# source is the indirect offset, in units of four dwords. The base is a
-# component offset, in dword units.
-intrinsic("load_ubo_ir3", src_comp=[1, 1], bit_sizes=[32], dest_comp=0, indices=[BASE],
-          flags=[CAN_REORDER, CAN_ELIMINATE])
-
 # System values for freedreno geometry shaders.
 system_value("vs_primitive_stride_ir3", 1)
 system_value("vs_vertex_stride_ir3", 1)
index 04a2dd9cea4f084a2536a7d6e284350c777036de..8172e113c1a141586b4cd030c786ae5fcb109882 100644 (file)
@@ -754,7 +754,7 @@ emit_intrinsic_load_ubo_ldc(struct ir3_context *ctx, nir_intrinsic_instr *intr,
        struct ir3_instruction *ldc = ir3_LDC(b, idx, 0, offset, 0);
        ldc->regs[0]->wrmask = MASK(ncomp);
        ldc->cat6.iim_val = ncomp;
-       ldc->cat6.d = nir_intrinsic_base(intr);
+       ldc->cat6.d = nir_intrinsic_component(intr);
        ldc->cat6.type = TYPE_U32;
 
        ir3_handle_bindless_cat6(ldc, intr->src[0]);
@@ -1647,7 +1647,7 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
        case nir_intrinsic_load_ubo:
                emit_intrinsic_load_ubo(ctx, intr, dst);
                break;
-       case nir_intrinsic_load_ubo_ir3:
+       case nir_intrinsic_load_ubo_vec4:
                emit_intrinsic_load_ubo_ldc(ctx, intr, dst);
                break;
        case nir_intrinsic_load_frag_coord:
index 7b373dff2422a694b12f681bcd6ffc15172e87f4..bd73beefac9915c0ec6181c66024de9b00b6921c 100644 (file)
@@ -461,6 +461,9 @@ ir3_nir_lower_variant(struct ir3_shader_variant *so, nir_shader *s)
        /* UBO offset lowering has to come after we've decided what will
         * be left as load_ubo
         */
+       if (so->shader->compiler->gpu_id >= 600)
+               OPT_V(s, nir_lower_ubo_vec4);
+
        OPT_V(s, ir3_nir_lower_io_offsets, so->shader->compiler->gpu_id);
 
        if (progress)
index 900768edb036433a12d915f85e7367677b9374fb..8a9503feb747a7294869a47ad22f41b1d78e16a5 100644 (file)
@@ -327,8 +327,8 @@ instr_is_load_ubo(nir_instr *instr)
 
        nir_intrinsic_op op = nir_instr_as_intrinsic(instr)->intrinsic;
 
-       /* ir3_nir_lower_io_offsets happens after this pass. */
-       assert(op != nir_intrinsic_load_ubo_ir3);
+       /* nir_lower_ubo_vec4 happens after this pass. */
+       assert(op != nir_intrinsic_load_ubo_vec4);
 
        return op == nir_intrinsic_load_ubo;
 }
index 36c48cf1299c8abf05cc2f17c5fd6fdad27fc1c2..110197d93b9bcc0a95d82969cccc2be3ce206281 100644 (file)
@@ -253,81 +253,6 @@ lower_offset_for_ssbo(nir_intrinsic_instr *intrinsic, nir_builder *b,
        return true;
 }
 
-static bool
-lower_offset_for_ubo(nir_intrinsic_instr *intrinsic, nir_builder *b, int gpu_id)
-{
-       /* We only need to lower offset if using LDC, which takes an offset in
-        * vec4 units and has the start component baked into the instruction.
-        */
-       if (gpu_id < 600)
-               return false;
-
-       /* TODO handle other bitsizes, including non-dword-aligned loads */
-       assert(intrinsic->dest.ssa.bit_size == 32);
-
-       b->cursor = nir_before_instr(&intrinsic->instr);
-
-       nir_intrinsic_instr *new_intrinsic =
-               nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_ubo_ir3);
-
-       debug_assert(intrinsic->dest.is_ssa);
-       new_intrinsic->src[0] = intrinsic->src[0];
-
-       nir_ssa_def *offset = intrinsic->src[1].ssa;
-       nir_ssa_def *new_offset = ir3_nir_try_propagate_bit_shift(b, offset, -4);
-
-       if (!new_offset)
-               new_offset = nir_ushr(b, offset, nir_imm_int(b, 4));
-
-       new_intrinsic->src[1] = nir_src_for_ssa(new_offset);
-
-       unsigned align_mul = nir_intrinsic_align_mul(intrinsic);
-       unsigned align_offset = nir_intrinsic_align_offset(intrinsic);
-
-       unsigned components = intrinsic->num_components;
-
-       if (align_mul % 16 != 0)
-               components = 4;
-
-       new_intrinsic->num_components = components;
-
-       nir_ssa_dest_init(&new_intrinsic->instr, &new_intrinsic->dest,
-                                         components, 32, NULL);
-
-       nir_builder_instr_insert(b, &new_intrinsic->instr);
-
-       nir_ssa_def *new_dest;
-       if (align_mul % 16 == 0) {
-               /* We know that the low 4 bits of the offset are constant and equal to
-                * align_offset. Use the component offset.
-                */
-               unsigned component = align_offset / 4;
-               nir_intrinsic_set_base(new_intrinsic, component);
-               new_dest = &new_intrinsic->dest.ssa;
-       } else {
-               /* We have to assume it isn't aligned, and extract the components
-                * dynamically.
-                */
-               nir_intrinsic_set_base(new_intrinsic, 0);
-               nir_ssa_def *component =
-                       nir_iand(b, nir_ushr(b, offset, nir_imm_int(b, 2)), nir_imm_int(b, 3));
-               nir_ssa_def *channels[NIR_MAX_VEC_COMPONENTS];
-               for (unsigned i = 0; i < intrinsic->num_components; i++) {
-                       nir_ssa_def *idx = nir_iadd(b, nir_imm_int(b, i), component);
-                       channels[i] = nir_vector_extract(b, &new_intrinsic->dest.ssa, idx);
-               }
-
-               new_dest = nir_vec(b, channels, intrinsic->num_components);
-       }
-
-       nir_ssa_def_rewrite_uses(&intrinsic->dest.ssa,
-                                                        nir_src_for_ssa(new_dest));
-
-       nir_instr_remove(&intrinsic->instr);
-
-       return true;
-}
-
 static bool
 lower_io_offsets_block(nir_block *block, nir_builder *b, void *mem_ctx, int gpu_id)
 {
@@ -339,12 +264,6 @@ lower_io_offsets_block(nir_block *block, nir_builder *b, void *mem_ctx, int gpu_
 
                nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
 
-               /* UBO */
-               if (intr->intrinsic == nir_intrinsic_load_ubo) {
-                       progress |= lower_offset_for_ubo(intr, b, gpu_id);
-                       continue;
-               }
-
                /* SSBO */
                int ir3_intrinsic;
                uint8_t offset_src_idx;