ir3: Fix LDC offset units
authorConnor Abbott <cwabbott0@gmail.com>
Wed, 15 Apr 2020 12:18:03 +0000 (14:18 +0200)
committerMarge Bot <eric+marge@anholt.net>
Wed, 15 Apr 2020 22:38:20 +0000 (22:38 +0000)
I had missed that LDC actually uses vec4 units for its offset. This
means that we have to create a new instruction, and lower it in
ir3_nir_lower_io_offsets, similar to the existing SSBO instructions.
Unfortunately we can't assume that loads are always vec4-aligned, so we
have to use the alignment information that NIR gives us. Unfortunately,
it's currently woefully inadequate, and will have to be fixed to give us
good codegen in the future.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4568>

src/compiler/nir/nir_intrinsics.py
src/freedreno/ir3/disasm-a3xx.c
src/freedreno/ir3/ir3.c
src/freedreno/ir3/ir3.h
src/freedreno/ir3/ir3_compiler_nir.c
src/freedreno/ir3/ir3_nir_lower_io_offsets.c
src/freedreno/ir3/ir3_print.c

index ac8d39a2b414439ccb4cf58ac9ae8841136af213..b2cb0371efce153c45a8a874f1f79652535725dc 100644 (file)
@@ -802,6 +802,12 @@ intrinsic("ssbo_atomic_xor_ir3",        src_comp=[1, 1, 1, 1],    dest_comp=1)
 intrinsic("ssbo_atomic_exchange_ir3",   src_comp=[1, 1, 1, 1],    dest_comp=1)
 intrinsic("ssbo_atomic_comp_swap_ir3",  src_comp=[1, 1, 1, 1, 1], dest_comp=1)
 
+# IR3-specific instruction for UBO loads using the ldc instruction. The second
+# source is the indirect offset, in units of four dwords. The base is a
+# component offset, in dword units.
+intrinsic("load_ubo_ir3", src_comp=[1, 1], bit_sizes=[32], dest_comp=0, indices=[BASE],
+          flags=[CAN_REORDER, CAN_ELIMINATE])
+
 # System values for freedreno geometry shaders.
 system_value("vs_primitive_stride_ir3", 1)
 system_value("vs_vertex_stride_ir3", 1)
index 674b4d57475b3ba60b50937223f99b8e7ad3f1c3..23c3bc76ab2e6dc527d8cd5b6dae24a4b9ec4297 100644 (file)
@@ -940,6 +940,8 @@ static void print_instr_cat6_a6xx(struct disasm_ctx *ctx, instr_t *instr)
                fprintf(ctx->out, ".%s", cat6->typed ? "typed" : "untyped");
                fprintf(ctx->out, ".%dd", cat6->d + 1);
                fprintf(ctx->out, ".%s", type[cat6->type]);
+       } else {
+               fprintf(ctx->out, ".offset%d", cat6->d);
        }
        fprintf(ctx->out, ".%u", cat6->type_size + 1);
 
index dcd5a5082a6e1d46f18a4b969dd1e8aa47d7427d..9678389e8b556cdf7d2e513bbe2d0d32ecb17419 100644 (file)
@@ -561,7 +561,7 @@ static int emit_cat6_a6xx(struct ir3_instruction *instr, void *ptr,
        }
 
        cat6->type      = instr->cat6.type;
-       cat6->d         = instr->cat6.d - 1;
+       cat6->d         = instr->cat6.d - (instr->opc == OPC_LDC ? 0 : 1);
        cat6->typed     = instr->cat6.typed;
        cat6->type_size = instr->cat6.iim_val - 1;
        cat6->opc       = instr->opc;
index 54d740e75dbff8bfb055afb9c21beb2db2b39d8c..351490aecf72f51c3e072279060c35624bbe8ee2 100644 (file)
@@ -267,7 +267,7 @@ struct ir3_instruction {
                        int src_offset;
                        int dst_offset;
                        int iim_val : 3;      /* for ldgb/stgb, # of components */
-                       unsigned d : 3;
+                       unsigned d : 3;       /* for ldc, component offset */
                        bool typed : 1;
                        unsigned base : 3;
                } cat6;
index 8c676b2f3ce03859ea408ba3bc27281bcf89c18c..d9152416e4d05d50f1c77d75a7854a0a8ae8c39c 100644 (file)
@@ -748,8 +748,8 @@ emit_intrinsic_load_ubo_ldc(struct ir3_context *ctx, nir_intrinsic_instr *intr,
        struct ir3_instruction *idx = ir3_get_src(ctx, &intr->src[0])[0];
        struct ir3_instruction *ldc = ir3_LDC(b, idx, 0, offset, 0);
        ldc->regs[0]->wrmask = MASK(ncomp);
-       ldc->cat6.iim_val = intr->num_components;
-       ldc->cat6.d = 1;
+       ldc->cat6.iim_val = ncomp;
+       ldc->cat6.d = nir_intrinsic_base(intr);
        ldc->cat6.type = TYPE_U32;
 
        nir_intrinsic_instr *bindless = ir3_bindless_resource(intr->src[0]);
@@ -768,13 +768,6 @@ static void
 emit_intrinsic_load_ubo(struct ir3_context *ctx, nir_intrinsic_instr *intr,
                struct ir3_instruction **dst)
 {
-       if (ir3_bindless_resource(intr->src[0])) {
-               /* TODO: We should be using ldc for non-bindless things on a6xx as
-                * well.
-                */
-               emit_intrinsic_load_ubo_ldc(ctx, intr, dst);
-               return;
-       }
        struct ir3_block *b = ctx->block;
        struct ir3_instruction *base_lo, *base_hi, *addr, *src0, *src1;
        /* UBO addresses are the first driver params, but subtract 2 here to
@@ -1612,6 +1605,9 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
        case nir_intrinsic_load_ubo:
                emit_intrinsic_load_ubo(ctx, intr, dst);
                break;
+       case nir_intrinsic_load_ubo_ir3:
+               emit_intrinsic_load_ubo_ldc(ctx, intr, dst);
+               break;
        case nir_intrinsic_load_frag_coord:
                ir3_split_dest(b, dst, get_frag_coord(ctx), 0, 4);
                break;
index ba40a9f4194a726416d93838bf925ba97c3547e8..8e80c40eeb884773815fd9f2b28b06dfd5fd9c54 100644 (file)
@@ -250,6 +250,84 @@ lower_offset_for_ssbo(nir_intrinsic_instr *intrinsic, nir_builder *b,
        return true;
 }
 
+static bool
+lower_offset_for_ubo(nir_intrinsic_instr *intrinsic, nir_builder *b)
+{
+       /* We only need to lower offset if using LDC. Currently, we only use LDC
+        * in the bindless mode. Also, LDC is introduced on A6xx, but currently we
+        * only use bindless in turnip which is A6xx only.
+        *
+        * TODO: We should be using LDC always on A6xx+.
+        */
+       if (!ir3_bindless_resource(intrinsic->src[0]))
+               return false;
+
+       /* TODO handle other bitsizes, including non-dword-aligned loads */
+       assert(intrinsic->dest.ssa.bit_size == 32);
+
+       b->cursor = nir_before_instr(&intrinsic->instr);
+
+       nir_intrinsic_instr *new_intrinsic =
+               nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_ubo_ir3);
+
+       debug_assert(intrinsic->dest.is_ssa);
+       new_intrinsic->src[0] = nir_src_for_ssa(intrinsic->src[0].ssa);
+
+       nir_ssa_def *offset = intrinsic->src[1].ssa;
+       nir_ssa_def *new_offset = ir3_nir_try_propagate_bit_shift(b, offset, -4);
+
+       if (!new_offset)
+               new_offset = nir_ushr(b, offset, nir_imm_int(b, 4));
+
+       new_intrinsic->src[1] = nir_src_for_ssa(new_offset);
+
+       unsigned align_mul = nir_intrinsic_align_mul(intrinsic);
+       unsigned align_offset = nir_intrinsic_align_offset(intrinsic);
+
+       unsigned components = intrinsic->num_components;
+
+       if (align_mul % 16 != 0)
+               components = 4;
+
+       new_intrinsic->num_components = components;
+
+       nir_ssa_dest_init(&new_intrinsic->instr, &new_intrinsic->dest,
+                                         components, 32, NULL);
+
+       nir_builder_instr_insert(b, &new_intrinsic->instr);
+
+       nir_ssa_def *new_dest;
+       if (align_mul % 16 == 0) {
+               /* We know that the low 4 bits of the offset are constant and equal to
+                * align_offset. Use the component offset.
+                */
+               unsigned component = align_offset / 4;
+               nir_intrinsic_set_base(new_intrinsic, component);
+               new_dest = &new_intrinsic->dest.ssa;
+       } else {
+               /* We have to assume it isn't aligned, and extract the components
+                * dynamically.
+                */
+               nir_intrinsic_set_base(new_intrinsic, 0);
+               nir_ssa_def *component =
+                       nir_iand(b, nir_ushr(b, offset, nir_imm_int(b, 2)), nir_imm_int(b, 3));
+               nir_ssa_def *channels[NIR_MAX_VEC_COMPONENTS];
+               for (unsigned i = 0; i < intrinsic->num_components; i++) {
+                       nir_ssa_def *idx = nir_iadd(b, nir_imm_int(b, i), component);
+                       channels[i] = nir_vector_extract(b, &new_intrinsic->dest.ssa, idx);
+               }
+
+               new_dest = nir_vec(b, channels, intrinsic->num_components);
+       }
+
+       nir_ssa_def_rewrite_uses(&intrinsic->dest.ssa,
+                                                        nir_src_for_ssa(new_dest));
+
+       nir_instr_remove(&intrinsic->instr);
+
+       return true;
+}
+
 static bool
 lower_io_offsets_block(nir_block *block, nir_builder *b, void *mem_ctx)
 {
@@ -261,6 +339,12 @@ lower_io_offsets_block(nir_block *block, nir_builder *b, void *mem_ctx)
 
                nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
 
+               /* UBO */
+               if (intr->intrinsic == nir_intrinsic_load_ubo) {
+                       progress |= lower_offset_for_ubo(intr, b);
+                       continue;
+               }
+
                /* SSBO */
                int ir3_intrinsic;
                uint8_t offset_src_idx;
index 753a9919ca0fb7786861ef346301b9dea219bb59..1b6908edefbb65012c8140388adc7992ce0e52c8 100644 (file)
@@ -127,6 +127,8 @@ static void print_instr_name(struct ir3_instruction *instr, bool flags)
                        printf(".s");
                if (instr->flags & IR3_INSTR_A1EN)
                        printf(".a1en");
+               if (instr->opc == OPC_LDC)
+                       printf(".offset%d", instr->cat6.d);
                if (instr->flags & IR3_INSTR_B) {
                        printf(".base%d",
                                   is_tex(instr) ? instr->cat5.tex_base : instr->cat6.base);