freedreno/ir3: use lower_wrmasks pass
authorRob Clark <robdclark@chromium.org>
Wed, 6 May 2020 21:58:28 +0000 (14:58 -0700)
committerRob Clark <robdclark@chromium.org>
Thu, 14 May 2020 03:24:53 +0000 (20:24 -0700)
Signed-off-by: Rob Clark <robdclark@chromium.org>
Reviewed-by: Kristian H. Kristensen <hoegsberg@google.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
src/freedreno/ir3/ir3_a4xx.c
src/freedreno/ir3/ir3_a6xx.c
src/freedreno/ir3/ir3_compiler_nir.c
src/freedreno/ir3/ir3_nir.c

index 594fb9cd021d0d9b6df15adc6746e9dd3b54d676..e460cd0b629458276384f1cdb7d59ab6e48257b7 100644 (file)
@@ -73,13 +73,11 @@ emit_intrinsic_store_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
 {
        struct ir3_block *b = ctx->block;
        struct ir3_instruction *stgb, *src0, *src1, *src2, *byte_offset, *offset;
-       /* TODO handle wrmask properly, see _store_shared().. but I think
-        * it is more a PITA than that, since blob ends up loading the
-        * masked components and writing them back out.
-        */
        unsigned wrmask = nir_intrinsic_write_mask(intr);
        unsigned ncomp = ffs(~wrmask) - 1;
 
+       assert(wrmask == BITFIELD_MASK(intr->num_components));
+
        /* can this be non-const buffer_index?  how do we handle that? */
        int ibo_idx = ir3_ssbo_to_ibo(ctx->so->shader, nir_src_as_uint(intr->src[1]));
 
index d4cb74c39bd48be05661ebad30cd643eda8648b2..e297e34fdf526f771d2ea861e92abe96a94702c9 100644 (file)
@@ -103,13 +103,11 @@ emit_intrinsic_store_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
 {
        struct ir3_block *b = ctx->block;
        struct ir3_instruction *stib, *val, *offset;
-       /* TODO handle wrmask properly, see _store_shared().. but I think
-        * it is more a PITA than that, since blob ends up loading the
-        * masked components and writing them back out.
-        */
        unsigned wrmask = nir_intrinsic_write_mask(intr);
        unsigned ncomp = ffs(~wrmask) - 1;
 
+       assert(wrmask == BITFIELD_MASK(intr->num_components));
+
        /* src0 is offset, src1 is value:
         */
        val = ir3_create_collect(ctx, ir3_get_src(ctx, &intr->src[0]), ncomp);
index 13e180118c43f67727e1fbe2b133fb81a1a97c63..9e1105bce08ee41efc1e24cb5db202b0ae0bc343 100644 (file)
@@ -878,40 +878,26 @@ emit_intrinsic_store_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr)
        struct ir3_block *b = ctx->block;
        struct ir3_instruction *stl, *offset;
        struct ir3_instruction * const *value;
-       unsigned base, wrmask;
+       unsigned base, wrmask, ncomp;
 
        value  = ir3_get_src(ctx, &intr->src[0]);
        offset = ir3_get_src(ctx, &intr->src[1])[0];
 
        base   = nir_intrinsic_base(intr);
        wrmask = nir_intrinsic_write_mask(intr);
+       ncomp  = ffs(~wrmask) - 1;
 
-       /* Combine groups of consecutive enabled channels in one write
-        * message. We use ffs to find the first enabled channel and then ffs on
-        * the bit-inverse, down-shifted writemask to determine the length of
-        * the block of enabled bits.
-        *
-        * (trick stolen from i965's fs_visitor::nir_emit_cs_intrinsic())
-        */
-       while (wrmask) {
-               unsigned first_component = ffs(wrmask) - 1;
-               unsigned length = ffs(~(wrmask >> first_component)) - 1;
-
-               stl = ir3_STL(b, offset, 0,
-                       ir3_create_collect(ctx, &value[first_component], length), 0,
-                       create_immed(b, length), 0);
-               stl->cat6.dst_offset = first_component + base;
-               stl->cat6.type = utype_src(intr->src[0]);
-               stl->barrier_class = IR3_BARRIER_SHARED_W;
-               stl->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
-
-               array_insert(b, b->keeps, stl);
-
-               /* Clear the bits in the writemask that we just wrote, then try
-                * again to see if more channels are left.
-                */
-               wrmask &= (15 << (first_component + length));
-       }
+       assert(wrmask == BITFIELD_MASK(intr->num_components));
+
+       stl = ir3_STL(b, offset, 0,
+               ir3_create_collect(ctx, value, ncomp), 0,
+               create_immed(b, ncomp), 0);
+       stl->cat6.dst_offset = base;
+       stl->cat6.type = utype_src(intr->src[0]);
+       stl->barrier_class = IR3_BARRIER_SHARED_W;
+       stl->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
+
+       array_insert(b, b->keeps, stl);
 }
 
 /* src[] = { offset }. const_index[] = { base } */
index b3f784a557e44a450186b179d2584d95e0a3b150..48dc9a340abf10332552cd719c62de048e335543 100644 (file)
@@ -210,6 +210,21 @@ ir3_optimize_loop(nir_shader *s)
        } while (progress);
 }
 
+static bool
+should_split_wrmask(const nir_instr *instr, const void *data)
+{
+       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+       switch (intr->intrinsic) {
+       case nir_intrinsic_store_ssbo:
+       case nir_intrinsic_store_shared:
+       case nir_intrinsic_store_global:
+               return true;
+       default:
+               return false;
+       }
+}
+
 void
 ir3_optimize_nir(struct ir3_shader *shader, nir_shader *s,
                const struct ir3_shader_key *key)
@@ -274,6 +289,7 @@ ir3_optimize_nir(struct ir3_shader *shader, nir_shader *s,
        }
 
        OPT_V(s, nir_lower_regs_to_ssa);
+       OPT_V(s, nir_lower_wrmasks, should_split_wrmask, s);
 
        if (key) {
                if (s->info.stage == MESA_SHADER_VERTEX) {