freedreno/ir3: Drop wrmask for ir3 local and global store intrinsics
authorKristian H. Kristensen <hoegsberg@google.com>
Wed, 13 May 2020 20:19:57 +0000 (13:19 -0700)
committerRob Clark <robdclark@chromium.org>
Thu, 14 May 2020 03:24:33 +0000 (20:24 -0700)
These intrinsics are supposed to map to the underlying hardware
instructions, which don't have wrmask. We use them when we lower
store_output in the geometry pipeline and since store_output gets
lowered to temps, we always see full wrmasks there.

src/compiler/nir/nir_intrinsics.py
src/freedreno/ir3/ir3_compiler_nir.c
src/freedreno/ir3/ir3_nir_lower_tess.c

index 611955ffa0270b60c0038ffca2834f58635cc0b8..00098203d2ef63d78511fa37e0d6737a30fd75aa 100644 (file)
@@ -836,7 +836,7 @@ intrinsic("end_patch_ir3")
 # between geometry stages - perhaps it's explicit access to the vertex cache.
 
 # src[] = { value, offset }.
-store("shared_ir3", 2, [BASE, WRMASK, ALIGN_MUL, ALIGN_OFFSET])
+store("shared_ir3", 2, [BASE, ALIGN_MUL, ALIGN_OFFSET])
 # src[] = { offset }.
 load("shared_ir3", 1, [BASE, ALIGN_MUL, ALIGN_OFFSET], [CAN_ELIMINATE])
 
@@ -846,7 +846,7 @@ load("shared_ir3", 1, [BASE, ALIGN_MUL, ALIGN_OFFSET], [CAN_ELIMINATE])
 
 # src[] = { value, address(vec2 of hi+lo uint32_t), offset }.
 # const_index[] = { write_mask, align_mul, align_offset }
-intrinsic("store_global_ir3", [0, 2, 1], indices=[WRMASK, ACCESS, ALIGN_MUL, ALIGN_OFFSET])
+intrinsic("store_global_ir3", [0, 2, 1], indices=[ACCESS, ALIGN_MUL, ALIGN_OFFSET])
 # src[] = { address(vec2 of hi+lo uint32_t), offset }.
 # const_index[] = { access, align_mul, align_offset }
 intrinsic("load_global_ir3", [2, 1], dest_comp=0, indices=[ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE])
index 76f2f7525bf3e2df9a179b78e1730877e6bfe5a0..13e180118c43f67727e1fbe2b133fb81a1a97c63 100644 (file)
@@ -939,48 +939,27 @@ emit_intrinsic_load_shared_ir3(struct ir3_context *ctx, nir_intrinsic_instr *int
        ir3_split_dest(b, dst, load, 0, intr->num_components);
 }
 
-/* src[] = { value, offset }. const_index[] = { base, write_mask } */
+/* src[] = { value, offset }. const_index[] = { base } */
 static void
 emit_intrinsic_store_shared_ir3(struct ir3_context *ctx, nir_intrinsic_instr *intr)
 {
        struct ir3_block *b = ctx->block;
        struct ir3_instruction *store, *offset;
        struct ir3_instruction * const *value;
-       unsigned base, wrmask;
 
        value  = ir3_get_src(ctx, &intr->src[0]);
        offset = ir3_get_src(ctx, &intr->src[1])[0];
 
-       base   = nir_intrinsic_base(intr);
-       wrmask = nir_intrinsic_write_mask(intr);
-
-       /* Combine groups of consecutive enabled channels in one write
-        * message. We use ffs to find the first enabled channel and then ffs on
-        * the bit-inverse, down-shifted writemask to determine the length of
-        * the block of enabled bits.
-        *
-        * (trick stolen from i965's fs_visitor::nir_emit_cs_intrinsic())
-        */
-       while (wrmask) {
-               unsigned first_component = ffs(wrmask) - 1;
-               unsigned length = ffs(~(wrmask >> first_component)) - 1;
-
-               store = ir3_STLW(b, offset, 0,
-                       ir3_create_collect(ctx, &value[first_component], length), 0,
-                       create_immed(b, length), 0);
-
-               store->cat6.dst_offset = first_component + base;
-               store->cat6.type = utype_src(intr->src[0]);
-               store->barrier_class = IR3_BARRIER_SHARED_W;
-               store->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
+       store = ir3_STLW(b, offset, 0,
+               ir3_create_collect(ctx, value, intr->num_components), 0,
+               create_immed(b, intr->num_components), 0);
 
-               array_insert(b, b->keeps, store);
+       store->cat6.dst_offset = nir_intrinsic_base(intr);
+       store->cat6.type = utype_src(intr->src[0]);
+       store->barrier_class = IR3_BARRIER_SHARED_W;
+       store->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
 
-               /* Clear the bits in the writemask that we just wrote, then try
-                * again to see if more channels are left.
-                */
-               wrmask &= (15 << (first_component + length));
-       }
+       array_insert(b, b->keeps, store);
 }
 
 /*
index 4d8798c285fd436104cf2dbf1f6bf4c6857f83c1..9f4985bc34f4636e3ff99e0536b9ea0d0ccf35bc 100644 (file)
@@ -191,6 +191,13 @@ lower_block_to_explicit_output(nir_block *block, nir_builder *b, struct state *s
                case nir_intrinsic_store_output: {
                        // src[] = { value, offset }.
 
+                       /* nir_lower_io_to_temporaries replaces all access to output
+                        * variables with temp variables and then emits a nir_copy_var at
+                        * the end of the shader.  Thus, we should always get a full wrmask
+                        * here.
+                        */
+                       assert(util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
+
                        b->cursor = nir_instr_remove(&intr->instr);
 
                        nir_ssa_def *vertex_id = build_vertex_id(b, state);
@@ -199,10 +206,8 @@ lower_block_to_explicit_output(nir_block *block, nir_builder *b, struct state *s
                        nir_intrinsic_instr *store =
                                nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_shared_ir3);
 
-                       nir_intrinsic_set_write_mask(store, MASK(intr->num_components));
                        store->src[0] = nir_src_for_ssa(intr->src[0].ssa);
                        store->src[1] = nir_src_for_ssa(offset);
-
                        store->num_components = intr->num_components;
 
                        nir_builder_instr_insert(b, &store->instr);
@@ -431,17 +436,21 @@ lower_tess_ctrl_block(nir_block *block, nir_builder *b, struct state *state)
 
                        b->cursor = nir_before_instr(&intr->instr);
 
+                       /* nir_lower_io_to_temporaries replaces all access to output
+                        * variables with temp variables and then emits a nir_copy_var at
+                        * the end of the shader.  Thus, we should always get a full wrmask
+                        * here.
+                        */
+                       assert(util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
+
                        nir_ssa_def *value = intr->src[0].ssa;
                        nir_ssa_def *address = nir_load_tess_param_base_ir3(b);
                        nir_variable *var = get_var(&b->shader->outputs, nir_intrinsic_base(intr));
                        nir_ssa_def *offset = build_per_vertex_offset(b, state,
                                        intr->src[1].ssa, intr->src[2].ssa, var);
 
-                       nir_intrinsic_instr *store =
-                               replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3, value, address,
-                                                                 nir_iadd(b, offset, nir_imm_int(b, nir_intrinsic_component(intr))));
-
-                       nir_intrinsic_set_write_mask(store, nir_intrinsic_write_mask(intr));
+                       replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3, value, address,
+                                       nir_iadd(b, offset, nir_imm_int(b, nir_intrinsic_component(intr))));
 
                        break;
                }
@@ -503,11 +512,15 @@ lower_tess_ctrl_block(nir_block *block, nir_builder *b, struct state *state)
 
                                debug_assert(nir_intrinsic_component(intr) == 0);
 
-                               nir_intrinsic_instr *store =
-                                       replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3,
-                                                       intr->src[0].ssa, address, offset);
+                               /* nir_lower_io_to_temporaries replaces all access to output
+                                * variables with temp variables and then emits a nir_copy_var at
+                                * the end of the shader.  Thus, we should always get a full wrmask
+                                * here.
+                                */
+                               assert(util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
 
-                               nir_intrinsic_set_write_mask(store, nir_intrinsic_write_mask(intr));
+                               replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3,
+                                               intr->src[0].ssa, address, offset);
                        }
                        break;
                }
@@ -559,7 +572,6 @@ emit_tess_epilouge(nir_builder *b, struct state *state)
        store->src[2] = nir_src_for_ssa(offset);
        nir_builder_instr_insert(b, &store->instr);
        store->num_components = levels[0]->num_components;
-       nir_intrinsic_set_write_mask(store, (1 << levels[0]->num_components) - 1);
 
        if (levels[1]) {
                store = nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_global_ir3);
@@ -570,7 +582,6 @@ emit_tess_epilouge(nir_builder *b, struct state *state)
                store->src[2] = nir_src_for_ssa(offset);
                nir_builder_instr_insert(b, &store->instr);
                store->num_components = levels[1]->num_components;
-               nir_intrinsic_set_write_mask(store, (1 << levels[1]->num_components) - 1);
        }
 
        /* Finally, Insert endpatch instruction: