From 14969aab11effa1500f114314c9b8879821b8b24 Mon Sep 17 00:00:00 2001
From: "Kristian H. Kristensen" <hoegsberg@google.com>
Date: Wed, 13 May 2020 13:19:57 -0700
Subject: [PATCH] freedreno/ir3: Drop wrmask for ir3 local and global store
 intrinsics

These intrinsics are supposed to map to the underlying hardware
instructions, which don't have wrmask. We use them when we lower
store_output in the geometry pipeline and since store_output gets
lowered to temps, we always see full wrmasks there.
---
 src/compiler/nir/nir_intrinsics.py     |  4 +--
 src/freedreno/ir3/ir3_compiler_nir.c   | 39 ++++++--------------------
 src/freedreno/ir3/ir3_nir_lower_tess.c | 37 +++++++++++++++---------
 3 files changed, 35 insertions(+), 45 deletions(-)

diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py
index 611955ffa02..00098203d2e 100644
--- a/src/compiler/nir/nir_intrinsics.py
+++ b/src/compiler/nir/nir_intrinsics.py
@@ -836,7 +836,7 @@ intrinsic("end_patch_ir3")
 # between geometry stages - perhaps it's explicit access to the vertex cache.
 
 # src[] = { value, offset }.
-store("shared_ir3", 2, [BASE, WRMASK, ALIGN_MUL, ALIGN_OFFSET])
+store("shared_ir3", 2, [BASE, ALIGN_MUL, ALIGN_OFFSET])
 # src[] = { offset }.
 load("shared_ir3", 1, [BASE, ALIGN_MUL, ALIGN_OFFSET], [CAN_ELIMINATE])
 
@@ -846,7 +846,7 @@ load("shared_ir3", 1, [BASE, ALIGN_MUL, ALIGN_OFFSET], [CAN_ELIMINATE])
 
 # src[] = { value, address(vec2 of hi+lo uint32_t), offset }.
 # const_index[] = { write_mask, align_mul, align_offset }
-intrinsic("store_global_ir3", [0, 2, 1], indices=[WRMASK, ACCESS, ALIGN_MUL, ALIGN_OFFSET])
+intrinsic("store_global_ir3", [0, 2, 1], indices=[ACCESS, ALIGN_MUL, ALIGN_OFFSET])
 # src[] = { address(vec2 of hi+lo uint32_t), offset }.
 # const_index[] = { access, align_mul, align_offset }
 intrinsic("load_global_ir3", [2, 1], dest_comp=0, indices=[ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE])
diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c
index 76f2f7525bf..13e180118c4 100644
--- a/src/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/freedreno/ir3/ir3_compiler_nir.c
@@ -939,48 +939,27 @@ emit_intrinsic_load_shared_ir3(struct ir3_context *ctx, nir_intrinsic_instr *int
 	ir3_split_dest(b, dst, load, 0, intr->num_components);
 }
 
-/* src[] = { value, offset }. const_index[] = { base, write_mask } */
+/* src[] = { value, offset }. const_index[] = { base } */
 static void
 emit_intrinsic_store_shared_ir3(struct ir3_context *ctx, nir_intrinsic_instr *intr)
 {
 	struct ir3_block *b = ctx->block;
 	struct ir3_instruction *store, *offset;
 	struct ir3_instruction * const *value;
-	unsigned base, wrmask;
 
 	value  = ir3_get_src(ctx, &intr->src[0]);
 	offset = ir3_get_src(ctx, &intr->src[1])[0];
 
-	base   = nir_intrinsic_base(intr);
-	wrmask = nir_intrinsic_write_mask(intr);
-
-	/* Combine groups of consecutive enabled channels in one write
-	 * message. We use ffs to find the first enabled channel and then ffs on
-	 * the bit-inverse, down-shifted writemask to determine the length of
-	 * the block of enabled bits.
-	 *
-	 * (trick stolen from i965's fs_visitor::nir_emit_cs_intrinsic())
-	 */
-	while (wrmask) {
-		unsigned first_component = ffs(wrmask) - 1;
-		unsigned length = ffs(~(wrmask >> first_component)) - 1;
-
-		store = ir3_STLW(b, offset, 0,
-			ir3_create_collect(ctx, &value[first_component], length), 0,
-			create_immed(b, length), 0);
-
-		store->cat6.dst_offset = first_component + base;
-		store->cat6.type = utype_src(intr->src[0]);
-		store->barrier_class = IR3_BARRIER_SHARED_W;
-		store->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
+	store = ir3_STLW(b, offset, 0,
+		ir3_create_collect(ctx, value, intr->num_components), 0,
+		create_immed(b, intr->num_components), 0);
 
-		array_insert(b, b->keeps, store);
+	store->cat6.dst_offset = nir_intrinsic_base(intr);
+	store->cat6.type = utype_src(intr->src[0]);
+	store->barrier_class = IR3_BARRIER_SHARED_W;
+	store->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
 
-		/* Clear the bits in the writemask that we just wrote, then try
-		 * again to see if more channels are left.
-		 */
-		wrmask &= (15 << (first_component + length));
-	}
+	array_insert(b, b->keeps, store);
 }
 
 /*
diff --git a/src/freedreno/ir3/ir3_nir_lower_tess.c b/src/freedreno/ir3/ir3_nir_lower_tess.c
index 4d8798c285f..9f4985bc34f 100644
--- a/src/freedreno/ir3/ir3_nir_lower_tess.c
+++ b/src/freedreno/ir3/ir3_nir_lower_tess.c
@@ -191,6 +191,13 @@ lower_block_to_explicit_output(nir_block *block, nir_builder *b, struct state *s
 		case nir_intrinsic_store_output: {
 			// src[] = { value, offset }.
 
+			/* nir_lower_io_to_temporaries replaces all access to output
+			 * variables with temp variables and then emits a nir_copy_var at
+			 * the end of the shader.  Thus, we should always get a full wrmask
+			 * here.
+			 */
+			assert(util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
+
 			b->cursor = nir_instr_remove(&intr->instr);
 
 			nir_ssa_def *vertex_id = build_vertex_id(b, state);
@@ -199,10 +206,8 @@ lower_block_to_explicit_output(nir_block *block, nir_builder *b, struct state *s
 			nir_intrinsic_instr *store =
 				nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_shared_ir3);
 
-			nir_intrinsic_set_write_mask(store, MASK(intr->num_components));
 			store->src[0] = nir_src_for_ssa(intr->src[0].ssa);
 			store->src[1] = nir_src_for_ssa(offset);
-
 			store->num_components = intr->num_components;
 
 			nir_builder_instr_insert(b, &store->instr);
@@ -431,17 +436,21 @@ lower_tess_ctrl_block(nir_block *block, nir_builder *b, struct state *state)
 
 			b->cursor = nir_before_instr(&intr->instr);
 
+			/* nir_lower_io_to_temporaries replaces all access to output
+			 * variables with temp variables and then emits a nir_copy_var at
+			 * the end of the shader.  Thus, we should always get a full wrmask
+			 * here.
+			 */
+			assert(util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
+
 			nir_ssa_def *value = intr->src[0].ssa;
 			nir_ssa_def *address = nir_load_tess_param_base_ir3(b);
 			nir_variable *var = get_var(&b->shader->outputs, nir_intrinsic_base(intr));
 			nir_ssa_def *offset = build_per_vertex_offset(b, state,
 					intr->src[1].ssa, intr->src[2].ssa, var);
 
-			nir_intrinsic_instr *store =
-				replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3, value, address,
-								  nir_iadd(b, offset, nir_imm_int(b, nir_intrinsic_component(intr))));
-
-			nir_intrinsic_set_write_mask(store, nir_intrinsic_write_mask(intr));
+			replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3, value, address,
+					nir_iadd(b, offset, nir_imm_int(b, nir_intrinsic_component(intr))));
 
 			break;
 		}
@@ -503,11 +512,15 @@ lower_tess_ctrl_block(nir_block *block, nir_builder *b, struct state *state)
 
 				debug_assert(nir_intrinsic_component(intr) == 0);
 
-				nir_intrinsic_instr *store =
-					replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3,
-							intr->src[0].ssa, address, offset);
+				/* nir_lower_io_to_temporaries replaces all access to output
+				 * variables with temp variables and then emits a nir_copy_var at
+				 * the end of the shader.  Thus, we should always get a full wrmask
+				 * here.
+				 */
+				assert(util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
 
-				nir_intrinsic_set_write_mask(store, nir_intrinsic_write_mask(intr));
+				replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3,
+						intr->src[0].ssa, address, offset);
 			}
 			break;
 		}
@@ -559,7 +572,6 @@ emit_tess_epilouge(nir_builder *b, struct state *state)
 	store->src[2] = nir_src_for_ssa(offset);
 	nir_builder_instr_insert(b, &store->instr);
 	store->num_components = levels[0]->num_components;
-	nir_intrinsic_set_write_mask(store, (1 << levels[0]->num_components) - 1);
 
 	if (levels[1]) {
 		store = nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_global_ir3);
@@ -570,7 +582,6 @@ emit_tess_epilouge(nir_builder *b, struct state *state)
 		store->src[2] = nir_src_for_ssa(offset);
 		nir_builder_instr_insert(b, &store->instr);
 		store->num_components = levels[1]->num_components;
-		nir_intrinsic_set_write_mask(store, (1 << levels[1]->num_components) - 1);
 	}
 
 	/* Finally, Insert endpatch instruction:
-- 
2.30.2