X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Ffreedreno%2Fir3%2Fir3_context.c;h=2d1ed21d9e37ee84d2589e34db64b7dbf0da65a3;hb=c2d94aa365da628fc2c7e2e9e2d35decec434719;hp=1aab7396c3e11593a4f7f8fe7dd6ce459a321d7b;hpb=c1a27ba9baf7c1d6ce15a3c9b2d9cb1eafa72918;p=mesa.git

diff --git a/src/freedreno/ir3/ir3_context.c b/src/freedreno/ir3/ir3_context.c
index 1aab7396c3e..2d1ed21d9e3 100644
--- a/src/freedreno/ir3/ir3_context.c
+++ b/src/freedreno/ir3/ir3_context.c
@@ -24,8 +24,6 @@
  *    Rob Clark <robclark@freedesktop.org>
  */
 
-#include "util/u_math.h"
-
 #include "ir3_compiler.h"
 #include "ir3_context.h"
 #include "ir3_image.h"
@@ -65,6 +63,8 @@ ir3_context_init(struct ir3_compiler *compiler,
 			_mesa_hash_pointer, _mesa_key_pointer_equal);
 	ctx->block_ht = _mesa_hash_table_create(ctx,
 			_mesa_hash_pointer, _mesa_key_pointer_equal);
+	ctx->sel_cond_conversions = _mesa_hash_table_create(ctx,
+			_mesa_hash_pointer, _mesa_key_pointer_equal);
 
 	/* TODO: maybe generate some sort of bitmask of what key
 	 * lowers vs what shader has (ie. no need to lower
@@ -73,97 +73,90 @@ ir3_context_init(struct ir3_compiler *compiler,
 	 * creating duplicate variants..
 	 */
 
-	if (ir3_key_lowers_nir(&so->key)) {
-		nir_shader *s = nir_shader_clone(ctx, so->shader->nir);
-		ctx->s = ir3_optimize_nir(so->shader, s, &so->key);
-	} else {
-		/* fast-path for shader key that lowers nothing in NIR: */
-		ctx->s = nir_shader_clone(ctx, so->shader->nir);
-	}
+	ctx->s = nir_shader_clone(ctx, so->shader->nir);
+	ir3_nir_lower_variant(so, ctx->s);
 
 	/* this needs to be the last pass run, so do this here instead of
 	 * in ir3_optimize_nir():
 	 */
-	NIR_PASS_V(ctx->s, nir_lower_bool_to_int32);
-	NIR_PASS_V(ctx->s, nir_lower_locals_to_regs);
-	NIR_PASS_V(ctx->s, nir_convert_from_ssa, true);
-
-	if (ir3_shader_debug & IR3_DBG_DISASM) {
-		DBG("dump nir%dv%d: type=%d, k={cts=%u,hp=%u}",
-			so->shader->id, so->id, so->type,
-			so->key.color_two_side, so->key.half_precision);
-		nir_print_shader(ctx->s, stdout);
+	bool progress = false;
+	NIR_PASS(progress, ctx->s, nir_lower_locals_to_regs);
+
+	/* we could need cleanup after lower_locals_to_regs */
+	while (progress) {
+		progress = false;
+		NIR_PASS(progress, ctx->s, nir_opt_algebraic);
+		NIR_PASS(progress, ctx->s, nir_opt_constant_folding);
 	}
 
-	if (shader_debug_enabled(so->type)) {
-		fprintf(stderr, "NIR (final form) for %s shader:\n",
-			_mesa_shader_stage_to_string(so->type));
-		nir_print_shader(ctx->s, stderr);
+	/* We want to lower nir_op_imul as late as possible, to catch also
+	 * those generated by earlier passes (e.g, nir_lower_locals_to_regs).
+	 * However, we want a final swing of a few passes to have a chance
+	 * at optimizing the result.
+	 */
+	progress = false;
+	NIR_PASS(progress, ctx->s, ir3_nir_lower_imul);
+	while (progress) {
+		progress = false;
+		NIR_PASS(progress, ctx->s, nir_opt_algebraic);
+		NIR_PASS(progress, ctx->s, nir_opt_copy_prop_vars);
+		NIR_PASS(progress, ctx->s, nir_opt_dead_write_vars);
+		NIR_PASS(progress, ctx->s, nir_opt_dce);
+		NIR_PASS(progress, ctx->s, nir_opt_constant_folding);
 	}
 
-	ir3_nir_scan_driver_consts(ctx->s, &so->const_layout);
-
-	so->num_uniforms = ctx->s->num_uniforms;
-	so->num_ubos = ctx->s->info.num_ubos;
+	/* Enable the texture pre-fetch feature only a4xx onwards.  But
+	 * only enable it on generations that have been tested:
+	 */
+	if ((so->type == MESA_SHADER_FRAGMENT) && (compiler->gpu_id >= 600))
+		NIR_PASS_V(ctx->s, ir3_nir_lower_tex_prefetch);
 
-	ir3_ibo_mapping_init(&so->image_mapping, ctx->s->info.num_textures);
+	NIR_PASS_V(ctx->s, nir_convert_from_ssa, true);
 
-	/* Layout of constant registers, each section aligned to vec4.  Note
-	 * that pointer size (ubo, etc) changes depending on generation.
+	/* Super crude heuristic to limit # of tex prefetch in small
+	 * shaders.  This completely ignores loops.. but that's really
+	 * not the worst of it's problems.  (A frag shader that has
+	 * loops is probably going to be big enough to not trigger a
+	 * lower threshold.)
 	 *
-	 *    user consts
-	 *    UBO addresses
-	 *    SSBO sizes
-	 *    if (vertex shader) {
-	 *        driver params (IR3_DP_*)
-	 *        if (stream_output.num_outputs > 0)
-	 *           stream-out addresses
-	 *    }
-	 *    immediates
+	 *   1) probably want to do this in terms of ir3 instructions
+	 *   2) probably really want to decide this after scheduling
+	 *      (or at least pre-RA sched) so we have a rough idea about
+	 *      nops, and don't count things that get cp'd away
+	 *   3) blob seems to use higher thresholds with a mix of more
+	 *      SFU instructions.  Which partly makes sense, more SFU
+	 *      instructions probably means you want to get the real
+	 *      shader started sooner, but that considers where in the
+	 *      shader the SFU instructions are, which blob doesn't seem
+	 *      to do.
 	 *
-	 * Immediates go last mostly because they are inserted in the CP pass
-	 * after the nir -> ir3 frontend.
+	 * This uses more conservative thresholds assuming a more alu
+	 * than sfu heavy instruction mix.
 	 */
-	unsigned constoff = align(ctx->s->num_uniforms, 4);
-	unsigned ptrsz = ir3_pointer_size(ctx);
-
-	memset(&so->constbase, ~0, sizeof(so->constbase));
-
-	if (so->num_ubos > 0) {
-		so->constbase.ubo = constoff;
-		constoff += align(ctx->s->info.num_ubos * ptrsz, 4) / 4;
-	}
-
-	if (so->const_layout.ssbo_size.count > 0) {
-		unsigned cnt = so->const_layout.ssbo_size.count;
-		so->constbase.ssbo_sizes = constoff;
-		constoff += align(cnt, 4) / 4;
-	}
+	if (so->type == MESA_SHADER_FRAGMENT) {
+		nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->s);
 
-	if (so->const_layout.image_dims.count > 0) {
-		unsigned cnt = so->const_layout.image_dims.count;
-		so->constbase.image_dims = constoff;
-		constoff += align(cnt, 4) / 4;
-	}
+		unsigned instruction_count = 0;
+		nir_foreach_block (block, fxn) {
+			instruction_count += exec_list_length(&block->instr_list);
+		}
 
-	unsigned num_driver_params = 0;
-	if (so->type == MESA_SHADER_VERTEX) {
-		num_driver_params = IR3_DP_VS_COUNT;
-	} else if (so->type == MESA_SHADER_COMPUTE) {
-		num_driver_params = IR3_DP_CS_COUNT;
+		if (instruction_count < 50) {
+			ctx->prefetch_limit = 2;
+		} else if (instruction_count < 70) {
+			ctx->prefetch_limit = 3;
+		} else {
+			ctx->prefetch_limit = IR3_MAX_SAMPLER_PREFETCH;
+		}
 	}
 
-	so->constbase.driver_param = constoff;
-	constoff += align(num_driver_params, 4) / 4;
-
-	if ((so->type == MESA_SHADER_VERTEX) &&
-			(compiler->gpu_id < 500) &&
-			so->shader->stream_output.num_outputs > 0) {
-		so->constbase.tfbo = constoff;
-		constoff += align(IR3_MAX_SO_BUFFERS * ptrsz, 4) / 4;
+	if (shader_debug_enabled(so->type)) {
+		fprintf(stdout, "NIR (final form) for %s shader %s:\n",
+			ir3_shader_stage(so), so->shader->nir->info.name);
+		nir_print_shader(ctx->s, stdout);
 	}
 
-	so->constbase.immediate = constoff;
+	ir3_ibo_mapping_init(&so->image_mapping, ctx->s->info.num_textures);
 
 	return ctx;
 }
@@ -228,7 +221,7 @@ ir3_get_src(struct ir3_context *ctx, nir_src *src)
 			ralloc_array(ctx, struct ir3_instruction *, num_components);
 
 		if (src->reg.indirect)
-			addr = ir3_get_addr(ctx, ir3_get_src(ctx, src->reg.indirect)[0],
+			addr = ir3_get_addr0(ctx, ir3_get_src(ctx, src->reg.indirect)[0],
 					reg->num_components);
 
 		for (unsigned i = 0; i < num_components; i++) {
@@ -242,7 +235,7 @@ ir3_get_src(struct ir3_context *ctx, nir_src *src)
 }
 
 void
-put_dst(struct ir3_context *ctx, nir_dest *dst)
+ir3_put_dst(struct ir3_context *ctx, nir_dest *dst)
 {
 	unsigned bit_size = nir_dest_bit_size(*dst);
 
@@ -251,17 +244,24 @@ put_dst(struct ir3_context *ctx, nir_dest *dst)
 	 * ir3_cp will clean up the extra mov:
 	 */
 	for (unsigned i = 0; i < ctx->last_dst_n; i++) {
+		if (!ctx->last_dst[i])
+			continue;
 		if (ctx->last_dst[i]->regs[0]->flags & IR3_REG_HIGH) {
 			ctx->last_dst[i] = ir3_MOV(ctx->block, ctx->last_dst[i], TYPE_U32);
 		}
 	}
 
-	if (bit_size < 32) {
+	/* Note: 1-bit bools are stored in 32-bit regs */
+	if (bit_size == 16) {
 		for (unsigned i = 0; i < ctx->last_dst_n; i++) {
 			struct ir3_instruction *dst = ctx->last_dst[i];
-			dst->regs[0]->flags |= IR3_REG_HALF;
-			if (ctx->last_dst[i]->opc == OPC_META_FO)
-				dst->regs[1]->instr->regs[0]->flags |= IR3_REG_HALF;
+			ir3_set_dst_type(dst, true);
+			ir3_fixup_src_type(dst);
+			if (dst->opc == OPC_META_SPLIT) {
+				ir3_set_dst_type(ssa(dst->regs[1]), true);
+				ir3_fixup_src_type(ssa(dst->regs[1]));
+				dst->regs[1]->flags |= IR3_REG_HALF;
+			}
 		}
 	}
 
@@ -272,7 +272,7 @@ put_dst(struct ir3_context *ctx, nir_dest *dst)
 		struct ir3_instruction *addr = NULL;
 
 		if (dst->reg.indirect)
-			addr = ir3_get_addr(ctx, ir3_get_src(ctx, dst->reg.indirect)[0],
+			addr = ir3_get_addr0(ctx, ir3_get_src(ctx, dst->reg.indirect)[0],
 					reg->num_components);
 
 		for (unsigned i = 0; i < num_components; i++) {
@@ -290,6 +290,12 @@ put_dst(struct ir3_context *ctx, nir_dest *dst)
 	ctx->last_dst_n = 0;
 }
 
+static unsigned
+dest_flags(struct ir3_instruction *instr)
+{
+	return instr->regs[0]->flags & (IR3_REG_HALF | IR3_REG_HIGH);
+}
+
 struct ir3_instruction *
 ir3_create_collect(struct ir3_context *ctx, struct ir3_instruction *const *arr,
 		unsigned arrsz)
@@ -300,10 +306,10 @@ ir3_create_collect(struct ir3_context *ctx, struct ir3_instruction *const *arr,
 	if (arrsz == 0)
 		return NULL;
 
-	unsigned flags = arr[0]->regs[0]->flags & IR3_REG_HALF;
+	unsigned flags = dest_flags(arr[0]);
 
-	collect = ir3_instr_create2(block, OPC_META_FI, 1 + arrsz);
-	ir3_reg_create(collect, 0, flags);     /* dst */
+	collect = ir3_instr_create2(block, OPC_META_COLLECT, 1 + arrsz);
+	__ssa_dst(collect)->flags |= flags;
 	for (unsigned i = 0; i < arrsz; i++) {
 		struct ir3_instruction *elem = arr[i];
 
@@ -336,15 +342,17 @@ ir3_create_collect(struct ir3_context *ctx, struct ir3_instruction *const *arr,
 			elem = ir3_MOV(block, elem, type);
 		}
 
-		compile_assert(ctx, (elem->regs[0]->flags & IR3_REG_HALF) == flags);
-		ir3_reg_create(collect, 0, IR3_REG_SSA | flags)->instr = elem;
+		compile_assert(ctx, dest_flags(elem) == flags);
+		__ssa_src(collect, elem, flags);
 	}
 
+	collect->regs[0]->wrmask = MASK(arrsz);
+
 	return collect;
 }
 
 /* helper for instructions that produce multiple consecutive scalar
- * outputs which need to have a split/fanout meta instruction inserted
+ * outputs which need to have a split meta instruction inserted
  */
 void
 ir3_split_dest(struct ir3_block *block, struct ir3_instruction **dst,
@@ -357,13 +365,24 @@ ir3_split_dest(struct ir3_block *block, struct ir3_instruction **dst,
 		return;
 	}
 
-	unsigned flags = src->regs[0]->flags & (IR3_REG_HALF | IR3_REG_HIGH);
+	if (src->opc == OPC_META_COLLECT) {
+		debug_assert((base + n) < src->regs_count);
+
+		for (int i = 0; i < n; i++) {
+			dst[i] = ssa(src->regs[i + base + 1]);
+		}
+
+		return;
+	}
+
+	unsigned flags = dest_flags(src);
 
 	for (int i = 0, j = 0; i < n; i++) {
-		struct ir3_instruction *split = ir3_instr_create(block, OPC_META_FO);
-		ir3_reg_create(split, 0, IR3_REG_SSA | flags);
-		ir3_reg_create(split, 0, IR3_REG_SSA | flags)->instr = src;
-		split->fo.off = i + base;
+		struct ir3_instruction *split =
+				ir3_instr_create(block, OPC_META_SPLIT);
+		__ssa_dst(split)->flags |= flags;
+		__ssa_src(split, src, flags);
+		split->split.off = i + base;
 
 		if (prev) {
 			split->cp.left = prev;
@@ -378,7 +397,7 @@ ir3_split_dest(struct ir3_block *block, struct ir3_instruction **dst,
 	}
 }
 
-void
+NORETURN void
 ir3_context_error(struct ir3_context *ctx, const char *format, ...)
 {
 	struct hash_table *errors = NULL;
@@ -397,19 +416,15 @@ ir3_context_error(struct ir3_context *ctx, const char *format, ...)
 	nir_print_shader_annotated(ctx->s, stdout, errors);
 	ralloc_free(errors);
 	ctx->error = true;
-	debug_assert(0);
+	unreachable("");
 }
 
 static struct ir3_instruction *
-create_addr(struct ir3_block *block, struct ir3_instruction *src, int align)
+create_addr0(struct ir3_block *block, struct ir3_instruction *src, int align)
 {
 	struct ir3_instruction *instr, *immed;
 
-	/* TODO in at least some cases, the backend could probably be
-	 * made clever enough to propagate IR3_REG_HALF..
-	 */
 	instr = ir3_COV(block, src, TYPE_U32, TYPE_S16);
-	instr->regs[0]->flags |= IR3_REG_HALF;
 
 	switch(align){
 	case 1:
@@ -417,41 +432,41 @@ create_addr(struct ir3_block *block, struct ir3_instruction *src, int align)
 		break;
 	case 2:
 		/* src *= 2	=> src <<= 1: */
-		immed = create_immed(block, 1);
-		immed->regs[0]->flags |= IR3_REG_HALF;
-
+		immed = create_immed_typed(block, 1, TYPE_S16);
 		instr = ir3_SHL_B(block, instr, 0, immed, 0);
-		instr->regs[0]->flags |= IR3_REG_HALF;
-		instr->regs[1]->flags |= IR3_REG_HALF;
 		break;
 	case 3:
 		/* src *= 3: */
-		immed = create_immed(block, 3);
-		immed->regs[0]->flags |= IR3_REG_HALF;
-
+		immed = create_immed_typed(block, 3, TYPE_S16);
 		instr = ir3_MULL_U(block, instr, 0, immed, 0);
-		instr->regs[0]->flags |= IR3_REG_HALF;
-		instr->regs[1]->flags |= IR3_REG_HALF;
 		break;
 	case 4:
 		/* src *= 4 => src <<= 2: */
-		immed = create_immed(block, 2);
-		immed->regs[0]->flags |= IR3_REG_HALF;
-
+		immed = create_immed_typed(block, 2, TYPE_S16);
 		instr = ir3_SHL_B(block, instr, 0, immed, 0);
-		instr->regs[0]->flags |= IR3_REG_HALF;
-		instr->regs[1]->flags |= IR3_REG_HALF;
 		break;
 	default:
 		unreachable("bad align");
 		return NULL;
 	}
 
+	instr->regs[0]->flags |= IR3_REG_HALF;
+
 	instr = ir3_MOV(block, instr, TYPE_S16);
 	instr->regs[0]->num = regid(REG_A0, 0);
-	instr->regs[0]->flags |= IR3_REG_HALF;
-	instr->regs[1]->flags |= IR3_REG_HALF;
+	instr->regs[0]->flags &= ~IR3_REG_SSA;
+
+	return instr;
+}
 
+static struct ir3_instruction *
+create_addr1(struct ir3_block *block, unsigned const_val)
+{
+
+	struct ir3_instruction *immed = create_immed_typed(block, const_val, TYPE_S16);
+	struct ir3_instruction *instr = ir3_MOV(block, immed, TYPE_S16);
+	instr->regs[0]->num = regid(REG_A0, 1);
+	instr->regs[0]->flags &= ~IR3_REG_SSA;
 	return instr;
 }
 
@@ -459,25 +474,45 @@ create_addr(struct ir3_block *block, struct ir3_instruction *src, int align)
  * sequences for each use of a given NIR level src as address
  */
 struct ir3_instruction *
-ir3_get_addr(struct ir3_context *ctx, struct ir3_instruction *src, int align)
+ir3_get_addr0(struct ir3_context *ctx, struct ir3_instruction *src, int align)
 {
 	struct ir3_instruction *addr;
 	unsigned idx = align - 1;
 
-	compile_assert(ctx, idx < ARRAY_SIZE(ctx->addr_ht));
+	compile_assert(ctx, idx < ARRAY_SIZE(ctx->addr0_ht));
 
-	if (!ctx->addr_ht[idx]) {
-		ctx->addr_ht[idx] = _mesa_hash_table_create(ctx,
+	if (!ctx->addr0_ht[idx]) {
+		ctx->addr0_ht[idx] = _mesa_hash_table_create(ctx,
 				_mesa_hash_pointer, _mesa_key_pointer_equal);
 	} else {
 		struct hash_entry *entry;
-		entry = _mesa_hash_table_search(ctx->addr_ht[idx], src);
+		entry = _mesa_hash_table_search(ctx->addr0_ht[idx], src);
 		if (entry)
 			return entry->data;
 	}
 
-	addr = create_addr(ctx->block, src, align);
-	_mesa_hash_table_insert(ctx->addr_ht[idx], src, addr);
+	addr = create_addr0(ctx->block, src, align);
+	_mesa_hash_table_insert(ctx->addr0_ht[idx], src, addr);
+
+	return addr;
+}
+
+/* Similar to ir3_get_addr0, but for a1.x. */
+struct ir3_instruction *
+ir3_get_addr1(struct ir3_context *ctx, unsigned const_val)
+{
+	struct ir3_instruction *addr;
+
+	if (!ctx->addr1_ht) {
+		ctx->addr1_ht = _mesa_hash_table_u64_create(ctx);
+	} else {
+		addr = _mesa_hash_table_u64_search(ctx->addr1_ht, const_val);
+		if (addr)
+			return addr;
+	}
+
+	addr = create_addr1(ctx->block, const_val);
+	_mesa_hash_table_u64_insert(ctx->addr1_ht, const_val, addr);
 
 	return addr;
 }
@@ -494,6 +529,7 @@ ir3_get_predicate(struct ir3_context *ctx, struct ir3_instruction *src)
 
 	/* condition always goes in predicate register: */
 	cond->regs[0]->num = regid(REG_P0, 0);
+	cond->regs[0]->flags &= ~IR3_REG_SSA;
 
 	return cond;
 }
@@ -517,13 +553,17 @@ ir3_declare_array(struct ir3_context *ctx, nir_register *reg)
 	arr->length = reg->num_components * MAX2(1, reg->num_array_elems);
 	compile_assert(ctx, arr->length > 0);
 	arr->r = reg;
+	arr->half = reg->bit_size <= 16;
+	// HACK one-bit bools still end up as 32b:
+	if (reg->bit_size == 1)
+		arr->half = false;
 	list_addtail(&arr->node, &ctx->ir->array_list);
 }
 
 struct ir3_array *
 ir3_get_array(struct ir3_context *ctx, nir_register *reg)
 {
-	list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
+	foreach_array (arr, &ctx->ir->array_list) {
 		if (arr->r == reg)
 			return arr;
 	}
@@ -539,15 +579,23 @@ ir3_create_array_load(struct ir3_context *ctx, struct ir3_array *arr, int n,
 	struct ir3_block *block = ctx->block;
 	struct ir3_instruction *mov;
 	struct ir3_register *src;
+	unsigned flags = 0;
 
 	mov = ir3_instr_create(block, OPC_MOV);
-	mov->cat1.src_type = TYPE_U32;
-	mov->cat1.dst_type = TYPE_U32;
+	if (arr->half) {
+		mov->cat1.src_type = TYPE_U16;
+		mov->cat1.dst_type = TYPE_U16;
+		flags |= IR3_REG_HALF;
+	} else {
+		mov->cat1.src_type = TYPE_U32;
+		mov->cat1.dst_type = TYPE_U32;
+	}
+
 	mov->barrier_class = IR3_BARRIER_ARRAY_R;
 	mov->barrier_conflict = IR3_BARRIER_ARRAY_W;
-	ir3_reg_create(mov, 0, 0);
+	__ssa_dst(mov)->flags |= flags;
 	src = ir3_reg_create(mov, 0, IR3_REG_ARRAY |
-			COND(address, IR3_REG_RELATIV));
+			COND(address, IR3_REG_RELATIV) | flags);
 	src->instr = arr->last_write;
 	src->size  = arr->length;
 	src->array.id = arr->id;
@@ -567,11 +615,15 @@ ir3_create_array_store(struct ir3_context *ctx, struct ir3_array *arr, int n,
 	struct ir3_block *block = ctx->block;
 	struct ir3_instruction *mov;
 	struct ir3_register *dst;
+	unsigned flags = 0;
 
 	/* if not relative store, don't create an extra mov, since that
 	 * ends up being difficult for cp to remove.
+	 *
+	 * Also, don't skip the mov if the src is meta (like fanout/split),
+	 * since that creates a situation that RA can't really handle properly.
 	 */
-	if (!address) {
+	if (!address && !is_meta(src)) {
 		dst = src->regs[0];
 
 		src->barrier_class |= IR3_BARRIER_ARRAY_W;
@@ -591,17 +643,24 @@ ir3_create_array_store(struct ir3_context *ctx, struct ir3_array *arr, int n,
 	}
 
 	mov = ir3_instr_create(block, OPC_MOV);
-	mov->cat1.src_type = TYPE_U32;
-	mov->cat1.dst_type = TYPE_U32;
+	if (arr->half) {
+		mov->cat1.src_type = TYPE_U16;
+		mov->cat1.dst_type = TYPE_U16;
+		flags |= IR3_REG_HALF;
+	} else {
+		mov->cat1.src_type = TYPE_U32;
+		mov->cat1.dst_type = TYPE_U32;
+	}
 	mov->barrier_class = IR3_BARRIER_ARRAY_W;
 	mov->barrier_conflict = IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W;
 	dst = ir3_reg_create(mov, 0, IR3_REG_ARRAY |
+			flags |
 			COND(address, IR3_REG_RELATIV));
 	dst->instr = arr->last_write;
 	dst->size  = arr->length;
 	dst->array.id = arr->id;
 	dst->array.offset = n;
-	ir3_reg_create(mov, 0, IR3_REG_SSA)->instr = src;
+	ir3_reg_create(mov, 0, IR3_REG_SSA | flags)->instr = src;
 
 	if (address)
 		ir3_instr_set_address(mov, address);