From edde00f5f1a1a6b3f9827af0475f6ff097705c1f Mon Sep 17 00:00:00 2001
From: Rob Clark <robdclark@gmail.com>
Date: Mon, 17 Apr 2017 11:04:00 -0400
Subject: [PATCH] freedreno/ir3: SSBO/atomic support

TODO cwabbott pointed out a write-after-read hazzard, which effects both
this and arrays.  A write needs to depend on *all* reads since the last
write, not just the last read.

Signed-off-by: Rob Clark <robdclark@gmail.com>
---
 src/gallium/drivers/freedreno/ir3/ir3.c       |  74 ++++++-
 src/gallium/drivers/freedreno/ir3/ir3.h       |  37 +++-
 .../drivers/freedreno/ir3/ir3_compiler_nir.c  | 195 +++++++++++++++++-
 src/gallium/drivers/freedreno/ir3/ir3_cp.c    |   6 +
 .../drivers/freedreno/ir3/ir3_legalize.c      |   7 +-
 .../drivers/freedreno/ir3/ir3_shader.h        |   3 +
 6 files changed, 309 insertions(+), 13 deletions(-)

diff --git a/src/gallium/drivers/freedreno/ir3/ir3.c b/src/gallium/drivers/freedreno/ir3/ir3.c
index ff2c342c357..d703f4e7f38 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3.c
@@ -475,6 +475,13 @@ static int emit_cat6(struct ir3_instruction *instr, void *ptr,
 	struct ir3_register *dst, *src1, *src2;
 	instr_cat6_t *cat6 = ptr;
 
+	cat6->type     = instr->cat6.type;
+	cat6->opc      = instr->opc;
+	cat6->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
+	cat6->sync     = !!(instr->flags & IR3_INSTR_SY);
+	cat6->g        = !!(instr->flags & IR3_INSTR_G);
+	cat6->opc_cat  = 6;
+
 	/* the "dst" for a store instruction is (from the perspective
 	 * of data flow in the shader, ie. register use/def, etc) in
 	 * fact a register that is read by the instruction, rather
@@ -500,7 +507,65 @@ static int emit_cat6(struct ir3_instruction *instr, void *ptr,
 	 * indicate to use the src_off encoding even if offset is zero
 	 * (but then what to do about dst_off?)
 	 */
-	if (instr->cat6.src_offset || (instr->opc == OPC_LDG)) {
+	if ((instr->opc == OPC_LDGB) || is_atomic(instr->opc)) {
+		struct ir3_register *src3 = instr->regs[3];
+		instr_cat6ldgb_t *ldgb = ptr;
+
+		/* maybe these two bits both determine the instruction encoding? */
+		cat6->src_off = false;
+
+		ldgb->d = 4 - 1;      /* always .4d ? */
+		ldgb->typed = false;  /* TODO true for images */
+		ldgb->type_size = instr->cat6.iim_val - 1;
+
+		ldgb->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+
+		/* first src is src_ssbo: */
+		iassert(src1->flags & IR3_REG_IMMED);
+		ldgb->src_ssbo = src1->uim_val;
+
+		/* then next two are src1/src2: */
+		ldgb->src1 = reg(src2, info, instr->repeat, IR3_REG_IMMED);
+		ldgb->src1_im = !!(src2->flags & IR3_REG_IMMED);
+		ldgb->src2 = reg(src3, info, instr->repeat, IR3_REG_IMMED);
+		ldgb->src2_im = !!(src3->flags & IR3_REG_IMMED);
+
+		if (is_atomic(instr->opc)) {
+			struct ir3_register *src4 = instr->regs[4];
+			ldgb->src3 = reg(src4, info, instr->repeat, 0);
+			ldgb->pad0 = 0x1;
+			ldgb->pad3 = 0x3;
+		} else {
+			ldgb->pad0 = 0x0;
+			ldgb->pad3 = 0x2;
+		}
+
+		return 0;
+	} else if (instr->opc == OPC_STGB) {
+		struct ir3_register *src3 = instr->regs[4];
+		instr_cat6stgb_t *stgb = ptr;
+
+		/* maybe these two bits both determine the instruction encoding? */
+		cat6->src_off = true;
+		stgb->pad3 = 0x2;
+
+		stgb->d = 4 - 1;    /* always .4d ? */
+		stgb->typed = false;
+		stgb->type_size = instr->cat6.iim_val - 1;
+
+		/* first src is dst_ssbo: */
+		iassert(dst->flags & IR3_REG_IMMED);
+		stgb->dst_ssbo = dst->uim_val;
+
+		/* then src1/src2/src3: */
+		stgb->src1 = reg(src1, info, instr->repeat, 0);
+		stgb->src2 = reg(src2, info, instr->repeat, IR3_REG_IMMED);
+		stgb->src2_im = !!(src2->flags & IR3_REG_IMMED);
+		stgb->src3 = reg(src3, info, instr->repeat, IR3_REG_IMMED);
+		stgb->src3_im = !!(src3->flags & IR3_REG_IMMED);
+
+		return 0;
+	} else if (instr->cat6.src_offset || (instr->opc == OPC_LDG)) {
 		instr_cat6a_t *cat6a = ptr;
 
 		cat6->src_off = true;
@@ -536,13 +601,6 @@ static int emit_cat6(struct ir3_instruction *instr, void *ptr,
 		cat6d->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
 	}
 
-	cat6->type     = instr->cat6.type;
-	cat6->opc      = instr->opc;
-	cat6->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
-	cat6->sync     = !!(instr->flags & IR3_INSTR_SY);
-	cat6->g        = !!(instr->flags & IR3_INSTR_G);
-	cat6->opc_cat  = 6;
-
 	return 0;
 }
 
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h
index 8d75ec168cb..beb125c5e97 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3.h
@@ -226,7 +226,7 @@ struct ir3_instruction {
 			type_t type;
 			int src_offset;
 			int dst_offset;
-			int iim_val;
+			int iim_val;          /* for ldgb/stgb, # of components */
 		} cat6;
 		/* for meta-instructions, just used to hold extra data
 		 * before instruction scheduling, etc
@@ -602,6 +602,7 @@ is_store(struct ir3_instruction *instr)
 	 */
 	switch (instr->opc) {
 	case OPC_STG:
+	case OPC_STGB:
 	case OPC_STP:
 	case OPC_STL:
 	case OPC_STLW:
@@ -617,6 +618,7 @@ static inline bool is_load(struct ir3_instruction *instr)
 {
 	switch (instr->opc) {
 	case OPC_LDG:
+	case OPC_LDGB:
 	case OPC_LDL:
 	case OPC_LDP:
 	case OPC_L2G:
@@ -931,7 +933,7 @@ int ir3_ra(struct ir3 *ir3, enum shader_t type,
 		bool frag_coord, bool frag_face);
 
 /* legalize: */
-void ir3_legalize(struct ir3 *ir, bool *has_samp, int *max_bary);
+void ir3_legalize(struct ir3 *ir, bool *has_samp, bool *has_ssbo, int *max_bary);
 
 /* ************************************************************************* */
 /* instruction helpers */
@@ -1025,6 +1027,24 @@ ir3_##name(struct ir3_block *block,                                      \
 	return instr;                                                        \
 }
 
+#define INSTR4(name)                                                     \
+static inline struct ir3_instruction *                                   \
+ir3_##name(struct ir3_block *block,                                      \
+		struct ir3_instruction *a, unsigned aflags,                      \
+		struct ir3_instruction *b, unsigned bflags,                      \
+		struct ir3_instruction *c, unsigned cflags,                      \
+		struct ir3_instruction *d, unsigned dflags)                      \
+{                                                                        \
+	struct ir3_instruction *instr =                                      \
+		ir3_instr_create2(block, OPC_##name, 5);                         \
+	ir3_reg_create(instr, 0, 0);   /* dst */                             \
+	ir3_reg_create(instr, 0, IR3_REG_SSA | aflags)->instr = a;           \
+	ir3_reg_create(instr, 0, IR3_REG_SSA | bflags)->instr = b;           \
+	ir3_reg_create(instr, 0, IR3_REG_SSA | cflags)->instr = c;           \
+	ir3_reg_create(instr, 0, IR3_REG_SSA | dflags)->instr = d;           \
+	return instr;                                                        \
+}
+
 /* cat0 instructions: */
 INSTR0(BR);
 INSTR0(JUMP);
@@ -1142,6 +1162,19 @@ ir3_SAM(struct ir3_block *block, opc_t opc, type_t type,
 INSTR2(LDLV)
 INSTR2(LDG)
 INSTR3(STG)
+INSTR3(LDGB);
+INSTR4(STGB);
+INSTR4(ATOMIC_ADD);
+INSTR4(ATOMIC_SUB);
+INSTR4(ATOMIC_XCHG);
+INSTR4(ATOMIC_INC);
+INSTR4(ATOMIC_DEC);
+INSTR4(ATOMIC_CMPXCHG);
+INSTR4(ATOMIC_MIN);
+INSTR4(ATOMIC_MAX);
+INSTR4(ATOMIC_AND);
+INSTR4(ATOMIC_OR);
+INSTR4(ATOMIC_XOR);
 
 /* ************************************************************************* */
 /* split this out or find some helper to use.. like main/bitset.h.. */
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index 22619e852c2..a164675ed24 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -71,6 +71,20 @@ struct ir3_compile {
 	/* For vertex shaders, keep track of the system values sources */
 	struct ir3_instruction *vertex_id, *basevertex, *instance_id;
 
+	/* For SSBO's and atomics, we need to preserve order, such
+	 * that reads don't overtake writes, and the order of writes
+	 * is preserved.  Atomics are considered as a write.
+	 *
+	 * To do this, we track last write and last access, in a
+	 * similar way to ir3_array.  But since we don't know whether
+	 * the same SSBO is bound to multiple slots, so we simply
+	 * track this globally rather than per-SSBO.
+	 *
+	 * TODO should we track this per block instead?  I guess it
+	 * shouldn't matter much?
+	 */
+	struct ir3_instruction *last_write, *last_access;
+
 	/* mapping from nir_register to defining instruction: */
 	struct hash_table *def_ht;
 
@@ -430,7 +444,7 @@ create_uniform_indirect(struct ir3_compile *ctx, int n,
 }
 
 static struct ir3_instruction *
-create_collect(struct ir3_block *block, struct ir3_instruction **arr,
+create_collect(struct ir3_block *block, struct ir3_instruction *const *arr,
 		unsigned arrsz)
 {
 	struct ir3_instruction *collect;
@@ -1136,6 +1150,165 @@ emit_intrinsic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 	}
 }
 
+static void
+mark_ssbo_read(struct ir3_compile *ctx, struct ir3_instruction *instr)
+{
+	instr->regs[0]->instr = ctx->last_write;
+	instr->regs[0]->flags |= IR3_REG_SSA;
+	ctx->last_access = instr;
+}
+
+static void
+mark_ssbo_write(struct ir3_compile *ctx, struct ir3_instruction *instr)
+{
+	instr->regs[0]->instr = ctx->last_access;
+	instr->regs[0]->flags |= IR3_REG_SSA;
+	ctx->last_write = ctx->last_access = instr;
+}
+
+static void
+emit_intrinsic_load_ssbo(struct ir3_compile *ctx, nir_intrinsic_instr *intr,
+		struct ir3_instruction **dst)
+{
+	struct ir3_block *b = ctx->block;
+	struct ir3_instruction *ldgb, *src0, *src1, *offset;
+	nir_const_value *const_offset;
+
+	/* can this be non-const buffer_index?  how do we handle that? */
+	const_offset = nir_src_as_const_value(intr->src[0]);
+	compile_assert(ctx, const_offset);
+
+	offset = get_src(ctx, &intr->src[1])[0];
+
+	/* src0 is uvec2(offset*4, 0), src1 is offset.. nir already *= 4: */
+	src0 = create_collect(b, (struct ir3_instruction*[]){
+		offset,
+		create_immed(b, 0),
+	}, 2);
+	src1 = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0);
+
+	ldgb = ir3_LDGB(b, create_immed(b, const_offset->u32[0]), 0,
+			src0, 0, src1, 0);
+	ldgb->regs[0]->wrmask = (1 << intr->num_components) - 1;
+	ldgb->cat6.iim_val = intr->num_components;
+	ldgb->cat6.type = TYPE_U32;
+	mark_ssbo_read(ctx, ldgb);
+
+	split_dest(b, dst, ldgb, 0, intr->num_components);
+}
+
+/* src[] = { value, block_index, offset }. const_index[] = { write_mask } */
+static void
+emit_intrinsic_store_ssbo(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
+{
+	struct ir3_block *b = ctx->block;
+	struct ir3_instruction *stgb, *src0, *src1, *src2, *offset;
+	nir_const_value *const_offset;
+	unsigned ncomp = ffs(~intr->const_index[0]) - 1;
+
+	/* can this be non-const buffer_index?  how do we handle that? */
+	const_offset = nir_src_as_const_value(intr->src[1]);
+	compile_assert(ctx, const_offset);
+
+	offset = get_src(ctx, &intr->src[2])[0];
+
+	/* src0 is value, src1 is offset, src2 is uvec2(offset*4, 0)..
+	 * nir already *= 4:
+	 */
+	src0 = create_collect(b, get_src(ctx, &intr->src[0]), ncomp);
+	src1 = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0);
+	src2 = create_collect(b, (struct ir3_instruction*[]){
+		offset,
+		create_immed(b, 0),
+	}, 2);
+
+	stgb = ir3_STGB(b, create_immed(b, const_offset->u32[0]), 0,
+			src0, 0, src1, 0, src2, 0);
+	stgb->cat6.iim_val = ncomp;
+	stgb->cat6.type = TYPE_U32;
+	mark_ssbo_write(ctx, stgb);
+
+	array_insert(b, b->keeps, stgb);
+}
+
+static void
+emit_intrinsic_atomic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
+{
+	struct ir3_block *b = ctx->block;
+	struct ir3_instruction *atomic, *ssbo, *src0, *src1, *src2, *offset;
+	nir_const_value *const_offset;
+	type_t type = TYPE_U32;
+
+	/* can this be non-const buffer_index?  how do we handle that? */
+	const_offset = nir_src_as_const_value(intr->src[0]);
+	compile_assert(ctx, const_offset);
+	ssbo = create_immed(b, const_offset->u32[0]);
+
+	offset = get_src(ctx, &intr->src[1])[0];
+
+	/* src0 is data (or uvec2(data, compare)
+	 * src1 is offset
+	 * src2 is uvec2(offset*4, 0)
+	 *
+	 * Note that nir already multiplies the offset by four
+	 */
+	src0 = get_src(ctx, &intr->src[2])[0];
+	src1 = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0);
+	src2 = create_collect(b, (struct ir3_instruction*[]){
+		offset,
+		create_immed(b, 0),
+	}, 2);
+
+	switch (intr->intrinsic) {
+	case nir_intrinsic_ssbo_atomic_add:
+		atomic = ir3_ATOMIC_ADD(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+		break;
+	case nir_intrinsic_ssbo_atomic_imin:
+		atomic = ir3_ATOMIC_MIN(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+		type = TYPE_S32;
+		break;
+	case nir_intrinsic_ssbo_atomic_umin:
+		atomic = ir3_ATOMIC_MIN(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+		break;
+	case nir_intrinsic_ssbo_atomic_imax:
+		atomic = ir3_ATOMIC_MAX(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+		type = TYPE_S32;
+		break;
+	case nir_intrinsic_ssbo_atomic_umax:
+		atomic = ir3_ATOMIC_MAX(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+		break;
+	case nir_intrinsic_ssbo_atomic_and:
+		atomic = ir3_ATOMIC_AND(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+		break;
+	case nir_intrinsic_ssbo_atomic_or:
+		atomic = ir3_ATOMIC_OR(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+		break;
+	case nir_intrinsic_ssbo_atomic_xor:
+		atomic = ir3_ATOMIC_XOR(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+		break;
+	case nir_intrinsic_ssbo_atomic_exchange:
+		atomic = ir3_ATOMIC_XCHG(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+		break;
+	case nir_intrinsic_ssbo_atomic_comp_swap:
+		/* for cmpxchg, src0 is [ui]vec2(data, compare): */
+		src0 = create_collect(b, (struct ir3_instruction*[]){
+			src0,
+			get_src(ctx, &intr->src[3])[0],
+		}, 2);
+		atomic = ir3_ATOMIC_CMPXCHG(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+		break;
+	default:
+		unreachable("boo");
+	}
+
+	atomic->cat6.iim_val = 1;
+	atomic->cat6.type = type;
+	mark_ssbo_write(ctx, atomic);
+
+	/* even if nothing consume the result, we can't DCE the instruction: */
+	array_insert(b, b->keeps, atomic);
+}
+
 static void add_sysval_input(struct ir3_compile *ctx, gl_system_value slot,
 		struct ir3_instruction *instr)
 {
@@ -1225,6 +1398,24 @@ emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 	case nir_intrinsic_store_var:
 		emit_intrinsic_store_var(ctx, intr);
 		break;
+	case nir_intrinsic_load_ssbo:
+		emit_intrinsic_load_ssbo(ctx, intr, dst);
+		break;
+	case nir_intrinsic_store_ssbo:
+		emit_intrinsic_store_ssbo(ctx, intr);
+		break;
+	case nir_intrinsic_ssbo_atomic_add:
+	case nir_intrinsic_ssbo_atomic_imin:
+	case nir_intrinsic_ssbo_atomic_umin:
+	case nir_intrinsic_ssbo_atomic_imax:
+	case nir_intrinsic_ssbo_atomic_umax:
+	case nir_intrinsic_ssbo_atomic_and:
+	case nir_intrinsic_ssbo_atomic_or:
+	case nir_intrinsic_ssbo_atomic_xor:
+	case nir_intrinsic_ssbo_atomic_exchange:
+	case nir_intrinsic_ssbo_atomic_comp_swap:
+		emit_intrinsic_atomic(ctx, intr);
+		break;
 	case nir_intrinsic_store_output:
 		idx = nir_intrinsic_base(intr);
 		const_offset = nir_src_as_const_value(intr->src[1]);
@@ -2541,7 +2732,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 	/* We need to do legalize after (for frag shader's) the "bary.f"
 	 * offsets (inloc) have been assigned.
 	 */
-	ir3_legalize(ir, &so->has_samp, &max_bary);
+	ir3_legalize(ir, &so->has_samp, &so->has_ssbo, &max_bary);
 
 	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
 		printf("AFTER LEGALIZE:\n");
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cp.c b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
index 7bb858df4d4..8c907eb5a53 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_cp.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
@@ -193,6 +193,12 @@ static bool valid_flags(struct ir3_instruction *instr, unsigned n,
 			 */
 			if (is_store(instr) && (n == 1))
 				return false;
+
+			/* disallow CP into anything but the SSBO slot argument for
+			 * atomics:
+			 */
+			if (is_atomic(instr->opc) && (n != 0))
+				return false;
 		}
 
 		break;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
index 6acea011d5c..fffa76504da 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
@@ -43,6 +43,7 @@
 
 struct ir3_legalize_ctx {
 	bool has_samp;
+	bool has_ssbo;
 	int max_bary;
 };
 
@@ -192,6 +193,9 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
 				regmask_set(&needs_sy, n->regs[0]);
 		}
 
+		if ((n->opc == OPC_LDGB) || (n->opc == OPC_STGB) || is_atomic(n->opc))
+			ctx->has_ssbo = true;
+
 		/* both tex/sfu appear to not always immediately consume
 		 * their src register(s):
 		 */
@@ -388,7 +392,7 @@ mark_convergence_points(struct ir3 *ir)
 }
 
 void
-ir3_legalize(struct ir3 *ir, bool *has_samp, int *max_bary)
+ir3_legalize(struct ir3 *ir, bool *has_samp, bool *has_ssbo, int *max_bary)
 {
 	struct ir3_legalize_ctx ctx = {
 			.max_bary = -1,
@@ -399,6 +403,7 @@ ir3_legalize(struct ir3 *ir, bool *has_samp, int *max_bary)
 	}
 
 	*has_samp = ctx.has_samp;
+	*has_ssbo = ctx.has_ssbo;
 	*max_bary = ctx.max_bary;
 
 	do {
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.h b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
index e5dcb739783..a06dd04904c 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
@@ -249,6 +249,9 @@ struct ir3_shader_variant {
 	/* do we have one or more texture sample instructions: */
 	bool has_samp;
 
+	/* do we have one or more SSBO instructions: */
+	bool has_ssbo;
+
 	/* do we have kill instructions: */
 	bool has_kill;
 
-- 
2.30.2