From 3f7239ca0ef279be3e1618770a1c2b9112236234 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Sat, 22 Feb 2014 09:46:39 -0500
Subject: [PATCH] freedreno/a3xx/compiler: half-precision output

Using generic shaders caused a measurable fps drop, which was isolated to
use of full precision (vs half precision) output.  This is an attempt to
regain that lost performance by using half precision solid/blit shaders
(when the output format is not float32).

Note: for the built-in shaders, I would not expect them to be register
starved.  And in fact it is the solid frag shader that seems to have the
biggest impact.  So I suspect you get double the pixel pipe units (or
half the cycles) when the output is half precision.  So there may be
some gain to using half precision output for application shaders as
well, even though the rest of register usage is still full precision.
But for half precision to work for more complex shaders, we need to deal
with some constraints, like cat2 needing same precision for it's two src
registers.  So for now it is not enabled by default except for the
built-in shaders.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 .../drivers/freedreno/a3xx/fd3_compiler.c     |  21 +++-
 src/gallium/drivers/freedreno/a3xx/fd3_draw.c |   5 +
 src/gallium/drivers/freedreno/a3xx/fd3_gmem.c |   3 +
 .../drivers/freedreno/a3xx/fd3_program.c      |   6 +-
 src/gallium/drivers/freedreno/a3xx/ir3.h      |   3 +-
 src/gallium/drivers/freedreno/a3xx/ir3_ra.c   | 102 ++++++++++++++++--
 6 files changed, 130 insertions(+), 10 deletions(-)

diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c b/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c
index f52003a47ee..818d5611dd9 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c
@@ -48,6 +48,25 @@
 #include "instr-a3xx.h"
 #include "ir3.h"
 
+/* NOTE on half/full precision:
+ * Currently, the front end (ie. basically this file) does everything in
+ * full precision (with the exception of trans_arl() which doesn't work
+ * currently.. we reject anything with relative addressing and fallback
+ * to old compiler).
+ *
+ * In the RA step, if half_precision, it will assign the output to hr0.x
+ * but use full precision everywhere else.
+ *
+ * Eventually we'll need a better way to communicate type information
+ * to RA so that it can more properly assign both half and full precision
+ * registers.  (And presumably double precision pairs for a4xx?)  This
+ * would let us make more use of half precision registers, while still
+ * keeping things like tex coords in full precision registers.
+ *
+ * Since the RA is dealing with patching instruction types for half
+ * precision output, we can ignore that in the front end and just always
+ * create full precision instructions.
+ */
 
 struct fd3_compile_context {
 	const struct tgsi_token *tokens;
@@ -2030,7 +2049,7 @@ fd3_compile_shader(struct fd3_shader_variant *so,
 		ir3_dump_instr_list(ctx.block->head);
 	}
 
-	ret = ir3_block_ra(ctx.block, so->type);
+	ret = ir3_block_ra(ctx.block, so->type, key.half_precision);
 	if (ret)
 		goto out;
 
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
index 7b071b2cd5d..f822aa728fe 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
@@ -103,6 +103,9 @@ fd3_draw(struct fd_context *ctx, const struct pipe_draw_info *info)
 			/* do binning pass first: */
 			.binning_pass = true,
 			.color_two_side = ctx->rasterizer ? ctx->rasterizer->light_twoside : false,
+			// TODO set .half_precision based on render target format,
+			// ie. float16 and smaller use half, float32 use full..
+			.half_precision = !!(fd_mesa_debug & FD_DBG_FRAGHALF),
 	};
 	draw_impl(ctx, info, ctx->binning_ring,
 			dirty & ~(FD_DIRTY_BLEND), key);
@@ -126,6 +129,7 @@ fd3_clear_binning(struct fd_context *ctx, unsigned dirty)
 	struct fd_ringbuffer *ring = ctx->binning_ring;
 	struct fd3_shader_key key = {
 			.binning_pass = true,
+			.half_precision = true,
 	};
 
 	fd3_emit_state(ctx, ring, &ctx->solid_prog, dirty, key);
@@ -166,6 +170,7 @@ fd3_clear(struct fd_context *ctx, unsigned buffers,
 	unsigned dirty = ctx->dirty;
 	unsigned ce, i;
 	struct fd3_shader_key key = {
+			.half_precision = true,
 	};
 
 	dirty &= FD_DIRTY_VIEWPORT | FD_DIRTY_FRAMEBUFFER | FD_DIRTY_SCISSOR;
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
index d1aa8cf1208..dde71ba97b9 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
@@ -44,6 +44,9 @@
 #include "fd3_zsa.h"
 
 static const struct fd3_shader_key key = {
+		// XXX should set this based on render target format!  We don't
+		// want half_precision if float32 render target!!!
+		.half_precision = true,
 };
 
 static void
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_program.c b/src/gallium/drivers/freedreno/a3xx/fd3_program.c
index 0a7500f1611..34d4dd3330b 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_program.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.c
@@ -101,7 +101,8 @@ create_variant(struct fd3_shader_stateobj *so, struct fd3_shader_key key)
 	v->type = so->type;
 
 	if (fd_mesa_debug & FD_DBG_DISASM) {
-		DBG("dump tgsi: type=%d", so->type);
+		DBG("dump tgsi: type=%d, k={bp=%u,cts=%u,hp=%u}", so->type,
+			key.binning_pass, key.color_two_side, key.half_precision);
 		tgsi_dump(tokens, 0);
 	}
 
@@ -138,7 +139,8 @@ create_variant(struct fd3_shader_stateobj *so, struct fd3_shader_key key)
 		fixup_vp_regfootprint(v);
 
 	if (fd_mesa_debug & FD_DBG_DISASM) {
-		DBG("disassemble: type=%d", v->type);
+		DBG("disassemble: type=%d, k={bp=%u,cts=%u,hp=%u}", v->type,
+			key.binning_pass, key.color_two_side, key.half_precision);
 		disasm_a3xx(fd_bo_map(v->bo), v->info.sizedwords, 0, v->type);
 	}
 
diff --git a/src/gallium/drivers/freedreno/a3xx/ir3.h b/src/gallium/drivers/freedreno/a3xx/ir3.h
index 894db175076..9327fbdca72 100644
--- a/src/gallium/drivers/freedreno/a3xx/ir3.h
+++ b/src/gallium/drivers/freedreno/a3xx/ir3.h
@@ -379,7 +379,8 @@ void ir3_block_cp(struct ir3_block *block);
 void ir3_block_sched(struct ir3_block *block);
 
 /* register assignment: */
-int ir3_block_ra(struct ir3_block *block, enum shader_t type);
+int ir3_block_ra(struct ir3_block *block, enum shader_t type,
+		bool half_precision);
 
 
 #ifndef ARRAY_SIZE
diff --git a/src/gallium/drivers/freedreno/a3xx/ir3_ra.c b/src/gallium/drivers/freedreno/a3xx/ir3_ra.c
index 06a86ff3b2d..1b3d0e3e1e5 100644
--- a/src/gallium/drivers/freedreno/a3xx/ir3_ra.c
+++ b/src/gallium/drivers/freedreno/a3xx/ir3_ra.c
@@ -53,10 +53,19 @@
 struct ir3_ra_ctx {
 	struct ir3_block *block;
 	enum shader_t type;
+	bool half_precision;
 	int cnt;
 	bool error;
 };
 
+/* sorta ugly way to retrofit half-precision support.. rather than
+ * passing extra param around, just OR in a high bit.  All the low
+ * value arithmetic (ie. +/- offset within a contiguous vec4, etc)
+ * will continue to work as long as you don't underflow (and that
+ * would go badly anyways).
+ */
+#define REG_HALF  0x8000
+
 struct ir3_ra_assignment {
 	int8_t  off;        /* offset of instruction dst within range */
 	uint8_t num;        /* number of components for the range */
@@ -91,7 +100,7 @@ static int output_base(struct ir3_ra_ctx *ctx)
 	 * see how because the blob driver always uses r0.x (ie.
 	 * all zeros)
 	 */
-	if (ctx->type == SHADER_FRAGMENT)
+	if ((ctx->type == SHADER_FRAGMENT) && !ctx->half_precision)
 		return 2;
 	return 0;
 }
@@ -348,12 +357,88 @@ static inline struct ra_assign_visitor *ra_assign_visitor(struct ir3_visitor *v)
 	return (struct ra_assign_visitor *)v;
 }
 
+static type_t half_type(type_t type)
+{
+	switch (type) {
+	case TYPE_F32: return TYPE_F16;
+	case TYPE_U32: return TYPE_U16;
+	case TYPE_S32: return TYPE_S16;
+	/* instructions may already be fixed up: */
+	case TYPE_F16:
+	case TYPE_U16:
+	case TYPE_S16:
+		return type;
+	default:
+		assert(0);
+		return ~0;
+	}
+}
+
+/* some instructions need fix-up if dst register is half precision: */
+static void fixup_half_instr_dst(struct ir3_instruction *instr)
+{
+	switch (instr->category) {
+	case 1: /* move instructions */
+		instr->cat1.dst_type = half_type(instr->cat1.dst_type);
+		break;
+	case 3:
+		switch (instr->opc) {
+		case OPC_MAD_F32:
+			instr->opc = OPC_MAD_F16;
+			break;
+		case OPC_SEL_B32:
+			instr->opc = OPC_SEL_B16;
+			break;
+		case OPC_SEL_S32:
+			instr->opc = OPC_SEL_S16;
+			break;
+		case OPC_SEL_F32:
+			instr->opc = OPC_SEL_F16;
+			break;
+		case OPC_SAD_S32:
+			instr->opc = OPC_SAD_S16;
+			break;
+		/* instructions may already be fixed up: */
+		case OPC_MAD_F16:
+		case OPC_SEL_B16:
+		case OPC_SEL_S16:
+		case OPC_SEL_F16:
+		case OPC_SAD_S16:
+			break;
+		default:
+			assert(0);
+			break;
+		}
+		break;
+	case 5:
+		instr->cat5.type = half_type(instr->cat5.type);
+		break;
+	}
+}
+/* some instructions need fix-up if src register is half precision: */
+static void fixup_half_instr_src(struct ir3_instruction *instr)
+{
+	switch (instr->category) {
+	case 1: /* move instructions */
+		instr->cat1.src_type = half_type(instr->cat1.src_type);
+		break;
+	}
+}
+
 static void ra_assign_reg(struct ir3_visitor *v,
 		struct ir3_instruction *instr, struct ir3_register *reg)
 {
 	struct ra_assign_visitor *a = ra_assign_visitor(v);
 	reg->flags &= ~IR3_REG_SSA;
-	reg->num = a->num;
+	reg->num = a->num & ~REG_HALF;
+	if (a->num & REG_HALF) {
+		reg->flags |= IR3_REG_HALF;
+		/* if dst reg being assigned, patch up the instr: */
+		if (reg == instr->regs[0])
+			fixup_half_instr_dst(instr);
+		else
+			fixup_half_instr_src(instr);
+	}
 }
 
 static void ra_assign_dst_shader_input(struct ir3_visitor *v,
@@ -429,8 +514,8 @@ static void ra_assign(struct ir3_ra_ctx *ctx,
 
 	/* if we've already visited this instruction, bail now: */
 	if (ir3_instr_check_mark(assigner)) {
-		debug_assert(assigner->regs[0]->num == num);
-		if (assigner->regs[0]->num != num) {
+		debug_assert(assigner->regs[0]->num == (num & ~REG_HALF));
+		if (assigner->regs[0]->num != (num & ~REG_HALF)) {
 			/* impossible situation, should have been resolved
 			 * at an earlier stage by inserting extra mov's:
 			 */
@@ -593,6 +678,9 @@ static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 
 		base = alloc_block(ctx, NULL, block->noutputs + off);
 
+		if (ctx->half_precision)
+			base |= REG_HALF;
+
 		for (i = 0; i < block->noutputs; i++)
 			if (block->outputs[i])
 				ra_assign(ctx, block->outputs[i], base + i + off);
@@ -600,7 +688,7 @@ static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 		if (ctx->type == SHADER_FRAGMENT) {
 			for (i = 0; i < block->ninputs; i++)
 				if (block->inputs[i])
-					ra_assign(ctx, block->inputs[i], base + i);
+					ra_assign(ctx, block->inputs[i], (base & ~REG_HALF) + i);
 		} else {
 			for (i = 0; i < block->ninputs; i++)
 				if (block->inputs[i])
@@ -623,11 +711,13 @@ static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 	return 0;
 }
 
-int ir3_block_ra(struct ir3_block *block, enum shader_t type)
+int ir3_block_ra(struct ir3_block *block, enum shader_t type,
+		bool half_precision)
 {
 	struct ir3_ra_ctx ctx = {
 			.block = block,
 			.type = type,
+			.half_precision = half_precision,
 	};
 	ir3_shader_clear_mark(block->shader);
 	return block_ra(&ctx, block);
-- 
2.30.2