From 6667dde098c4d9f30720024e76e35963eec2c511 Mon Sep 17 00:00:00 2001
From: Rob Clark <robdclark@gmail.com>
Date: Wed, 5 Dec 2018 15:07:51 -0500
Subject: [PATCH] freedreno/ir3: don't treat all inputs/outputs as vec4

This was a hold-over from the early TGSI days, and mostly not needed
with NIR.  This avoids burning an entire 4 consecutive scalar regs
for vec3 outputs, for example.  Which fixes a few places that we were
doing worse that we should on register usage.

Signed-off-by: Rob Clark <robdclark@gmail.com>
---
 src/freedreno/ir3/ir3_compiler_nir.c | 50 ++++++++++++++++++++--------
 src/freedreno/ir3/ir3_ra.c           |  2 ++
 2 files changed, 38 insertions(+), 14 deletions(-)

diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c
index 936c3277fff..167d6ae7836 100644
--- a/src/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/freedreno/ir3/ir3_compiler_nir.c
@@ -2649,11 +2649,9 @@ setup_input(struct ir3_context *ctx, nir_variable *in)
 	struct ir3_shader_variant *so = ctx->so;
 	unsigned ncomp = glsl_get_components(in->type);
 	unsigned n = in->data.driver_location;
+	unsigned frac = in->data.location_frac;
 	unsigned slot = in->data.location;
 
-	/* let's pretend things other than vec4 don't exist: */
-	ncomp = MAX2(ncomp, 4);
-
 	/* skip unread inputs, we could end up with (for example), unsplit
 	 * matrix/etc inputs in the case they are not read, so just silently
 	 * skip these.
@@ -2661,17 +2659,15 @@ setup_input(struct ir3_context *ctx, nir_variable *in)
 	if (ncomp > 4)
 		return;
 
-	compile_assert(ctx, ncomp == 4);
-
 	so->inputs[n].slot = slot;
-	so->inputs[n].compmask = (1 << ncomp) - 1;
+	so->inputs[n].compmask = (1 << (ncomp + frac)) - 1;
 	so->inputs_count = MAX2(so->inputs_count, n + 1);
 	so->inputs[n].interpolate = in->data.interpolation;
 
 	if (ctx->so->type == MESA_SHADER_FRAGMENT) {
 		for (int i = 0; i < ncomp; i++) {
 			struct ir3_instruction *instr = NULL;
-			unsigned idx = (n * 4) + i;
+			unsigned idx = (n * 4) + i + frac;
 
 			if (slot == VARYING_SLOT_POS) {
 				so->inputs[n].bary = false;
@@ -2726,7 +2722,7 @@ setup_input(struct ir3_context *ctx, nir_variable *in)
 		}
 	} else if (ctx->so->type == MESA_SHADER_VERTEX) {
 		for (int i = 0; i < ncomp; i++) {
-			unsigned idx = (n * 4) + i;
+			unsigned idx = (n * 4) + i + frac;
 			compile_assert(ctx, idx < ctx->ir->ninputs);
 			ctx->ir->inputs[idx] = create_input(ctx, idx);
 		}
@@ -2745,13 +2741,10 @@ setup_output(struct ir3_context *ctx, nir_variable *out)
 	struct ir3_shader_variant *so = ctx->so;
 	unsigned ncomp = glsl_get_components(out->type);
 	unsigned n = out->data.driver_location;
+	unsigned frac = out->data.location_frac;
 	unsigned slot = out->data.location;
 	unsigned comp = 0;
 
-	/* let's pretend things other than vec4 don't exist: */
-	ncomp = MAX2(ncomp, 4);
-	compile_assert(ctx, ncomp == 4);
-
 	if (ctx->so->type == MESA_SHADER_FRAGMENT) {
 		switch (slot) {
 		case FRAG_RESULT_DEPTH:
@@ -2803,10 +2796,25 @@ setup_output(struct ir3_context *ctx, nir_variable *out)
 	so->outputs_count = MAX2(so->outputs_count, n + 1);
 
 	for (int i = 0; i < ncomp; i++) {
-		unsigned idx = (n * 4) + i;
+		unsigned idx = (n * 4) + i + frac;
 		compile_assert(ctx, idx < ctx->ir->noutputs);
 		ctx->ir->outputs[idx] = create_immed(ctx->block, fui(0.0));
 	}
+
+	/* if varying packing doesn't happen, we could end up in a situation
+	 * with "holes" in the output, and since the per-generation code that
+	 * sets up varying linkage registers doesn't expect to have more than
+	 * one varying per vec4 slot, pad the holes.
+	 *
+	 * Note that this should probably generate a performance warning of
+	 * some sort.
+	 */
+	for (int i = 0; i < frac; i++) {
+		unsigned idx = (n * 4) + i;
+		if (!ctx->ir->outputs[idx]) {
+			ctx->ir->outputs[idx] = create_immed(ctx->block, fui(0.0));
+		}
+	}
 }
 
 static int
@@ -3126,7 +3134,21 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 
 	/* fixup input/outputs: */
 	for (i = 0; i < so->outputs_count; i++) {
-		so->outputs[i].regid = ir->outputs[i*4]->regs[0]->num;
+		/* sometimes we get outputs that don't write the .x coord, like:
+		 *
+		 *   decl_var shader_out INTERP_MODE_NONE float Color (VARYING_SLOT_VAR9.z, 1, 0)
+		 *
+		 * Presumably the result of varying packing and then eliminating
+		 * some unneeded varyings?  Just skip head to the first valid
+		 * component of the output.
+		 */
+		for (unsigned j = 0; j < 4; j++) {
+			struct ir3_instruction *instr = ir->outputs[(i*4) + j];
+			if (instr) {
+				so->outputs[i].regid = instr->regs[0]->num;
+				break;
+			}
+		}
 	}
 
 	/* Note that some or all channels of an input may be unused: */
diff --git a/src/freedreno/ir3/ir3_ra.c b/src/freedreno/ir3/ir3_ra.c
index b202c141378..f951acd5eef 100644
--- a/src/freedreno/ir3/ir3_ra.c
+++ b/src/freedreno/ir3/ir3_ra.c
@@ -917,6 +917,8 @@ ra_add_interference(struct ir3_ra_ctx *ctx)
 	/* need to fix things up to keep outputs live: */
 	for (unsigned i = 0; i < ir->noutputs; i++) {
 		struct ir3_instruction *instr = ir->outputs[i];
+		if (!instr)
+			continue;
 		unsigned name = ra_name(ctx, &ctx->instrd[instr->ip]);
 		ctx->use[name] = ctx->instr_cnt;
 	}
-- 
2.30.2