From: Rob Clark <robdclark@chromium.org>
Date: Fri, 25 Oct 2019 22:37:56 +0000 (-0700)
Subject: freedreno/ir3: re-work shader inputs/outputs
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=bdf6b7018cedf95b554e21953d5a1935d3067ce7;p=mesa.git

freedreno/ir3: re-work shader inputs/outputs

Allow inputs/outputs to be vecN (ie. whatever their actual size is), and
use split to get scalar components of inputs, and collect to gather up
scalar components of outputs.

The main motivation is to simplify RA, by only having to consider split/
collect to figure out where values need to land in consecutive scalar
registers, rather than having to also deal with left/right neighbors.

Because of varying packing, and the resulting fractional location
(location_frac), to implement load_input/store_output, it is still
convenient to have a table of scalar inputs/outputs.  We move this to
the compile ctx (since it is only needed for nir->ir3).

Signed-off-by: Rob Clark <robdclark@chromium.org>
Reviewed-by: Kristian H. Kristensen <hoegsberg@google.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
---

diff --git a/src/freedreno/ir3/ir3.c b/src/freedreno/ir3/ir3.c
index 76ee44d80f5..b89e9b316a5 100644
--- a/src/freedreno/ir3/ir3.c
+++ b/src/freedreno/ir3/ir3.c
@@ -45,18 +45,12 @@ void * ir3_alloc(struct ir3 *shader, int sz)
 	return rzalloc_size(shader, sz); /* TODO: don't use rzalloc */
 }
 
-struct ir3 * ir3_create(struct ir3_compiler *compiler,
-		gl_shader_stage type, unsigned nin, unsigned nout)
+struct ir3 * ir3_create(struct ir3_compiler *compiler, gl_shader_stage type)
 {
 	struct ir3 *shader = rzalloc(NULL, struct ir3);
 
 	shader->compiler = compiler;
 	shader->type = type;
-	shader->ninputs = nin;
-	shader->inputs = ir3_alloc(shader, sizeof(shader->inputs[0]) * nin);
-
-	shader->noutputs = nout;
-	shader->outputs = ir3_alloc(shader, sizeof(shader->outputs[0]) * nout);
 
 	list_inithead(&shader->block_list);
 	list_inithead(&shader->array_list);
diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h
index 3e4fa34aa0f..afff38b9b60 100644
--- a/src/freedreno/ir3/ir3.h
+++ b/src/freedreno/ir3/ir3.h
@@ -268,11 +268,19 @@ struct ir3_instruction {
 		struct {
 			int off;              /* component/offset */
 		} split;
+		struct {
+			/* for output collects, this maps back to the entry in the
+			 * ir3_shader_variant::outputs table.
+			 */
+			int outidx;
+		} collect;
 		struct {
 			unsigned samp, tex;
 			unsigned input_offset;
 		} prefetch;
 		struct {
+			/* maps back to entry in ir3_shader_variant::inputs table: */
+			int inidx;
 			/* for sysvals, identifies the sysval type.  Mostly so we can
 			 * identify the special cases where a sysval should not be DCE'd
 			 * (currently, just pre-fs texture fetch)
@@ -425,9 +433,8 @@ struct ir3 {
 	struct ir3_compiler *compiler;
 	gl_shader_stage type;
 
-	unsigned ninputs, noutputs;
-	struct ir3_instruction **inputs;
-	struct ir3_instruction **outputs;
+	DECLARE_ARRAY(struct ir3_instruction *, inputs);
+	DECLARE_ARRAY(struct ir3_instruction *, outputs);
 
 	/* Track bary.f (and ldlv) instructions.. this is needed in
 	 * scheduling to ensure that all varying fetches happen before
@@ -537,8 +544,7 @@ block_id(struct ir3_block *block)
 #endif
 }
 
-struct ir3 * ir3_create(struct ir3_compiler *compiler,
-		gl_shader_stage type, unsigned nin, unsigned nout);
+struct ir3 * ir3_create(struct ir3_compiler *compiler, gl_shader_stage type);
 void ir3_destroy(struct ir3 *shader);
 void * ir3_assemble(struct ir3 *shader,
 		struct ir3_info *info, uint32_t gpu_id);
@@ -1065,14 +1071,14 @@ static inline bool __is_false_dep(struct ir3_instruction *instr, unsigned n)
 
 /* iterators for shader inputs: */
 #define foreach_input_n(__ininstr, __cnt, __ir) \
-	for (unsigned __cnt = 0; __cnt < (__ir)->ninputs; __cnt++) \
+	for (unsigned __cnt = 0; __cnt < (__ir)->inputs_count; __cnt++) \
 		if ((__ininstr = (__ir)->inputs[__cnt]))
 #define foreach_input(__ininstr, __ir) \
 	foreach_input_n(__ininstr, __i, __ir)
 
 /* iterators for shader outputs: */
 #define foreach_output_n(__outinstr, __cnt, __ir) \
-	for (unsigned __cnt = 0; __cnt < (__ir)->noutputs; __cnt++) \
+	for (unsigned __cnt = 0; __cnt < (__ir)->outputs_count; __cnt++) \
 		if ((__outinstr = (__ir)->outputs[__cnt]))
 #define foreach_output(__outinstr, __ir) \
 	foreach_output_n(__outinstr, __i, __ir)
diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c
index e0a9d05fda3..58d515b1e22 100644
--- a/src/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/freedreno/ir3/ir3_compiler_nir.c
@@ -70,13 +70,9 @@ create_input_compmask(struct ir3_context *ctx, unsigned n, unsigned compmask)
 	in->input.sysval = ~0;
 	__ssa_dst(in)->wrmask = compmask;
 
-	return in;
-}
+	array_insert(ctx->ir, ctx->ir->inputs, in);
 
-static struct ir3_instruction *
-create_input(struct ir3_context *ctx, unsigned n)
-{
-	return create_input_compmask(ctx, n, 0x1);
+	return in;
 }
 
 static struct ir3_instruction *
@@ -1198,21 +1194,17 @@ static void add_sysval_input_compmask(struct ir3_context *ctx,
 		struct ir3_instruction *instr)
 {
 	struct ir3_shader_variant *so = ctx->so;
-	unsigned r = regid(so->inputs_count, 0);
 	unsigned n = so->inputs_count++;
 
 	assert(instr->opc == OPC_META_INPUT);
+	instr->input.inidx = n;
 	instr->input.sysval = slot;
 
 	so->inputs[n].sysval = true;
 	so->inputs[n].slot = slot;
 	so->inputs[n].compmask = compmask;
-	so->inputs[n].regid = r;
 	so->inputs[n].interpolate = INTERP_MODE_FLAT;
 	so->total_in++;
-
-	ctx->ir->ninputs = MAX2(ctx->ir->ninputs, r + 1);
-	ctx->ir->inputs[r] = instr;
 }
 
 static struct ir3_instruction *
@@ -1521,17 +1513,17 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
 			idx += nir_src_as_uint(intr->src[0]);
 			for (int i = 0; i < intr->num_components; i++) {
 				unsigned n = idx * 4 + i + comp;
-				dst[i] = ctx->ir->inputs[n];
-				compile_assert(ctx, ctx->ir->inputs[n]);
+				dst[i] = ctx->inputs[n];
+				compile_assert(ctx, ctx->inputs[n]);
 			}
 		} else {
 			src = ir3_get_src(ctx, &intr->src[0]);
 			struct ir3_instruction *collect =
-					ir3_create_collect(ctx, ctx->ir->inputs, ctx->ir->ninputs);
+					ir3_create_collect(ctx, ctx->ir->inputs, ctx->ninputs);
 			struct ir3_instruction *addr = ir3_get_addr(ctx, src[0], 4);
 			for (int i = 0; i < intr->num_components; i++) {
 				unsigned n = idx * 4 + i + comp;
-				dst[i] = create_indirect_load(ctx, ctx->ir->ninputs,
+				dst[i] = create_indirect_load(ctx, ctx->ninputs,
 						n, addr, collect);
 			}
 		}
@@ -1632,7 +1624,7 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
 		src = ir3_get_src(ctx, &intr->src[0]);
 		for (int i = 0; i < intr->num_components; i++) {
 			unsigned n = idx * 4 + i + comp;
-			ctx->ir->outputs[n] = src[i];
+			ctx->outputs[n] = src[i];
 		}
 		break;
 	case nir_intrinsic_load_base_vertex:
@@ -2715,15 +2707,27 @@ setup_input(struct ir3_context *ctx, nir_variable *in)
 				instr = create_frag_input(ctx, so->inputs[n].use_ldlv, idx);
 			}
 
-			compile_assert(ctx, idx < ctx->ir->ninputs);
+			compile_assert(ctx, idx < ctx->ninputs);
 
-			ctx->ir->inputs[idx] = instr;
+			ctx->inputs[idx] = instr;
 		}
 	} else if (ctx->so->type == MESA_SHADER_VERTEX) {
+		/* We shouldn't have fractional input for VS input.. that only shows
+		 * up with varying packing
+		 */
+		assert(frac == 0);
+
+		struct ir3_instruction *input = create_input_compmask(ctx, 0, (1 << ncomp) - 1);
+		struct ir3_instruction *components[ncomp];
+
+		input->input.inidx = n;
+
+		ir3_split_dest(ctx->block, components, input, 0, ncomp);
+
 		for (int i = 0; i < ncomp; i++) {
 			unsigned idx = (n * 4) + i + frac;
-			compile_assert(ctx, idx < ctx->ir->ninputs);
-			ctx->ir->inputs[idx] = create_input(ctx, idx);
+			compile_assert(ctx, idx < ctx->ninputs);
+			ctx->inputs[idx] = components[i];
 		}
 	} else {
 		ir3_context_error(ctx, "unknown shader type: %d\n", ctx->so->type);
@@ -2904,8 +2908,8 @@ setup_output(struct ir3_context *ctx, nir_variable *out)
 
 	for (int i = 0; i < ncomp; i++) {
 		unsigned idx = (n * 4) + i + frac;
-		compile_assert(ctx, idx < ctx->ir->noutputs);
-		ctx->ir->outputs[idx] = create_immed(ctx->block, fui(0.0));
+		compile_assert(ctx, idx < ctx->noutputs);
+		ctx->outputs[idx] = create_immed(ctx->block, fui(0.0));
 	}
 
 	/* if varying packing doesn't happen, we could end up in a situation
@@ -2918,8 +2922,8 @@ setup_output(struct ir3_context *ctx, nir_variable *out)
 	 */
 	for (int i = 0; i < frac; i++) {
 		unsigned idx = (n * 4) + i;
-		if (!ctx->ir->outputs[idx]) {
-			ctx->ir->outputs[idx] = create_immed(ctx->block, fui(0.0));
+		if (!ctx->outputs[idx]) {
+			ctx->outputs[idx] = create_immed(ctx->block, fui(0.0));
 		}
 	}
 }
@@ -2934,33 +2938,18 @@ max_drvloc(struct exec_list *vars)
 	return drvloc;
 }
 
-static const unsigned max_sysvals[] = {
-	[MESA_SHADER_VERTEX]  = 16,
-	[MESA_SHADER_TESS_CTRL] = 16,
-	[MESA_SHADER_TESS_EVAL] = 16,
-	[MESA_SHADER_GEOMETRY] = 16,
-	[MESA_SHADER_FRAGMENT] = 24,  // TODO
-	[MESA_SHADER_COMPUTE] = 16, // TODO how many do we actually need?
-	[MESA_SHADER_KERNEL]  = 16, // TODO how many do we actually need?
-};
-
 static void
 emit_instructions(struct ir3_context *ctx)
 {
-	unsigned ninputs, noutputs;
 	nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->s);
 
-	ninputs  = (max_drvloc(&ctx->s->inputs) + 1) * 4;
-	noutputs = (max_drvloc(&ctx->s->outputs) + 1) * 4;
+	ctx->ninputs  = (max_drvloc(&ctx->s->inputs) + 1) * 4;
+	ctx->noutputs = (max_drvloc(&ctx->s->outputs) + 1) * 4;
 
-	/* we need to leave room for sysvals:
-	 */
-	ninputs += max_sysvals[ctx->so->type];
-	if (ctx->so->type == MESA_SHADER_VERTEX ||
-			ctx->so->type == MESA_SHADER_TESS_EVAL)
-		noutputs += 8; /* gs or tess header + primitive_id */
+	ctx->inputs  = rzalloc_array(ctx, struct ir3_instruction *, ctx->ninputs);
+	ctx->outputs = rzalloc_array(ctx, struct ir3_instruction *, ctx->noutputs);
 
-	ctx->ir = ir3_create(ctx->compiler, ctx->so->type, ninputs, noutputs);
+	ctx->ir = ir3_create(ctx->compiler, ctx->so->type);
 
 	/* Create inputs in first block: */
 	ctx->block = get_block(ctx, nir_start_block(fxn));
@@ -3039,40 +3028,6 @@ emit_instructions(struct ir3_context *ctx)
 		setup_output(ctx, var);
 	}
 
-	/* Set up the shared system values as outputs for the vertex and tess eval
-	 * shaders so they don't clobber them for the next shader in the pipeline.
-	 */
-	if (ctx->so->type == MESA_SHADER_VERTEX ||
-			(has_gs && ctx->so->type == MESA_SHADER_TESS_EVAL)) {
-		struct ir3_shader_variant *so = ctx->so;
-		if (ctx->primitive_id) {
-			unsigned n = so->outputs_count++;
-			so->outputs[n].slot = VARYING_SLOT_PRIMITIVE_ID;
-			so->outputs[n].regid = regid(n, 0);
-			ctx->ir->outputs[n * 4] = ctx->primitive_id;
-
-			compile_assert(ctx, n * 4 < ctx->ir->noutputs);
-		}
-
-		if (ctx->gs_header) {
-			unsigned n = so->outputs_count++;
-			so->outputs[n].slot = VARYING_SLOT_GS_HEADER_IR3;
-			so->outputs[n].regid = regid(n, 0);
-			ctx->ir->outputs[n * 4] = ctx->gs_header;
-
-			compile_assert(ctx, n * 4 < ctx->ir->noutputs);
-		}
-
-		if (ctx->tcs_header) {
-			unsigned n = so->outputs_count++;
-			so->outputs[n].slot = VARYING_SLOT_TCS_HEADER_IR3;
-			so->outputs[n].regid = regid(n, 0);
-			ctx->ir->outputs[n * 4] = ctx->tcs_header;
-
-			compile_assert(ctx, n * 4 < ctx->ir->noutputs);
-		}
-	}
-
 	/* Find # of samplers: */
 	nir_foreach_variable(var, &ctx->s->uniforms) {
 		ctx->so->num_samp += glsl_type_get_sampler_count(var->type);
@@ -3092,28 +3047,6 @@ emit_instructions(struct ir3_context *ctx)
 	emit_function(ctx, fxn);
 }
 
-/* from NIR perspective, we actually have varying inputs.  But the varying
- * inputs, from an IR standpoint, are just bary.f/ldlv instructions.  The
- * only actual inputs are the sysvals.
- */
-static void
-fixup_frag_inputs(struct ir3_context *ctx)
-{
-	struct ir3_shader_variant *so = ctx->so;
-	struct ir3 *ir = ctx->ir;
-	unsigned i = 0;
-
-	/* sysvals should appear at the end of the inputs, drop everything else: */
-	while ((i < so->inputs_count) && !so->inputs[i].sysval)
-		i++;
-
-	/* at IR level, inputs are always blocks of 4 scalars: */
-	i *= 4;
-
-	ir->inputs = &ir->inputs[i];
-	ir->ninputs -= i;
-}
-
 /* Fixup tex sampler state for astc/srgb workaround instructions.  We
  * need to assign the tex state indexes for these after we know the
  * max tex index.
@@ -3155,23 +3088,44 @@ fixup_binning_pass(struct ir3_context *ctx)
 	struct ir3 *ir = ctx->ir;
 	unsigned i, j;
 
+	/* first pass, remove unused outputs from the IR level outputs: */
+	for (i = 0, j = 0; i < ir->outputs_count; i++) {
+		struct ir3_instruction *out = ir->outputs[i];
+		assert(out->opc == OPC_META_COLLECT);
+		unsigned outidx = out->collect.outidx;
+		unsigned slot = so->outputs[outidx].slot;
+
+		/* throw away everything but first position/psize */
+		if ((slot == VARYING_SLOT_POS) || (slot == VARYING_SLOT_PSIZ)) {
+			ir->outputs[j] = ir->outputs[i];
+			j++;
+		}
+	}
+	ir->outputs_count = j;
+
+	/* second pass, cleanup the unused slots in ir3_shader_variant::outputs
+	 * table:
+	 */
 	for (i = 0, j = 0; i < so->outputs_count; i++) {
 		unsigned slot = so->outputs[i].slot;
 
 		/* throw away everything but first position/psize */
 		if ((slot == VARYING_SLOT_POS) || (slot == VARYING_SLOT_PSIZ)) {
-			if (i != j) {
-				so->outputs[j] = so->outputs[i];
-				ir->outputs[(j*4)+0] = ir->outputs[(i*4)+0];
-				ir->outputs[(j*4)+1] = ir->outputs[(i*4)+1];
-				ir->outputs[(j*4)+2] = ir->outputs[(i*4)+2];
-				ir->outputs[(j*4)+3] = ir->outputs[(i*4)+3];
+			so->outputs[j] = so->outputs[i];
+
+			/* fixup outidx to point to new output table entry: */
+			struct ir3_instruction *out;
+			foreach_output(out, ir) {
+				if (out->collect.outidx == i) {
+					out->collect.outidx = j;
+					break;
+				}
 			}
+
 			j++;
 		}
 	}
 	so->outputs_count = j;
-	ir->noutputs = j * 4;
 }
 
 static void
@@ -3215,8 +3169,6 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 {
 	struct ir3_context *ctx;
 	struct ir3 *ir;
-	struct ir3_instruction **inputs;
-	unsigned i;
 	int ret = 0, max_bary;
 
 	assert(!so->ir);
@@ -3238,12 +3190,81 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 
 	ir = so->ir = ctx->ir;
 
-	/* keep track of the inputs from TGSI perspective.. */
-	inputs = ir->inputs;
+	assert((ctx->noutputs % 4) == 0);
 
-	/* but fixup actual inputs for frag shader: */
-	if (so->type == MESA_SHADER_FRAGMENT)
-		fixup_frag_inputs(ctx);
+	/* Setup IR level outputs, which are "collects" that gather
+	 * the scalar components of outputs.
+	 */
+	for (unsigned i = 0; i < ctx->noutputs; i += 4) {
+		unsigned ncomp = 0;
+		/* figure out the # of components written:
+		 *
+		 * TODO do we need to handle holes, ie. if .x and .z
+		 * components written, but .y component not written?
+		 */
+		for (unsigned j = 0; j < 4; j++) {
+			if (!ctx->outputs[i + j])
+				break;
+			ncomp++;
+		}
+
+		/* Note that in some stages, like TCS, store_output is
+		 * lowered to memory writes, so no components of the
+		 * are "written" from the PoV of traditional store-
+		 * output instructions:
+		 */
+		if (!ncomp)
+			continue;
+
+		struct ir3_instruction *out =
+			ir3_create_collect(ctx, &ctx->outputs[i], ncomp);
+
+		int outidx = i / 4;
+		assert(outidx < so->outputs_count);
+
+		/* stash index into so->outputs[] so we can map the
+		 * output back to slot/etc later:
+		 */
+		out->collect.outidx = outidx;
+
+		array_insert(ir, ir->outputs, out);
+	}
+
+	/* Set up the gs header as an output for the vertex shader so it won't
+	 * clobber it for the tess ctrl shader.
+	 *
+	 * TODO this could probably be done more cleanly in a nir pass.
+	 */
+	if (ctx->so->type == MESA_SHADER_VERTEX ||
+			(ctx->so->key.has_gs && ctx->so->type == MESA_SHADER_TESS_EVAL)) {
+		if (ctx->primitive_id) {
+			unsigned n = so->outputs_count++;
+			so->outputs[n].slot = VARYING_SLOT_PRIMITIVE_ID;
+
+			struct ir3_instruction *out =
+				ir3_create_collect(ctx, &ctx->primitive_id, 1);
+			out->collect.outidx = n;
+			array_insert(ir, ir->outputs, out);
+		}
+
+		if (ctx->gs_header) {
+			unsigned n = so->outputs_count++;
+			so->outputs[n].slot = VARYING_SLOT_GS_HEADER_IR3;
+			struct ir3_instruction *out =
+				ir3_create_collect(ctx, &ctx->gs_header, 1);
+			out->collect.outidx = n;
+			array_insert(ir, ir->outputs, out);
+		}
+
+		if (ctx->tcs_header) {
+			unsigned n = so->outputs_count++;
+			so->outputs[n].slot = VARYING_SLOT_TCS_HEADER_IR3;
+			struct ir3_instruction *out =
+				ir3_create_collect(ctx, &ctx->tcs_header, 1);
+			out->collect.outidx = n;
+			array_insert(ir, ir->outputs, out);
+		}
+	}
 
 	/* at this point, for binning pass, throw away unneeded outputs: */
 	if (so->binning_pass && (ctx->compiler->gpu_id < 600))
@@ -3267,8 +3288,8 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 	 */
 	if (ctx->compiler->gpu_id >= 600 && so->binning_pass &&
 			so->type == MESA_SHADER_VERTEX) {
-		for (int i = 0; i < ir->ninputs; i++) {
-			struct ir3_instruction *in = ir->inputs[i];
+		for (int i = 0; i < ctx->ninputs; i++) {
+			struct ir3_instruction *in = ctx->inputs[i];
 
 			if (!in)
 				continue;
@@ -3287,20 +3308,6 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 		}
 	}
 
-	/* Insert mov if there's same instruction for each output.
-	 * eg. dEQP-GLES31.functional.shaders.opaque_type_indexing.sampler.const_expression.vertex.sampler2dshadow
-	 */
-	for (int i = ir->noutputs - 1; i >= 0; i--) {
-		if (!ir->outputs[i])
-			continue;
-		for (unsigned j = 0; j < i; j++) {
-			if (ir->outputs[i] == ir->outputs[j]) {
-				ir->outputs[i] =
-					ir3_MOV(ir->outputs[i]->block, ir->outputs[i], TYPE_F32);
-			}
-		}
-	}
-
 	ir3_debug_print(ir, "BEFORE GROUPING");
 
 	ir3_sched_add_deps(ir);
@@ -3342,8 +3349,8 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 			so->binning_pass;
 
 	if (pre_assign_inputs) {
-		for (unsigned i = 0; i < ir->ninputs; i++) {
-			struct ir3_instruction *instr = ir->inputs[i];
+		for (unsigned i = 0; i < ctx->ninputs; i++) {
+			struct ir3_instruction *instr = ctx->inputs[i];
 
 			if (!instr)
 				continue;
@@ -3355,7 +3362,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 			instr->regs[0]->num = regid;
 		}
 
-		ret = ir3_ra(so, ir->inputs, ir->ninputs);
+		ret = ir3_ra(so, ctx->inputs, ctx->ninputs);
 	} else if (ctx->tcs_header) {
 		/* We need to have these values in the same registers between VS and TCS
 		 * since the VS chains to TCS and doesn't get the sysvals redelivered.
@@ -3406,48 +3413,36 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 	if (so->type == MESA_SHADER_FRAGMENT)
 		pack_inlocs(ctx);
 
-	/* fixup input/outputs: */
-	for (i = 0; i < so->outputs_count; i++) {
-		/* sometimes we get outputs that don't write the .x coord, like:
-		 *
-		 *   decl_var shader_out INTERP_MODE_NONE float Color (VARYING_SLOT_VAR9.z, 1, 0)
-		 *
-		 * Presumably the result of varying packing and then eliminating
-		 * some unneeded varyings?  Just skip head to the first valid
-		 * component of the output.
-		 */
-		for (unsigned j = 0; j < 4; j++) {
-			struct ir3_instruction *instr = ir->outputs[(i*4) + j];
-			if (instr) {
-				so->outputs[i].regid = instr->regs[0]->num;
-				so->outputs[i].half  = !!(instr->regs[0]->flags & IR3_REG_HALF);
-				break;
-			}
-		}
-	}
+	/*
+	 * Fixup inputs/outputs to point to the actual registers assigned:
+	 *
+	 * 1) initialize to r63.x (invalid/unused)
+	 * 2) iterate IR level inputs/outputs and update the variants
+	 *    inputs/outputs table based on the assigned registers for
+	 *    the remaining inputs/outputs.
+	 */
 
-	/* Note that some or all channels of an input may be unused: */
-	for (i = 0; i < so->inputs_count; i++) {
-		unsigned j, reg = regid(63,0);
-		bool half = false;
-		for (j = 0; j < 4; j++) {
-			struct ir3_instruction *in = inputs[(i*4) + j];
+	for (unsigned i = 0; i < so->inputs_count; i++)
+		so->inputs[i].regid = regid(63, 0);
+	for (unsigned i = 0; i < so->outputs_count; i++)
+		so->outputs[i].regid = regid(63, 0);
 
-			if (!in)
-				continue;
+	struct ir3_instruction *out;
+	foreach_output(out, ir) {
+		assert(out->opc == OPC_META_COLLECT);
+		unsigned outidx = out->collect.outidx;
 
-			if (in->flags & IR3_INSTR_UNUSED)
-				continue;
+		so->outputs[outidx].regid = out->regs[0]->num;
+		so->outputs[outidx].half  = !!(out->regs[0]->flags & IR3_REG_HALF);
+	}
 
-			reg = in->regs[0]->num - j;
-			if (half) {
-				compile_assert(ctx, in->regs[0]->flags & IR3_REG_HALF);
-			} else {
-				half = !!(in->regs[0]->flags & IR3_REG_HALF);
-			}
-		}
-		so->inputs[i].regid = reg;
-		so->inputs[i].half  = half;
+	struct ir3_instruction *in;
+	foreach_input(in, ir) {
+		assert(in->opc == OPC_META_INPUT);
+		unsigned inidx = in->input.inidx;
+
+		so->inputs[inidx].regid = in->regs[0]->num;
+		so->inputs[inidx].half  = !!(in->regs[0]->flags & IR3_REG_HALF);
 	}
 
 	if (ctx->astc_srgb)
diff --git a/src/freedreno/ir3/ir3_context.h b/src/freedreno/ir3/ir3_context.h
index bb283a76326..1ce5c6776ca 100644
--- a/src/freedreno/ir3/ir3_context.h
+++ b/src/freedreno/ir3/ir3_context.h
@@ -52,6 +52,18 @@ struct ir3_context {
 	struct ir3 *ir;
 	struct ir3_shader_variant *so;
 
+	/* Tables of scalar inputs/outputs.  Because of the way varying packing
+	 * works, we could have inputs w/ fractional location, which is a bit
+	 * awkward to deal with unless we keep track of the split scalar in/
+	 * out components.
+	 *
+	 * These *only* have inputs/outputs that are touched by load_*input and
+	 * store_output.
+	 */
+	unsigned ninputs, noutputs;
+	struct ir3_instruction **inputs;
+	struct ir3_instruction **outputs;
+
 	struct ir3_block *block;      /* the current block */
 	struct ir3_block *in_block;   /* block created for shader inputs */
 
diff --git a/src/freedreno/ir3/ir3_group.c b/src/freedreno/ir3/ir3_group.c
index a0b853ca159..f86397565f0 100644
--- a/src/freedreno/ir3/ir3_group.c
+++ b/src/freedreno/ir3/ir3_group.c
@@ -39,41 +39,6 @@ struct group_ops {
 	void (*insert_mov)(void *arr, int idx, struct ir3_instruction *instr);
 };
 
-static struct ir3_instruction *arr_get(void *arr, int idx)
-{
-	return ((struct ir3_instruction **)arr)[idx];
-}
-static void arr_insert_mov_out(void *arr, int idx, struct ir3_instruction *instr)
-{
-	((struct ir3_instruction **)arr)[idx] =
-			ir3_MOV(instr->block, instr, TYPE_F32);
-}
-static void arr_insert_mov_in(void *arr, int idx, struct ir3_instruction *instr)
-{
-	/* so, we can't insert a mov in front of a meta:in.. and the downstream
-	 * instruction already has a pointer to 'instr'.  So we cheat a bit and
-	 * morph the meta:in instruction into a mov and insert a new meta:in
-	 * in front.
-	 */
-	struct ir3_instruction *in;
-
-	debug_assert(instr->regs_count == 1);
-
-	in = ir3_instr_create(instr->block, OPC_META_INPUT);
-	in->input.sysval = instr->input.sysval;
-	__ssa_dst(in);
-
-	/* create src reg for meta:in and fixup to now be a mov: */
-	__ssa_src(instr, in, 0);
-	instr->opc = OPC_MOV;
-	instr->cat1.src_type = TYPE_F32;
-	instr->cat1.dst_type = TYPE_F32;
-
-	((struct ir3_instruction **)arr)[idx] = in;
-}
-static struct group_ops arr_ops_out = { arr_get, arr_insert_mov_out };
-static struct group_ops arr_ops_in = { arr_get, arr_insert_mov_in };
-
 static struct ir3_instruction *instr_get(void *arr, int idx)
 {
 	return ssa(((struct ir3_instruction *)arr)->regs[idx+1]);
@@ -192,61 +157,11 @@ instr_find_neighbors(struct ir3_instruction *instr)
 		instr_find_neighbors(src);
 }
 
-/* a bit of sadness.. we can't have "holes" in inputs from PoV of
- * register assignment, they still need to be grouped together.  So
- * we need to insert dummy/padding instruction for grouping, and
- * then take it back out again before anyone notices.
- */
-static void
-pad_and_group_input(struct ir3_instruction **input, unsigned n)
-{
-	int i, mask = 0;
-	struct ir3_block *block = NULL;
-
-	for (i = n - 1; i >= 0; i--) {
-		struct ir3_instruction *instr = input[i];
-		if (instr) {
-			block = instr->block;
-		} else if (block) {
-			instr = ir3_NOP(block);
-			__ssa_dst(instr);          /* dummy dst */
-			input[i] = instr;
-			mask |= (1 << i);
-		}
-	}
-
-	group_n(&arr_ops_in, input, n);
-
-	for (i = 0; i < n; i++) {
-		if (mask & (1 << i))
-			input[i] = NULL;
-	}
-}
-
 static void
 find_neighbors(struct ir3 *ir)
 {
 	unsigned i;
 
-	/* shader inputs/outputs themselves must be contiguous as well:
-	 *
-	 * NOTE: group inputs first, since we only insert mov's
-	 * *before* the conflicted instr (and that would go badly
-	 * for inputs).  By doing inputs first, we should never
-	 * have a conflict on inputs.. pushing any conflict to
-	 * resolve to the outputs, for stuff like:
-	 *
-	 *     MOV OUT[n], IN[m].wzyx
-	 *
-	 * NOTE: we assume here inputs/outputs are grouped in vec4.
-	 * This logic won't quite cut it if we don't align smaller
-	 * on vec4 boundaries
-	 */
-	for (i = 0; i < ir->ninputs; i += 4)
-		pad_and_group_input(&ir->inputs[i], 4);
-	for (i = 0; i < ir->noutputs; i += 4)
-		group_n(&arr_ops_out, &ir->outputs[i], 4);
-
 	struct ir3_instruction *out;
 	foreach_output(out, ir)
 		instr_find_neighbors(out);