panfrost/midgard: Set minimal swizzle on texture input
[mesa.git] / src / gallium / drivers / panfrost / midgard / midgard_compile.c
index c519193a56a98c52fc0b73b6552df48be55e75c2..4bddea40fdb1328b2deaf9e45a3b22b556f38efe 100644 (file)
@@ -83,6 +83,9 @@ midgard_block_add_successor(midgard_block *block, midgard_block *successor)
 
 #define EMIT(op, ...) emit_mir_instruction(ctx, v_##op(__VA_ARGS__));
 #define SWIZZLE_XYZW SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W)
+#define SWIZZLE_XYXX SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_X, COMPONENT_X)
+#define SWIZZLE_XXXX SWIZZLE(COMPONENT_X, COMPONENT_X, COMPONENT_X, COMPONENT_X)
+#define SWIZZLE_WWWW SWIZZLE(COMPONENT_W, COMPONENT_W, COMPONENT_W, COMPONENT_W)
 
 #define M_LOAD_STORE(name, rname, uname) \
        static midgard_instruction m_##name(unsigned ssa, unsigned address) { \
@@ -351,9 +354,11 @@ optimise_nir(nir_shader *nir)
 
         NIR_PASS(progress, nir, nir_lower_regs_to_ssa);
         NIR_PASS(progress, nir, midgard_nir_lower_fdot2);
+        NIR_PASS(progress, nir, nir_lower_idiv);
 
         nir_lower_tex_options lower_tex_options = {
-                .lower_rect = true
+                .lower_rect = true,
+                .lower_txp = ~0
         };
 
         NIR_PASS(progress, nir, nir_lower_tex, &lower_tex_options);
@@ -539,14 +544,14 @@ emit_condition(compiler_context *ctx, nir_src *src, bool for_branch, unsigned co
                 .unit = for_branch ? UNIT_SMUL : UNIT_SADD,
 
                 .ssa_args = {
-
                         .src0 = condition,
                         .src1 = condition,
                         .dest = SSA_FIXED_REGISTER(31),
                 },
+
                 .alu = {
                         .op = midgard_alu_op_iand,
-                        .outmod = midgard_outmod_int,
+                        .outmod = midgard_outmod_int_wrap,
                         .reg_mode = midgard_reg_mode_32,
                         .dest_override = midgard_dest_override_none,
                         .mask = (0x3 << 6), /* w */
@@ -585,7 +590,7 @@ emit_condition_mixed(compiler_context *ctx, nir_alu_src *src, unsigned nr_comp)
                 },
                 .alu = {
                         .op = midgard_alu_op_iand,
-                        .outmod = midgard_outmod_int,
+                        .outmod = midgard_outmod_int_wrap,
                         .reg_mode = midgard_reg_mode_32,
                         .dest_override = midgard_dest_override_none,
                         .mask = expand_writemask((1 << nr_comp) - 1),
@@ -616,7 +621,7 @@ emit_indirect_offset(compiler_context *ctx, nir_src *src)
                 },
                 .alu = {
                         .op = midgard_alu_op_imov,
-                        .outmod = midgard_outmod_int,
+                        .outmod = midgard_outmod_int_wrap,
                         .reg_mode = midgard_reg_mode_32,
                         .dest_override = midgard_dest_override_none,
                         .mask = (0x3 << 6), /* w */
@@ -682,7 +687,10 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr)
                 ALU_CASE(iadd, iadd);
                 ALU_CASE(isub, isub);
                 ALU_CASE(imul, imul);
-                ALU_CASE(iabs, iabs);
+
+                /* Zero shoved as second-arg */
+                ALU_CASE(iabs, iabsdiff);
+
                 ALU_CASE(mov, imov);
 
                 ALU_CASE(feq32, feq);
@@ -727,10 +735,11 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr)
                 ALU_CASE(fsin, fsin);
                 ALU_CASE(fcos, fcos);
 
+                /* Second op implicit #0 */
+                ALU_CASE(inot, inor);
                 ALU_CASE(iand, iand);
                 ALU_CASE(ior, ior);
                 ALU_CASE(ixor, ixor);
-                ALU_CASE(inot, inand);
                 ALU_CASE(ishl, ishl);
                 ALU_CASE(ishr, iasr);
                 ALU_CASE(ushr, ilsr);
@@ -816,12 +825,14 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr)
         }
 
         /* Midgard can perform certain modifiers on output of an ALU op */
-        midgard_outmod outmod =
-                midgard_is_integer_out_op(op) ? midgard_outmod_int :
-                instr->dest.saturate ? midgard_outmod_sat : midgard_outmod_none;
+        unsigned outmod;
 
-        if (instr->op == nir_op_fsat)
-                outmod = midgard_outmod_sat;
+        if (midgard_is_integer_out_op(op)) {
+                outmod = midgard_outmod_int_wrap;
+        } else {
+                bool sat = instr->dest.saturate || instr->op == nir_op_fsat;
+                outmod = sat ? midgard_outmod_sat : midgard_outmod_none;
+        }
 
         /* fmax(a, 0.0) can turn into a .pos modifier as an optimization */
 
@@ -927,7 +938,8 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr)
                 }
 
                 ins.alu.src2 = vector_alu_srco_unsigned(blank_alu_src_xxxx);
-        } else if (instr->op == nir_op_f2b32 || instr->op == nir_op_i2b32) {
+        } else if (nr_inputs == 1 && !quirk_flipped_r24) {
+                /* Lots of instructions need a 0 plonked in */
                 ins.ssa_args.inline_constant = false;
                 ins.ssa_args.src1 = SSA_FIXED_REGISTER(REGISTER_CONSTANT);
                 ins.has_constants = true;
@@ -998,6 +1010,42 @@ emit_uniform_read(compiler_context *ctx, unsigned dest, unsigned offset, nir_src
         }
 }
 
+static void
+emit_varying_read(
+                compiler_context *ctx,
+                unsigned dest, unsigned offset,
+                unsigned nr_comp, unsigned component,
+                nir_src *indirect_offset)
+{
+        /* XXX: Half-floats? */
+        /* TODO: swizzle, mask */
+
+        midgard_instruction ins = m_ld_vary_32(dest, offset);
+        ins.load_store.mask = (1 << nr_comp) - 1;
+        ins.load_store.swizzle = SWIZZLE_XYZW >> (2 * component);
+
+        midgard_varying_parameter p = {
+                .is_varying = 1,
+                .interpolation = midgard_interp_default,
+                .flat = /*var->data.interpolation == INTERP_MODE_FLAT*/ 0
+        };
+
+        unsigned u;
+        memcpy(&u, &p, sizeof(p));
+        ins.load_store.varying_parameters = u;
+
+        if (indirect_offset) {
+                /* We need to add in the dynamic index, moved to r27.w */
+                emit_indirect_offset(ctx, indirect_offset);
+                ins.load_store.unknown = 0x79e; /* xxx: what is this? */
+        } else {
+                /* Just a direct load */
+                ins.load_store.unknown = 0x1e9e; /* xxx: what is this? */
+        }
+
+        emit_mir_instruction(ctx, ins);
+}
+
 static void
 emit_sysval_read(compiler_context *ctx, nir_intrinsic_instr *instr)
 {
@@ -1120,38 +1168,15 @@ emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr)
                         offset += nir_src_as_uint(instr->src[0]);
                 }
 
+                /* We may need to apply a fractional offset */
+                int component = instr->intrinsic == nir_intrinsic_load_input ?
+                        nir_intrinsic_component(instr) : 0;
                 reg = nir_dest_index(ctx, &instr->dest);
 
                 if (instr->intrinsic == nir_intrinsic_load_uniform && !ctx->is_blend) {
                         emit_uniform_read(ctx, reg, ctx->sysval_count + offset, !direct ? &instr->src[0] : NULL);
                 } else if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->is_blend) {
-                        /* XXX: Half-floats? */
-                        /* TODO: swizzle, mask */
-
-                        midgard_instruction ins = m_ld_vary_32(reg, offset);
-                        ins.load_store.mask = (1 << nr_comp) - 1;
-
-                        midgard_varying_parameter p = {
-                                .is_varying = 1,
-                                .interpolation = midgard_interp_default,
-                                .flat = /*var->data.interpolation == INTERP_MODE_FLAT*/ 0
-                        };
-
-                        unsigned u;
-                        memcpy(&u, &p, sizeof(p));
-                        ins.load_store.varying_parameters = u;
-
-                        if (direct) {
-                                /* We have the offset totally ready */
-                                ins.load_store.unknown = 0x1e9e; /* xxx: what is this? */
-                        } else {
-                                /* We have it partially ready, but we need to
-                                 * add in the dynamic index, moved to r27.w */
-                                emit_indirect_offset(ctx, &instr->src[0]);
-                                ins.load_store.unknown = 0x79e; /* xxx: what is this? */
-                        }
-
-                        emit_mir_instruction(ctx, ins);
+                        emit_varying_read(ctx, reg, offset, nr_comp, component, !direct ? &instr->src[0] : NULL);
                 } else if (ctx->is_blend) {
                         /* For blend shaders, load the input color, which is
                          * preloaded to r0 */
@@ -1221,44 +1246,23 @@ emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr)
                         ctx->fragment_output = reg;
                 } else if (ctx->stage == MESA_SHADER_VERTEX) {
                         /* Varyings are written into one of two special
-                         * varying register, r26 or r27. The register itself is selected as the register
-                         * in the st_vary instruction, minus the base of 26. E.g. write into r27 and then call st_vary(1)
-                         *
-                         * Normally emitting fmov's is frowned upon,
-                         * but due to unique constraints of
-                         * REGISTER_VARYING, fmov emission + a
-                         * dedicated cleanup pass is the only way to
-                         * guarantee correctness when considering some
-                         * (common) edge cases XXX: FIXME */
-
-                        /* If this varying corresponds to a constant (why?!),
-                         * emit that now since it won't get picked up by
-                         * hoisting (since there is no corresponding move
-                         * emitted otherwise) */
-
-                        void *constant_value = _mesa_hash_table_u64_search(ctx->ssa_constants, reg + 1);
-
-                        if (constant_value) {
-                                /* Special case: emit the varying write
-                                 * directly to r26 (looks funny in asm but it's
-                                 * fine) and emit the store _now_. Possibly
-                                 * slightly slower, but this is a really stupid
-                                 * special case anyway (why on earth would you
-                                 * have a constant varying? Your own fault for
-                                 * slightly worse perf :P) */
-
-                                midgard_instruction ins = v_fmov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), blank_alu_src, SSA_FIXED_REGISTER(26));
-                                attach_constants(ctx, &ins, constant_value, reg + 1);
-                                emit_mir_instruction(ctx, ins);
+                         * varying register, r26 or r27. The register itself is
+                         * selected as the register in the st_vary instruction,
+                         * minus the base of 26. E.g. write into r27 and then
+                         * call st_vary(1) */
 
-                                midgard_instruction st = m_st_vary_32(SSA_FIXED_REGISTER(0), offset);
-                                st.load_store.unknown = 0x1E9E; /* XXX: What is this? */
-                                emit_mir_instruction(ctx, st);
-                        } else {
-                                /* Do not emit the varying yet -- instead, just mark down that we need to later */
+                        midgard_instruction ins = v_fmov(reg, blank_alu_src, SSA_FIXED_REGISTER(26));
+                        emit_mir_instruction(ctx, ins);
 
-                                _mesa_hash_table_u64_insert(ctx->ssa_varyings, reg + 1, (void *) ((uintptr_t) (offset + 1)));
-                        }
+                        /* We should have been vectorized. That also lets us
+                         * ignore the mask. because the mask component on
+                         * st_vary is (as far as I can tell) ignored [the blob
+                         * sets it to zero] */
+                        assert(nir_intrinsic_component(instr) == 0);
+
+                        midgard_instruction st = m_st_vary_32(SSA_FIXED_REGISTER(0), offset);
+                        st.load_store.unknown = 0x1E9E; /* XXX: What is this? */
+                        emit_mir_instruction(ctx, st);
                 } else {
                         DBG("Unknown store\n");
                         assert(0);
@@ -1309,13 +1313,26 @@ midgard_tex_format(enum glsl_sampler_dim dim)
         }
 }
 
+static unsigned
+midgard_tex_op(nir_texop op)
+{
+        switch (op) {
+                case nir_texop_tex:
+                case nir_texop_txb:
+                        return TEXTURE_OP_NORMAL;
+                case nir_texop_txl:
+                        return TEXTURE_OP_LOD;
+                default:
+                        unreachable("Unhanlded texture op");
+        }
+}
+
 static void
 emit_tex(compiler_context *ctx, nir_tex_instr *instr)
 {
         /* TODO */
         //assert (!instr->sampler);
         //assert (!instr->texture_array_size);
-        assert (instr->op == nir_texop_tex);
 
         /* Allocate registers via a round robin scheme to alternate between the two registers */
         int reg = ctx->texture_op_count & 1;
@@ -1330,18 +1347,17 @@ emit_tex(compiler_context *ctx, nir_tex_instr *instr)
         int sampler_index = texture_index;
 
         for (unsigned i = 0; i < instr->num_srcs; ++i) {
+                int reg = SSA_FIXED_REGISTER(REGISTER_TEXTURE_BASE + in_reg);
+                int index = nir_src_index(ctx, &instr->src[i].src);
+                midgard_vector_alu_src alu_src = blank_alu_src;
+
                 switch (instr->src[i].src_type) {
                 case nir_tex_src_coord: {
-                        int index = nir_src_index(ctx, &instr->src[i].src);
-
-                        midgard_vector_alu_src alu_src = blank_alu_src;
-
-                        int reg = SSA_FIXED_REGISTER(REGISTER_TEXTURE_BASE + in_reg);
-
                         if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
                                 /* For cubemaps, we need to load coords into
                                  * special r27, and then use a special ld/st op
-                                 * to copy into the texture register */
+                                 * to select the face and copy the xy into the
+                                 * texture register */
 
                                 alu_src.swizzle = SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_X);
 
@@ -1350,7 +1366,7 @@ emit_tex(compiler_context *ctx, nir_tex_instr *instr)
 
                                 midgard_instruction st = m_st_cubemap_coords(reg, 0);
                                 st.load_store.unknown = 0x24; /* XXX: What is this? */
-                                st.load_store.mask = 0x3; /* xy? */
+                                st.load_store.mask = 0x3; /* xy */
                                 st.load_store.swizzle = alu_src.swizzle;
                                 emit_mir_instruction(ctx, st);
 
@@ -1358,12 +1374,26 @@ emit_tex(compiler_context *ctx, nir_tex_instr *instr)
                                 alu_src.swizzle = SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_X, COMPONENT_X);
 
                                 midgard_instruction ins = v_fmov(index, alu_src, reg);
+                                ins.alu.mask = expand_writemask(0x3); /* xy */
                                 emit_mir_instruction(ctx, ins);
                         }
 
                         break;
                 }
 
+                case nir_tex_src_bias:
+                case nir_tex_src_lod: {
+                        /* To keep RA simple, we put the bias/LOD into the w
+                         * component of the input source, which is otherwise in xy */
+
+                        alu_src.swizzle = SWIZZLE_XXXX;
+
+                        midgard_instruction ins = v_fmov(index, alu_src, reg);
+                        ins.alu.mask = expand_writemask(1 << COMPONENT_W);
+                        emit_mir_instruction(ctx, ins);
+                        break;
+               };
+
                 default: {
                         DBG("Unknown source type\n");
                         //assert(0);
@@ -1376,26 +1406,22 @@ emit_tex(compiler_context *ctx, nir_tex_instr *instr)
         midgard_instruction ins = {
                 .type = TAG_TEXTURE_4,
                 .texture = {
-                        .op = TEXTURE_OP_NORMAL,
+                        .op = midgard_tex_op(instr->op),
                         .format = midgard_tex_format(instr->sampler_dim),
                         .texture_handle = texture_index,
                         .sampler_handle = sampler_index,
 
-                        /* TODO: Don't force xyzw */
-                        .swizzle = SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W),
+                        /* TODO: Regalloc it in */
+                        .swizzle = SWIZZLE_XYZW,
                         .mask = 0xF,
 
                         /* TODO: half */
-                        //.in_reg_full = 1,
+                        .in_reg_full = 1,
+                        .in_reg_swizzle = SWIZZLE_XYXX,
                         .out_full = 1,
 
-                        .filter = 1,
-
                         /* Always 1 */
                         .unknown7 = 1,
-
-                        /* Assume we can continue; hint it out later */
-                        .cont = 1,
                 }
         };
 
@@ -1403,15 +1429,24 @@ emit_tex(compiler_context *ctx, nir_tex_instr *instr)
         ins.texture.in_reg_select = in_reg;
         ins.texture.out_reg_select = out_reg;
 
-        /* TODO: Dynamic swizzle input selection, half-swizzles? */
-        if (instr->sampler_dim == GLSL_SAMPLER_DIM_3D) {
-                ins.texture.in_reg_swizzle_right = COMPONENT_X;
-                ins.texture.in_reg_swizzle_left = COMPONENT_Y;
-                //ins.texture.in_reg_swizzle_third = COMPONENT_Z;
-        } else {
-                ins.texture.in_reg_swizzle_left = COMPONENT_X;
-                ins.texture.in_reg_swizzle_right = COMPONENT_Y;
-                //ins.texture.in_reg_swizzle_third = COMPONENT_X;
+        /* Setup bias/LOD if necessary. Only register mode support right now.
+         * TODO: Immediate mode for performance gains */
+
+        if (instr->op == nir_texop_txb || instr->op == nir_texop_txl) {
+                ins.texture.lod_register = true;
+
+                midgard_tex_register_select sel = {
+                        .select = in_reg,
+                        .full = 1,
+
+                        /* w */
+                        .component_lo = 1,
+                        .component_hi = 1
+                };
+
+                uint8_t packed;
+                memcpy(&packed, &sel, sizeof(packed));
+                ins.texture.bias = packed;
         }
 
         emit_mir_instruction(ctx, ins);
@@ -1683,6 +1718,10 @@ embedded_to_inline_constant(compiler_context *ctx)
 static void
 map_ssa_to_alias(compiler_context *ctx, int *ref)
 {
+        /* Sign is used quite deliberately for unused */
+        if (*ref < 0)
+                return;
+
         unsigned int alias = (uintptr_t) _mesa_hash_table_u64_search(ctx->ssa_to_alias, *ref + 1);
 
         if (alias) {
@@ -1721,6 +1760,35 @@ midgard_opt_dead_code_eliminate(compiler_context *ctx, midgard_block *block)
         return progress;
 }
 
+/* Dead code elimination for branches at the end of a block - only one branch
+ * per block is legal semantically */
+
+static void
+midgard_opt_cull_dead_branch(compiler_context *ctx, midgard_block *block)
+{
+        bool branched = false;
+
+        mir_foreach_instr_in_block_safe(block, ins) {
+                if (!midgard_is_branch_unit(ins->unit)) continue;
+
+                /* We ignore prepacked branches since the fragment epilogue is
+                 * just generally special */
+                if (ins->prepacked_branch) continue;
+
+                /* Discards are similarly special and may not correspond to the
+                 * end of a block */
+
+                if (ins->branch.target_type == TARGET_DISCARD) continue;
+
+                if (branched) {
+                        /* We already branched, so this is dead */
+                        mir_remove_instruction(ins);
+                }
+
+                branched = true;
+        }
+}
+
 static bool
 mir_nontrivial_mod(midgard_vector_alu_src src, bool is_int, unsigned mask)
 {
@@ -1736,6 +1804,30 @@ mir_nontrivial_mod(midgard_vector_alu_src src, bool is_int, unsigned mask)
         return false;
 }
 
+static bool
+mir_nontrivial_source2_mod(midgard_instruction *ins)
+{
+        unsigned mask = squeeze_writemask(ins->alu.mask);
+        bool is_int = midgard_is_integer_op(ins->alu.op);
+
+        midgard_vector_alu_src src2 =
+                vector_alu_from_unsigned(ins->alu.src2);
+
+        return mir_nontrivial_mod(src2, is_int, mask);
+}
+
+static bool
+mir_nontrivial_outmod(midgard_instruction *ins)
+{
+        bool is_int = midgard_is_integer_op(ins->alu.op);
+        unsigned mod = ins->alu.outmod;
+
+        if (is_int)
+                return mod != midgard_outmod_int_wrap;
+        else
+                return mod != midgard_outmod_none;
+}
+
 static bool
 midgard_opt_copy_prop(compiler_context *ctx, midgard_block *block)
 {
@@ -1759,15 +1851,8 @@ midgard_opt_copy_prop(compiler_context *ctx, midgard_block *block)
                 if (ins->ssa_args.inline_constant) continue;
                 if (ins->has_constants) continue;
 
-                /* Also, if the move has side effects, we're helpless */
-
-                midgard_vector_alu_src src =
-                        vector_alu_from_unsigned(ins->alu.src2);
-                unsigned mask = squeeze_writemask(ins->alu.mask);
-                bool is_int = midgard_is_integer_op(ins->alu.op);
-
-                if (mir_nontrivial_mod(src, is_int, mask)) continue;
-                if (ins->alu.outmod != midgard_outmod_none) continue;
+                if (mir_nontrivial_source2_mod(ins)) continue;
+                if (mir_nontrivial_outmod(ins)) continue;
 
                 /* We're clear -- rewrite */
                 mir_rewrite_index_src(ctx, to, from);
@@ -1778,6 +1863,68 @@ midgard_opt_copy_prop(compiler_context *ctx, midgard_block *block)
         return progress;
 }
 
+/* fmov.pos is an idiom for fpos. Propoagate the .pos up to the source, so then
+ * the move can be propagated away entirely */
+
+static bool
+mir_compose_float_outmod(midgard_outmod_float *outmod, midgard_outmod_float comp)
+{
+        /* Nothing to do */
+        if (comp == midgard_outmod_none)
+                return true;
+
+        if (*outmod == midgard_outmod_none) {
+                *outmod = comp;
+                return true;
+        }
+
+        /* TODO: Compose rules */
+        return false;
+}
+
+static bool
+midgard_opt_pos_propagate(compiler_context *ctx, midgard_block *block)
+{
+        bool progress = false;
+
+        mir_foreach_instr_in_block_safe(block, ins) {
+                if (ins->type != TAG_ALU_4) continue;
+                if (ins->alu.op != midgard_alu_op_fmov) continue;
+                if (ins->alu.outmod != midgard_outmod_pos) continue;
+
+                /* TODO: Registers? */
+                unsigned src = ins->ssa_args.src1;
+                if (src >= ctx->func->impl->ssa_alloc) continue;
+                assert(!mir_has_multiple_writes(ctx, src));
+
+                /* There might be a source modifier, too */
+                if (mir_nontrivial_source2_mod(ins)) continue;
+
+                /* Backpropagate the modifier */
+                mir_foreach_instr_in_block_from_rev(block, v, mir_prev_op(ins)) {
+                        if (v->type != TAG_ALU_4) continue;
+                        if (v->ssa_args.dest != src) continue;
+
+                        /* Can we even take a float outmod? */
+                        if (midgard_is_integer_out_op(v->alu.op)) continue;
+
+                        midgard_outmod_float temp = v->alu.outmod;
+                        progress |= mir_compose_float_outmod(&temp, ins->alu.outmod);
+
+                        /* Throw in the towel.. */
+                        if (!progress) break;
+
+                        /* Otherwise, transfer the modifier */
+                        v->alu.outmod = temp;
+                        ins->alu.outmod = midgard_outmod_none;
+
+                        break;
+                }
+        }
+
+        return progress;
+}
+
 static bool
 midgard_opt_copy_prop_tex(compiler_context *ctx, midgard_block *block)
 {
@@ -1878,40 +2025,6 @@ midgard_pair_load_store(compiler_context *ctx, midgard_block *block)
         }
 }
 
-/* Emit varying stores late */
-
-static void
-midgard_emit_store(compiler_context *ctx, midgard_block *block) {
-        /* Iterate in reverse to get the final write, rather than the first */
-
-        mir_foreach_instr_in_block_safe_rev(block, ins) {
-                /* Check if what we just wrote needs a store */
-                int idx = ins->ssa_args.dest;
-                uintptr_t varying = ((uintptr_t) _mesa_hash_table_u64_search(ctx->ssa_varyings, idx + 1));
-
-                if (!varying) continue;
-
-                varying -= 1;
-
-                /* We need to store to the appropriate varying, so emit the
-                 * move/store */
-
-                /* TODO: Integrate with special purpose RA (and scheduler?) */
-                bool high_varying_register = false;
-
-                midgard_instruction mov = v_fmov(idx, blank_alu_src, SSA_FIXED_REGISTER(REGISTER_VARYING_BASE + high_varying_register));
-
-                midgard_instruction st = m_st_vary_32(SSA_FIXED_REGISTER(high_varying_register), varying);
-                st.load_store.unknown = 0x1E9E; /* XXX: What is this? */
-
-                mir_insert_instruction_before(mir_next_op(ins), st);
-                mir_insert_instruction_before(mir_next_op(ins), mov);
-
-                /* We no longer need to store this varying */
-                _mesa_hash_table_u64_remove(ctx->ssa_varyings, idx + 1);
-        }
-}
-
 /* If there are leftovers after the below pass, emit actual fmov
  * instructions for the slow-but-correct path */
 
@@ -2030,7 +2143,7 @@ emit_blend_epilogue(compiler_context *ctx)
                         .op = midgard_alu_op_imov,
                         .reg_mode = midgard_reg_mode_8,
                         .dest_override = midgard_dest_override_none,
-                        .outmod = midgard_outmod_int,
+                        .outmod = midgard_outmod_int_wrap,
                         .mask = 0xFF,
                         .src1 = vector_alu_srco_unsigned(blank_alu_src),
                         .src2 = vector_alu_srco_unsigned(blank_alu_src),
@@ -2077,7 +2190,6 @@ emit_block(compiler_context *ctx, nir_block *block)
         /* Perform heavylifting for aliasing */
         actualise_ssa_to_alias(ctx);
 
-        midgard_emit_store(ctx, this_block);
         midgard_pair_load_store(ctx, this_block);
 
         /* Append fragment shader epilogue (value writeout) */
@@ -2287,16 +2399,9 @@ midgard_compile_shader_nir(nir_shader *nir, midgard_program *program, bool is_bl
         /* TODO: Decide this at runtime */
         ctx->uniform_cutoff = 8;
 
-        /* Assign var locations early, so the epilogue can use them if necessary */
-
-        nir_assign_var_locations(&nir->outputs, &nir->num_outputs, glsl_type_size);
-        nir_assign_var_locations(&nir->inputs, &nir->num_inputs, glsl_type_size);
-        nir_assign_var_locations(&nir->uniforms, &nir->num_uniforms, glsl_type_size);
-
         /* Initialize at a global (not block) level hash tables */
 
         ctx->ssa_constants = _mesa_hash_table_u64_create(NULL);
-        ctx->ssa_varyings = _mesa_hash_table_u64_create(NULL);
         ctx->ssa_to_alias = _mesa_hash_table_u64_create(NULL);
         ctx->hash_to_temp = _mesa_hash_table_u64_create(NULL);
         ctx->sysval_to_id = _mesa_hash_table_u64_create(NULL);
@@ -2307,16 +2412,22 @@ midgard_compile_shader_nir(nir_shader *nir, midgard_program *program, bool is_bl
         struct exec_list *varyings =
                 ctx->stage == MESA_SHADER_VERTEX ? &nir->outputs : &nir->inputs;
 
+        unsigned max_varying = 0;
         nir_foreach_variable(var, varyings) {
                 unsigned loc = var->data.driver_location;
                 unsigned sz = glsl_type_size(var->type, FALSE);
 
-                for (int c = 0; c < sz; ++c) {
-                        program->varyings[loc + c] = var->data.location;
+                for (int c = loc; c < (loc + sz); ++c) {
+                        program->varyings[c] = var->data.location;
+                        max_varying = MAX2(max_varying, c);
                 }
         }
 
-        /* Lower gl_Position pre-optimisation */
+        /* Lower gl_Position pre-optimisation, but after lowering vars to ssa
+         * (so we don't accidentally duplicate the epilogue since mesa/st has
+         * messed with our I/O quite a bit already) */
+
+        NIR_PASS_V(nir, nir_lower_vars_to_ssa);
 
         if (ctx->stage == MESA_SHADER_VERTEX)
                 NIR_PASS_V(nir, nir_lower_viewport_transform);
@@ -2349,7 +2460,7 @@ midgard_compile_shader_nir(nir_shader *nir, midgard_program *program, bool is_bl
         memcpy(program->sysvals, ctx->sysvals, sizeof(ctx->sysvals[0]) * ctx->sysval_count);
 
         program->attribute_count = (ctx->stage == MESA_SHADER_VERTEX) ? nir->num_inputs : 0;
-        program->varying_count = (ctx->stage == MESA_SHADER_VERTEX) ? nir->num_outputs : ((ctx->stage == MESA_SHADER_FRAGMENT) ? nir->num_inputs : 0);
+        program->varying_count = max_varying + 1; /* Fencepost off-by-one */
 
         nir_foreach_function(func, nir) {
                 if (!func->impl)
@@ -2375,12 +2486,20 @@ midgard_compile_shader_nir(nir_shader *nir, midgard_program *program, bool is_bl
                 progress = false;
 
                 mir_foreach_block(ctx, block) {
+                        progress |= midgard_opt_pos_propagate(ctx, block);
                         progress |= midgard_opt_copy_prop(ctx, block);
                         progress |= midgard_opt_copy_prop_tex(ctx, block);
                         progress |= midgard_opt_dead_code_eliminate(ctx, block);
                 }
         } while (progress);
 
+        /* Nested control-flow can result in dead branches at the end of the
+         * block. This messes with our analysis and is just dead code, so cull
+         * them */
+        mir_foreach_block(ctx, block) {
+                midgard_opt_cull_dead_branch(ctx, block);
+        }
+
         /* Schedule! */
         schedule_program(ctx);
 
@@ -2536,7 +2655,7 @@ midgard_compile_shader_nir(nir_shader *nir, midgard_program *program, bool is_bl
          * last is an ALU, then it's also 1... */
 
         mir_foreach_block(ctx, block) {
-                util_dynarray_foreach(&block->bundles, midgard_bundle, bundle) {
+                mir_foreach_bundle_in_block(block, bundle) {
                         int lookahead = 1;
 
                         if (current_bundle + 1 < bundle_count) {