panfrost/midgard: Refactor schedule/emit pipeline
authorAlyssa Rosenzweig <alyssa@rosenzweig.io>
Wed, 22 May 2019 04:33:21 +0000 (04:33 +0000)
committerAlyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Tue, 4 Jun 2019 20:14:50 +0000 (20:14 +0000)
First, this moves the scheduler and emitter out of midgard_compile.c
into their own dedicated files.

More interestingly, this slims down midgard_bundle to be essentially an
array of _pointers_ to midgard_instructions (plus some bundling
metadata), rather than the instructions and packing themselves. The
difference is critical, as it means that (within reason, i.e. as long as
it doesn't affect the schedule) midgard_instrucitons can now be modified
_after_ scheduling while having changes updated in the final binary.

On a more philosophical level, this removes an IR. Previously, the IR
before scheduling (MIR) was separate from the IR after scheduling
(post-schedule MIR), requiring a separate set of utilities to traverse,
using different idioms. There was no good reason for this, and it
restricts our flexibility with the RA. So unify all the things!

Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Reviewed-by: Ryan Houdek <Sonicadvance1@gmail.com>
src/gallium/drivers/panfrost/meson.build
src/gallium/drivers/panfrost/midgard/compiler.h
src/gallium/drivers/panfrost/midgard/helpers.h
src/gallium/drivers/panfrost/midgard/midgard_compile.c
src/gallium/drivers/panfrost/midgard/midgard_emit.c [new file with mode: 0644]
src/gallium/drivers/panfrost/midgard/midgard_schedule.c [new file with mode: 0644]

index fb92954854a1c7520d8a351bc9e47c06d9a90aed..5adf24282c4c483ade40a6389de73c4699872d45 100644 (file)
@@ -28,6 +28,8 @@ files_panfrost = files(
 
   'midgard/midgard_compile.c',
   'midgard/midgard_print.c',
+  'midgard/midgard_schedule.c',
+  'midgard/midgard_emit.c',
   'midgard/midgard_ra.c',
   'midgard/midgard_liveness.c',
   'midgard/midgard_ops.c',
index d3d64d37c491c30c9e3afb8ac64122845eeb6f4a..96760d964b0c5092cef39d3cf756b7851ab65332 100644 (file)
@@ -169,7 +169,7 @@ typedef struct midgard_bundle {
 
         /* Instructions contained by the bundle */
         int instruction_count;
-        midgard_instruction instructions[5];
+        midgard_instruction *instructions[5];
 
         /* Bundle-wide ALU configuration */
         int padding;
@@ -177,13 +177,6 @@ typedef struct midgard_bundle {
         bool has_embedded_constants;
         float constants[4];
         bool has_blend_constant;
-
-        uint16_t register_words[8];
-        int register_words_count;
-
-        uint64_t body_words[8];
-        size_t body_size[8];
-        int body_words_count;
 } midgard_bundle;
 
 typedef struct compiler_context {
@@ -422,4 +415,11 @@ struct ra_graph* allocate_registers(compiler_context *ctx);
 void install_registers(compiler_context *ctx, struct ra_graph *g);
 bool mir_is_live_after(compiler_context *ctx, midgard_block *block, midgard_instruction *start, int src);
 
+/* Final emission */
+
+void emit_binary_bundle(
+                compiler_context *ctx,
+                midgard_bundle *bundle,
+                struct util_dynarray *emission,
+                int next_tag);
 #endif
index cf3a63e7587c35a059754925c25597da0140710e..9adc5b35195fb805eaa3bd86ce4f997c289e4251 100644 (file)
@@ -22,6 +22,7 @@
 #ifndef __MDG_HELPERS_H
 #define __MDG_HELPERS_H
 
+#include "util/macros.h"
 #include <string.h>
 
 #define OP_IS_STORE_VARY(op) (\
 #define TAG_ALU_12 0xA
 #define TAG_ALU_16 0xB
 
+static inline int
+quadword_size(int tag)
+{
+        switch (tag) {
+        case TAG_ALU_4:
+        case TAG_LOAD_STORE_4:
+        case TAG_TEXTURE_4:
+                return 1;
+        case TAG_ALU_8:
+                return 2;
+        case TAG_ALU_12:
+                return 3;
+        case TAG_ALU_16:
+                return 4;
+        default:
+                unreachable("Unknown tag");
+        }
+}
+
 /* Special register aliases */
 
 #define MAX_WORK_REGISTERS 16
index 6e650db2b1c115c5d3e71545f1976792909cc342..fab50d671a8995a36c45e07b28ec0eb98d08d7d9 100644 (file)
@@ -1484,109 +1484,6 @@ emit_instr(compiler_context *ctx, struct nir_instr *instr)
         }
 }
 
-/* Midgard IR only knows vector ALU types, but we sometimes need to actually
- * use scalar ALU instructions, for functional or performance reasons. To do
- * this, we just demote vector ALU payloads to scalar. */
-
-static int
-component_from_mask(unsigned mask)
-{
-        for (int c = 0; c < 4; ++c) {
-                if (mask & (3 << (2 * c)))
-                        return c;
-        }
-
-        assert(0);
-        return 0;
-}
-
-static bool
-is_single_component_mask(unsigned mask)
-{
-        int components = 0;
-
-        for (int c = 0; c < 4; ++c)
-                if (mask & (3 << (2 * c)))
-                        components++;
-
-        return components == 1;
-}
-
-/* Create a mask of accessed components from a swizzle to figure out vector
- * dependencies */
-
-static unsigned
-swizzle_to_access_mask(unsigned swizzle)
-{
-        unsigned component_mask = 0;
-
-        for (int i = 0; i < 4; ++i) {
-                unsigned c = (swizzle >> (2 * i)) & 3;
-                component_mask |= (1 << c);
-        }
-
-        return component_mask;
-}
-
-static unsigned
-vector_to_scalar_source(unsigned u, bool is_int)
-{
-        midgard_vector_alu_src v;
-        memcpy(&v, &u, sizeof(v));
-
-        /* TODO: Integers */
-
-        midgard_scalar_alu_src s = {
-                .full = !v.half,
-                .component = (v.swizzle & 3) << 1
-        };
-
-        if (is_int) {
-                /* TODO */
-        } else {
-                s.abs = v.mod & MIDGARD_FLOAT_MOD_ABS;
-                s.negate = v.mod & MIDGARD_FLOAT_MOD_NEG;
-        }
-
-        unsigned o;
-        memcpy(&o, &s, sizeof(s));
-
-        return o & ((1 << 6) - 1);
-}
-
-static midgard_scalar_alu
-vector_to_scalar_alu(midgard_vector_alu v, midgard_instruction *ins)
-{
-        bool is_int = midgard_is_integer_op(v.op);
-
-        /* The output component is from the mask */
-        midgard_scalar_alu s = {
-                .op = v.op,
-                .src1 = vector_to_scalar_source(v.src1, is_int),
-                .src2 = vector_to_scalar_source(v.src2, is_int),
-                .unknown = 0,
-                .outmod = v.outmod,
-                .output_full = 1, /* TODO: Half */
-                .output_component = component_from_mask(v.mask) << 1,
-        };
-
-        /* Inline constant is passed along rather than trying to extract it
-         * from v */
-
-        if (ins->ssa_args.inline_constant) {
-                uint16_t imm = 0;
-                int lower_11 = ins->inline_constant & ((1 << 12) - 1);
-                imm |= (lower_11 >> 9) & 3;
-                imm |= (lower_11 >> 6) & 4;
-                imm |= (lower_11 >> 2) & 0x38;
-                imm |= (lower_11 & 63) << 6;
-
-                s.src2 = imm;
-        }
-
-        return s;
-}
-
 /* Midgard prefetches instruction types, so during emission we need to
  * lookahead too. Unless this is the last instruction, in which we return 1. Or
  * if this is the second to last and the last is an ALU, then it's also 1... */
@@ -1594,599 +1491,6 @@ vector_to_scalar_alu(midgard_vector_alu v, midgard_instruction *ins)
 #define IS_ALU(tag) (tag == TAG_ALU_4 || tag == TAG_ALU_8 ||  \
                     tag == TAG_ALU_12 || tag == TAG_ALU_16)
 
-#define EMIT_AND_COUNT(type, val) util_dynarray_append(emission, type, val); \
-                                 bytes_emitted += sizeof(type)
-
-static void
-emit_binary_vector_instruction(midgard_instruction *ains,
-                               uint16_t *register_words, int *register_words_count,
-                               uint64_t *body_words, size_t *body_size, int *body_words_count,
-                               size_t *bytes_emitted)
-{
-        memcpy(&register_words[(*register_words_count)++], &ains->registers, sizeof(ains->registers));
-        *bytes_emitted += sizeof(midgard_reg_info);
-
-        body_size[*body_words_count] = sizeof(midgard_vector_alu);
-        memcpy(&body_words[(*body_words_count)++], &ains->alu, sizeof(ains->alu));
-        *bytes_emitted += sizeof(midgard_vector_alu);
-}
-
-/* Checks for an SSA data hazard between two adjacent instructions, keeping in
- * mind that we are a vector architecture and we can write to different
- * components simultaneously */
-
-static bool
-can_run_concurrent_ssa(midgard_instruction *first, midgard_instruction *second)
-{
-        /* Each instruction reads some registers and writes to a register. See
-         * where the first writes */
-
-        /* Figure out where exactly we wrote to */
-        int source = first->ssa_args.dest;
-        int source_mask = first->type == TAG_ALU_4 ? squeeze_writemask(first->alu.mask) : 0xF;
-
-        /* As long as the second doesn't read from the first, we're okay */
-        if (second->ssa_args.src0 == source) {
-                if (first->type == TAG_ALU_4) {
-                        /* Figure out which components we just read from */
-
-                        int q = second->alu.src1;
-                        midgard_vector_alu_src *m = (midgard_vector_alu_src *) &q;
-
-                        /* Check if there are components in common, and fail if so */
-                        if (swizzle_to_access_mask(m->swizzle) & source_mask)
-                                return false;
-                } else
-                        return false;
-
-        }
-
-        if (second->ssa_args.src1 == source)
-                return false;
-
-        /* Otherwise, it's safe in that regard. Another data hazard is both
-         * writing to the same place, of course */
-
-        if (second->ssa_args.dest == source) {
-                /* ...but only if the components overlap */
-                int dest_mask = second->type == TAG_ALU_4 ? squeeze_writemask(second->alu.mask) : 0xF;
-
-                if (dest_mask & source_mask)
-                        return false;
-        }
-
-        /* ...That's it */
-        return true;
-}
-
-static bool
-midgard_has_hazard(
-                midgard_instruction **segment, unsigned segment_size,
-                midgard_instruction *ains)
-{
-        for (int s = 0; s < segment_size; ++s)
-                if (!can_run_concurrent_ssa(segment[s], ains))
-                        return true;
-
-        return false;
-
-
-}
-
-/* Schedules, but does not emit, a single basic block. After scheduling, the
- * final tag and size of the block are known, which are necessary for branching
- * */
-
-static midgard_bundle
-schedule_bundle(compiler_context *ctx, midgard_block *block, midgard_instruction *ins, int *skip)
-{
-        int instructions_emitted = 0, instructions_consumed = -1;
-        midgard_bundle bundle = { 0 };
-
-        uint8_t tag = ins->type;
-
-        /* Default to the instruction's tag */
-        bundle.tag = tag;
-
-        switch (ins->type) {
-        case TAG_ALU_4: {
-                uint32_t control = 0;
-                size_t bytes_emitted = sizeof(control);
-
-                /* TODO: Constant combining */
-                int index = 0, last_unit = 0;
-
-                /* Previous instructions, for the purpose of parallelism */
-                midgard_instruction *segment[4] = {0};
-                int segment_size = 0;
-
-                instructions_emitted = -1;
-                midgard_instruction *pins = ins;
-
-                for (;;) {
-                        midgard_instruction *ains = pins;
-
-                        /* Advance instruction pointer */
-                        if (index) {
-                                ains = mir_next_op(pins);
-                                pins = ains;
-                        }
-
-                        /* Out-of-work condition */
-                        if ((struct list_head *) ains == &block->instructions)
-                                break;
-
-                        /* Ensure that the chain can continue */
-                        if (ains->type != TAG_ALU_4) break;
-
-                        /* If there's already something in the bundle and we
-                         * have weird scheduler constraints, break now */
-                        if (ains->precede_break && index) break;
-
-                        /* According to the presentation "The ARM
-                         * Mali-T880 Mobile GPU" from HotChips 27,
-                         * there are two pipeline stages. Branching
-                         * position determined experimentally. Lines
-                         * are executed in parallel:
-                         *
-                         * [ VMUL ] [ SADD ]
-                         * [ VADD ] [ SMUL ] [ LUT ] [ BRANCH ]
-                         *
-                         * Verify that there are no ordering dependencies here.
-                         *
-                         * TODO: Allow for parallelism!!!
-                         */
-
-                        /* Pick a unit for it if it doesn't force a particular unit */
-
-                        int unit = ains->unit;
-
-                        if (!unit) {
-                                int op = ains->alu.op;
-                                int units = alu_opcode_props[op].props;
-
-                                bool vectorable = units & UNITS_ANY_VECTOR;
-                                bool scalarable = units & UNITS_SCALAR;
-                                bool could_scalar = is_single_component_mask(ains->alu.mask);
-                                bool vector = vectorable && !(could_scalar && scalarable);
-
-                                if (!vector)
-                                        assert(units & UNITS_SCALAR);
-
-                                if (vector) {
-                                        if (last_unit >= UNIT_VADD) {
-                                                if (units & UNIT_VLUT)
-                                                        unit = UNIT_VLUT;
-                                                else
-                                                        break;
-                                        } else {
-                                                if ((units & UNIT_VMUL) && !(control & UNIT_VMUL))
-                                                        unit = UNIT_VMUL;
-                                                else if ((units & UNIT_VADD) && !(control & UNIT_VADD))
-                                                        unit = UNIT_VADD;
-                                                else if (units & UNIT_VLUT)
-                                                        unit = UNIT_VLUT;
-                                                else
-                                                        break;
-                                        }
-                                } else {
-                                        if (last_unit >= UNIT_VADD) {
-                                                if ((units & UNIT_SMUL) && !(control & UNIT_SMUL))
-                                                        unit = UNIT_SMUL;
-                                                else if (units & UNIT_VLUT)
-                                                        unit = UNIT_VLUT;
-                                                else
-                                                        break;
-                                        } else {
-                                                if ((units & UNIT_SADD) && !(control & UNIT_SADD) && !midgard_has_hazard(segment, segment_size, ains))
-                                                        unit = UNIT_SADD;
-                                                else if (units & UNIT_SMUL)
-                                                        unit = ((units & UNIT_VMUL) && !(control & UNIT_VMUL)) ? UNIT_VMUL : UNIT_SMUL;
-                                                else if ((units & UNIT_VADD) && !(control & UNIT_VADD))
-                                                        unit = UNIT_VADD;
-                                                else
-                                                        break;
-                                        }
-                                }
-
-                                assert(unit & units);
-                        }
-
-                        /* Late unit check, this time for encoding (not parallelism) */
-                        if (unit <= last_unit) break;
-
-                        /* Clear the segment */
-                        if (last_unit < UNIT_VADD && unit >= UNIT_VADD)
-                                segment_size = 0;
-
-                        if (midgard_has_hazard(segment, segment_size, ains))
-                                break;
-
-                        /* We're good to go -- emit the instruction */
-                        ains->unit = unit;
-
-                        segment[segment_size++] = ains;
-
-                        /* Only one set of embedded constants per
-                         * bundle possible; if we have more, we must
-                         * break the chain early, unfortunately */
-
-                        if (ains->has_constants) {
-                                if (bundle.has_embedded_constants) {
-                                        /* The blend constant needs to be
-                                         * alone, since it conflicts with
-                                         * everything by definition*/
-
-                                        if (ains->has_blend_constant || bundle.has_blend_constant)
-                                                break;
-
-                                        /* ...but if there are already
-                                         * constants but these are the
-                                         * *same* constants, we let it
-                                         * through */
-
-                                        if (memcmp(bundle.constants, ains->constants, sizeof(bundle.constants)))
-                                                break;
-                                } else {
-                                        bundle.has_embedded_constants = true;
-                                        memcpy(bundle.constants, ains->constants, sizeof(bundle.constants));
-
-                                        /* If this is a blend shader special constant, track it for patching */
-                                        bundle.has_blend_constant |= ains->has_blend_constant;
-                                }
-                        }
-
-                        if (ains->unit & UNITS_ANY_VECTOR) {
-                                emit_binary_vector_instruction(ains, bundle.register_words,
-                                                               &bundle.register_words_count, bundle.body_words,
-                                                               bundle.body_size, &bundle.body_words_count, &bytes_emitted);
-                        } else if (ains->compact_branch) {
-                                /* All of r0 has to be written out
-                                 * along with the branch writeout.
-                                 * (slow!) */
-
-                                if (ains->writeout) {
-                                        if (index == 0) {
-                                                midgard_instruction ins = v_fmov(0, blank_alu_src, SSA_FIXED_REGISTER(0));
-                                                ins.unit = UNIT_VMUL;
-
-                                                control |= ins.unit;
-
-                                                emit_binary_vector_instruction(&ins, bundle.register_words,
-                                                                               &bundle.register_words_count, bundle.body_words,
-                                                                               bundle.body_size, &bundle.body_words_count, &bytes_emitted);
-                                        } else {
-                                                /* Analyse the group to see if r0 is written in full, on-time, without hanging dependencies*/
-                                                bool written_late = false;
-                                                bool components[4] = { 0 };
-                                                uint16_t register_dep_mask = 0;
-                                                uint16_t written_mask = 0;
-
-                                                midgard_instruction *qins = ins;
-                                                for (int t = 0; t < index; ++t) {
-                                                        if (qins->registers.out_reg != 0) {
-                                                                /* Mark down writes */
-
-                                                                written_mask |= (1 << qins->registers.out_reg);
-                                                        } else {
-                                                                /* Mark down the register dependencies for errata check */
-
-                                                                if (qins->registers.src1_reg < 16)
-                                                                        register_dep_mask |= (1 << qins->registers.src1_reg);
-
-                                                                if (qins->registers.src2_reg < 16)
-                                                                        register_dep_mask |= (1 << qins->registers.src2_reg);
-
-                                                                int mask = qins->alu.mask;
-
-                                                                for (int c = 0; c < 4; ++c)
-                                                                        if (mask & (0x3 << (2 * c)))
-                                                                                components[c] = true;
-
-                                                                /* ..but if the writeout is too late, we have to break up anyway... for some reason */
-
-                                                                if (qins->unit == UNIT_VLUT)
-                                                                        written_late = true;
-                                                        }
-
-                                                        /* Advance instruction pointer */
-                                                        qins = mir_next_op(qins);
-                                                }
-
-
-                                                /* Register dependencies of r0 must be out of fragment writeout bundle */
-                                                if (register_dep_mask & written_mask)
-                                                        break;
-
-                                                if (written_late)
-                                                        break;
-
-                                                /* If even a single component is not written, break it up (conservative check). */
-                                                bool breakup = false;
-
-                                                for (int c = 0; c < 4; ++c)
-                                                        if (!components[c])
-                                                                breakup = true;
-
-                                                if (breakup)
-                                                        break;
-
-                                                /* Otherwise, we're free to proceed */
-                                        }
-                                }
-
-                                if (ains->unit == ALU_ENAB_BRANCH) {
-                                        bundle.body_size[bundle.body_words_count] = sizeof(midgard_branch_extended);
-                                        memcpy(&bundle.body_words[bundle.body_words_count++], &ains->branch_extended, sizeof(midgard_branch_extended));
-                                        bytes_emitted += sizeof(midgard_branch_extended);
-                                } else {
-                                        bundle.body_size[bundle.body_words_count] = sizeof(ains->br_compact);
-                                        memcpy(&bundle.body_words[bundle.body_words_count++], &ains->br_compact, sizeof(ains->br_compact));
-                                        bytes_emitted += sizeof(ains->br_compact);
-                                }
-                        } else {
-                                memcpy(&bundle.register_words[bundle.register_words_count++], &ains->registers, sizeof(ains->registers));
-                                bytes_emitted += sizeof(midgard_reg_info);
-
-                                bundle.body_size[bundle.body_words_count] = sizeof(midgard_scalar_alu);
-                                bundle.body_words_count++;
-                                bytes_emitted += sizeof(midgard_scalar_alu);
-                        }
-
-                        /* Defer marking until after writing to allow for break */
-                        control |= ains->unit;
-                        last_unit = ains->unit;
-                        ++instructions_emitted;
-                        ++index;
-                }
-
-                /* Bubble up the number of instructions for skipping */
-                instructions_consumed = index - 1;
-
-                int padding = 0;
-
-                /* Pad ALU op to nearest word */
-
-                if (bytes_emitted & 15) {
-                        padding = 16 - (bytes_emitted & 15);
-                        bytes_emitted += padding;
-                }
-
-                /* Constants must always be quadwords */
-                if (bundle.has_embedded_constants)
-                        bytes_emitted += 16;
-
-                /* Size ALU instruction for tag */
-                bundle.tag = (TAG_ALU_4) + (bytes_emitted / 16) - 1;
-                bundle.padding = padding;
-                bundle.control = bundle.tag | control;
-
-                break;
-        }
-
-        case TAG_LOAD_STORE_4: {
-                /* Load store instructions have two words at once. If
-                 * we only have one queued up, we need to NOP pad.
-                 * Otherwise, we store both in succession to save space
-                 * and cycles -- letting them go in parallel -- skip
-                 * the next. The usefulness of this optimisation is
-                 * greatly dependent on the quality of the instruction
-                 * scheduler.
-                 */
-
-                midgard_instruction *next_op = mir_next_op(ins);
-
-                if ((struct list_head *) next_op != &block->instructions && next_op->type == TAG_LOAD_STORE_4) {
-                        /* As the two operate concurrently, make sure
-                         * they are not dependent */
-
-                        if (can_run_concurrent_ssa(ins, next_op) || true) {
-                                /* Skip ahead, since it's redundant with the pair */
-                                instructions_consumed = 1 + (instructions_emitted++);
-                        }
-                }
-
-                break;
-        }
-
-        default:
-                /* Texture ops default to single-op-per-bundle scheduling */
-                break;
-        }
-
-        /* Copy the instructions into the bundle */
-        bundle.instruction_count = instructions_emitted + 1;
-
-        int used_idx = 0;
-
-        midgard_instruction *uins = ins;
-        for (int i = 0; used_idx < bundle.instruction_count; ++i) {
-                bundle.instructions[used_idx++] = *uins;
-                uins = mir_next_op(uins);
-        }
-
-        *skip = (instructions_consumed == -1) ? instructions_emitted : instructions_consumed;
-
-        return bundle;
-}
-
-static int
-quadword_size(int tag)
-{
-        switch (tag) {
-        case TAG_ALU_4:
-                return 1;
-
-        case TAG_ALU_8:
-                return 2;
-
-        case TAG_ALU_12:
-                return 3;
-
-        case TAG_ALU_16:
-                return 4;
-
-        case TAG_LOAD_STORE_4:
-                return 1;
-
-        case TAG_TEXTURE_4:
-                return 1;
-
-        default:
-                assert(0);
-                return 0;
-        }
-}
-
-/* Schedule a single block by iterating its instruction to create bundles.
- * While we go, tally about the bundle sizes to compute the block size. */
-
-static void
-schedule_block(compiler_context *ctx, midgard_block *block)
-{
-        util_dynarray_init(&block->bundles, NULL);
-
-        block->quadword_count = 0;
-
-        mir_foreach_instr_in_block(block, ins) {
-                int skip;
-                midgard_bundle bundle = schedule_bundle(ctx, block, ins, &skip);
-                util_dynarray_append(&block->bundles, midgard_bundle, bundle);
-
-                if (bundle.has_blend_constant) {
-                        /* TODO: Multiblock? */
-                        int quadwords_within_block = block->quadword_count + quadword_size(bundle.tag) - 1;
-                        ctx->blend_constant_offset = quadwords_within_block * 0x10;
-                }
-
-                while(skip--)
-                        ins = mir_next_op(ins);
-
-                block->quadword_count += quadword_size(bundle.tag);
-        }
-
-        block->is_scheduled = true;
-}
-
-static void
-schedule_program(compiler_context *ctx)
-{
-        /* We run RA prior to scheduling */
-        struct ra_graph *g = allocate_registers(ctx);
-        install_registers(ctx, g);
-
-        mir_foreach_block(ctx, block) {
-                schedule_block(ctx, block);
-        }
-}
-
-/* After everything is scheduled, emit whole bundles at a time */
-
-static void
-emit_binary_bundle(compiler_context *ctx, midgard_bundle *bundle, struct util_dynarray *emission, int next_tag)
-{
-        int lookahead = next_tag << 4;
-
-        switch (bundle->tag) {
-        case TAG_ALU_4:
-        case TAG_ALU_8:
-        case TAG_ALU_12:
-        case TAG_ALU_16: {
-                /* Actually emit each component */
-                util_dynarray_append(emission, uint32_t, bundle->control | lookahead);
-
-                for (int i = 0; i < bundle->register_words_count; ++i)
-                        util_dynarray_append(emission, uint16_t, bundle->register_words[i]);
-
-                /* Emit body words based on the instructions bundled */
-                for (int i = 0; i < bundle->instruction_count; ++i) {
-                        midgard_instruction *ins = &bundle->instructions[i];
-
-                        if (ins->unit & UNITS_ANY_VECTOR) {
-                                memcpy(util_dynarray_grow(emission, sizeof(midgard_vector_alu)), &ins->alu, sizeof(midgard_vector_alu));
-                        } else if (ins->compact_branch) {
-                                /* Dummy move, XXX DRY */
-                                if ((i == 0) && ins->writeout) {
-                                        midgard_instruction ins = v_fmov(0, blank_alu_src, SSA_FIXED_REGISTER(0));
-                                        memcpy(util_dynarray_grow(emission, sizeof(midgard_vector_alu)), &ins.alu, sizeof(midgard_vector_alu));
-                                }
-
-                                if (ins->unit == ALU_ENAB_BR_COMPACT) {
-                                        memcpy(util_dynarray_grow(emission, sizeof(ins->br_compact)), &ins->br_compact, sizeof(ins->br_compact));
-                                } else {
-                                        memcpy(util_dynarray_grow(emission, sizeof(ins->branch_extended)), &ins->branch_extended, sizeof(ins->branch_extended));
-                                }
-                        } else {
-                                /* Scalar */
-                                midgard_scalar_alu scalarised = vector_to_scalar_alu(ins->alu, ins);
-                                memcpy(util_dynarray_grow(emission, sizeof(scalarised)), &scalarised, sizeof(scalarised));
-                        }
-                }
-
-                /* Emit padding (all zero) */
-                memset(util_dynarray_grow(emission, bundle->padding), 0, bundle->padding);
-
-                /* Tack on constants */
-
-                if (bundle->has_embedded_constants) {
-                        util_dynarray_append(emission, float, bundle->constants[0]);
-                        util_dynarray_append(emission, float, bundle->constants[1]);
-                        util_dynarray_append(emission, float, bundle->constants[2]);
-                        util_dynarray_append(emission, float, bundle->constants[3]);
-                }
-
-                break;
-        }
-
-        case TAG_LOAD_STORE_4: {
-                /* One or two composing instructions */
-
-                uint64_t current64, next64 = LDST_NOP;
-
-                memcpy(&current64, &bundle->instructions[0].load_store, sizeof(current64));
-
-                if (bundle->instruction_count == 2)
-                        memcpy(&next64, &bundle->instructions[1].load_store, sizeof(next64));
-
-                midgard_load_store instruction = {
-                        .type = bundle->tag,
-                        .next_type = next_tag,
-                        .word1 = current64,
-                        .word2 = next64
-                };
-
-                util_dynarray_append(emission, midgard_load_store, instruction);
-
-                break;
-        }
-
-        case TAG_TEXTURE_4: {
-                /* Texture instructions are easy, since there is no
-                 * pipelining nor VLIW to worry about. We may need to set the .last flag */
-
-                midgard_instruction *ins = &bundle->instructions[0];
-
-                ins->texture.type = TAG_TEXTURE_4;
-                ins->texture.next_type = next_tag;
-
-                ctx->texture_op_count--;
-
-                if (!ctx->texture_op_count) {
-                        ins->texture.cont = 0;
-                        ins->texture.last = 1;
-                }
-
-                util_dynarray_append(emission, midgard_texture_word, ins->texture);
-                break;
-        }
-
-        default:
-                DBG("Unknown midgard instruction type\n");
-                assert(0);
-                break;
-        }
-}
-
 
 /* ALU instructions can inline or embed constants, which decreases register
  * pressure and saves space. */
@@ -3102,7 +2406,7 @@ midgard_compile_shader_nir(nir_shader *nir, midgard_program *program, bool is_bl
         mir_foreach_block(ctx, block) {
                 util_dynarray_foreach(&block->bundles, midgard_bundle, bundle) {
                         for (int c = 0; c < bundle->instruction_count; ++c) {
-                                midgard_instruction *ins = &bundle->instructions[c];
+                                midgard_instruction *ins = bundle->instructions[c];
 
                                 if (!midgard_is_branch_unit(ins->unit)) continue;
 
@@ -3117,10 +2421,13 @@ midgard_compile_shader_nir(nir_shader *nir, midgard_program *program, bool is_bl
                                 /* Determine the block we're jumping to */
                                 int target_number = ins->branch.target_block;
 
-                                /* Report the destination tag. Discards don't need this */
+                                /* Report the destination tag */
                                 int dest_tag = is_discard ? 0 : midgard_get_first_tag_from_block(ctx, target_number);
 
-                                /* Count up the number of quadwords we're jumping over. That is, the number of quadwords in each of the blocks between (br_block_idx, target_number) */
+                                /* Count up the number of quadwords we're
+                                 * jumping over = number of quadwords until
+                                 * (br_block_idx, target_number) */
+
                                 int quadword_offset = 0;
 
                                 if (is_discard) {
diff --git a/src/gallium/drivers/panfrost/midgard/midgard_emit.c b/src/gallium/drivers/panfrost/midgard/midgard_emit.c
new file mode 100644 (file)
index 0000000..ffa0873
--- /dev/null
@@ -0,0 +1,229 @@
+/*
+ * Copyright (C) 2018-2019 Alyssa Rosenzweig <alyssa@rosenzweig.io>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "compiler.h"
+#include "midgard_ops.h"
+
+/* Midgard IR only knows vector ALU types, but we sometimes need to actually
+ * use scalar ALU instructions, for functional or performance reasons. To do
+ * this, we just demote vector ALU payloads to scalar. */
+
+static int
+component_from_mask(unsigned mask)
+{
+        for (int c = 0; c < 4; ++c) {
+                if (mask & (3 << (2 * c)))
+                        return c;
+        }
+
+        assert(0);
+        return 0;
+}
+
+static unsigned
+vector_to_scalar_source(unsigned u, bool is_int)
+{
+        midgard_vector_alu_src v;
+        memcpy(&v, &u, sizeof(v));
+
+        /* TODO: Integers */
+
+        midgard_scalar_alu_src s = {
+                .full = !v.half,
+                .component = (v.swizzle & 3) << 1
+        };
+
+        if (is_int) {
+                /* TODO */
+        } else {
+                s.abs = v.mod & MIDGARD_FLOAT_MOD_ABS;
+                s.negate = v.mod & MIDGARD_FLOAT_MOD_NEG;
+        }
+
+        unsigned o;
+        memcpy(&o, &s, sizeof(s));
+
+        return o & ((1 << 6) - 1);
+}
+
+static midgard_scalar_alu
+vector_to_scalar_alu(midgard_vector_alu v, midgard_instruction *ins)
+{
+        bool is_int = midgard_is_integer_op(v.op);
+
+        /* The output component is from the mask */
+        midgard_scalar_alu s = {
+                .op = v.op,
+                .src1 = vector_to_scalar_source(v.src1, is_int),
+                .src2 = vector_to_scalar_source(v.src2, is_int),
+                .unknown = 0,
+                .outmod = v.outmod,
+                .output_full = 1, /* TODO: Half */
+                .output_component = component_from_mask(v.mask) << 1,
+        };
+
+        /* Inline constant is passed along rather than trying to extract it
+         * from v */
+
+        if (ins->ssa_args.inline_constant) {
+                uint16_t imm = 0;
+                int lower_11 = ins->inline_constant & ((1 << 12) - 1);
+                imm |= (lower_11 >> 9) & 3;
+                imm |= (lower_11 >> 6) & 4;
+                imm |= (lower_11 >> 2) & 0x38;
+                imm |= (lower_11 & 63) << 6;
+
+                s.src2 = imm;
+        }
+
+        return s;
+}
+
+static void
+emit_alu_bundle(compiler_context *ctx,
+                midgard_bundle *bundle,
+                struct util_dynarray *emission,
+                unsigned lookahead)
+{
+        /* Emit the control word */
+        util_dynarray_append(emission, uint32_t, bundle->control | lookahead);
+
+        /* Next up, emit register words */
+        for (unsigned i = 0; i < bundle->instruction_count; ++i) {
+                midgard_instruction *ins = bundle->instructions[i];
+
+                /* Check if this instruction has registers */
+                if (ins->compact_branch || ins->prepacked_branch) continue;
+
+                /* Otherwise, just emit the registers */
+                uint16_t reg_word = 0;
+                memcpy(&reg_word, &ins->registers, sizeof(uint16_t));
+                util_dynarray_append(emission, uint16_t, reg_word);
+        }
+
+        /* Now, we emit the body itself */
+        for (unsigned i = 0; i < bundle->instruction_count; ++i) {
+                midgard_instruction *ins = bundle->instructions[i];
+
+                /* Where is this body */
+                unsigned size = 0;
+                void *source = NULL;
+
+                /* In case we demote to a scalar */
+                midgard_scalar_alu scalarized;
+
+                if (ins->unit & UNITS_ANY_VECTOR) {
+                        size = sizeof(midgard_vector_alu);
+                        source = &ins->alu;
+                } else if (ins->unit == ALU_ENAB_BR_COMPACT) {
+                        size = sizeof(midgard_branch_cond);
+                        source = &ins->br_compact;
+                } else if (ins->compact_branch) { /* misnomer */
+                        size = sizeof(midgard_branch_extended);
+                        source = &ins->branch_extended;
+                } else {
+                        size = sizeof(midgard_scalar_alu);
+                        scalarized = vector_to_scalar_alu(ins->alu, ins);
+                        source = &scalarized;
+                }
+
+                memcpy(util_dynarray_grow(emission, size), source, size);
+        }
+
+        /* Emit padding (all zero) */
+        memset(util_dynarray_grow(emission, bundle->padding), 0, bundle->padding);
+
+        /* Tack on constants */
+
+        if (bundle->has_embedded_constants) {
+                util_dynarray_append(emission, float, bundle->constants[0]);
+                util_dynarray_append(emission, float, bundle->constants[1]);
+                util_dynarray_append(emission, float, bundle->constants[2]);
+                util_dynarray_append(emission, float, bundle->constants[3]);
+        }
+}
+
+/* After everything is scheduled, emit whole bundles at a time */
+
+void
+emit_binary_bundle(compiler_context *ctx,
+                midgard_bundle *bundle,
+                struct util_dynarray *emission,
+                int next_tag)
+{
+        int lookahead = next_tag << 4;
+
+        switch (bundle->tag) {
+        case TAG_ALU_4:
+        case TAG_ALU_8:
+        case TAG_ALU_12:
+        case TAG_ALU_16:
+                emit_alu_bundle(ctx, bundle, emission, lookahead);
+                break;
+
+        case TAG_LOAD_STORE_4: {
+                /* One or two composing instructions */
+
+                uint64_t current64, next64 = LDST_NOP;
+
+                memcpy(&current64, &bundle->instructions[0]->load_store, sizeof(current64));
+
+                if (bundle->instruction_count == 2)
+                        memcpy(&next64, &bundle->instructions[1]->load_store, sizeof(next64));
+
+                midgard_load_store instruction = {
+                        .type = bundle->tag,
+                        .next_type = next_tag,
+                        .word1 = current64,
+                        .word2 = next64
+                };
+
+                util_dynarray_append(emission, midgard_load_store, instruction);
+
+                break;
+        }
+
+        case TAG_TEXTURE_4: {
+                /* Texture instructions are easy, since there is no pipelining
+                 * nor VLIW to worry about. We may need to set .last flag */
+
+                midgard_instruction *ins = bundle->instructions[0];
+
+                ins->texture.type = TAG_TEXTURE_4;
+                ins->texture.next_type = next_tag;
+
+                ctx->texture_op_count--;
+
+                if (!ctx->texture_op_count) {
+                        ins->texture.cont = 0;
+                        ins->texture.last = 1;
+                }
+
+                util_dynarray_append(emission, midgard_texture_word, ins->texture);
+                break;
+        }
+
+        default:
+                unreachable("Unknown midgard instruction type\n");
+        }
+}
diff --git a/src/gallium/drivers/panfrost/midgard/midgard_schedule.c b/src/gallium/drivers/panfrost/midgard/midgard_schedule.c
new file mode 100644 (file)
index 0000000..385b8bc
--- /dev/null
@@ -0,0 +1,479 @@
+/*
+ * Copyright (C) 2018-2019 Alyssa Rosenzweig <alyssa@rosenzweig.io>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "compiler.h"
+#include "midgard_ops.h"
+#include "util/u_memory.h"
+
+/* Create a mask of accessed components from a swizzle to figure out vector
+ * dependencies */
+
+static unsigned
+swizzle_to_access_mask(unsigned swizzle)
+{
+        unsigned component_mask = 0;
+
+        for (int i = 0; i < 4; ++i) {
+                unsigned c = (swizzle >> (2 * i)) & 3;
+                component_mask |= (1 << c);
+        }
+
+        return component_mask;
+}
+
+/* Does the mask cover more than a scalar? */
+
+static bool
+is_single_component_mask(unsigned mask)
+{
+        int components = 0;
+
+        for (int c = 0; c < 4; ++c)
+                if (mask & (3 << (2 * c)))
+                        components++;
+
+        return components == 1;
+}
+
+/* Checks for an SSA data hazard between two adjacent instructions, keeping in
+ * mind that we are a vector architecture and we can write to different
+ * components simultaneously */
+
+static bool
+can_run_concurrent_ssa(midgard_instruction *first, midgard_instruction *second)
+{
+        /* Each instruction reads some registers and writes to a register. See
+         * where the first writes */
+
+        /* Figure out where exactly we wrote to */
+        int source = first->ssa_args.dest;
+        int source_mask = first->type == TAG_ALU_4 ? squeeze_writemask(first->alu.mask) : 0xF;
+
+        /* As long as the second doesn't read from the first, we're okay */
+        if (second->ssa_args.src0 == source) {
+                if (first->type == TAG_ALU_4) {
+                        /* Figure out which components we just read from */
+
+                        int q = second->alu.src1;
+                        midgard_vector_alu_src *m = (midgard_vector_alu_src *) &q;
+
+                        /* Check if there are components in common, and fail if so */
+                        if (swizzle_to_access_mask(m->swizzle) & source_mask)
+                                return false;
+                } else
+                        return false;
+
+        }
+
+        if (second->ssa_args.src1 == source)
+                return false;
+
+        /* Otherwise, it's safe in that regard. Another data hazard is both
+         * writing to the same place, of course */
+
+        if (second->ssa_args.dest == source) {
+                /* ...but only if the components overlap */
+                int dest_mask = second->type == TAG_ALU_4 ? squeeze_writemask(second->alu.mask) : 0xF;
+
+                if (dest_mask & source_mask)
+                        return false;
+        }
+
+        /* ...That's it */
+        return true;
+}
+
+static bool
+midgard_has_hazard(
+                midgard_instruction **segment, unsigned segment_size,
+                midgard_instruction *ains)
+{
+        for (int s = 0; s < segment_size; ++s)
+                if (!can_run_concurrent_ssa(segment[s], ains))
+                        return true;
+
+        return false;
+
+
+}
+
+/* Schedules, but does not emit, a single basic block. After scheduling, the
+ * final tag and size of the block are known, which are necessary for branching
+ * */
+
+static midgard_bundle
+schedule_bundle(compiler_context *ctx, midgard_block *block, midgard_instruction *ins, int *skip)
+{
+        int instructions_emitted = 0, packed_idx = 0;
+        midgard_bundle bundle = { 0 };
+
+        uint8_t tag = ins->type;
+
+        /* Default to the instruction's tag */
+        bundle.tag = tag;
+
+        switch (ins->type) {
+        case TAG_ALU_4: {
+                uint32_t control = 0;
+                size_t bytes_emitted = sizeof(control);
+
+                /* TODO: Constant combining */
+                int index = 0, last_unit = 0;
+
+                /* Previous instructions, for the purpose of parallelism */
+                midgard_instruction *segment[4] = {0};
+                int segment_size = 0;
+
+                instructions_emitted = -1;
+                midgard_instruction *pins = ins;
+
+                for (;;) {
+                        midgard_instruction *ains = pins;
+
+                        /* Advance instruction pointer */
+                        if (index) {
+                                ains = mir_next_op(pins);
+                                pins = ains;
+                        }
+
+                        /* Out-of-work condition */
+                        if ((struct list_head *) ains == &block->instructions)
+                                break;
+
+                        /* Ensure that the chain can continue */
+                        if (ains->type != TAG_ALU_4) break;
+
+                        /* If there's already something in the bundle and we
+                         * have weird scheduler constraints, break now */
+                        if (ains->precede_break && index) break;
+
+                        /* According to the presentation "The ARM
+                         * Mali-T880 Mobile GPU" from HotChips 27,
+                         * there are two pipeline stages. Branching
+                         * position determined experimentally. Lines
+                         * are executed in parallel:
+                         *
+                         * [ VMUL ] [ SADD ]
+                         * [ VADD ] [ SMUL ] [ LUT ] [ BRANCH ]
+                         *
+                         * Verify that there are no ordering dependencies here.
+                         *
+                         * TODO: Allow for parallelism!!!
+                         */
+
+                        /* Pick a unit for it if it doesn't force a particular unit */
+
+                        int unit = ains->unit;
+
+                        if (!unit) {
+                                int op = ains->alu.op;
+                                int units = alu_opcode_props[op].props;
+
+                                bool vectorable = units & UNITS_ANY_VECTOR;
+                                bool scalarable = units & UNITS_SCALAR;
+                                bool could_scalar = is_single_component_mask(ains->alu.mask);
+                                bool vector = vectorable && !(could_scalar && scalarable);
+
+                                if (!vector)
+                                        assert(units & UNITS_SCALAR);
+
+                                if (vector) {
+                                       if (last_unit >= UNIT_VADD) {
+                                                if (units & UNIT_VLUT)
+                                                        unit = UNIT_VLUT;
+                                                else
+                                                        break;
+                                        } else {
+                                                if ((units & UNIT_VMUL) && !(control & UNIT_VMUL))
+                                                        unit = UNIT_VMUL;
+                                                else if ((units & UNIT_VADD) && !(control & UNIT_VADD))
+                                                        unit = UNIT_VADD;
+                                                else if (units & UNIT_VLUT)
+                                                        unit = UNIT_VLUT;
+                                                else
+                                                        break;
+                                        }
+                                } else {
+                                        if (last_unit >= UNIT_VADD) {
+                                                if ((units & UNIT_SMUL) && !(control & UNIT_SMUL))
+                                                        unit = UNIT_SMUL;
+                                                else if (units & UNIT_VLUT)
+                                                        unit = UNIT_VLUT;
+                                                else
+                                                        break;
+                                        } else {
+                                                if ((units & UNIT_SADD) && !(control & UNIT_SADD) && !midgard_has_hazard(segment, segment_size, ains))
+                                                        unit = UNIT_SADD;
+                                                else if (units & UNIT_SMUL)
+                                                        unit = ((units & UNIT_VMUL) && !(control & UNIT_VMUL)) ? UNIT_VMUL : UNIT_SMUL;
+                                                else if ((units & UNIT_VADD) && !(control & UNIT_VADD))
+                                                        unit = UNIT_VADD;
+                                                else
+                                                        break;
+                                        }
+                                }
+
+                                assert(unit & units);
+                        }
+
+                        /* Late unit check, this time for encoding (not parallelism) */
+                        if (unit <= last_unit) break;
+
+                        /* Clear the segment */
+                        if (last_unit < UNIT_VADD && unit >= UNIT_VADD)
+                                segment_size = 0;
+
+                        if (midgard_has_hazard(segment, segment_size, ains))
+                                break;
+
+                        /* We're good to go -- emit the instruction */
+                        ains->unit = unit;
+
+                        segment[segment_size++] = ains;
+
+                        /* Only one set of embedded constants per
+                         * bundle possible; if we have more, we must
+                         * break the chain early, unfortunately */
+
+                        if (ains->has_constants) {
+                                if (bundle.has_embedded_constants) {
+                                        /* The blend constant needs to be
+                                         * alone, since it conflicts with
+                                         * everything by definition */
+
+                                        if (ains->has_blend_constant || bundle.has_blend_constant)
+                                                break;
+
+                                        /* ...but if there are already
+                                         * constants but these are the
+                                         * *same* constants, we let it
+                                         * through */
+
+                                        if (memcmp(bundle.constants, ains->constants, sizeof(bundle.constants)))
+                                                break;
+                                } else {
+                                        bundle.has_embedded_constants = true;
+                                        memcpy(bundle.constants, ains->constants, sizeof(bundle.constants));
+
+                                        /* If this is a blend shader special constant, track it for patching */
+                                        bundle.has_blend_constant |= ains->has_blend_constant;
+                                }
+                        }
+
+                        if (ains->unit & UNITS_ANY_VECTOR) {
+                                bytes_emitted += sizeof(midgard_reg_info);
+                                bytes_emitted += sizeof(midgard_vector_alu);
+                        } else if (ains->compact_branch) {
+                                /* All of r0 has to be written out along with
+                                 * the branch writeout */
+
+                                if (ains->writeout) {
+                                        if (index == 0) {
+                                                /* Inject a move */
+                                                midgard_instruction ins = v_fmov(0, blank_alu_src, SSA_FIXED_REGISTER(0));
+                                                ins.unit = UNIT_VMUL;
+                                                control |= ins.unit;
+
+                                                /* TODO don't leak */
+                                                midgard_instruction *move =
+                                                        mem_dup(&ins, sizeof(midgard_instruction));
+                                                bytes_emitted += sizeof(midgard_reg_info);
+                                                bytes_emitted += sizeof(midgard_vector_alu);
+                                                bundle.instructions[packed_idx++] = move;
+                                        } else {
+                                                /* Analyse the group to see if r0 is written in full, on-time, without hanging dependencies */
+                                                bool written_late = false;
+                                                bool components[4] = { 0 };
+                                                uint16_t register_dep_mask = 0;
+                                                uint16_t written_mask = 0;
+
+                                                midgard_instruction *qins = ins;
+                                                for (int t = 0; t < index; ++t) {
+                                                        if (qins->registers.out_reg != 0) {
+                                                                /* Mark down writes */
+
+                                                                written_mask |= (1 << qins->registers.out_reg);
+                                                        } else {
+                                                                /* Mark down the register dependencies for errata check */
+
+                                                                if (qins->registers.src1_reg < 16)
+                                                                        register_dep_mask |= (1 << qins->registers.src1_reg);
+
+                                                                if (qins->registers.src2_reg < 16)
+                                                                        register_dep_mask |= (1 << qins->registers.src2_reg);
+
+                                                                int mask = qins->alu.mask;
+
+                                                                for (int c = 0; c < 4; ++c)
+                                                                        if (mask & (0x3 << (2 * c)))
+                                                                                components[c] = true;
+
+                                                                /* ..but if the writeout is too late, we have to break up anyway... for some reason */
+
+                                                                if (qins->unit == UNIT_VLUT)
+                                                                        written_late = true;
+                                                        }
+
+                                                        /* Advance instruction pointer */
+                                                        qins = mir_next_op(qins);
+                                                }
+
+                                                /* Register dependencies of r0 must be out of fragment writeout bundle */
+                                                if (register_dep_mask & written_mask)
+                                                        break;
+
+                                                if (written_late)
+                                                        break;
+
+                                                /* If even a single component is not written, break it up (conservative check). */
+                                                bool breakup = false;
+
+                                                for (int c = 0; c < 4; ++c)
+                                                        if (!components[c])
+                                                                breakup = true;
+
+                                                if (breakup)
+                                                        break;
+
+                                                /* Otherwise, we're free to proceed */
+                                        }
+                                }
+
+                                if (ains->unit == ALU_ENAB_BRANCH) {
+                                        bytes_emitted += sizeof(midgard_branch_extended);
+                                } else {
+                                        bytes_emitted += sizeof(ains->br_compact);
+                                }
+                        } else {
+                                bytes_emitted += sizeof(midgard_reg_info);
+                                bytes_emitted += sizeof(midgard_scalar_alu);
+                        }
+
+                        /* Defer marking until after writing to allow for break */
+                        control |= ains->unit;
+                        last_unit = ains->unit;
+                        ++instructions_emitted;
+                        ++index;
+                }
+
+                int padding = 0;
+
+                /* Pad ALU op to nearest word */
+
+                if (bytes_emitted & 15) {
+                        padding = 16 - (bytes_emitted & 15);
+                        bytes_emitted += padding;
+                }
+
+                /* Constants must always be quadwords */
+                if (bundle.has_embedded_constants)
+                        bytes_emitted += 16;
+
+                /* Size ALU instruction for tag */
+                bundle.tag = (TAG_ALU_4) + (bytes_emitted / 16) - 1;
+                bundle.padding = padding;
+                bundle.control = bundle.tag | control;
+
+                break;
+        }
+
+        case TAG_LOAD_STORE_4: {
+                /* Load store instructions have two words at once. If
+                 * we only have one queued up, we need to NOP pad.
+                 * Otherwise, we store both in succession to save space
+                 * and cycles -- letting them go in parallel -- skip
+                 * the next. The usefulness of this optimisation is
+                 * greatly dependent on the quality of the instruction
+                 * scheduler.
+                 */
+
+                midgard_instruction *next_op = mir_next_op(ins);
+
+                if ((struct list_head *) next_op != &block->instructions && next_op->type == TAG_LOAD_STORE_4) {
+                        /* TODO: Concurrency check */
+                        instructions_emitted++;
+                }
+
+                break;
+        }
+
+        default:
+                /* Texture ops default to single-op-per-bundle scheduling */
+                break;
+        }
+
+        /* Copy the instructions into the bundle */
+        bundle.instruction_count = instructions_emitted + 1 + packed_idx;
+
+        midgard_instruction *uins = ins;
+        for (; packed_idx < bundle.instruction_count; ++packed_idx) {
+                bundle.instructions[packed_idx] = uins;
+                uins = mir_next_op(uins);
+        }
+
+        *skip = instructions_emitted;
+
+        return bundle;
+}
+
+/* Schedule a single block by iterating its instruction to create bundles.
+ * While we go, tally about the bundle sizes to compute the block size. */
+
+static void
+schedule_block(compiler_context *ctx, midgard_block *block)
+{
+        util_dynarray_init(&block->bundles, NULL);
+
+        block->quadword_count = 0;
+
+        mir_foreach_instr_in_block(block, ins) {
+                int skip;
+                midgard_bundle bundle = schedule_bundle(ctx, block, ins, &skip);
+                util_dynarray_append(&block->bundles, midgard_bundle, bundle);
+
+                if (bundle.has_blend_constant) {
+                        /* TODO: Multiblock? */
+                        int quadwords_within_block = block->quadword_count + quadword_size(bundle.tag) - 1;
+                        ctx->blend_constant_offset = quadwords_within_block * 0x10;
+                }
+
+                while(skip--)
+                        ins = mir_next_op(ins);
+
+                block->quadword_count += quadword_size(bundle.tag);
+        }
+
+        block->is_scheduled = true;
+}
+
+void
+schedule_program(compiler_context *ctx)
+{
+        /* We run RA prior to scheduling */
+        struct ra_graph *g = allocate_registers(ctx);
+        install_registers(ctx, g);
+
+        mir_foreach_block(ctx, block) {
+                schedule_block(ctx, block);
+        }
+}