ir3: Plumb through support for a1.x
[mesa.git] / src / freedreno / ir3 / ir3_sched.c
index 96897f60e9249ef3c74196d125041fbe543424da..9d0bf69d193c7baa1e02af0334cf71bc35a36f75 100644 (file)
 #include "util/u_math.h"
 
 #include "ir3.h"
+#include "ir3_compiler.h"
+
+#ifdef DEBUG
+#define SCHED_DEBUG (ir3_shader_debug & IR3_DBG_SCHEDMSGS)
+#else
+#define SCHED_DEBUG 0
+#endif
+#define d(fmt, ...) do { if (SCHED_DEBUG) { \
+       printf("SCHED: "fmt"\n", ##__VA_ARGS__); \
+} } while (0)
+
+#define di(instr, fmt, ...) do { if (SCHED_DEBUG) { \
+       printf("SCHED: "fmt": ", ##__VA_ARGS__); \
+       ir3_print_instr(instr); \
+} } while (0)
 
 /*
  * Instruction Scheduling:
@@ -53,10 +68,17 @@ struct ir3_sched_ctx {
        struct ir3_block *block;           /* the current block */
        struct list_head depth_list;       /* depth sorted unscheduled instrs */
        struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/
-       struct ir3_instruction *addr;      /* current a0.x user, if any */
+       struct ir3_instruction *addr0;     /* current a0.x user, if any */
+       struct ir3_instruction *addr1;     /* current a1.x user, if any */
        struct ir3_instruction *pred;      /* current p0.x user, if any */
        int live_values;                   /* estimate of current live values */
+       int half_live_values;              /* estimate of current half precision live values */
        bool error;
+
+       unsigned live_threshold_hi;
+       unsigned live_threshold_lo;
+       unsigned depth_threshold_hi;
+       unsigned depth_threshold_lo;
 };
 
 static bool is_scheduled(struct ir3_instruction *instr)
@@ -64,29 +86,29 @@ static bool is_scheduled(struct ir3_instruction *instr)
        return !!(instr->flags & IR3_INSTR_MARK);
 }
 
-static bool is_sfu_or_mem(struct ir3_instruction *instr)
-{
-       return is_sfu(instr) || is_mem(instr);
-}
-
 static void
 unuse_each_src(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
 {
        struct ir3_instruction *src;
 
-       foreach_ssa_src_n(src, n, instr) {
+       foreach_ssa_src_n (src, n, instr) {
                if (__is_false_dep(instr, n))
                        continue;
                if (instr->block != src->block)
                        continue;
-               if ((src->opc == OPC_META_FI) || (src->opc == OPC_META_FO)) {
+               if ((src->opc == OPC_META_COLLECT) || (src->opc == OPC_META_SPLIT)) {
                        unuse_each_src(ctx, src);
                } else {
                        debug_assert(src->use_count > 0);
 
                        if (--src->use_count == 0) {
-                               ctx->live_values -= dest_regs(src);
-                               debug_assert(ctx->live_values >= 0);
+                               if (is_half(src)) {
+                                       ctx->half_live_values -= dest_regs(src);
+                                       debug_assert(ctx->half_live_values >= 0);
+                               } else {
+                                       ctx->live_values -= dest_regs(src);
+                                       debug_assert(ctx->live_values >= 0);
+                               }
                        }
                }
        }
@@ -108,10 +130,14 @@ transfer_use(struct ir3_sched_ctx *ctx, struct ir3_instruction *orig_instr,
 
        debug_assert(is_scheduled(orig_instr));
 
-       foreach_ssa_src_n(src, n, new_instr) {
+       foreach_ssa_src_n (src, n, new_instr) {
                if (__is_false_dep(new_instr, n))
                        continue;
-               ctx->live_values += dest_regs(src);
+               if (is_half(new_instr)) {
+                       ctx->half_live_values += dest_regs(src);
+               } else {
+                       ctx->live_values += dest_regs(src);
+               }
                use_instr(src);
        }
 
@@ -123,7 +149,7 @@ use_each_src(struct ir3_instruction *instr)
 {
        struct ir3_instruction *src;
 
-       foreach_ssa_src_n(src, n, instr) {
+       foreach_ssa_src_n (src, n, instr) {
                if (__is_false_dep(instr, n))
                        continue;
                use_instr(src);
@@ -133,7 +159,7 @@ use_each_src(struct ir3_instruction *instr)
 static void
 use_instr(struct ir3_instruction *instr)
 {
-       if ((instr->opc == OPC_META_FI) || (instr->opc == OPC_META_FO)) {
+       if ((instr->opc == OPC_META_COLLECT) || (instr->opc == OPC_META_SPLIT)) {
                use_each_src(instr);
        } else {
                instr->use_count++;
@@ -141,27 +167,32 @@ use_instr(struct ir3_instruction *instr)
 }
 
 static void
-update_live_values(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
+update_live_values(struct ir3_sched_ctx *ctx, struct ir3_instruction *scheduled)
 {
-       if ((instr->opc == OPC_META_FI) || (instr->opc == OPC_META_FO))
+       if ((scheduled->opc == OPC_META_COLLECT) || (scheduled->opc == OPC_META_SPLIT))
                return;
 
-       ctx->live_values += dest_regs(instr);
-       unuse_each_src(ctx, instr);
+       if ((scheduled->regs_count > 0) && is_half(scheduled)) {
+               ctx->half_live_values += dest_regs(scheduled);
+       } else {
+               ctx->live_values += dest_regs(scheduled);
+       }
+
+       unuse_each_src(ctx, scheduled);
 }
 
 static void
 update_use_count(struct ir3 *ir)
 {
-       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
-               list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+       foreach_block (block, &ir->block_list) {
+               foreach_instr (instr, &block->instr_list) {
                        instr->use_count = 0;
                }
        }
 
-       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
-               list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
-                       if ((instr->opc == OPC_META_FI) || (instr->opc == OPC_META_FO))
+       foreach_block (block, &ir->block_list) {
+               foreach_instr (instr, &block->instr_list) {
+                       if ((instr->opc == OPC_META_COLLECT) || (instr->opc == OPC_META_SPLIT))
                                continue;
 
                        use_each_src(instr);
@@ -170,14 +201,9 @@ update_use_count(struct ir3 *ir)
 
        /* Shader outputs are also used:
         */
-       for (unsigned i = 0; i <  ir->noutputs; i++) {
-               struct ir3_instruction  *out = ir->outputs[i];
-
-               if (!out)
-                       continue;
-
+       struct ir3_instruction *out;
+       foreach_output (out, ir)
                use_instr(out);
-       }
 }
 
 #define NULL_INSTR ((void *)~0)
@@ -185,7 +211,7 @@ update_use_count(struct ir3 *ir)
 static void
 clear_cache(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
 {
-       list_for_each_entry (struct ir3_instruction, instr2, &ctx->depth_list, node) {
+       foreach_instr (instr2, &ctx->depth_list) {
                if ((instr2->data == instr) || (instr2->data == NULL_INSTR) || !instr)
                        instr2->data = NULL;
        }
@@ -196,20 +222,18 @@ schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
 {
        debug_assert(ctx->block == instr->block);
 
-       /* maybe there is a better way to handle this than just stuffing
-        * a nop.. ideally we'd know about this constraint in the
-        * scheduling and depth calculation..
-        */
-       if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr))
-               ir3_NOP(ctx->block);
-
        /* remove from depth list:
         */
        list_delinit(&instr->node);
 
-       if (writes_addr(instr)) {
-               debug_assert(ctx->addr == NULL);
-               ctx->addr = instr;
+       if (writes_addr0(instr)) {
+               debug_assert(ctx->addr0 == NULL);
+               ctx->addr0 = instr;
+       }
+
+       if (writes_addr1(instr)) {
+               debug_assert(ctx->addr1 == NULL);
+               ctx->addr1 = instr;
        }
 
        if (writes_pred(instr)) {
@@ -219,12 +243,14 @@ schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
 
        instr->flags |= IR3_INSTR_MARK;
 
+       di(instr, "schedule");
+
        list_addtail(&instr->node, &instr->block->instr_list);
        ctx->scheduled = instr;
 
        update_live_values(ctx, instr);
 
-       if (writes_addr(instr) || writes_pred(instr) || is_input(instr)) {
+       if (writes_addr0(instr) || writes_addr1(instr) || writes_pred(instr) || is_input(instr)) {
                clear_cache(ctx, NULL);
        } else {
                /* invalidate only the necessary entries.. */
@@ -253,117 +279,6 @@ deepest(struct ir3_instruction **srcs, unsigned nsrcs)
        return d;
 }
 
-/**
- * @block: the block to search in, starting from end; in first pass,
- *    this will be the block the instruction would be inserted into
- *    (but has not yet, ie. it only contains already scheduled
- *    instructions).  For intra-block scheduling (second pass), this
- *    would be one of the predecessor blocks.
- * @instr: the instruction to search for
- * @maxd:  max distance, bail after searching this # of instruction
- *    slots, since it means the instruction we are looking for is
- *    far enough away
- * @pred:  if true, recursively search into predecessor blocks to
- *    find the worst case (shortest) distance (only possible after
- *    individual blocks are all scheduled
- */
-static unsigned
-distance(struct ir3_block *block, struct ir3_instruction *instr,
-               unsigned maxd, bool pred)
-{
-       unsigned d = 0;
-
-       list_for_each_entry_rev (struct ir3_instruction, n, &block->instr_list, node) {
-               if ((n == instr) || (d >= maxd))
-                       return d;
-               /* NOTE: don't count branch/jump since we don't know yet if they will
-                * be eliminated later in resolve_jumps().. really should do that
-                * earlier so we don't have this constraint.
-                */
-               if (is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR)))
-                       d++;
-       }
-
-       /* if coming from a predecessor block, assume it is assigned far
-        * enough away.. we'll fix up later.
-        */
-       if (!pred)
-               return maxd;
-
-       if (pred && (block->data != block)) {
-               /* Search into predecessor blocks, finding the one with the
-                * shortest distance, since that will be the worst case
-                */
-               unsigned min = maxd - d;
-
-               /* (ab)use block->data to prevent recursion: */
-               block->data = block;
-
-               set_foreach(block->predecessors, entry) {
-                       struct ir3_block *pred = (struct ir3_block *)entry->key;
-                       unsigned n;
-
-                       n = distance(pred, instr, min, pred);
-
-                       min = MIN2(min, n);
-               }
-
-               block->data = NULL;
-               d += min;
-       }
-
-       return d;
-}
-
-/* calculate delay for specified src: */
-static unsigned
-delay_calc_srcn(struct ir3_block *block,
-               struct ir3_instruction *assigner,
-               struct ir3_instruction *consumer,
-               unsigned srcn, bool soft, bool pred)
-{
-       unsigned delay = 0;
-
-       if (is_meta(assigner)) {
-               struct ir3_instruction *src;
-               foreach_ssa_src(src, assigner) {
-                       unsigned d;
-                       d = delay_calc_srcn(block, src, consumer, srcn, soft, pred);
-                       delay = MAX2(delay, d);
-               }
-       } else {
-               if (soft) {
-                       if (is_sfu(assigner)) {
-                               delay = 4;
-                       } else {
-                               delay = ir3_delayslots(assigner, consumer, srcn);
-                       }
-               } else {
-                       delay = ir3_delayslots(assigner, consumer, srcn);
-               }
-               delay -= distance(block, assigner, delay, pred);
-       }
-
-       return delay;
-}
-
-/* calculate delay for instruction (maximum of delay for all srcs): */
-static unsigned
-delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
-               bool soft, bool pred)
-{
-       unsigned delay = 0;
-       struct ir3_instruction *src;
-
-       foreach_ssa_src_n(src, i, instr) {
-               unsigned d;
-               d = delay_calc_srcn(block, src, instr, i, soft, pred);
-               delay = MAX2(delay, d);
-       }
-
-       return delay;
-}
-
 struct ir3_sched_notes {
        /* there is at least one kill which could be scheduled, except
         * for unscheduled bary.f's:
@@ -372,7 +287,7 @@ struct ir3_sched_notes {
        /* there is at least one instruction that could be scheduled,
         * except for conflicting address/predicate register usage:
         */
-       bool addr_conflict, pred_conflict;
+       bool addr0_conflict, addr1_conflict, pred_conflict;
 };
 
 /* could an instruction be scheduled if specified ssa src was scheduled? */
@@ -380,7 +295,7 @@ static bool
 could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
 {
        struct ir3_instruction *other_src;
-       foreach_ssa_src(other_src, instr) {
+       foreach_ssa_src (other_src, instr) {
                /* if dependency not scheduled, we aren't ready yet: */
                if ((src != other_src) && !is_scheduled(other_src)) {
                        return false;
@@ -405,11 +320,28 @@ check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
         * TODO if any instructions use pred register and have other
         * src args, we would need to do the same for writes_pred()..
         */
-       if (writes_addr(instr)) {
+       if (writes_addr0(instr)) {
+               struct ir3 *ir = instr->block->shader;
+               bool ready = false;
+               for (unsigned i = 0; (i < ir->a0_users_count) && !ready; i++) {
+                       struct ir3_instruction *indirect = ir->a0_users[i];
+                       if (!indirect)
+                               continue;
+                       if (indirect->address != instr)
+                               continue;
+                       ready = could_sched(indirect, instr);
+               }
+
+               /* nothing could be scheduled, so keep looking: */
+               if (!ready)
+                       return false;
+       }
+
+       if (writes_addr1(instr)) {
                struct ir3 *ir = instr->block->shader;
                bool ready = false;
-               for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) {
-                       struct ir3_instruction *indirect = ir->indirects[i];
+               for (unsigned i = 0; (i < ir->a1_users_count) && !ready; i++) {
+                       struct ir3_instruction *indirect = ir->a1_users[i];
                        if (!indirect)
                                continue;
                        if (indirect->address != instr)
@@ -426,9 +358,15 @@ check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
         * register is currently in use, we need to defer until it is
         * free:
         */
-       if (writes_addr(instr) && ctx->addr) {
-               debug_assert(ctx->addr != instr);
-               notes->addr_conflict = true;
+       if (writes_addr0(instr) && ctx->addr0) {
+               debug_assert(ctx->addr0 != instr);
+               notes->addr0_conflict = true;
+               return false;
+       }
+
+       if (writes_addr1(instr) && ctx->addr1) {
+               debug_assert(ctx->addr1 != instr);
+               notes->addr1_conflict = true;
                return false;
        }
 
@@ -494,7 +432,7 @@ find_instr_recursive(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
        }
 
        /* find unscheduled srcs: */
-       foreach_ssa_src(src, instr) {
+       foreach_ssa_src (src, instr) {
                if (!is_scheduled(src) && (src->block == instr->block)) {
                        debug_assert(nsrcs < ARRAY_SIZE(srcs));
                        srcs[nsrcs++] = src;
@@ -535,26 +473,26 @@ live_effect(struct ir3_instruction *instr)
        int new_live = dest_regs(instr);
        int old_live = 0;
 
-       foreach_ssa_src_n(src, n, instr) {
+       foreach_ssa_src_n (src, n, instr) {
                if (__is_false_dep(instr, n))
                        continue;
 
                if (instr->block != src->block)
                        continue;
 
-               /* for fanout/split, just pass things along to the real src: */
-               if (src->opc == OPC_META_FO)
+               /* for split, just pass things along to the real src: */
+               if (src->opc == OPC_META_SPLIT)
                        src = ssa(src->regs[1]);
 
-               /* for fanin/collect, if this is the last use of *each* src,
+               /* for collect, if this is the last use of *each* src,
                 * then it will decrease the live values, since RA treats
                 * them as a whole:
                 */
-               if (src->opc == OPC_META_FI) {
+               if (src->opc == OPC_META_COLLECT) {
                        struct ir3_instruction *src2;
                        bool last_use = true;
 
-                       foreach_ssa_src(src2, src) {
+                       foreach_ssa_src (src2, src) {
                                if (src2->use_count > 1) {
                                        last_use = false;
                                        break;
@@ -591,7 +529,7 @@ find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
         * get traversed both when they appear as ssa src to a later instruction
         * as well as where they appear in the depth_list.
         */
-       list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) {
+       foreach_instr_rev (instr, &ctx->depth_list) {
                struct ir3_instruction *candidate;
 
                candidate = find_instr_recursive(ctx, notes, instr);
@@ -607,7 +545,7 @@ find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
        /* traverse the list a second time.. but since we cache the result of
         * find_instr_recursive() it isn't as bad as it looks.
         */
-       list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) {
+       foreach_instr_rev (instr, &ctx->depth_list) {
                struct ir3_instruction *candidate;
 
                candidate = find_instr_recursive(ctx, notes, instr);
@@ -616,6 +554,7 @@ find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 
                /* determine net change to # of live values: */
                int le = live_effect(candidate);
+               unsigned live_values = (2 * ctx->live_values) + ctx->half_live_values;
 
                /* if there is a net increase in # of live values, then apply some
                 * threshold to avoid instructions getting scheduled *too* early
@@ -624,10 +563,10 @@ find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
                if (le >= 1) {
                        unsigned threshold;
 
-                       if (ctx->live_values > 4*4) {
-                               threshold = 4;
+                       if (live_values > ctx->live_threshold_lo) {
+                               threshold = ctx->depth_threshold_lo;
                        } else {
-                               threshold = 6;
+                               threshold = ctx->depth_threshold_hi;
                        }
 
                        /* Filter out any "shallow" instructions which would otherwise
@@ -646,14 +585,14 @@ find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
                                continue;
                }
 
-               int rank = delay_calc(ctx->block, candidate, soft, false);
+               int rank = ir3_delay_calc(ctx->block, candidate, soft, false);
 
                /* if too many live values, prioritize instructions that reduce the
                 * number of live values:
                 */
-               if (ctx->live_values > 16*4) {
+               if (live_values > ctx->live_threshold_hi) {
                        rank = le;
-               } else if (ctx->live_values > 4*4) {
+               } else if (live_values > ctx->live_threshold_lo) {
                        rank += le;
                }
 
@@ -675,23 +614,21 @@ split_instr(struct ir3_sched_ctx *ctx, struct ir3_instruction *orig_instr)
        return new_instr;
 }
 
-/* "spill" the address register by remapping any unscheduled
+/* "spill" the address registers by remapping any unscheduled
  * instructions which depend on the current address register
  * to a clone of the instruction which wrote the address reg.
  */
 static struct ir3_instruction *
-split_addr(struct ir3_sched_ctx *ctx)
+split_addr(struct ir3_sched_ctx *ctx, struct ir3_instruction **addr,
+                  struct ir3_instruction **users, unsigned users_count)
 {
-       struct ir3 *ir;
        struct ir3_instruction *new_addr = NULL;
        unsigned i;
 
-       debug_assert(ctx->addr);
+       debug_assert(*addr);
 
-       ir = ctx->addr->block->shader;
-
-       for (i = 0; i < ir->indirects_count; i++) {
-               struct ir3_instruction *indirect = ir->indirects[i];
+       for (i = 0; i < users_count; i++) {
+               struct ir3_instruction *indirect = users[i];
 
                if (!indirect)
                        continue;
@@ -703,9 +640,9 @@ split_addr(struct ir3_sched_ctx *ctx)
                /* remap remaining instructions using current addr
                 * to new addr:
                 */
-               if (indirect->address == ctx->addr) {
+               if (indirect->address == *addr) {
                        if (!new_addr) {
-                               new_addr = split_instr(ctx, ctx->addr);
+                               new_addr = split_instr(ctx, *addr);
                                /* original addr is scheduled, but new one isn't: */
                                new_addr->flags &= ~IR3_INSTR_MARK;
                        }
@@ -715,7 +652,7 @@ split_addr(struct ir3_sched_ctx *ctx)
        }
 
        /* all remaining indirects remapped to new addr: */
-       ctx->addr = NULL;
+       *addr = NULL;
 
        return new_addr;
 }
@@ -772,7 +709,8 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
        ctx->block = block;
 
        /* addr/pred writes are per-block: */
-       ctx->addr = NULL;
+       ctx->addr0 = NULL;
+       ctx->addr1 = NULL;
        ctx->pred = NULL;
 
        /* move all instructions to the unscheduled list, and
@@ -783,19 +721,30 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
        list_inithead(&block->instr_list);
        list_inithead(&ctx->depth_list);
 
-       /* first a pre-pass to schedule all meta:input instructions
-        * (which need to appear first so that RA knows the register is
-        * occupied), and move remaining to depth sorted list:
+       /* First schedule all meta:input instructions, followed by
+        * tex-prefetch.  We want all of the instructions that load
+        * values into registers before the shader starts to go
+        * before any other instructions.  But in particular we
+        * want inputs to come before prefetches.  This is because
+        * a FS's bary_ij input may not actually be live in the
+        * shader, but it should not be scheduled on top of any
+        * other input (but can be overwritten by a tex prefetch)
+        *
+        * Finally, move all the remaining instructions to the depth-
+        * list
         */
-       list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node) {
-               if (instr->opc == OPC_META_INPUT) {
+       foreach_instr_safe (instr, &unscheduled_list)
+               if (instr->opc == OPC_META_INPUT)
+                       schedule(ctx, instr);
+
+       foreach_instr_safe (instr, &unscheduled_list)
+               if (instr->opc == OPC_META_TEX_PREFETCH)
                        schedule(ctx, instr);
-               } else {
-                       ir3_insert_by_depth(instr, &ctx->depth_list);
-               }
-       }
 
-       while (!list_empty(&ctx->depth_list)) {
+       foreach_instr_safe (instr, &unscheduled_list)
+               ir3_insert_by_depth(instr, &ctx->depth_list);
+
+       while (!list_is_empty(&ctx->depth_list)) {
                struct ir3_sched_notes notes = {0};
                struct ir3_instruction *instr;
 
@@ -804,7 +753,8 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
                        instr = find_eligible_instr(ctx, &notes, false);
 
                if (instr) {
-                       unsigned delay = delay_calc(ctx->block, instr, false, false);
+                       unsigned delay = ir3_delay_calc(ctx->block, instr, false, false);
+                       d("delay=%u", delay);
 
                        /* and if we run out of instructions that can be scheduled,
                         * then it is time for nop's:
@@ -818,14 +768,19 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
                        schedule(ctx, instr);
                } else {
                        struct ir3_instruction *new_instr = NULL;
+                       struct ir3 *ir = block->shader;
 
                        /* nothing available to schedule.. if we are blocked on
                         * address/predicate register conflict, then break the
                         * deadlock by cloning the instruction that wrote that
                         * reg:
                         */
-                       if (notes.addr_conflict) {
-                               new_instr = split_addr(ctx);
+                       if (notes.addr0_conflict) {
+                               new_instr = split_addr(ctx, &ctx->addr0,
+                                                                          ir->a0_users, ir->a0_users_count);
+                       } else if (notes.addr1_conflict) {
+                               new_instr = split_addr(ctx, &ctx->addr1,
+                                                                          ir->a1_users, ir->a1_users_count);
                        } else if (notes.pred_conflict) {
                                new_instr = split_pred(ctx);
                        } else {
@@ -848,94 +803,21 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
                        }
                }
        }
-
-       /* And lastly, insert branch/jump instructions to take us to
-        * the next block.  Later we'll strip back out the branches
-        * that simply jump to next instruction.
-        */
-       if (block->successors[1]) {
-               /* if/else, conditional branches to "then" or "else": */
-               struct ir3_instruction *br;
-               unsigned delay = 6;
-
-               debug_assert(ctx->pred);
-               debug_assert(block->condition);
-
-               delay -= distance(ctx->block, ctx->pred, delay, false);
-
-               while (delay > 0) {
-                       ir3_NOP(block);
-                       delay--;
-               }
-
-               /* create "else" branch first (since "then" block should
-                * frequently/always end up being a fall-thru):
-                */
-               br = ir3_BR(block);
-               br->cat0.inv = true;
-               br->cat0.target = block->successors[1];
-
-               /* NOTE: we have to hard code delay of 6 above, since
-                * we want to insert the nop's before constructing the
-                * branch.  Throw in an assert so we notice if this
-                * ever breaks on future generation:
-                */
-               debug_assert(ir3_delayslots(ctx->pred, br, 0) == 6);
-
-               br = ir3_BR(block);
-               br->cat0.target = block->successors[0];
-
-       } else if (block->successors[0]) {
-               /* otherwise unconditional jump to next block: */
-               struct ir3_instruction *jmp;
-
-               jmp = ir3_JUMP(block);
-               jmp->cat0.target = block->successors[0];
-       }
-
-       /* NOTE: if we kept track of the predecessors, we could do a better
-        * job w/ (jp) flags.. every node w/ > predecessor is a join point.
-        * Note that as we eliminate blocks which contain only an unconditional
-        * jump we probably need to propagate (jp) flag..
-        */
 }
 
-/* After scheduling individual blocks, we still could have cases where
- * one (or more) paths into a block, a value produced by a previous
- * has too few delay slots to be legal.  We can't deal with this in the
- * first pass, because loops (ie. we can't ensure all predecessor blocks
- * are already scheduled in the first pass).  All we can really do at
- * this point is stuff in extra nop's until things are legal.
- */
 static void
-sched_intra_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
+setup_thresholds(struct ir3_sched_ctx *ctx, struct ir3 *ir)
 {
-       unsigned n = 0;
-
-       ctx->block = block;
-
-       list_for_each_entry_safe (struct ir3_instruction, instr, &block->instr_list, node) {
-               unsigned delay = 0;
-
-               set_foreach(block->predecessors, entry) {
-                       struct ir3_block *pred = (struct ir3_block *)entry->key;
-                       unsigned d = delay_calc(pred, instr, false, true);
-                       delay = MAX2(d, delay);
-               }
-
-               while (delay > n) {
-                       struct ir3_instruction *nop = ir3_NOP(block);
-
-                       /* move to before instr: */
-                       list_delinit(&nop->node);
-                       list_addtail(&nop->node, &instr->node);
-
-                       n++;
-               }
-
-               /* we can bail once we hit worst case delay: */
-               if (++n > 6)
-                       break;
+       if (ir3_has_latency_to_hide(ir)) {
+               ctx->live_threshold_hi = 2 * 16 * 4;
+               ctx->live_threshold_lo = 2 * 4 * 4;
+               ctx->depth_threshold_hi = 6;
+               ctx->depth_threshold_lo = 4;
+       } else {
+               ctx->live_threshold_hi = 2 * 16 * 4;
+               ctx->live_threshold_lo = 2 * 12 * 4;
+               ctx->depth_threshold_hi = 16;
+               ctx->depth_threshold_lo = 16;
        }
 }
 
@@ -943,18 +825,17 @@ int ir3_sched(struct ir3 *ir)
 {
        struct ir3_sched_ctx ctx = {0};
 
+       setup_thresholds(&ctx, ir);
+
        ir3_clear_mark(ir);
        update_use_count(ir);
 
-       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+       foreach_block (block, &ir->block_list) {
                ctx.live_values = 0;
+               ctx.half_live_values = 0;
                sched_block(&ctx, block);
        }
 
-       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
-               sched_intra_block(&ctx, block);
-       }
-
        if (ctx.error)
                return -1;
 
@@ -1066,7 +947,7 @@ add_barrier_deps(struct ir3_block *block, struct ir3_instruction *instr)
 static void
 calculate_deps(struct ir3_block *block)
 {
-       list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+       foreach_instr (instr, &block->instr_list) {
                if (instr->barrier_class) {
                        add_barrier_deps(block, instr);
                }
@@ -1076,7 +957,7 @@ calculate_deps(struct ir3_block *block)
 void
 ir3_sched_add_deps(struct ir3 *ir)
 {
-       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+       foreach_block (block, &ir->block_list) {
                calculate_deps(block);
        }
 }