ir3: Plumb through support for a1.x

[mesa.git] / src / freedreno / ir3 / ir3_sched.c
diff --git a/src/freedreno/ir3/ir3_sched.c b/src/freedreno/ir3/ir3_sched.c

index 96897f60e9249ef3c74196d125041fbe543424da..9d0bf69d193c7baa1e02af0334cf71bc35a36f75 100644 (file)
--- a/src/freedreno/ir3/ir3_sched.c
+++ b/src/freedreno/ir3/ir3_sched.c
@@ -28,6 +28,21 @@
  #include "util/u_math.h"
  
  #include "ir3.h"
+#include "ir3_compiler.h"
+
+#ifdef DEBUG
+#define SCHED_DEBUG (ir3_shader_debug & IR3_DBG_SCHEDMSGS)
+#else
+#define SCHED_DEBUG 0
+#endif
+#define d(fmt, ...) do { if (SCHED_DEBUG) { \
+       printf("SCHED: "fmt"\n", ##__VA_ARGS__); \
+} } while (0)
+
+#define di(instr, fmt, ...) do { if (SCHED_DEBUG) { \
+       printf("SCHED: "fmt": ", ##__VA_ARGS__); \
+       ir3_print_instr(instr); \
+} } while (0)
  
  /*
   * Instruction Scheduling:
@@ -53,10 +68,17 @@ struct ir3_sched_ctx {
         struct ir3_block *block;           /* the current block */
         struct list_head depth_list;       /* depth sorted unscheduled instrs */
         struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/
-       struct ir3_instruction *addr;      /* current a0.x user, if any */
+       struct ir3_instruction *addr0;     /* current a0.x user, if any */
+       struct ir3_instruction *addr1;     /* current a1.x user, if any */
         struct ir3_instruction *pred;      /* current p0.x user, if any */
         int live_values;                   /* estimate of current live values */
+       int half_live_values;              /* estimate of current half precision live values */
         bool error;
+
+       unsigned live_threshold_hi;
+       unsigned live_threshold_lo;
+       unsigned depth_threshold_hi;
+       unsigned depth_threshold_lo;
  };
  
  static bool is_scheduled(struct ir3_instruction *instr)
@@ -64,29 +86,29 @@ static bool is_scheduled(struct ir3_instruction *instr)
         return !!(instr->flags & IR3_INSTR_MARK);
  }
  
-static bool is_sfu_or_mem(struct ir3_instruction *instr)
-{
-       return is_sfu(instr) || is_mem(instr);
-}
-
  static void
  unuse_each_src(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
  {
         struct ir3_instruction *src;
  
-       foreach_ssa_src_n(src, n, instr) {
+       foreach_ssa_src_n (src, n, instr) {
                 if (__is_false_dep(instr, n))
                         continue;
                 if (instr->block != src->block)
                         continue;
-               if ((src->opc == OPC_META_FI) || (src->opc == OPC_META_FO)) {
+               if ((src->opc == OPC_META_COLLECT) || (src->opc == OPC_META_SPLIT)) {
                         unuse_each_src(ctx, src);
                 } else {
                         debug_assert(src->use_count > 0);
  
                         if (--src->use_count == 0) {
-                               ctx->live_values -= dest_regs(src);
-                               debug_assert(ctx->live_values >= 0);
+                               if (is_half(src)) {
+                                       ctx->half_live_values -= dest_regs(src);
+                                       debug_assert(ctx->half_live_values >= 0);
+                               } else {
+                                       ctx->live_values -= dest_regs(src);
+                                       debug_assert(ctx->live_values >= 0);
+                               }
                         }
                 }
         }
@@ -108,10 +130,14 @@ transfer_use(struct ir3_sched_ctx *ctx, struct ir3_instruction *orig_instr,
  
         debug_assert(is_scheduled(orig_instr));
  
-       foreach_ssa_src_n(src, n, new_instr) {
+       foreach_ssa_src_n (src, n, new_instr) {
                 if (__is_false_dep(new_instr, n))
                         continue;
-               ctx->live_values += dest_regs(src);
+               if (is_half(new_instr)) {
+                       ctx->half_live_values += dest_regs(src);
+               } else {
+                       ctx->live_values += dest_regs(src);
+               }
                 use_instr(src);
         }
  
@@ -123,7 +149,7 @@ use_each_src(struct ir3_instruction *instr)
  {
         struct ir3_instruction *src;
  
-       foreach_ssa_src_n(src, n, instr) {
+       foreach_ssa_src_n (src, n, instr) {
                 if (__is_false_dep(instr, n))
                         continue;
                 use_instr(src);
@@ -133,7 +159,7 @@ use_each_src(struct ir3_instruction *instr)
  static void
  use_instr(struct ir3_instruction *instr)
  {
-       if ((instr->opc == OPC_META_FI) || (instr->opc == OPC_META_FO)) {
+       if ((instr->opc == OPC_META_COLLECT) || (instr->opc == OPC_META_SPLIT)) {
                 use_each_src(instr);
         } else {
                 instr->use_count++;
@@ -141,27 +167,32 @@ use_instr(struct ir3_instruction *instr)
  }
  
  static void
-update_live_values(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
+update_live_values(struct ir3_sched_ctx *ctx, struct ir3_instruction *scheduled)
  {
-       if ((instr->opc == OPC_META_FI) || (instr->opc == OPC_META_FO))
+       if ((scheduled->opc == OPC_META_COLLECT) || (scheduled->opc == OPC_META_SPLIT))
                 return;
  
-       ctx->live_values += dest_regs(instr);
-       unuse_each_src(ctx, instr);
+       if ((scheduled->regs_count > 0) && is_half(scheduled)) {
+               ctx->half_live_values += dest_regs(scheduled);
+       } else {
+               ctx->live_values += dest_regs(scheduled);
+       }
+
+       unuse_each_src(ctx, scheduled);
  }
  
  static void
  update_use_count(struct ir3 *ir)
  {
-       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
-               list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+       foreach_block (block, &ir->block_list) {
+               foreach_instr (instr, &block->instr_list) {
                         instr->use_count = 0;
                 }
         }
  
-       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
-               list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
-                       if ((instr->opc == OPC_META_FI) || (instr->opc == OPC_META_FO))
+       foreach_block (block, &ir->block_list) {
+               foreach_instr (instr, &block->instr_list) {
+                       if ((instr->opc == OPC_META_COLLECT) || (instr->opc == OPC_META_SPLIT))
                                 continue;
  
                         use_each_src(instr);
@@ -170,14 +201,9 @@ update_use_count(struct ir3 *ir)
  
         /* Shader outputs are also used:
          */
-       for (unsigned i = 0; i <  ir->noutputs; i++) {
-               struct ir3_instruction  *out = ir->outputs[i];
-
-               if (!out)
-                       continue;
-
+       struct ir3_instruction *out;
+       foreach_output (out, ir)
                 use_instr(out);
-       }
  }
  
  #define NULL_INSTR ((void *)~0)
@@ -185,7 +211,7 @@ update_use_count(struct ir3 *ir)
  static void
  clear_cache(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
  {
-       list_for_each_entry (struct ir3_instruction, instr2, &ctx->depth_list, node) {
+       foreach_instr (instr2, &ctx->depth_list) {
                 if ((instr2->data == instr) || (instr2->data == NULL_INSTR) || !instr)
                         instr2->data = NULL;
         }
@@ -196,20 +222,18 @@ schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
  {
         debug_assert(ctx->block == instr->block);
  
-       /* maybe there is a better way to handle this than just stuffing
-        * a nop.. ideally we'd know about this constraint in the
-        * scheduling and depth calculation..
-        */
-       if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr))
-               ir3_NOP(ctx->block);
-
         /* remove from depth list:
          */
         list_delinit(&instr->node);
  
-       if (writes_addr(instr)) {
-               debug_assert(ctx->addr == NULL);
-               ctx->addr = instr;
+       if (writes_addr0(instr)) {
+               debug_assert(ctx->addr0 == NULL);
+               ctx->addr0 = instr;
+       }
+
+       if (writes_addr1(instr)) {
+               debug_assert(ctx->addr1 == NULL);
+               ctx->addr1 = instr;
         }
  
         if (writes_pred(instr)) {
@@ -219,12 +243,14 @@ schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
  
         instr->flags |= IR3_INSTR_MARK;
  
+       di(instr, "schedule");
+
         list_addtail(&instr->node, &instr->block->instr_list);
         ctx->scheduled = instr;
  
         update_live_values(ctx, instr);
  
-       if (writes_addr(instr) || writes_pred(instr) || is_input(instr)) {
+       if (writes_addr0(instr) || writes_addr1(instr) || writes_pred(instr) || is_input(instr)) {
                 clear_cache(ctx, NULL);
         } else {
                 /* invalidate only the necessary entries.. */
@@ -253,117 +279,6 @@ deepest(struct ir3_instruction **srcs, unsigned nsrcs)
         return d;
  }
  
-/**
- * @block: the block to search in, starting from end; in first pass,
- *    this will be the block the instruction would be inserted into
- *    (but has not yet, ie. it only contains already scheduled
- *    instructions).  For intra-block scheduling (second pass), this
- *    would be one of the predecessor blocks.
- * @instr: the instruction to search for
- * @maxd:  max distance, bail after searching this # of instruction
- *    slots, since it means the instruction we are looking for is
- *    far enough away
- * @pred:  if true, recursively search into predecessor blocks to
- *    find the worst case (shortest) distance (only possible after
- *    individual blocks are all scheduled
- */
-static unsigned
-distance(struct ir3_block *block, struct ir3_instruction *instr,
-               unsigned maxd, bool pred)
-{
-       unsigned d = 0;
-
-       list_for_each_entry_rev (struct ir3_instruction, n, &block->instr_list, node) {
-               if ((n == instr) || (d >= maxd))
-                       return d;
-               /* NOTE: don't count branch/jump since we don't know yet if they will
-                * be eliminated later in resolve_jumps().. really should do that
-                * earlier so we don't have this constraint.
-                */
-               if (is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR)))
-                       d++;
-       }
-
-       /* if coming from a predecessor block, assume it is assigned far
-        * enough away.. we'll fix up later.
-        */
-       if (!pred)
-               return maxd;
-
-       if (pred && (block->data != block)) {
-               /* Search into predecessor blocks, finding the one with the
-                * shortest distance, since that will be the worst case
-                */
-               unsigned min = maxd - d;
-
-               /* (ab)use block->data to prevent recursion: */
-               block->data = block;
-
-               set_foreach(block->predecessors, entry) {
-                       struct ir3_block *pred = (struct ir3_block *)entry->key;
-                       unsigned n;
-
-                       n = distance(pred, instr, min, pred);
-
-                       min = MIN2(min, n);
-               }
-
-               block->data = NULL;
-               d += min;
-       }
-
-       return d;
-}
-
-/* calculate delay for specified src: */
-static unsigned
-delay_calc_srcn(struct ir3_block *block,
-               struct ir3_instruction *assigner,
-               struct ir3_instruction *consumer,
-               unsigned srcn, bool soft, bool pred)
-{
-       unsigned delay = 0;
-
-       if (is_meta(assigner)) {
-               struct ir3_instruction *src;
-               foreach_ssa_src(src, assigner) {
-                       unsigned d;
-                       d = delay_calc_srcn(block, src, consumer, srcn, soft, pred);
-                       delay = MAX2(delay, d);
-               }
-       } else {
-               if (soft) {
-                       if (is_sfu(assigner)) {
-                               delay = 4;
-                       } else {
-                               delay = ir3_delayslots(assigner, consumer, srcn);
-                       }
-               } else {
-                       delay = ir3_delayslots(assigner, consumer, srcn);
-               }
-               delay -= distance(block, assigner, delay, pred);
-       }
-
-       return delay;
-}
-
-/* calculate delay for instruction (maximum of delay for all srcs): */
-static unsigned
-delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
-               bool soft, bool pred)
-{
-       unsigned delay = 0;
-       struct ir3_instruction *src;
-
-       foreach_ssa_src_n(src, i, instr) {
-               unsigned d;
-               d = delay_calc_srcn(block, src, instr, i, soft, pred);
-               delay = MAX2(delay, d);
-       }
-
-       return delay;
-}
-
  struct ir3_sched_notes {
         /* there is at least one kill which could be scheduled, except
          * for unscheduled bary.f's:
@@ -372,7 +287,7 @@ struct ir3_sched_notes {
         /* there is at least one instruction that could be scheduled,
          * except for conflicting address/predicate register usage:
          */
-       bool addr_conflict, pred_conflict;
+       bool addr0_conflict, addr1_conflict, pred_conflict;
  };
  
  /* could an instruction be scheduled if specified ssa src was scheduled? */
@@ -380,7 +295,7 @@ static bool
  could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
  {
         struct ir3_instruction *other_src;
-       foreach_ssa_src(other_src, instr) {
+       foreach_ssa_src (other_src, instr) {
                 /* if dependency not scheduled, we aren't ready yet: */
                 if ((src != other_src) && !is_scheduled(other_src)) {
                         return false;
@@ -405,11 +320,28 @@ check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
          * TODO if any instructions use pred register and have other
          * src args, we would need to do the same for writes_pred()..
          */
-       if (writes_addr(instr)) {
+       if (writes_addr0(instr)) {
+               struct ir3 *ir = instr->block->shader;
+               bool ready = false;
+               for (unsigned i = 0; (i < ir->a0_users_count) && !ready; i++) {
+                       struct ir3_instruction *indirect = ir->a0_users[i];
+                       if (!indirect)
+                               continue;
+                       if (indirect->address != instr)
+                               continue;
+                       ready = could_sched(indirect, instr);
+               }
+
+               /* nothing could be scheduled, so keep looking: */
+               if (!ready)
+                       return false;
+       }
+
+       if (writes_addr1(instr)) {
                 struct ir3 *ir = instr->block->shader;
                 bool ready = false;
-               for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) {
-                       struct ir3_instruction *indirect = ir->indirects[i];
+               for (unsigned i = 0; (i < ir->a1_users_count) && !ready; i++) {
+                       struct ir3_instruction *indirect = ir->a1_users[i];
                         if (!indirect)
                                 continue;
                         if (indirect->address != instr)
@@ -426,9 +358,15 @@ check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
          * register is currently in use, we need to defer until it is
          * free:
          */
-       if (writes_addr(instr) && ctx->addr) {
-               debug_assert(ctx->addr != instr);
-               notes->addr_conflict = true;
+       if (writes_addr0(instr) && ctx->addr0) {
+               debug_assert(ctx->addr0 != instr);
+               notes->addr0_conflict = true;
+               return false;
+       }
+
+       if (writes_addr1(instr) && ctx->addr1) {
+               debug_assert(ctx->addr1 != instr);
+               notes->addr1_conflict = true;
                 return false;
         }
  
@@ -494,7 +432,7 @@ find_instr_recursive(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
         }
  
         /* find unscheduled srcs: */
-       foreach_ssa_src(src, instr) {
+       foreach_ssa_src (src, instr) {
                 if (!is_scheduled(src) && (src->block == instr->block)) {
                         debug_assert(nsrcs < ARRAY_SIZE(srcs));
                         srcs[nsrcs++] = src;
@@ -535,26 +473,26 @@ live_effect(struct ir3_instruction *instr)
         int new_live = dest_regs(instr);
         int old_live = 0;
  
-       foreach_ssa_src_n(src, n, instr) {
+       foreach_ssa_src_n (src, n, instr) {
                 if (__is_false_dep(instr, n))
                         continue;
  
                 if (instr->block != src->block)
                         continue;
  
-               /* for fanout/split, just pass things along to the real src: */
-               if (src->opc == OPC_META_FO)
+               /* for split, just pass things along to the real src: */
+               if (src->opc == OPC_META_SPLIT)
                         src = ssa(src->regs[1]);
  
-               /* for fanin/collect, if this is the last use of *each* src,
+               /* for collect, if this is the last use of *each* src,
                  * then it will decrease the live values, since RA treats
                  * them as a whole:
                  */
-               if (src->opc == OPC_META_FI) {
+               if (src->opc == OPC_META_COLLECT) {
                         struct ir3_instruction *src2;
                         bool last_use = true;
  
-                       foreach_ssa_src(src2, src) {
+                       foreach_ssa_src (src2, src) {
                                 if (src2->use_count > 1) {
                                         last_use = false;
                                         break;
@@ -591,7 +529,7 @@ find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
          * get traversed both when they appear as ssa src to a later instruction
          * as well as where they appear in the depth_list.
          */
-       list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) {
+       foreach_instr_rev (instr, &ctx->depth_list) {
                 struct ir3_instruction *candidate;
  
                 candidate = find_instr_recursive(ctx, notes, instr);
@@ -607,7 +545,7 @@ find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
         /* traverse the list a second time.. but since we cache the result of
          * find_instr_recursive() it isn't as bad as it looks.
          */
-       list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) {
+       foreach_instr_rev (instr, &ctx->depth_list) {
                 struct ir3_instruction *candidate;
  
                 candidate = find_instr_recursive(ctx, notes, instr);
@@ -616,6 +554,7 @@ find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
  
                 /* determine net change to # of live values: */
                 int le = live_effect(candidate);
+               unsigned live_values = (2 * ctx->live_values) + ctx->half_live_values;
  
                 /* if there is a net increase in # of live values, then apply some
                  * threshold to avoid instructions getting scheduled *too* early
@@ -624,10 +563,10 @@ find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
                 if (le >= 1) {
                         unsigned threshold;
  
-                       if (ctx->live_values > 4*4) {
-                               threshold = 4;
+                       if (live_values > ctx->live_threshold_lo) {
+                               threshold = ctx->depth_threshold_lo;
                         } else {
-                               threshold = 6;
+                               threshold = ctx->depth_threshold_hi;
                         }
  
                         /* Filter out any "shallow" instructions which would otherwise
@@ -646,14 +585,14 @@ find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
                                 continue;
                 }
  
-               int rank = delay_calc(ctx->block, candidate, soft, false);
+               int rank = ir3_delay_calc(ctx->block, candidate, soft, false);
  
                 /* if too many live values, prioritize instructions that reduce the
                  * number of live values:
                  */
-               if (ctx->live_values > 16*4) {
+               if (live_values > ctx->live_threshold_hi) {
                         rank = le;
-               } else if (ctx->live_values > 4*4) {
+               } else if (live_values > ctx->live_threshold_lo) {
                         rank += le;
                 }
  
@@ -675,23 +614,21 @@ split_instr(struct ir3_sched_ctx *ctx, struct ir3_instruction *orig_instr)
         return new_instr;
  }
  
-/* "spill" the address register by remapping any unscheduled
+/* "spill" the address registers by remapping any unscheduled
   * instructions which depend on the current address register
   * to a clone of the instruction which wrote the address reg.
   */
  static struct ir3_instruction *
-split_addr(struct ir3_sched_ctx *ctx)
+split_addr(struct ir3_sched_ctx *ctx, struct ir3_instruction **addr,
+                  struct ir3_instruction **users, unsigned users_count)
  {
-       struct ir3 *ir;
         struct ir3_instruction *new_addr = NULL;
         unsigned i;
  
-       debug_assert(ctx->addr);
+       debug_assert(*addr);
  
-       ir = ctx->addr->block->shader;
-
-       for (i = 0; i < ir->indirects_count; i++) {
-               struct ir3_instruction *indirect = ir->indirects[i];
+       for (i = 0; i < users_count; i++) {
+               struct ir3_instruction *indirect = users[i];
  
                 if (!indirect)
                         continue;
@@ -703,9 +640,9 @@ split_addr(struct ir3_sched_ctx *ctx)
                 /* remap remaining instructions using current addr
                  * to new addr:
                  */
-               if (indirect->address == ctx->addr) {
+               if (indirect->address == *addr) {
                         if (!new_addr) {
-                               new_addr = split_instr(ctx, ctx->addr);
+                               new_addr = split_instr(ctx, *addr);
                                 /* original addr is scheduled, but new one isn't: */
                                 new_addr->flags &= ~IR3_INSTR_MARK;
                         }
@@ -715,7 +652,7 @@ split_addr(struct ir3_sched_ctx *ctx)
         }
  
         /* all remaining indirects remapped to new addr: */
-       ctx->addr = NULL;
+       *addr = NULL;
  
         return new_addr;
  }
@@ -772,7 +709,8 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
         ctx->block = block;
  
         /* addr/pred writes are per-block: */
-       ctx->addr = NULL;
+       ctx->addr0 = NULL;
+       ctx->addr1 = NULL;
         ctx->pred = NULL;
  
         /* move all instructions to the unscheduled list, and
@@ -783,19 +721,30 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
         list_inithead(&block->instr_list);
         list_inithead(&ctx->depth_list);
  
-       /* first a pre-pass to schedule all meta:input instructions
-        * (which need to appear first so that RA knows the register is
-        * occupied), and move remaining to depth sorted list:
+       /* First schedule all meta:input instructions, followed by
+        * tex-prefetch.  We want all of the instructions that load
+        * values into registers before the shader starts to go
+        * before any other instructions.  But in particular we
+        * want inputs to come before prefetches.  This is because
+        * a FS's bary_ij input may not actually be live in the
+        * shader, but it should not be scheduled on top of any
+        * other input (but can be overwritten by a tex prefetch)
+        *
+        * Finally, move all the remaining instructions to the depth-
+        * list
          */
-       list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node) {
-               if (instr->opc == OPC_META_INPUT) {
+       foreach_instr_safe (instr, &unscheduled_list)
+               if (instr->opc == OPC_META_INPUT)
+                       schedule(ctx, instr);
+
+       foreach_instr_safe (instr, &unscheduled_list)
+               if (instr->opc == OPC_META_TEX_PREFETCH)
                         schedule(ctx, instr);
-               } else {
-                       ir3_insert_by_depth(instr, &ctx->depth_list);
-               }
-       }
  
-       while (!list_empty(&ctx->depth_list)) {
+       foreach_instr_safe (instr, &unscheduled_list)
+               ir3_insert_by_depth(instr, &ctx->depth_list);
+
+       while (!list_is_empty(&ctx->depth_list)) {
                 struct ir3_sched_notes notes = {0};
                 struct ir3_instruction *instr;
  
@@ -804,7 +753,8 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
                         instr = find_eligible_instr(ctx, &notes, false);
  
                 if (instr) {
-                       unsigned delay = delay_calc(ctx->block, instr, false, false);
+                       unsigned delay = ir3_delay_calc(ctx->block, instr, false, false);
+                       d("delay=%u", delay);
  
                         /* and if we run out of instructions that can be scheduled,
                          * then it is time for nop's:
@@ -818,14 +768,19 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
                         schedule(ctx, instr);
                 } else {
                         struct ir3_instruction *new_instr = NULL;
+                       struct ir3 *ir = block->shader;
  
                         /* nothing available to schedule.. if we are blocked on
                          * address/predicate register conflict, then break the
                          * deadlock by cloning the instruction that wrote that
                          * reg:
                          */
-                       if (notes.addr_conflict) {
-                               new_instr = split_addr(ctx);
+                       if (notes.addr0_conflict) {
+                               new_instr = split_addr(ctx, &ctx->addr0,
+                                                                          ir->a0_users, ir->a0_users_count);
+                       } else if (notes.addr1_conflict) {
+                               new_instr = split_addr(ctx, &ctx->addr1,
+                                                                          ir->a1_users, ir->a1_users_count);
                         } else if (notes.pred_conflict) {
                                 new_instr = split_pred(ctx);
                         } else {
@@ -848,94 +803,21 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
                         }
                 }
         }
-
-       /* And lastly, insert branch/jump instructions to take us to
-        * the next block.  Later we'll strip back out the branches
-        * that simply jump to next instruction.
-        */
-       if (block->successors[1]) {
-               /* if/else, conditional branches to "then" or "else": */
-               struct ir3_instruction *br;
-               unsigned delay = 6;
-
-               debug_assert(ctx->pred);
-               debug_assert(block->condition);
-
-               delay -= distance(ctx->block, ctx->pred, delay, false);
-
-               while (delay > 0) {
-                       ir3_NOP(block);
-                       delay--;
-               }
-
-               /* create "else" branch first (since "then" block should
-                * frequently/always end up being a fall-thru):
-                */
-               br = ir3_BR(block);
-               br->cat0.inv = true;
-               br->cat0.target = block->successors[1];
-
-               /* NOTE: we have to hard code delay of 6 above, since
-                * we want to insert the nop's before constructing the
-                * branch.  Throw in an assert so we notice if this
-                * ever breaks on future generation:
-                */
-               debug_assert(ir3_delayslots(ctx->pred, br, 0) == 6);
-
-               br = ir3_BR(block);
-               br->cat0.target = block->successors[0];
-
-       } else if (block->successors[0]) {
-               /* otherwise unconditional jump to next block: */
-               struct ir3_instruction *jmp;
-
-               jmp = ir3_JUMP(block);
-               jmp->cat0.target = block->successors[0];
-       }
-
-       /* NOTE: if we kept track of the predecessors, we could do a better
-        * job w/ (jp) flags.. every node w/ > predecessor is a join point.
-        * Note that as we eliminate blocks which contain only an unconditional
-        * jump we probably need to propagate (jp) flag..
-        */
  }
  
-/* After scheduling individual blocks, we still could have cases where
- * one (or more) paths into a block, a value produced by a previous
- * has too few delay slots to be legal.  We can't deal with this in the
- * first pass, because loops (ie. we can't ensure all predecessor blocks
- * are already scheduled in the first pass).  All we can really do at
- * this point is stuff in extra nop's until things are legal.
- */
  static void
-sched_intra_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
+setup_thresholds(struct ir3_sched_ctx *ctx, struct ir3 *ir)
  {
-       unsigned n = 0;
-
-       ctx->block = block;
-
-       list_for_each_entry_safe (struct ir3_instruction, instr, &block->instr_list, node) {
-               unsigned delay = 0;
-
-               set_foreach(block->predecessors, entry) {
-                       struct ir3_block *pred = (struct ir3_block *)entry->key;
-                       unsigned d = delay_calc(pred, instr, false, true);
-                       delay = MAX2(d, delay);
-               }
-
-               while (delay > n) {
-                       struct ir3_instruction *nop = ir3_NOP(block);
-
-                       /* move to before instr: */
-                       list_delinit(&nop->node);
-                       list_addtail(&nop->node, &instr->node);
-
-                       n++;
-               }
-
-               /* we can bail once we hit worst case delay: */
-               if (++n > 6)
-                       break;
+       if (ir3_has_latency_to_hide(ir)) {
+               ctx->live_threshold_hi = 2 * 16 * 4;
+               ctx->live_threshold_lo = 2 * 4 * 4;
+               ctx->depth_threshold_hi = 6;
+               ctx->depth_threshold_lo = 4;
+       } else {
+               ctx->live_threshold_hi = 2 * 16 * 4;
+               ctx->live_threshold_lo = 2 * 12 * 4;
+               ctx->depth_threshold_hi = 16;
+               ctx->depth_threshold_lo = 16;
         }
  }
  
@@ -943,18 +825,17 @@ int ir3_sched(struct ir3 *ir)
  {
         struct ir3_sched_ctx ctx = {0};
  
+       setup_thresholds(&ctx, ir);
+
         ir3_clear_mark(ir);
         update_use_count(ir);
  
-       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+       foreach_block (block, &ir->block_list) {
                 ctx.live_values = 0;
+               ctx.half_live_values = 0;
                 sched_block(&ctx, block);
         }
  
-       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
-               sched_intra_block(&ctx, block);
-       }
-
         if (ctx.error)
                 return -1;
  
@@ -1066,7 +947,7 @@ add_barrier_deps(struct ir3_block *block, struct ir3_instruction *instr)
  static void
  calculate_deps(struct ir3_block *block)
  {
-       list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+       foreach_instr (instr, &block->instr_list) {
                 if (instr->barrier_class) {
                         add_barrier_deps(block, instr);
                 }
@@ -1076,7 +957,7 @@ calculate_deps(struct ir3_block *block)
  void
  ir3_sched_add_deps(struct ir3 *ir)
  {
-       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+       foreach_block (block, &ir->block_list) {
                 calculate_deps(block);
         }
  }