freedreno: Remove the Emacs mode lines
[mesa.git] / src / gallium / drivers / freedreno / ir3 / ir3_sched.c
index 653f679fe1ef9f2ff98a9f95c76695fb1c6af5ee..6552980d90c4317fbabfe34277208aef1255ba65 100644 (file)
@@ -1,5 +1,3 @@
-/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
-
 /*
  * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
  *
 
 #include "ir3.h"
 
-enum {
-       SCHEDULED = -1,
-       DELAYED = -2,
-};
-
 /*
  * Instruction Scheduling:
  *
- * Using the depth sorted list from depth pass, attempt to recursively
- * schedule deepest unscheduled path.  The first instruction that cannot
- * be scheduled, returns the required delay slots it needs, at which
- * point we return back up to the top and attempt to schedule by next
- * highest depth.  After a sufficient number of instructions have been
- * scheduled, return back to beginning of list and start again.  If you
- * reach the end of depth sorted list without being able to insert any
- * instruction, insert nop's.  Repeat until no more unscheduled
- * instructions.
+ * A recursive depth based scheduling algo.  Recursively find an eligible
+ * instruction to schedule from the deepest instruction (recursing through
+ * it's unscheduled src instructions).  Normally this would result in a
+ * lot of re-traversal of the same instructions, so we cache results in
+ * instr->data (and clear cached results that would be no longer valid
+ * after scheduling an instruction).
  *
  * There are a few special cases that need to be handled, since sched
  * is currently independent of register allocation.  Usages of address
@@ -60,13 +50,69 @@ enum {
  */
 
 struct ir3_sched_ctx {
-       struct ir3_instruction *scheduled; /* last scheduled instr */
+       struct ir3_block *block;           /* the current block */
+       struct list_head depth_list;       /* depth sorted unscheduled instrs */
+       struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/
        struct ir3_instruction *addr;      /* current a0.x user, if any */
        struct ir3_instruction *pred;      /* current p0.x user, if any */
-       unsigned cnt;
        bool error;
 };
 
+static bool is_sfu_or_mem(struct ir3_instruction *instr)
+{
+       return is_sfu(instr) || is_mem(instr);
+}
+
+#define NULL_INSTR ((void *)~0)
+
+static void
+clear_cache(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
+{
+       list_for_each_entry (struct ir3_instruction, instr2, &ctx->depth_list, node) {
+               if ((instr2->data == instr) || (instr2->data == NULL_INSTR) || !instr)
+                       instr2->data = NULL;
+       }
+}
+
+static void
+schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
+{
+       debug_assert(ctx->block == instr->block);
+
+       /* maybe there is a better way to handle this than just stuffing
+        * a nop.. ideally we'd know about this constraint in the
+        * scheduling and depth calculation..
+        */
+       if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr))
+               ir3_NOP(ctx->block);
+
+       /* remove from depth list:
+        */
+       list_delinit(&instr->node);
+
+       if (writes_addr(instr)) {
+               debug_assert(ctx->addr == NULL);
+               ctx->addr = instr;
+       }
+
+       if (writes_pred(instr)) {
+               debug_assert(ctx->pred == NULL);
+               ctx->pred = instr;
+       }
+
+       instr->flags |= IR3_INSTR_MARK;
+
+       list_addtail(&instr->node, &instr->block->instr_list);
+       ctx->scheduled = instr;
+
+       if (writes_addr(instr) || writes_pred(instr) || is_input(instr)) {
+               clear_cache(ctx, NULL);
+       } else {
+               /* invalidate only the necessary entries.. */
+               clear_cache(ctx, instr);
+       }
+}
+
 static struct ir3_instruction *
 deepest(struct ir3_instruction **srcs, unsigned nsrcs)
 {
@@ -88,157 +134,192 @@ deepest(struct ir3_instruction **srcs, unsigned nsrcs)
        return d;
 }
 
-static unsigned distance(struct ir3_sched_ctx *ctx,
-               struct ir3_instruction *instr, unsigned maxd)
+/**
+ * @block: the block to search in, starting from end; in first pass,
+ *    this will be the block the instruction would be inserted into
+ *    (but has not yet, ie. it only contains already scheduled
+ *    instructions).  For intra-block scheduling (second pass), this
+ *    would be one of the predecessor blocks.
+ * @instr: the instruction to search for
+ * @maxd:  max distance, bail after searching this # of instruction
+ *    slots, since it means the instruction we are looking for is
+ *    far enough away
+ * @pred:  if true, recursively search into predecessor blocks to
+ *    find the worst case (shortest) distance (only possible after
+ *    individual blocks are all scheduled
+ */
+static unsigned
+distance(struct ir3_block *block, struct ir3_instruction *instr,
+               unsigned maxd, bool pred)
 {
-       struct ir3_instruction *n = ctx->scheduled;
        unsigned d = 0;
-       while (n && (n != instr) && (d < maxd)) {
-               if (is_alu(n) || is_flow(n))
+
+       list_for_each_entry_rev (struct ir3_instruction, n, &block->instr_list, node) {
+               if ((n == instr) || (d >= maxd))
+                       return d;
+               /* NOTE: don't count branch/jump since we don't know yet if they will
+                * be eliminated later in resolve_jumps().. really should do that
+                * earlier so we don't have this constraint.
+                */
+               if (is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR)))
                        d++;
-               n = n->next;
        }
-       return d;
-}
-
-/* TODO maybe we want double linked list? */
-static struct ir3_instruction * prev(struct ir3_instruction *instr)
-{
-       struct ir3_instruction *p = instr->block->head;
-       while (p && (p->next != instr))
-               p = p->next;
-       return p;
-}
-
-static bool is_sfu_or_mem(struct ir3_instruction *instr)
-{
-       return is_sfu(instr) || is_mem(instr);
-}
-
-static void schedule(struct ir3_sched_ctx *ctx,
-               struct ir3_instruction *instr, bool remove)
-{
-       struct ir3_block *block = instr->block;
-
-       /* maybe there is a better way to handle this than just stuffing
-        * a nop.. ideally we'd know about this constraint in the
-        * scheduling and depth calculation..
-        */
-       if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr))
-               schedule(ctx, ir3_instr_create(block, 0, OPC_NOP), false);
 
-       /* remove from depth list:
+       /* if coming from a predecessor block, assume it is assigned far
+        * enough away.. we'll fix up later.
         */
-       if (remove) {
-               struct ir3_instruction *p = prev(instr);
+       if (!pred)
+               return maxd;
 
-               /* NOTE: this can happen for inputs which are not
-                * read.. in that case there is no need to schedule
-                * the input, so just bail:
+       if (pred && (block->data != block)) {
+               /* Search into predecessor blocks, finding the one with the
+                * shortest distance, since that will be the worst case
                 */
-               if (instr != (p ? p->next : block->head))
-                       return;
+               unsigned min = maxd - d;
 
-               if (p)
-                       p->next = instr->next;
-               else
-                       block->head = instr->next;
-       }
+               /* (ab)use block->data to prevent recursion: */
+               block->data = block;
 
-       if (writes_addr(instr)) {
-               assert(ctx->addr == NULL);
-               ctx->addr = instr;
-       }
+               for (unsigned i = 0; i < block->predecessors_count; i++) {
+                       unsigned n;
 
-       if (writes_pred(instr)) {
-               assert(ctx->pred == NULL);
-               ctx->pred = instr;
-       }
+                       n = distance(block->predecessors[i], instr, min, pred);
 
-       instr->flags |= IR3_INSTR_MARK;
+                       min = MIN2(min, n);
+               }
 
-       instr->next = ctx->scheduled;
-       ctx->scheduled = instr;
+               block->data = NULL;
+               d += min;
+       }
 
-       ctx->cnt++;
+       return d;
 }
 
-/*
- * Delay-slot calculation.  Follows fanin/fanout.
- */
-
 /* calculate delay for specified src: */
-static unsigned delay_calc_srcn(struct ir3_sched_ctx *ctx,
+static unsigned
+delay_calc_srcn(struct ir3_block *block,
                struct ir3_instruction *assigner,
-               struct ir3_instruction *consumer, unsigned srcn)
+               struct ir3_instruction *consumer,
+               unsigned srcn, bool soft, bool pred)
 {
        unsigned delay = 0;
 
        if (is_meta(assigner)) {
                struct ir3_instruction *src;
                foreach_ssa_src(src, assigner) {
-                       unsigned d = delay_calc_srcn(ctx, src, consumer, srcn);
+                       unsigned d;
+                       d = delay_calc_srcn(block, src, consumer, srcn, soft, pred);
                        delay = MAX2(delay, d);
                }
        } else {
-               delay = ir3_delayslots(assigner, consumer, srcn);
-               delay -= distance(ctx, assigner, delay);
+               if (soft) {
+                       if (is_sfu(assigner)) {
+                               delay = 4;
+                       } else {
+                               delay = ir3_delayslots(assigner, consumer, srcn);
+                       }
+               } else {
+                       delay = ir3_delayslots(assigner, consumer, srcn);
+               }
+               delay -= distance(block, assigner, delay, pred);
        }
 
        return delay;
 }
 
 /* calculate delay for instruction (maximum of delay for all srcs): */
-static unsigned delay_calc(struct ir3_sched_ctx *ctx,
-               struct ir3_instruction *instr)
+static unsigned
+delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
+               bool soft, bool pred)
 {
        unsigned delay = 0;
        struct ir3_instruction *src;
 
        foreach_ssa_src_n(src, i, instr) {
-               unsigned d = delay_calc_srcn(ctx, src, instr, i);
+               unsigned d;
+               d = delay_calc_srcn(block, src, instr, i, soft, pred);
                delay = MAX2(delay, d);
        }
 
        return delay;
 }
 
-/* A negative return value signals that an instruction has been newly
- * SCHEDULED (or DELAYED due to address or predicate register already
- * in use), return back up to the top of the stack (to block_sched())
+struct ir3_sched_notes {
+       /* there is at least one kill which could be scheduled, except
+        * for unscheduled bary.f's:
+        */
+       bool blocked_kill;
+       /* there is at least one instruction that could be scheduled,
+        * except for conflicting address/predicate register usage:
+        */
+       bool addr_conflict, pred_conflict;
+};
+
+static bool is_scheduled(struct ir3_instruction *instr)
+{
+       return !!(instr->flags & IR3_INSTR_MARK);
+}
+
+/* could an instruction be scheduled if specified ssa src was scheduled? */
+static bool
+could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
+{
+       struct ir3_instruction *other_src;
+       foreach_ssa_src(other_src, instr) {
+               /* if dependency not scheduled, we aren't ready yet: */
+               if ((src != other_src) && !is_scheduled(other_src)) {
+                       return false;
+               }
+       }
+       return true;
+}
+
+/* Check if instruction is ok to schedule.  Make sure it is not blocked
+ * by use of addr/predicate register, etc.
  */
-static int trysched(struct ir3_sched_ctx *ctx,
+static bool
+check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
                struct ir3_instruction *instr)
 {
-       struct ir3_instruction *srcs[64];
-       struct ir3_instruction *src;
-       unsigned delay, nsrcs = 0;
-
-       /* if already scheduled: */
-       if (instr->flags & IR3_INSTR_MARK)
-               return 0;
+       /* For instructions that write address register we need to
+        * make sure there is at least one instruction that uses the
+        * addr value which is otherwise ready.
+        *
+        * TODO if any instructions use pred register and have other
+        * src args, we would need to do the same for writes_pred()..
+        */
+       if (writes_addr(instr)) {
+               struct ir3 *ir = instr->block->shader;
+               bool ready = false;
+               for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) {
+                       struct ir3_instruction *indirect = ir->indirects[i];
+                       if (!indirect)
+                               continue;
+                       if (indirect->address != instr)
+                               continue;
+                       ready = could_sched(indirect, instr);
+               }
 
-       /* figure out our src's, copy 'em out into an array for sorting: */
-       foreach_ssa_src(src, instr) {
-               debug_assert(nsrcs < ARRAY_SIZE(srcs));
-               srcs[nsrcs++] = src;
+               /* nothing could be scheduled, so keep looking: */
+               if (!ready)
+                       return false;
        }
 
-       /* for each src register in sorted order:
+       /* if this is a write to address/predicate register, and that
+        * register is currently in use, we need to defer until it is
+        * free:
         */
-       delay = 0;
-       while ((src = deepest(srcs, nsrcs))) {
-               delay = trysched(ctx, src);
-               if (delay)
-                       return delay;
+       if (writes_addr(instr) && ctx->addr) {
+               debug_assert(ctx->addr != instr);
+               notes->addr_conflict = true;
+               return false;
        }
 
-       /* all our dependents are scheduled, figure out if
-        * we have enough delay slots to schedule ourself:
-        */
-       delay = delay_calc(ctx, instr);
-       if (delay)
-               return delay;
+       if (writes_pred(instr) && ctx->pred) {
+               debug_assert(ctx->pred != instr);
+               notes->pred_conflict = true;
+               return false;
+       }
 
        /* if the instruction is a kill, we need to ensure *every*
         * bary.f is scheduled.  The hw seems unhappy if the thread
@@ -255,207 +336,483 @@ static int trysched(struct ir3_sched_ctx *ctx,
         */
        if (is_kill(instr)) {
                struct ir3 *ir = instr->block->shader;
-               unsigned i;
 
-               for (i = 0; i < ir->baryfs_count; i++) {
+               for (unsigned i = 0; i < ir->baryfs_count; i++) {
                        struct ir3_instruction *baryf = ir->baryfs[i];
-                       if (baryf->depth == DEPTH_UNUSED)
+                       if (baryf->flags & IR3_INSTR_UNUSED)
                                continue;
-                       delay = trysched(ctx, baryf);
-                       if (delay)
-                               return delay;
+                       if (!is_scheduled(baryf)) {
+                               notes->blocked_kill = true;
+                               return false;
+                       }
                }
        }
 
-       /* if instruction writes address register, we need to ensure
-        * that the instructions which use the address register value
-        * have all their other dependencies scheduled.
-        * TODO we may possibly need to do the same thing with predicate
-        * register usage, but for now we get by without since the
-        * predicate usage patterns are more simple
+       return true;
+}
+
+/* Find the best instruction to schedule from specified instruction or
+ * recursively it's ssa sources.
+ */
+static struct ir3_instruction *
+find_instr_recursive(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
+               struct ir3_instruction *instr)
+{
+       struct ir3_instruction *srcs[__ssa_src_cnt(instr)];
+       struct ir3_instruction *src;
+       unsigned nsrcs = 0;
+
+       if (is_scheduled(instr))
+               return NULL;
+
+       /* use instr->data to cache the results of recursing up the
+        * instr src's.  Otherwise the recursive algo can scale quite
+        * badly w/ shader size.  But this takes some care to clear
+        * the cache appropriately when instructions are scheduled.
         */
-       if (writes_addr(instr)) {
-               struct ir3 *ir = instr->block->shader;
-               unsigned i;
+       if (instr->data) {
+               if (instr->data == NULL_INSTR)
+                       return NULL;
+               return instr->data;
+       }
 
-               for (i = 0; i < ir->indirects_count; i++) {
-                       struct ir3_instruction *indirect = ir->indirects[i];
-                       if (indirect->depth == DEPTH_UNUSED)
-                               continue;
-                       if (indirect->address != instr)
-                               continue;
-                       /* NOTE: avoid recursively scheduling the dependency
-                        * on ourself (ie. avoid infinite recursion):
-                        */
-                       foreach_ssa_src(src, indirect) {
-                               if ((src == instr) || (src->address == instr))
-                                       continue;
-                               delay = trysched(ctx, src);
-                               if (delay)
-                                       return delay;
-                       }
+       /* find unscheduled srcs: */
+       foreach_ssa_src(src, instr) {
+               if (!is_scheduled(src)) {
+                       debug_assert(nsrcs < ARRAY_SIZE(srcs));
+                       srcs[nsrcs++] = src;
                }
        }
 
-       /* if this is a write to address/predicate register, and that
-        * register is currently in use, we need to defer until it is
-        * free:
-        */
-       if (writes_addr(instr) && ctx->addr) {
-               assert(ctx->addr != instr);
-               return DELAYED;
+       /* if all our src's are already scheduled: */
+       if (nsrcs == 0) {
+               if (check_instr(ctx, notes, instr)) {
+                       instr->data = instr;
+                       return instr;
+               }
+               return NULL;
        }
-       if (writes_pred(instr) && ctx->pred) {
-               assert(ctx->pred != instr);
-               return DELAYED;
+
+       while ((src = deepest(srcs, nsrcs))) {
+               struct ir3_instruction *candidate;
+
+               candidate = find_instr_recursive(ctx, notes, src);
+               if (!candidate)
+                       continue;
+
+               if (check_instr(ctx, notes, candidate)) {
+                       instr->data = candidate;
+                       return candidate;
+               }
        }
 
-       schedule(ctx, instr, true);
-       return SCHEDULED;
+       instr->data = NULL_INSTR;
+       return NULL;
 }
 
-static struct ir3_instruction * reverse(struct ir3_instruction *instr)
+/* find instruction to schedule: */
+static struct ir3_instruction *
+find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
+               bool soft)
 {
-       struct ir3_instruction *reversed = NULL;
-       while (instr) {
-               struct ir3_instruction *next = instr->next;
-               instr->next = reversed;
-               reversed = instr;
-               instr = next;
+       struct ir3_instruction *best_instr = NULL;
+       unsigned min_delay = ~0;
+
+       /* TODO we'd really rather use the list/array of block outputs.  But we
+        * don't have such a thing.  Recursing *every* instruction in the list
+        * will result in a lot of repeated traversal, since instructions will
+        * get traversed both when they appear as ssa src to a later instruction
+        * as well as where they appear in the depth_list.
+        */
+       list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) {
+               struct ir3_instruction *candidate;
+               unsigned delay;
+
+               candidate = find_instr_recursive(ctx, notes, instr);
+               if (!candidate)
+                       continue;
+
+               delay = delay_calc(ctx->block, candidate, soft, false);
+               if (delay < min_delay) {
+                       best_instr = candidate;
+                       min_delay = delay;
+               }
+
+               if (min_delay == 0)
+                       break;
        }
-       return reversed;
-}
 
-static bool uses_current_addr(struct ir3_sched_ctx *ctx,
-               struct ir3_instruction *instr)
-{
-       return instr->address && (ctx->addr == instr->address);
+       return best_instr;
 }
 
-static bool uses_current_pred(struct ir3_sched_ctx *ctx,
-               struct ir3_instruction *instr)
+/* "spill" the address register by remapping any unscheduled
+ * instructions which depend on the current address register
+ * to a clone of the instruction which wrote the address reg.
+ */
+static struct ir3_instruction *
+split_addr(struct ir3_sched_ctx *ctx)
 {
-       struct ir3_instruction *src;
-       foreach_ssa_src(src, instr)
-               if (ctx->pred == src)
-                       return true;
-       return false;
+       struct ir3 *ir;
+       struct ir3_instruction *new_addr = NULL;
+       unsigned i;
+
+       debug_assert(ctx->addr);
+
+       ir = ctx->addr->block->shader;
+
+       for (i = 0; i < ir->indirects_count; i++) {
+               struct ir3_instruction *indirect = ir->indirects[i];
+
+               if (!indirect)
+                       continue;
+
+               /* skip instructions already scheduled: */
+               if (is_scheduled(indirect))
+                       continue;
+
+               /* remap remaining instructions using current addr
+                * to new addr:
+                */
+               if (indirect->address == ctx->addr) {
+                       if (!new_addr) {
+                               new_addr = ir3_instr_clone(ctx->addr);
+                               /* original addr is scheduled, but new one isn't: */
+                               new_addr->flags &= ~IR3_INSTR_MARK;
+                       }
+                       ir3_instr_set_address(indirect, new_addr);
+               }
+       }
+
+       /* all remaining indirects remapped to new addr: */
+       ctx->addr = NULL;
+
+       return new_addr;
 }
 
-/* when we encounter an instruction that writes to the address register
- * when it is in use, we delay that instruction and try to schedule all
- * other instructions using the current address register:
+/* "spill" the predicate register by remapping any unscheduled
+ * instructions which depend on the current predicate register
+ * to a clone of the instruction which wrote the address reg.
  */
-static int block_sched_undelayed(struct ir3_sched_ctx *ctx,
-               struct ir3_block *block)
+static struct ir3_instruction *
+split_pred(struct ir3_sched_ctx *ctx)
 {
-       struct ir3_instruction *instr = block->head;
-       bool addr_in_use = false;
-       bool pred_in_use = false;
-       bool all_delayed = true;
-       unsigned cnt = ~0, attempted = 0;
-
-       while (instr) {
-               struct ir3_instruction *next = instr->next;
-               bool addr = uses_current_addr(ctx, instr);
-               bool pred = uses_current_pred(ctx, instr);
-
-               if (addr || pred) {
-                       int ret = trysched(ctx, instr);
-
-                       if (ret != DELAYED)
-                               all_delayed = false;
-
-                       if (ret == SCHEDULED)
-                               cnt = 0;
-                       else if (ret > 0)
-                               cnt = MIN2(cnt, ret);
-                       if (addr)
-                               addr_in_use = true;
-                       if (pred)
-                               pred_in_use = true;
-
-                       attempted++;
-               }
+       struct ir3 *ir;
+       struct ir3_instruction *new_pred = NULL;
+       unsigned i;
 
-               instr = next;
-       }
+       debug_assert(ctx->pred);
 
-       if (!addr_in_use)
-               ctx->addr = NULL;
+       ir = ctx->pred->block->shader;
 
-       if (!pred_in_use)
-               ctx->pred = NULL;
+       for (i = 0; i < ir->predicates_count; i++) {
+               struct ir3_instruction *predicated = ir->predicates[i];
 
-       /* detect if we've gotten ourselves into an impossible situation
-        * and bail if needed
-        */
-       if (all_delayed && (attempted > 0))
-               ctx->error = true;
+               /* skip instructions already scheduled: */
+               if (is_scheduled(predicated))
+                       continue;
+
+               /* remap remaining instructions using current pred
+                * to new pred:
+                *
+                * TODO is there ever a case when pred isn't first
+                * (and only) src?
+                */
+               if (ssa(predicated->regs[1]) == ctx->pred) {
+                       if (!new_pred) {
+                               new_pred = ir3_instr_clone(ctx->pred);
+                               /* original pred is scheduled, but new one isn't: */
+                               new_pred->flags &= ~IR3_INSTR_MARK;
+                       }
+                       predicated->regs[1]->instr = new_pred;
+               }
+       }
+
+       /* all remaining predicated remapped to new pred: */
+       ctx->pred = NULL;
 
-       return cnt;
+       return new_pred;
 }
 
-static void block_sched(struct ir3_sched_ctx *ctx, struct ir3_block *block)
+static void
+sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 {
-       struct ir3_instruction *instr;
+       struct list_head unscheduled_list;
 
-       /* schedule all the shader input's (meta-instr) first so that
-        * the RA step sees that the input registers contain a value
-        * from the start of the shader:
+       ctx->block = block;
+
+       /* addr/pred writes are per-block: */
+       ctx->addr = NULL;
+       ctx->pred = NULL;
+
+       /* move all instructions to the unscheduled list, and
+        * empty the block's instruction list (to which we will
+        * be inserting).
+        */
+       list_replace(&block->instr_list, &unscheduled_list);
+       list_inithead(&block->instr_list);
+       list_inithead(&ctx->depth_list);
+
+       /* first a pre-pass to schedule all meta:input instructions
+        * (which need to appear first so that RA knows the register is
+        * occupied), and move remaining to depth sorted list:
         */
-       if (!block->parent) {
-               unsigned i;
-               for (i = 0; i < block->ninputs; i++) {
-                       struct ir3_instruction *in = block->inputs[i];
-                       if (in)
-                               schedule(ctx, in, true);
+       list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node) {
+               if (instr->opc == OPC_META_INPUT) {
+                       schedule(ctx, instr);
+               } else {
+                       ir3_insert_by_depth(instr, &ctx->depth_list);
                }
        }
 
-       while ((instr = block->head) && !ctx->error) {
-               /* NOTE: always grab next *before* trysched(), in case the
-                * instruction is actually scheduled (and therefore moved
-                * from depth list into scheduled list)
-                */
-               struct ir3_instruction *next = instr->next;
-               int cnt = trysched(ctx, instr);
+       while (!list_empty(&ctx->depth_list)) {
+               struct ir3_sched_notes notes = {0};
+               struct ir3_instruction *instr;
 
-               if (cnt == DELAYED)
-                       cnt = block_sched_undelayed(ctx, block);
+               instr = find_eligible_instr(ctx, &notes, true);
+               if (!instr)
+                       instr = find_eligible_instr(ctx, &notes, false);
 
-               /* -1 is signal to return up stack, but to us means same as 0: */
-               cnt = MAX2(0, cnt);
-               cnt += ctx->cnt;
-               instr = next;
+               if (instr) {
+                       unsigned delay = delay_calc(ctx->block, instr, false, false);
 
-               /* if deepest remaining instruction cannot be scheduled, try
-                * the increasingly more shallow instructions until needed
-                * number of delay slots is filled:
-                */
-               while (instr && (cnt > ctx->cnt)) {
-                       next = instr->next;
-                       trysched(ctx, instr);
-                       instr = next;
+                       /* and if we run out of instructions that can be scheduled,
+                        * then it is time for nop's:
+                        */
+                       debug_assert(delay <= 6);
+                       while (delay > 0) {
+                               ir3_NOP(block);
+                               delay--;
+                       }
+
+                       schedule(ctx, instr);
+               } else {
+                       struct ir3_instruction *new_instr = NULL;
+
+                       /* nothing available to schedule.. if we are blocked on
+                        * address/predicate register conflict, then break the
+                        * deadlock by cloning the instruction that wrote that
+                        * reg:
+                        */
+                       if (notes.addr_conflict) {
+                               new_instr = split_addr(ctx);
+                       } else if (notes.pred_conflict) {
+                               new_instr = split_pred(ctx);
+                       } else {
+                               debug_assert(0);
+                               ctx->error = true;
+                               return;
+                       }
+
+                       if (new_instr) {
+                               /* clearing current addr/pred can change what is
+                                * available to schedule, so clear cache..
+                                */
+                               clear_cache(ctx, NULL);
+
+                               ir3_insert_by_depth(new_instr, &ctx->depth_list);
+                               /* the original instr that wrote addr/pred may have
+                                * originated from a different block:
+                                */
+                               new_instr->block = block;
+                       }
+               }
+       }
+
+       /* And lastly, insert branch/jump instructions to take us to
+        * the next block.  Later we'll strip back out the branches
+        * that simply jump to next instruction.
+        */
+       if (block->successors[1]) {
+               /* if/else, conditional branches to "then" or "else": */
+               struct ir3_instruction *br;
+               unsigned delay = 6;
+
+               debug_assert(ctx->pred);
+               debug_assert(block->condition);
+
+               delay -= distance(ctx->block, ctx->pred, delay, false);
+
+               while (delay > 0) {
+                       ir3_NOP(block);
+                       delay--;
                }
 
-               /* and if we run out of instructions that can be scheduled,
-                * then it is time for nop's:
+               /* create "else" branch first (since "then" block should
+                * frequently/always end up being a fall-thru):
                 */
-               while (cnt > ctx->cnt)
-                       schedule(ctx, ir3_instr_create(block, 0, OPC_NOP), false);
+               br = ir3_BR(block);
+               br->cat0.inv = true;
+               br->cat0.target = block->successors[1];
+
+               /* NOTE: we have to hard code delay of 6 above, since
+                * we want to insert the nop's before constructing the
+                * branch.  Throw in an assert so we notice if this
+                * ever breaks on future generation:
+                */
+               debug_assert(ir3_delayslots(ctx->pred, br, 0) == 6);
+
+               br = ir3_BR(block);
+               br->cat0.target = block->successors[0];
+
+       } else if (block->successors[0]) {
+               /* otherwise unconditional jump to next block: */
+               struct ir3_instruction *jmp;
+
+               jmp = ir3_JUMP(block);
+               jmp->cat0.target = block->successors[0];
        }
 
-       /* at this point, scheduled list is in reverse order, so fix that: */
-       block->head = reverse(ctx->scheduled);
+       /* NOTE: if we kept track of the predecessors, we could do a better
+        * job w/ (jp) flags.. every node w/ > predecessor is a join point.
+        * Note that as we eliminate blocks which contain only an unconditional
+        * jump we probably need to propagate (jp) flag..
+        */
 }
 
-int ir3_block_sched(struct ir3_block *block)
+/* After scheduling individual blocks, we still could have cases where
+ * one (or more) paths into a block, a value produced by a previous
+ * has too few delay slots to be legal.  We can't deal with this in the
+ * first pass, because loops (ie. we can't ensure all predecessor blocks
+ * are already scheduled in the first pass).  All we can really do at
+ * this point is stuff in extra nop's until things are legal.
+ */
+static void
+sched_intra_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
+{
+       unsigned n = 0;
+
+       ctx->block = block;
+
+       list_for_each_entry_safe (struct ir3_instruction, instr, &block->instr_list, node) {
+               unsigned delay = 0;
+
+               for (unsigned i = 0; i < block->predecessors_count; i++) {
+                       unsigned d = delay_calc(block->predecessors[i], instr, false, true);
+                       delay = MAX2(d, delay);
+               }
+
+               while (delay > n) {
+                       struct ir3_instruction *nop = ir3_NOP(block);
+
+                       /* move to before instr: */
+                       list_delinit(&nop->node);
+                       list_addtail(&nop->node, &instr->node);
+
+                       n++;
+               }
+
+               /* we can bail once we hit worst case delay: */
+               if (++n > 6)
+                       break;
+       }
+}
+
+int ir3_sched(struct ir3 *ir)
 {
        struct ir3_sched_ctx ctx = {0};
-       ir3_clear_mark(block->shader);
-       block_sched(&ctx, block);
+
+       ir3_clear_mark(ir);
+
+       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+               sched_block(&ctx, block);
+       }
+
+       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+               sched_intra_block(&ctx, block);
+       }
+
        if (ctx.error)
                return -1;
        return 0;
 }
+
+/* does instruction 'prior' need to be scheduled before 'instr'? */
+static bool
+depends_on(struct ir3_instruction *instr, struct ir3_instruction *prior)
+{
+       /* TODO for dependencies that are related to a specific object, ie
+        * a specific SSBO/image/array, we could relax this constraint to
+        * make accesses to unrelated objects not depend on each other (at
+        * least as long as not declared coherent)
+        */
+       if (((instr->barrier_class & IR3_BARRIER_EVERYTHING) && prior->barrier_class) ||
+                       ((prior->barrier_class & IR3_BARRIER_EVERYTHING) && instr->barrier_class))
+               return true;
+       return !!(instr->barrier_class & prior->barrier_conflict);
+}
+
+static void
+add_barrier_deps(struct ir3_block *block, struct ir3_instruction *instr)
+{
+       struct list_head *prev = instr->node.prev;
+       struct list_head *next = instr->node.next;
+
+       /* add dependencies on previous instructions that must be scheduled
+        * prior to the current instruction
+        */
+       while (prev != &block->instr_list) {
+               struct ir3_instruction *pi =
+                       LIST_ENTRY(struct ir3_instruction, prev, node);
+
+               prev = prev->prev;
+
+               if (is_meta(pi))
+                       continue;
+
+               if (instr->barrier_class == pi->barrier_class) {
+                       ir3_instr_add_dep(instr, pi);
+                       break;
+               }
+
+               if (depends_on(instr, pi))
+                       ir3_instr_add_dep(instr, pi);
+       }
+
+       /* add dependencies on this instruction to following instructions
+        * that must be scheduled after the current instruction:
+        */
+       while (next != &block->instr_list) {
+               struct ir3_instruction *ni =
+                       LIST_ENTRY(struct ir3_instruction, next, node);
+
+               next = next->next;
+
+               if (is_meta(ni))
+                       continue;
+
+               if (instr->barrier_class == ni->barrier_class) {
+                       ir3_instr_add_dep(ni, instr);
+                       break;
+               }
+
+               if (depends_on(ni, instr))
+                       ir3_instr_add_dep(ni, instr);
+       }
+}
+
+/* before scheduling a block, we need to add any necessary false-dependencies
+ * to ensure that:
+ *
+ *  (1) barriers are scheduled in the right order wrt instructions related
+ *      to the barrier
+ *
+ *  (2) reads that come before a write actually get scheduled before the
+ *      write
+ */
+static void
+calculate_deps(struct ir3_block *block)
+{
+       list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+               if (instr->barrier_class) {
+                       add_barrier_deps(block, instr);
+               }
+       }
+}
+
+void
+ir3_sched_add_deps(struct ir3 *ir)
+{
+       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+               calculate_deps(block);
+       }
+}