From 54434fe67068d9abdf73bfc3482337cc4315d2d3 Mon Sep 17 00:00:00 2001
From: Connor Abbott <cwabbott0@gmail.com>
Date: Thu, 11 Jan 2018 18:35:58 -0500
Subject: [PATCH] lima/gpir: Rework the scheduler

Now, we do scheduling at the same time as value register allocation. The
ready list now acts similarly to the array of registers in
value_regalloc, keeping us from running out of slots. Before this, the
value register allocator wasn't aware of the scheduling constraints of
the actual machine, which meant that it sometimes chose the wrong false
dependencies to insert. Now, we assign value registers at the same time
as we actually schedule instructions, making its choices reflect reality
much better. It was also conservative in some cases where the new scheme
doesn't have to be. For example, in something like:

1 = ld_att
2 = ld_uni
3 = add 1, 2

It's possible that one of 1 and 2 can't be scheduled in the same
instruction as 3, meaning that a move needs to be inserted, so the value
register allocator needs to assume that this sequence requires two
registers. But when actually scheduling, we could discover that 1, 2,
and 3 can all be scheduled together, so that they only require one
register. The new scheduler speculatively inserts the instruction under
consideration, as well as all of its child load instructions, and then
counts the number of live value registers after all is said and done.
This lets us be more aggressive with scheduling when we're close to the
limit.

With the new scheduler, the kmscube vertex shader is now scheduled in 40
instructions, versus 66 before.

Acked-by: Qiang Yu <yuq825@gmail.com>
---
 src/gallium/drivers/lima/ir/gp/codegen.c      |    8 +-
 src/gallium/drivers/lima/ir/gp/gpir.h         |   50 +-
 src/gallium/drivers/lima/ir/gp/instr.c        |   80 +-
 src/gallium/drivers/lima/ir/gp/nir.c          |    5 +-
 src/gallium/drivers/lima/ir/gp/node.c         |    2 -
 .../drivers/lima/ir/gp/physical_regalloc.c    |  135 --
 .../ir/gp/{value_regalloc.c => regalloc.c}    |   85 +-
 src/gallium/drivers/lima/ir/gp/scheduler.c    | 1379 +++++++++++++----
 src/gallium/drivers/lima/meson.build          |    3 +-
 9 files changed, 1187 insertions(+), 560 deletions(-)
 delete mode 100644 src/gallium/drivers/lima/ir/gp/physical_regalloc.c
 rename src/gallium/drivers/lima/ir/gp/{value_regalloc.c => regalloc.c} (57%)

diff --git a/src/gallium/drivers/lima/ir/gp/codegen.c b/src/gallium/drivers/lima/ir/gp/codegen.c
index 1b7c8903c97..9bc279e0119 100644
--- a/src/gallium/drivers/lima/ir/gp/codegen.c
+++ b/src/gallium/drivers/lima/ir/gp/codegen.c
@@ -76,9 +76,13 @@ static gpir_codegen_src gpir_get_alu_input(gpir_node *parent, gpir_node *child)
          gpir_codegen_src_load_w, gpir_codegen_src_unused, gpir_codegen_src_unused },
    };
 
-   assert(child->sched.instr - parent->sched.instr < 3);
+   int diff = child->sched.instr->index - parent->sched.instr->index;
+   assert(diff < 3);
+   assert(diff >= 0);
 
-   return slot_to_src[child->sched.pos][child->sched.instr - parent->sched.instr];
+   int src = slot_to_src[child->sched.pos][diff];
+   assert(src != gpir_codegen_src_unused);
+   return src;
 }
 
 static void gpir_codegen_mul0_slot(gpir_codegen_instr *code, gpir_instr *instr)
diff --git a/src/gallium/drivers/lima/ir/gp/gpir.h b/src/gallium/drivers/lima/ir/gp/gpir.h
index e7f199033cd..6a1d533977b 100644
--- a/src/gallium/drivers/lima/ir/gp/gpir.h
+++ b/src/gallium/drivers/lima/ir/gp/gpir.h
@@ -131,8 +131,6 @@ typedef struct {
       GPIR_DEP_OFFSET,    /* def is the offset of use (i.e. temp store) */
       GPIR_DEP_READ_AFTER_WRITE,
       GPIR_DEP_WRITE_AFTER_READ,
-      GPIR_DEP_VREG_READ_AFTER_WRITE,
-      GPIR_DEP_VREG_WRITE_AFTER_READ,
    } type;
 
    /* node execute before succ */
@@ -146,6 +144,9 @@ typedef struct {
    struct list_head succ_link;
 } gpir_dep;
 
+struct gpir_instr;
+struct gpir_store_node;
+
 typedef struct gpir_node {
    struct list_head list;
    gpir_op op;
@@ -165,12 +166,14 @@ typedef struct gpir_node {
    int value_reg;
    union {
       struct {
-         int instr;
+         struct gpir_instr *instr;
+         struct gpir_store_node *physreg_store;
          int pos;
          int dist;
          int index;
          bool ready;
          bool inserted;
+         bool max_node, next_max_node;
       } sched;
       struct {
          int parent_index;
@@ -223,7 +226,7 @@ typedef struct {
    struct list_head reg_link;
 } gpir_load_node;
 
-typedef struct {
+typedef struct gpir_store_node {
    gpir_node node;
 
    unsigned index;
@@ -266,14 +269,43 @@ enum gpir_instr_slot {
    GPIR_INSTR_SLOT_DIST_TWO_END   = GPIR_INSTR_SLOT_PASS,
 };
 
-typedef struct {
+typedef struct gpir_instr {
    int index;
    struct list_head list;
 
    gpir_node *slots[GPIR_INSTR_SLOT_NUM];
 
+   /* The number of ALU slots free for moves. */
    int alu_num_slot_free;
+
+   /* The number of ALU slots free for moves, except for the complex slot. */
+   int alu_non_cplx_slot_free;
+
+   /* We need to make sure that we can insert moves in the following cases:
+    * (1) There was a use of a value two cycles ago.
+    * (2) There were more than 5 uses of a value 1 cycle ago (or else we can't
+    *     possibly satisfy (1) for the next cycle).
+    * (3) There is a store instruction scheduled, but not its child.
+    *
+    * The complex slot cannot be used for a move in case (1), since it only
+    * has a FIFO depth of 1, but it can be used for (2) and (3). In order to
+    * ensure that we have enough space for all three, we maintain the
+    * following invariants:
+    *
+    * (1) alu_num_slot_free >= alu_num_slot_needed_by_store +
+    *       alu_num_slot_needed_by_max +
+    *       alu_num_slot_needed_by_next_max
+    * (2) alu_non_cplx_slot_free >= alu_num_slot_needed_by_max
+    */
    int alu_num_slot_needed_by_store;
+   int alu_num_slot_needed_by_max;
+   int alu_num_slot_needed_by_next_max;
+
+   /* Used to communicate to the scheduler how many slots need to be cleared
+    * up in order to satisfy the invariants.
+    */
+   int slot_difference;
+   int non_cplx_slot_difference;
 
    int reg0_use_count;
    bool reg0_is_attr;
@@ -387,18 +419,12 @@ bool gpir_instr_try_insert_node(gpir_instr *instr, gpir_node *node);
 void gpir_instr_remove_node(gpir_instr *instr, gpir_node *node);
 void gpir_instr_print_prog(gpir_compiler *comp);
 
-static inline bool gpir_instr_alu_slot_is_full(gpir_instr *instr)
-{
-   return instr->alu_num_slot_free <= instr->alu_num_slot_needed_by_store;
-}
-
 bool gpir_codegen_acc_same_op(gpir_op op1, gpir_op op2);
 
 bool gpir_pre_rsched_lower_prog(gpir_compiler *comp);
 bool gpir_post_rsched_lower_prog(gpir_compiler *comp);
 bool gpir_reduce_reg_pressure_schedule_prog(gpir_compiler *comp);
-bool gpir_value_regalloc_prog(gpir_compiler *comp);
-bool gpir_physical_regalloc_prog(gpir_compiler *comp);
+bool gpir_regalloc_prog(gpir_compiler *comp);
 bool gpir_schedule_prog(gpir_compiler *comp);
 bool gpir_codegen_prog(gpir_compiler *comp);
 
diff --git a/src/gallium/drivers/lima/ir/gp/instr.c b/src/gallium/drivers/lima/ir/gp/instr.c
index d3190d39d4c..228d1459280 100644
--- a/src/gallium/drivers/lima/ir/gp/instr.c
+++ b/src/gallium/drivers/lima/ir/gp/instr.c
@@ -36,6 +36,7 @@ gpir_instr *gpir_instr_create(gpir_block *block)
 
    instr->index = block->sched.instr_index++;
    instr->alu_num_slot_free = 6;
+   instr->alu_non_cplx_slot_free = 5;
 
    list_add(&instr->list, &block->instr_list);
    return instr;
@@ -85,6 +86,11 @@ static bool gpir_instr_insert_alu_check(gpir_instr *instr, gpir_node *node)
       return false;
 
    int consume_slot = gpir_instr_get_consume_slot(instr, node);
+   int non_cplx_consume_slot =
+      node->sched.pos == GPIR_INSTR_SLOT_COMPLEX ? 0 : consume_slot;
+   int store_reduce_slot = 0;
+   int max_reduce_slot = node->sched.max_node ? 1 : 0;
+   int next_max_reduce_slot = node->sched.next_max_node ? 1 : 0;
 
    /* check if this node is child of one store node.
     * complex1 won't be any of this instr's store node's child,
@@ -93,25 +99,40 @@ static bool gpir_instr_insert_alu_check(gpir_instr *instr, gpir_node *node)
    for (int i = GPIR_INSTR_SLOT_STORE0; i <= GPIR_INSTR_SLOT_STORE3; i++) {
       gpir_store_node *s = gpir_node_to_store(instr->slots[i]);
       if (s && s->child == node) {
-         /* acc node may consume 2 slots, so even it's the child of a
-          * store node, it may not be inserted successfully, in which
-          * case we need a move node for it */
-         if (instr->alu_num_slot_free - consume_slot <
-             instr->alu_num_slot_needed_by_store - 1)
-            return false;
-
-         instr->alu_num_slot_needed_by_store--;
-         instr->alu_num_slot_free -= consume_slot;
-         return true;
+         store_reduce_slot = 1;
+         break;
       }
    }
 
-   /* not a child of any store node, so must reserve alu slot for store node */
-   if (instr->alu_num_slot_free - consume_slot <
-       instr->alu_num_slot_needed_by_store)
+   /* Check that the invariants will be maintained after we adjust everything
+    */
+
+   int slot_difference = 
+       instr->alu_num_slot_needed_by_store - store_reduce_slot +
+       instr->alu_num_slot_needed_by_max - max_reduce_slot +
+       MAX2(instr->alu_num_slot_needed_by_next_max - next_max_reduce_slot, 0) -
+      (instr->alu_num_slot_free - consume_slot);
+   if (slot_difference > 0) {
+      gpir_debug("failed %d because of alu slot\n", node->index);
+      instr->slot_difference = slot_difference;
+   }
+
+   int non_cplx_slot_difference =
+       instr->alu_num_slot_needed_by_max - max_reduce_slot -
+       (instr->alu_non_cplx_slot_free - non_cplx_consume_slot);
+   if (non_cplx_slot_difference > 0) {
+      gpir_debug("failed %d because of alu slot\n", node->index);
+      instr->non_cplx_slot_difference = non_cplx_slot_difference;
+   }
+
+   if (slot_difference > 0 || non_cplx_slot_difference > 0)
       return false;
 
    instr->alu_num_slot_free -= consume_slot;
+   instr->alu_non_cplx_slot_free -= non_cplx_consume_slot;
+   instr->alu_num_slot_needed_by_store -= store_reduce_slot;
+   instr->alu_num_slot_needed_by_max -= max_reduce_slot;
+   instr->alu_num_slot_needed_by_next_max -= next_max_reduce_slot;
    return true;
 }
 
@@ -123,12 +144,17 @@ static void gpir_instr_remove_alu(gpir_instr *instr, gpir_node *node)
       gpir_store_node *s = gpir_node_to_store(instr->slots[i]);
       if (s && s->child == node) {
          instr->alu_num_slot_needed_by_store++;
-         instr->alu_num_slot_free += consume_slot;
-         return;
+         break;
       }
    }
 
    instr->alu_num_slot_free += consume_slot;
+   if (node->sched.pos != GPIR_INSTR_SLOT_COMPLEX)
+      instr->alu_non_cplx_slot_free += consume_slot;
+   if (node->sched.max_node)
+      instr->alu_num_slot_needed_by_max++;
+   if (node->sched.next_max_node)
+      instr->alu_num_slot_needed_by_next_max++;
 }
 
 static bool gpir_instr_insert_reg0_check(gpir_instr *instr, gpir_node *node)
@@ -269,12 +295,18 @@ static bool gpir_instr_insert_store_check(gpir_instr *instr, gpir_node *node)
          goto out;
    }
 
-   /* no store node has the same child as this node, and child is not
-    * already in this instr's alu slot, so instr must have some free
-    * alu slot to insert this node's child
+   /* Check the invariants documented in gpir.h, similar to the ALU case.
+    * Since the only thing that changes is alu_num_slot_needed_by_store, we
+    * can get away with just checking the first one.
     */
-   if (gpir_instr_alu_slot_is_full(instr))
+   int slot_difference = instr->alu_num_slot_needed_by_store + 1
+      + instr->alu_num_slot_needed_by_max +
+      MAX2(instr->alu_num_slot_needed_by_next_max, 0) -
+      instr->alu_num_slot_free;
+   if (slot_difference > 0) {
+      instr->slot_difference = slot_difference;
       return false;
+   }
 
    instr->alu_num_slot_needed_by_store++;
 
@@ -299,6 +331,9 @@ static void gpir_instr_remove_store(gpir_instr *instr, gpir_node *node)
    int other_slot = GPIR_INSTR_SLOT_STORE0 + (component ^ 1);
 
    for (int j = GPIR_INSTR_SLOT_STORE0; j <= GPIR_INSTR_SLOT_STORE3; j++) {
+      if (j == node->sched.pos)
+         continue;
+
       gpir_store_node *s = gpir_node_to_store(instr->slots[j]);
       if (s && s->child == store->child)
          goto out;
@@ -369,6 +404,9 @@ static bool gpir_instr_slot_free(gpir_instr *instr, gpir_node *node)
 
 bool gpir_instr_try_insert_node(gpir_instr *instr, gpir_node *node)
 {
+   instr->slot_difference = 0;
+   instr->non_cplx_slot_difference = 0;
+
    if (!gpir_instr_slot_free(instr, node))
       return false;
 
@@ -413,7 +451,7 @@ void gpir_instr_remove_node(gpir_instr *instr, gpir_node *node)
    /* This can happen if we merge duplicate loads in the scheduler. */
    if (instr->slots[node->sched.pos] != node) {
       node->sched.pos = -1;
-      node->sched.instr = -1;
+      node->sched.instr = NULL;
       return;
    }
 
@@ -439,7 +477,7 @@ void gpir_instr_remove_node(gpir_instr *instr, gpir_node *node)
       instr->slots[GPIR_INSTR_SLOT_MUL1] = NULL;
 
    node->sched.pos = -1;
-   node->sched.instr = -1;
+   node->sched.instr = NULL;
 }
 
 void gpir_instr_print_prog(gpir_compiler *comp)
diff --git a/src/gallium/drivers/lima/ir/gp/nir.c b/src/gallium/drivers/lima/ir/gp/nir.c
index 902d27a3149..d1da7ed3754 100644
--- a/src/gallium/drivers/lima/ir/gp/nir.c
+++ b/src/gallium/drivers/lima/ir/gp/nir.c
@@ -422,10 +422,7 @@ bool gpir_compile_nir(struct lima_vs_shader_state *prog, struct nir_shader *nir)
    if (!gpir_post_rsched_lower_prog(comp))
       goto err_out0;
 
-   if (!gpir_value_regalloc_prog(comp))
-      goto err_out0;
-
-   if (!gpir_physical_regalloc_prog(comp))
+   if (!gpir_regalloc_prog(comp))
       goto err_out0;
 
    if (!gpir_schedule_prog(comp))
diff --git a/src/gallium/drivers/lima/ir/gp/node.c b/src/gallium/drivers/lima/ir/gp/node.c
index eda6ae7ed6f..decda5f1246 100644
--- a/src/gallium/drivers/lima/ir/gp/node.c
+++ b/src/gallium/drivers/lima/ir/gp/node.c
@@ -436,8 +436,6 @@ static void gpir_node_print_node(gpir_node *node, int type, int space)
       [GPIR_DEP_OFFSET] = "offset",
       [GPIR_DEP_READ_AFTER_WRITE] = "RaW",
       [GPIR_DEP_WRITE_AFTER_READ] = "WaR",
-      [GPIR_DEP_VREG_READ_AFTER_WRITE] = "vRaW",
-      [GPIR_DEP_VREG_WRITE_AFTER_READ] = "vWaR",
    };
 
    for (int i = 0; i < space; i++)
diff --git a/src/gallium/drivers/lima/ir/gp/physical_regalloc.c b/src/gallium/drivers/lima/ir/gp/physical_regalloc.c
deleted file mode 100644
index 87d88a8f9b7..00000000000
--- a/src/gallium/drivers/lima/ir/gp/physical_regalloc.c
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright (c) 2017 Lima Project
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sub license,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- */
-
-#include <limits.h>
-
-#include "gpir.h"
-
-/* Linear scan register alloc for physical reg alloc of each
- * load/store node
- */
-
-static void regalloc_print_result(gpir_compiler *comp)
-{
-   if (!(lima_debug & LIMA_DEBUG_GP))
-      return;
-
-   int index = 0;
-   printf("======== physical regalloc ========\n");
-   list_for_each_entry(gpir_block, block, &comp->block_list, list) {
-      list_for_each_entry(gpir_node, node, &block->node_list, list) {
-         if (node->op == gpir_op_load_reg) {
-            gpir_load_node *load = gpir_node_to_load(node);
-            printf("%03d: load %d use reg %d\n", index, node->index, load->reg->index);
-         }
-         else if (node->op == gpir_op_store_reg) {
-            gpir_store_node *store = gpir_node_to_store(node);
-            printf("%03d: store %d use reg %d\n", index, node->index, store->reg->index);
-         }
-         index++;
-      }
-      printf("----------------------------\n");
-   }
-}
-
-bool gpir_physical_regalloc_prog(gpir_compiler *comp)
-{
-   int index = 0;
-   list_for_each_entry(gpir_block, block, &comp->block_list, list) {
-      list_for_each_entry(gpir_node, node, &block->node_list, list) {
-         node->preg.index = index++;
-      }
-   }
-
-   /* calculate each reg liveness interval */
-   list_for_each_entry(gpir_reg, reg, &comp->reg_list, list) {
-      reg->start = INT_MAX;
-      list_for_each_entry(gpir_store_node, store, &reg->defs_list, reg_link) {
-         if (store->node.preg.index < reg->start)
-            reg->start = store->node.preg.index;
-      }
-
-      reg->end = 0;
-      list_for_each_entry(gpir_load_node, load, &reg->uses_list, reg_link) {
-         if (load->node.preg.index > reg->end)
-            reg->end = load->node.preg.index;
-      }
-   }
-
-   /* sort reg list by start value */
-   struct list_head reg_list;
-   list_replace(&comp->reg_list, &reg_list);
-   list_inithead(&comp->reg_list);
-   list_for_each_entry_safe(gpir_reg, reg, &reg_list, list) {
-      struct list_head *insert_pos = &comp->reg_list;
-      list_for_each_entry(gpir_reg, creg, &comp->reg_list, list) {
-         if (creg->start > reg->start) {
-            insert_pos = &creg->list;
-            break;
-         }
-      }
-      list_del(&reg->list);
-      list_addtail(&reg->list, insert_pos);
-   }
-
-   /* do linear scan reg alloc */
-   gpir_reg *active[GPIR_PHYSICAL_REG_NUM] = {0};
-   list_for_each_entry(gpir_reg, reg, &comp->reg_list, list) {
-      int i;
-
-      /* if some reg is expired */
-      for (i = 0; i < GPIR_PHYSICAL_REG_NUM; i++) {
-         if (active[i] && active[i]->end <= reg->start)
-            active[i] = NULL;
-      }
-
-      /* find a free reg value for this reg */
-      for (i = 0; i < GPIR_PHYSICAL_REG_NUM; i++) {
-         if (!active[i]) {
-            active[i] = reg;
-            reg->index = i;
-            break;
-         }
-      }
-
-      /* TODO: support spill to temp memory */
-      assert(i < GPIR_PHYSICAL_REG_NUM);
-   }
-
-   /* update load/store node info for the real reg */
-   list_for_each_entry(gpir_reg, reg, &comp->reg_list, list) {
-      list_for_each_entry(gpir_store_node, store, &reg->defs_list, reg_link) {
-         store->index = reg->index >> 2;
-         store->component = reg->index % 4;
-      }
-
-      list_for_each_entry(gpir_load_node, load, &reg->uses_list, reg_link) {
-         load->index = reg->index >> 2;
-         load->index = reg->index % 4;
-      }
-   }
-
-   regalloc_print_result(comp);
-   return true;
-}
diff --git a/src/gallium/drivers/lima/ir/gp/value_regalloc.c b/src/gallium/drivers/lima/ir/gp/regalloc.c
similarity index 57%
rename from src/gallium/drivers/lima/ir/gp/value_regalloc.c
rename to src/gallium/drivers/lima/ir/gp/regalloc.c
index f633b949932..c145bfbee81 100644
--- a/src/gallium/drivers/lima/ir/gp/value_regalloc.c
+++ b/src/gallium/drivers/lima/ir/gp/regalloc.c
@@ -24,60 +24,17 @@
 
 #include "gpir.h"
 
-/* Linear scan register alloc for value reg alloc of each node */
-
-static int regalloc_spill_active_node(gpir_node *active[])
-{
-   gpir_node *spill = NULL;
-   for (int i = 0; i < GPIR_VALUE_REG_NUM; i++) {
-      if (gpir_op_infos[active[i]->op].spillless)
-         continue;
-
-      /* spill farest node */
-      if (!spill ||
-          spill->vreg.last->vreg.index < active[i]->vreg.last->vreg.index) {
-         spill = active[i];
-      }
-   }
-
-   assert(spill);
-   gpir_debug("value regalloc spill node %d for value reg %d\n",
-              spill->index, spill->value_reg);
-
-   /* create store node for spilled node */
-   gpir_store_node *store = gpir_node_create(spill->block, gpir_op_store_reg);
-   store->child = spill;
-   /* no need to calculate other vreg values because store & spill won't
-    * be used in the following schedule again */
-   store->node.value_reg = spill->value_reg;
-   list_addtail(&store->node.list, &spill->list);
-
-   gpir_reg *reg = gpir_create_reg(spill->block->comp);
-   store->reg = reg;
-   list_addtail(&store->reg_link, &reg->defs_list);
-
-   gpir_node_foreach_succ_safe(spill, dep) {
-      gpir_node *succ = dep->succ;
-      gpir_load_node *load = gpir_node_create(succ->block, gpir_op_load_reg);
-      gpir_node_replace_pred(dep, &load->node);
-      gpir_node_replace_child(succ, spill, &load->node);
-      list_addtail(&load->node.list, &succ->list);
-
-      /* only valid for succ already scheduled, succ not scheduled will
-       * re-write this value */
-      load->node.value_reg = spill->value_reg;
-      load->node.vreg.index =
-         (list_first_entry(&load->node.list, gpir_node, list)->vreg.index +
-          list_last_entry(&load->node.list, gpir_node, list)->vreg.index) / 2.0f;
-      load->node.vreg.last = succ;
-
-      load->reg = reg;
-      list_addtail(&load->reg_link, &reg->uses_list);
-   }
-
-   gpir_node_add_dep(&store->node, spill, GPIR_DEP_INPUT);
-   return spill->value_reg;
-}
+/* Register allocation
+ *
+ * TODO: This needs to be rewritten when we support multiple basic blocks. We
+ * need to do proper liveness analysis, combined with either linear scan,
+ * graph coloring, or SSA-based allocation. We should also support spilling to
+ * temporaries.
+ *
+ * For now, this only assigns fake registers to values, used to build the fake
+ * dependencies that the scheduler relies on. In the future we should also be
+ * assigning actual physreg numbers to load_reg/store_reg nodes.
+ */
 
 static void regalloc_block(gpir_block *block)
 {
@@ -99,7 +56,7 @@ static void regalloc_block(gpir_block *block)
 
    /* do linear scan regalloc */
    int reg_search_start = 0;
-   gpir_node *active[GPIR_VALUE_REG_NUM] = {0};
+   gpir_node *active[GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM] = {0};
    list_for_each_entry(gpir_node, node, &block->node_list, list) {
       /* if some reg is expired */
       gpir_node_foreach_pred(node, dep) {
@@ -116,9 +73,9 @@ static void regalloc_block(gpir_block *block)
 
       /* find a free reg for this node */
       int i;
-      for (i = 0; i < GPIR_VALUE_REG_NUM; i++) {
+      for (i = 0; i < GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM; i++) {
          /* round robin reg select to reduce false dep when schedule */
-         int reg = (reg_search_start + i) % GPIR_VALUE_REG_NUM;
+         int reg = (reg_search_start + i) % (GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM);
          if (!active[reg]) {
             active[reg] = node;
             node->value_reg = reg;
@@ -127,14 +84,8 @@ static void regalloc_block(gpir_block *block)
          }
       }
 
-      /* need spill */
-      if (i == GPIR_VALUE_REG_NUM) {
-         int spilled_reg = regalloc_spill_active_node(active);
-         active[spilled_reg] = node;
-         node->value_reg = spilled_reg;
-         gpir_debug("value regalloc node %d reuse reg %d\n",
-                    node->index, spilled_reg);
-      }
+      /* TODO: spill */
+      assert(i != GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM);
    }
 }
 
@@ -144,7 +95,7 @@ static void regalloc_print_result(gpir_compiler *comp)
       return;
 
    int index = 0;
-   printf("======== value regalloc ========\n");
+   printf("======== regalloc ========\n");
    list_for_each_entry(gpir_block, block, &comp->block_list, list) {
       list_for_each_entry(gpir_node, node, &block->node_list, list) {
          printf("%03d: %d/%d %s ", index++, node->index, node->value_reg,
@@ -159,7 +110,7 @@ static void regalloc_print_result(gpir_compiler *comp)
    }
 }
 
-bool gpir_value_regalloc_prog(gpir_compiler *comp)
+bool gpir_regalloc_prog(gpir_compiler *comp)
 {
    list_for_each_entry(gpir_block, block, &comp->block_list, list) {
       regalloc_block(block);
diff --git a/src/gallium/drivers/lima/ir/gp/scheduler.c b/src/gallium/drivers/lima/ir/gp/scheduler.c
index 3d7e640ada2..60076daa3c4 100644
--- a/src/gallium/drivers/lima/ir/gp/scheduler.c
+++ b/src/gallium/drivers/lima/ir/gp/scheduler.c
@@ -27,58 +27,196 @@
 #include "gpir.h"
 
 /*
- * GP schedule algorithm (by Connor Abbott <cwabbott0@gmail.com>)
+ * GP scheduling algorithm (by Connor Abbott <cwabbott0@gmail.com>)
+ * 
+ * The GP pipeline has three main stages:
  *
- * Pre schedule phase:
- * 1. order all nodes in a sequence
- * 2. convert the real reg read/write to GP load/store node, now all
- *    variable is SSA
- * 3. do reg alloc for all SSA with 11 reg (value reg) and spill with
- *    load/store to real reg if needed
- * 4. add fake dependency like this:
- *    after step 3, node sequence is
- *      01: r1=r2+r3
- *      02: r4=r1+r2
- *      03: r1=r5+r6
- *    we should add a fake dependency of node 3 to node 2 like a
- *    write-after-read dep. But this is not really write-after-read
- *    dep because there's no r1 really, because it's a value register.
- *    We need this fake dep in the schedule phase to make sure in any
- *    schedule point, there're only <=11 input needed by the past
- *    scheduled nodes.
- * 5. build DAG according to all the real and fake dep
+ * --------------------------------------------------------
+ * |                                                      |
+ * |               Register/Attr/Temp Fetch               |
+ * |                                                      |
+ * --------------------------------------------------------
+ * |        |        |        |        |        |         |
+ * |  Mul0  |  Mul1  |  Add0  |  Add1  |  Cplx  |  Pass   |
+ * |        |        |        |        |        |         |
+ * --------------------------------------------------------
+ * |                 |                          |         |
+ * |    Complex1     |   Temp/Register/Varying  |  Pass   |
+ * |    Stage 2      |         Store            | Stage 2 |
+ * |                 |                          |         |
+ * --------------------------------------------------------
  *
- * Schedule phase:
- * 1. Compute the nodes ready to schedule, if no nodes, exit
- * 2. Create a new GP instruction, and call it as current instr
- * 3. For any nodes with a use 2 cycles ago with a definition ready to
- *    schedule, schedule that definition immediately if possible, or else
- *    schedule a move.
- * 4. For any nodes with a use 2 cycles ago but the definition not
- *    scheduled and not ready to schedule, schedule a move immediately
- *    to prevent the value from falling off the queue.
- * 5. Calculate the number of remaining nodes with a use 1 cycle ago but
- *    the definition not yet scheduled, and if there are more than 5,
- *    schedule moves or definitions for the rest now.
- * 6. Schedule the rest of the available nodes using your favorite heuristic
- *    to current instr.
- * 7. go to step 1
+ * Because of this setup, storing a register has a latency of three cycles.
+ * Also, the register file is organized into 4-component vectors, and the
+ * load stage can only load two vectors at a time. Aside from these highly
+ * constrained register load/store units, there is an explicit bypass
+ * network, where each unit (mul0/mul1/etc.) can access the results of the
+ * any unit from the previous two cycles directly, except for the complex
+ * unit whose result can only be accessed for one cycle (since it's expected
+ * to be used directly by the complex2 instruction in the following cycle).
  *
- * Step 5 for the current instruction guarantees that steps 3 and 4 for
- * the next instruction will always succeed, so it's only step 5 that can
- * possibly fail. Now, note that the nodes whose definitions have not yet
- * been scheduled but one or more use has been scheduled, are exactly the
- * nodes that are live in the final schedule. Therefore there will never
- * be more than 11 of them (guarenteed by the 11 value reg alloc and the
- * fake dep added before schedule). The worst case for step 5 is that all of
- * these nodes had a use 1 cycle ago, which means that none of them hit
- * case 3 or 4 already, so there are 6 slots still available so step 5
- * will always succeed. In general, even if there are exactly 11 values
- * live, if n are scheduled in steps 3 and 4, there are 11-n left in step
- * 4 so at most 11-n-5 = 6-n are scheduled in step 5 and therefore 6 are
- * scheduled total, below the limit. So the algorithm will always succeed.
+ * Because of the very restricted register file, and because only rarely are
+ * all the units in use at the same time, it can be very beneficial to use
+ * the unused units to "thread" a value from source to destination by using
+ * moves in the otherwise-unused units, without involving the register file
+ * at all. It's very difficult to fully exploit this with a traditional
+ * scheduler, so we need to do something a little un-traditional. The 512
+ * instruction limit means that for more complex shaders, we need to do as
+ * well as possible or else the app won't even work.
+ *
+ * The scheduler works by considering the bypass network as a kind of
+ * register file. It's a quite unusual register file, since registers have to
+ * be assigned "on the fly" as we schedule operations, but with some care, we
+ * can use something conceptually similar to a linear-scan allocator to
+ * successfully schedule nodes to instructions without running into
+ * conflicts.
+ *
+ * Values in the IR are separated into normal values, or "value registers",
+ * which is what normal nodes like add, mul, etc. produce, and which only
+ * live inside one basic block, and registers, which can span multiple basic
+ * blocks but have to be accessed via special load_reg/store_reg nodes. RA
+ * assigns physical registers to both value registers and normal registers,
+ * treating load_reg/store_reg as a move instruction, but these are only used
+ * directly for normal registers -- the physreg assigned to a value register
+ * is "fake," and is only used inside the scheduler. Before scheduling we
+ * insert read-after-write dependencies, even for value registers, as if
+ * we're going to use those, but then we throw them away. For example, if we
+ * had something like:
+ *
+ * (*)r2 = add (*)r1, (*)r2
+ * (*)r1 = load_reg r0
+ *
+ * we'd insert a write-after-read dependency between the add and load_reg,
+ * even though the starred registers aren't actually used by the scheduler
+ * after this step. This step is crucial since it guarantees that during any
+ * point in the schedule, the number of live registers + live value registers
+ * will never exceed the capacity of the register file and the bypass network
+ * combined. This is because each live register/value register will have a
+ * different fake number, thanks to the fake dependencies inserted before
+ * scheduling. This allows us to not have to worry about spilling to
+ * temporaries, which is only done ahead of time.
+ *
+ * The scheduler is a bottom-up scheduler. It keeps track of each live value
+ * register, and decides on-the-fly which value registers to keep in the
+ * bypass network and which to "spill" to registers. Of particular importance
+ * is the "ready list," which consists of "input nodes" (nodes that produce a
+ * value that can be consumed via the bypass network), both "partially ready"
+ * (only some of the uses have been scheduled) and "fully ready" (all uses
+ * have been scheduled), as well as other non-input nodes like register
+ * stores. Each input node on the ready list represents a live value register
+ * before the current instruction. There must be at most 11 such input nodes
+ * at all times, since there are only 11 slots in the next two instructions
+ * which can reach the current instruction.
+ *
+ * An input node is a "max node" if it has a use two cycles ago, which must be
+ * connected to a definition this cycle. Otherwise it may be a "next max node"
+ * if it will be a max node on the next instruction (i.e. it has a use at most
+ * one cycle ago), or it may be neither if all of its uses are this cycle. As
+ * we keep adding instructions to the front, input nodes graduate from
+ * neither, to next max, to max, unless we decide to insert a move to keep it
+ * alive longer, at which point any uses after the current instruction are
+ * rewritten to be uses of the move so that the original node returns to
+ * neither. The scheduler decides which nodes to try freely, but we have to
+ * reserve slots for two different reasons: (1) out of the 5 non-complex
+ * slots, we reserve a slot for each max node, so that we can connect a
+ * definition to the use 2 cycles ago. (2) Out of all 6 slots, we reserve a
+ * slot for every next-max node above 5, so that for the next instruction
+ * there are no more than 5 max nodes. When a max or next-max node gets
+ * scheduled, the corresponding reservation is reduced by one. At the end, we
+ * insert moves for every slot that was reserved. The reservation is actually
+ * managed by nir_instr, and all we have to do is tell it how many to reserve
+ * at the beginning and then tell it which nodes are max/next-max nodes. When
+ * we start scheduling an instruction, there will be at most 5 max nodes
+ * thanks to the previous instruction's next-max reservation/move insertion.
+ * Since there are at most 11 total input nodes, if there are N max nodes,
+ * there are at most 11 - N next-max nodes, and therefore at most 11 - N - 5 =
+ * 6 - N slots need to be reserved for next-max nodes, and so at most
+ * 6 - N + N = 6 slots need to be reserved in total, exactly the total number
+ * of slots. So, thanks to the total input node restriction, we will never
+ * need to reserve too many slots.
+ *
+ * It sometimes happens that scheduling a given node will violate this total
+ * input node restriction, or that a reservation will mean that we can't
+ * schedule it. We first schedule a node "speculatively" to see if this is a
+ * problem. If some of the node's sources are loads, then we can schedule
+ * the node and its dependent loads in one swoop to avoid going over the
+ * pressure limit. If that fails, we can try to spill a ready or
+ * partially-ready input node to a register by rewriting all of its uses to
+ * refer to a register load. This removes it from the list of ready and
+ * partially ready input nodes as all of its uses are now unscheduled. If
+ * successful, we can then proceed with scheduling the original node. All of
+ * this happens "speculatively," meaning that afterwards the node is removed
+ * and the entire state of the scheduler is reverted to before it was tried, to
+ * ensure that we never get into an invalid state and run out of spots for
+ * moves. In try_nodes(), we try to schedule each node speculatively on the
+ * ready list, keeping only the nodes that could be successfully scheduled, so
+ * that when we finally decide which node to actually schedule, we know it
+ * will succeed.  This is how we decide on the fly which values go in
+ * registers and which go in the bypass network. Note that "unspilling" a
+ * value is simply a matter of scheduling the store_reg instruction created
+ * when we spill.
+ *
+ * The careful accounting of live value registers, reservations for moves, and
+ * speculative scheduling guarantee that we never run into a failure case
+ * while scheduling. However, we need to make sure that this scheduler will
+ * not get stuck in an infinite loop, i.e. that we'll always make forward
+ * progress by eventually scheduling a non-move node. If we run out of value
+ * registers, then we may have to spill a node to a register. If we
+ * were to schedule one of the fully-ready nodes, then we'd have 11 + N live
+ * value registers before the current instruction. But since there are at most
+ * 64+11 live registers and register values total thanks to the fake
+ * dependencies we inserted before scheduling, there are at most 64 - N live
+ * physical registers, and therefore there are at least N registers available
+ * for spilling. Not all these registers will be available immediately, since
+ * in order to spill a node to a given register we have to ensure that there
+ * are slots available to rewrite every use to a load instruction, and that
+ * may not be the case. There may also be intervening writes which prevent
+ * some registers from being used. However, these are all temporary problems,
+ * since as we create each instruction, we create additional register load
+ * slots that can be freely used for spilling, and we create more move nodes
+ * which means that the uses of the nodes we're trying to spill keep moving
+ * forward. This means that eventually, these problems will go away, at which
+ * point we'll be able to spill a node successfully, so eventually we'll be
+ * able to schedule the first node on the ready list.
  */
 
+typedef struct {
+   /* This is the list of ready and partially-ready nodes. A partially-ready
+    * node must have at least one input dependency already scheduled.
+    */
+   struct list_head ready_list;
+
+   /* The number of ready or partially-ready nodes with at least one input
+    * dependency already scheduled. In other words, the number of live value
+    * registers. This must be at most 11.
+    */
+   int ready_list_slots;
+
+   /* The physical registers live into the current instruction. */
+   uint64_t live_physregs;
+
+   /* The current instruction. */
+   gpir_instr *instr;
+
+   /* The current basic block. */
+   gpir_block *block;
+
+   /* True if at least one node failed to schedule due to lack of available
+    * value registers.
+    */
+   bool try_spill_all;
+
+   /* The number of max nodes needed to spill to successfully schedule the
+    * instruction.
+    */
+   int max_node_spill_needed;
+
+   /* The number of max and next-max nodes needed to spill to successfully
+    * schedule the instruction.
+    */
+   int total_spill_needed;
+} sched_ctx;
+
 static int gpir_min_dist_alu(gpir_dep *dep)
 {
    switch (dep->pred->op) {
@@ -104,8 +242,13 @@ static int gpir_get_min_dist(gpir_dep *dep)
       case gpir_op_store_temp:
       case gpir_op_store_reg:
       case gpir_op_store_varying:
-         /* store must use alu node as input */
-         if (dep->pred->type == gpir_node_type_load)
+         /* Stores must use an alu node as input. Also, complex1 takes two
+          * cycles, which means that its result cannot be stored to a register
+          * as part of the normal path, and therefore it must also have a move
+          * inserted.
+          */
+         if (dep->pred->type == gpir_node_type_load ||
+             dep->pred->op == gpir_op_complex1)
             return INT_MAX >> 2;
          else
             return 0;
@@ -119,44 +262,24 @@ static int gpir_get_min_dist(gpir_dep *dep)
       return gpir_min_dist_alu(dep);
 
    case GPIR_DEP_READ_AFTER_WRITE:
-      switch (dep->succ->op) {
-      case gpir_op_load_temp:
-         assert(dep->pred->op == gpir_op_store_temp);
+      if (dep->succ->op == gpir_op_load_temp &&
+          dep->pred->op == gpir_op_store_temp) {
          return 4;
-      case gpir_op_load_reg:
-         assert(dep->pred->op == gpir_op_store_reg);
+      } else if (dep->succ->op == gpir_op_load_reg &&
+                 dep->pred->op == gpir_op_store_reg) {
          return 3;
-      case gpir_op_load_uniform:
-         assert(dep->pred->op == gpir_op_store_temp_load_off0 ||
-                dep->pred->op == gpir_op_store_temp_load_off1 ||
-                dep->pred->op == gpir_op_store_temp_load_off2);
+      } else if ((dep->pred->op == gpir_op_store_temp_load_off0 ||
+                  dep->pred->op == gpir_op_store_temp_load_off1 ||
+                  dep->pred->op == gpir_op_store_temp_load_off2) &&
+                 dep->succ->op == gpir_op_load_uniform) {
          return 4;
-      default:
-         assert(0);
+      } else {
+         /* Fake dependency */
+         return 0;
       }
 
    case GPIR_DEP_WRITE_AFTER_READ:
-      switch (dep->pred->op) {
-      case gpir_op_load_temp:
-         assert(dep->succ->op == gpir_op_store_temp);
-         return -3;
-      case gpir_op_load_reg:
-         assert(dep->succ->op == gpir_op_store_reg);
-         return -2;
-      case gpir_op_load_uniform:
-         assert(dep->succ->op == gpir_op_store_temp_load_off0 ||
-                dep->succ->op == gpir_op_store_temp_load_off1 ||
-                dep->succ->op == gpir_op_store_temp_load_off2);
-         return -3;
-      default:
-         assert(0);
-      }
-
-   case GPIR_DEP_VREG_WRITE_AFTER_READ:
       return 0;
-
-   case GPIR_DEP_VREG_READ_AFTER_WRITE:
-      assert(0); /* not possible, this is GPIR_DEP_INPUT */
    }
 
    return 0;
@@ -230,13 +353,64 @@ static void schedule_update_distance(gpir_node *node)
       if (pred->sched.dist < 0)
          schedule_update_distance(pred);
 
-      int dist = pred->sched.dist + 1;
+      int dist = pred->sched.dist + gpir_min_dist_alu(dep);
       if (node->sched.dist < dist)
          node->sched.dist = dist;
    }
 }
 
-static void schedule_insert_ready_list(struct list_head *ready_list,
+static bool gpir_is_input_node(gpir_node *node)
+{
+   gpir_node_foreach_succ(node, dep) {
+      if (dep->type == GPIR_DEP_INPUT)
+         return true;
+   }
+   return false;
+}
+
+
+/* Get the number of slots required for a node on the ready list.
+ */
+static int gpir_get_slots_required(gpir_node *node)
+{
+   if (!gpir_is_input_node(node))
+      return 0;
+
+   /* Note that we assume every node only consumes one slot, even dual-slot
+    * instructions. While dual-slot instructions may consume more than one
+    * slot, we can always safely insert a move if it turns out that there
+    * isn't enough space for them. There's the risk that we get stuck in an
+    * infinite loop if all the fully ready nodes are dual-slot nodes, but we
+    * rely on spilling to registers to save us here.
+    */
+   return 1;
+}
+
+static void verify_ready_list(sched_ctx *ctx)
+{
+   list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
+      if (!gpir_is_input_node(node)) {
+         assert(node->sched.ready);
+      }
+
+      if (node->sched.ready) {
+         /* Every successor must have been scheduled */
+         gpir_node_foreach_succ(node, dep) {
+            assert(dep->succ->sched.instr);
+         }
+      } else {
+         /* There must be at least one successor that's not scheduled. */
+         bool unscheduled = false;
+         gpir_node_foreach_succ(node, dep) {
+            unscheduled |= !(dep->succ->sched.instr);
+         }
+
+         assert(unscheduled);
+      }
+   }
+}
+
+static void schedule_insert_ready_list(sched_ctx *ctx,
                                        gpir_node *insert_node)
 {
    /* if this node is fully ready or partially ready
@@ -250,7 +424,7 @@ static void schedule_insert_ready_list(struct list_head *ready_list,
    bool ready = true, insert = false;
    gpir_node_foreach_succ(insert_node, dep) {
       gpir_node *succ = dep->succ;
-      if (succ->sched.instr >= 0) {
+      if (succ->sched.instr) {
          if (dep->type == GPIR_DEP_INPUT)
             insert = true;
       }
@@ -265,8 +439,8 @@ static void schedule_insert_ready_list(struct list_head *ready_list,
    if (!insert || insert_node->sched.inserted)
       return;
 
-   struct list_head *insert_pos = ready_list;
-   list_for_each_entry(gpir_node, node, ready_list, list) {
+   struct list_head *insert_pos = &ctx->ready_list;
+   list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
       if (insert_node->sched.dist > node->sched.dist) {
          insert_pos = &node->list;
          break;
@@ -275,6 +449,7 @@ static void schedule_insert_ready_list(struct list_head *ready_list,
 
    list_addtail(&insert_node->list, insert_pos);
    insert_node->sched.inserted = true;
+   ctx->ready_list_slots += gpir_get_slots_required(insert_node);
 }
 
 static int gpir_get_max_start(gpir_node *node)
@@ -284,10 +459,10 @@ static int gpir_get_max_start(gpir_node *node)
    /* find the max start instr constrainted by all successors */
    gpir_node_foreach_succ(node, dep) {
       gpir_node *succ = dep->succ;
-      if (succ->sched.instr < 0)
+      if (!succ->sched.instr)
          continue;
 
-      int start = succ->sched.instr + gpir_get_min_dist(dep);
+      int start = succ->sched.instr->index + gpir_get_min_dist(dep);
       if (start > max_start)
          max_start = start;
    }
@@ -302,10 +477,10 @@ static int gpir_get_min_end(gpir_node *node)
    /* find the min end instr constrainted by all successors */
    gpir_node_foreach_succ(node, dep) {
       gpir_node *succ = dep->succ;
-      if (succ->sched.instr < 0)
+      if (!succ->sched.instr)
          continue;
 
-      int end = succ->sched.instr + gpir_get_max_dist(dep);
+      int end = succ->sched.instr->index + gpir_get_max_dist(dep);
       if (end < min_end)
          min_end = end;
    }
@@ -330,11 +505,24 @@ static gpir_node *gpir_sched_instr_has_load(gpir_instr *instr, gpir_node *node)
    return NULL;
 }
 
-static bool schedule_try_place_node(gpir_instr *instr, gpir_node *node)
+/* Simply place the node into the given instruction without trying to deal
+ * with liveness or the ready list. This will only fail if the instruction
+ * cannot be placed due to a lack of available slots. In addition to normal
+ * node placement, this is also used for placing loads when spilling to
+ * registers.
+ */
+static bool _try_place_node(sched_ctx *ctx, gpir_instr *instr, gpir_node *node)
 {
    if (node->type == gpir_node_type_load) {
       gpir_node *load = gpir_sched_instr_has_load(instr, node);
       if (load) {
+         /* This node may have a store as a successor, in which case we have to
+          * fail it exactly like below in order to later create a move node in
+          * between.
+          */
+         if (instr->index < gpir_get_max_start(node))
+            return false;
+
          gpir_debug("same load %d in instr %d for node %d\n",
                     load->index, instr->index, node->index);
 
@@ -345,23 +533,100 @@ static bool schedule_try_place_node(gpir_instr *instr, gpir_node *node)
       }
    }
 
-   node->sched.instr = instr->index;
+   node->sched.instr = instr;
 
+   int max_node_spill_needed = INT_MAX;
+   int total_spill_needed = INT_MAX;
    int *slots = gpir_op_infos[node->op].slots;
    for (int i = 0; slots[i] != GPIR_INSTR_SLOT_END; i++) {
       node->sched.pos = slots[i];
-      if (node->sched.instr >= gpir_get_max_start(node) &&
-          node->sched.instr <= gpir_get_min_end(node) &&
+      if (instr->index >= gpir_get_max_start(node) &&
+          instr->index <= gpir_get_min_end(node) &&
           gpir_instr_try_insert_node(instr, node))
          return true;
+      if (ctx->instr->non_cplx_slot_difference ||
+          ctx->instr->slot_difference) {
+         /* If one of these fields is non-zero, then we could insert the node
+          * here after spilling. To get an accurate count of how many nodes we
+          * need to spill, we need to choose one of the positions where there
+          * were nonzero slot differences, preferably one with the smallest
+          * difference (so we don't have to spill as much).
+          */
+         if (ctx->instr->non_cplx_slot_difference < max_node_spill_needed ||
+             ctx->instr->slot_difference < total_spill_needed) {
+            max_node_spill_needed = ctx->instr->non_cplx_slot_difference;
+            total_spill_needed = ctx->instr->slot_difference;
+         }
+      }
    }
 
-   node->sched.instr = -1;
+   if (max_node_spill_needed != INT_MAX) {
+      /* Indicate how many spill nodes are needed. */
+      ctx->max_node_spill_needed = MAX2(ctx->max_node_spill_needed,
+                                        max_node_spill_needed);
+      ctx->total_spill_needed = MAX2(ctx->total_spill_needed,
+                                     total_spill_needed);
+   }
+   node->sched.instr = NULL;
    node->sched.pos = -1;
    return false;
 }
 
-static gpir_node *schedule_create_move_node(gpir_node *node)
+/* Try to place just the node given, updating the ready list. If "speculative"
+ * is true, then this is part ofthe pre-commit phase. If false, then we have
+ * committed to placing this node, so update liveness and ready list
+ * information.
+ */
+
+static bool schedule_try_place_node(sched_ctx *ctx, gpir_node *node,
+                                    bool speculative)
+{
+   if (!_try_place_node(ctx, ctx->instr, node)) {
+      if (!speculative)
+         gpir_debug("failed to place %d\n", node->index);
+      return false;
+   }
+
+   ctx->ready_list_slots -= gpir_get_slots_required(node);
+
+   if (!speculative) {
+      gpir_debug("placed node %d\n", node->index);
+
+      /* We assume here that writes are placed before reads. If this changes,
+       * then this needs to be updated.
+       */
+      if (node->op == gpir_op_store_reg) {
+         gpir_store_node *store = gpir_node_to_store(node);
+         ctx->live_physregs &=
+            ~(1ull << (4 * store->index + store->component));
+         if (store->child->sched.physreg_store == store)
+            store->child->sched.physreg_store = NULL;
+      }
+
+      if (node->op == gpir_op_load_reg) {
+         gpir_load_node *load = gpir_node_to_load(node);
+         ctx->live_physregs |= 
+            (1ull << (4 * load->index + load->component));
+      }
+
+      list_del(&node->list);
+      list_add(&node->list, &ctx->block->node_list);
+      gpir_node_foreach_pred(node, dep) {
+         gpir_node *pred = dep->pred;
+         schedule_insert_ready_list(ctx, pred);
+      }
+   } else {
+      gpir_node_foreach_pred(node, dep) {
+         gpir_node *pred = dep->pred;
+         if (!pred->sched.inserted && dep->type == GPIR_DEP_INPUT)
+            ctx->ready_list_slots += gpir_get_slots_required(pred);
+      }
+   }
+
+   return true;
+}
+
+static gpir_node *create_move(sched_ctx *ctx, gpir_node *node)
 {
    gpir_alu_node *move = gpir_node_create(node->block, gpir_op_mov);
    if (unlikely(!move))
@@ -370,187 +635,712 @@ static gpir_node *schedule_create_move_node(gpir_node *node)
    move->children[0] = node;
    move->num_child = 1;
 
-   move->node.sched.instr = -1;
+   move->node.sched.instr = NULL;
    move->node.sched.pos = -1;
    move->node.sched.dist = node->sched.dist;
+   move->node.sched.max_node = node->sched.max_node;
+   move->node.sched.next_max_node = node->sched.next_max_node;
 
    gpir_debug("create move %d for %d\n", move->node.index, node->index);
+
+   ctx->ready_list_slots--;
+   list_del(&node->list);
+   node->sched.max_node = false;
+   node->sched.next_max_node = false;
+   node->sched.ready = false;
+   node->sched.inserted = false;
+   gpir_node_replace_succ(&move->node, node);
+   gpir_node_add_dep(&move->node, node, GPIR_DEP_INPUT);
+   schedule_insert_ready_list(ctx, &move->node);
    return &move->node;
 }
 
-static gpir_node *gpir_sched_node(gpir_instr *instr, gpir_node *node)
+
+/* Once we schedule the successor, would the predecessor be fully ready? */
+static bool pred_almost_ready(gpir_dep *dep)
 {
-   if (node->op == gpir_op_mov) {
-      gpir_node *child = gpir_node_to_alu(node)->children[0];
-      gpir_node_foreach_succ_safe(node, dep) {
-         gpir_node *succ = dep->succ;
-         if (succ->sched.instr < 0 ||
-             instr->index < succ->sched.instr + gpir_get_min_dist(dep)) {
-            gpir_node_replace_pred(dep, child);
-            if (dep->type == GPIR_DEP_INPUT)
-               gpir_node_replace_child(succ, node, child);
+   bool fully_ready = true;
+   gpir_node_foreach_succ(dep->pred, other_dep) {
+      gpir_node *succ = other_dep->succ;
+      if (!succ->sched.instr && dep->succ != other_dep->succ) {
+         fully_ready = false;
+         break;
+      }
+   }
+
+   return fully_ready;
+}
+
+/* Recursively try to schedule a node and all its dependent nodes that can fit
+ * in the same  instruction. There is a simple heuristic scoring system to try
+ * to group together nodes that load different components of the same input,
+ * to avoid bottlenecking for operations like matrix multiplies that are
+ * mostly input-bound.
+ */
+
+static int _schedule_try_node(sched_ctx *ctx, gpir_node *node, bool speculative)
+{
+   if (!schedule_try_place_node(ctx, node, speculative))
+      return INT_MIN;
+
+   int score = 0;
+
+   gpir_node_foreach_pred(node, dep) {
+      if (!gpir_is_input_node(dep->pred))
+         continue;
+
+      int pred_score = INT_MIN;
+      if (pred_almost_ready(dep)) {
+         if (dep->pred->type == gpir_node_type_load || 
+             node->type == gpir_node_type_store) {
+            pred_score = _schedule_try_node(ctx, dep->pred, speculative);
+         }
+      }
+      if (dep->pred->type == gpir_node_type_load || 
+          node->type == gpir_node_type_store) {
+         if (pred_score == INT_MIN) {
+            if (node->op == gpir_op_mov) {
+               /* The only moves on the ready list are for loads that we
+                * couldn't schedule immediately, as created below. If we
+                * couldn't schedule the load, there's no point scheduling
+                * the move. The normal move threading logic will ensure
+                * that another move is created if we're about to go too far
+                * from the uses of this move.
+                */
+               assert(speculative);
+               return INT_MIN;
+            } else if (!speculative && dep->pred->type == gpir_node_type_load) {
+               /* We couldn't schedule the load right away, so it will have
+                * to happen in some earlier instruction and then be moved
+                * into a value register and threaded to the use by "node".
+                * We create the move right away, so that later we'll fail
+                * to schedule it if there isn't a slot for a move
+                * available.
+                */
+               create_move(ctx, dep->pred);
+            }
+            /* Penalize nodes whose dependent ops we couldn't schedule.
+             */
+            score--;
+         } else {
+            score += pred_score;
+            continue;
          }
       }
-      MAYBE_UNUSED bool result = schedule_try_place_node(instr, node);
-      assert(result);
-      return node;
    }
-   else {
-      gpir_node *move = schedule_create_move_node(node);
+
+   return score;
+}
+
+/* If we speculatively tried a node, undo everything.
+ */
+
+static void schedule_undo_node(sched_ctx *ctx, gpir_node *node)
+{
+   gpir_instr_remove_node(ctx->instr, node);
+
+   gpir_node_foreach_pred(node, dep) {
+      gpir_node *pred = dep->pred;
+      if (pred->sched.instr) {
+         schedule_undo_node(ctx, pred);
+      }
+   }
+}
+
+/* Try to schedule a node. We also try to schedule any predecessors that can
+ * be part of the same instruction. If "speculative" is true, then we don't
+ * actually change any state, only returning the score were the node to be
+ * scheduled, with INT_MIN meaning "cannot be scheduled at all".
+ */
+static int schedule_try_node(sched_ctx *ctx, gpir_node *node, bool speculative)
+{
+   int prev_slots = ctx->ready_list_slots;
+
+   int score = _schedule_try_node(ctx, node, speculative);
+
+   if (ctx->ready_list_slots > GPIR_VALUE_REG_NUM) {
+      assert(speculative);
+      ctx->total_spill_needed = MAX2(ctx->total_spill_needed,
+                                     ctx->ready_list_slots - GPIR_VALUE_REG_NUM);
+      score = INT_MIN;
+   }
+
+   if (speculative) {
+      ctx->ready_list_slots = prev_slots;
+      if (node->sched.instr)
+         schedule_undo_node(ctx, node);
+   }
+
+   return score;
+}
+
+/* This is called when we want to spill "node" by inserting loads at its uses.
+ * It returns all the possible registers we can use so that all the loads will
+ * successfully be inserted. Also return the first instruction we'll need to
+ * insert a load for.
+ */
+
+static uint64_t get_available_regs(sched_ctx *ctx, gpir_node *node,
+                                   int *min_index)
+{
+   uint64_t available = ~0ull;
+   gpir_node_foreach_succ(node, dep) {
+      if (dep->type != GPIR_DEP_INPUT)
+         continue;
+
+      gpir_node *use = dep->succ;
+      gpir_instr *instr = use->sched.instr;
+
+      if (!instr) {
+         /* This use isn't scheduled, so no need to spill it. */
+         continue;
+      }
+
+      if (use->type == gpir_node_type_store) {
+         /* We're trying to spill something that was recently stored... just
+          * bail out.
+          */
+         return 0;
+      }
+
+      if (use->op == gpir_op_mov && instr == ctx->instr) {
+         /* We try to spill the sources of this move, so we can free up space
+          * in the current instruction.
+          *
+          * TODO: should we go back further? It might let us schedule the
+          * write earlier in some cases, but then we might fail to spill.
+          */
+         available &= get_available_regs(ctx, use, min_index);
+      } else {
+         if (instr->index < *min_index)
+            *min_index = instr->index;
+
+         uint64_t use_available = 0;
+
+         if (instr->reg0_use_count == 0)
+            use_available = ~0ull;
+         else if (!instr->reg0_is_attr)
+            use_available = 0xf << (4 * instr->reg0_index);
+
+         if (instr->reg1_use_count == 0)
+            use_available = ~0ull;
+         else
+            use_available |= 0xf << (4 * instr->reg1_index);
+
+         available &= use_available;
+      }
+   }
+
+   return available;
+}
+
+/* Using "min_index" returned by get_available_regs(), figure out which
+ * registers are killed by a write after or during the current instruction and
+ * hence we can't use for spilling. Writes that haven't been scheduled yet
+ * should be reflected in live_physregs.
+ */
+
+static uint64_t get_killed_regs(sched_ctx *ctx, int min_index)
+{
+   uint64_t killed = 0;
+
+   list_for_each_entry(gpir_instr, instr, &ctx->block->instr_list, list) {
+      if (instr->index <= min_index)
+         break;
+
+      for (int slot = GPIR_INSTR_SLOT_STORE0; slot <= GPIR_INSTR_SLOT_STORE3; 
+           slot++) {
+         if (!instr->slots[slot])
+            continue;
+
+         gpir_store_node *store = gpir_node_to_store(instr->slots[slot]);
+         if (store->node.op != gpir_op_store_reg)
+            continue;
+
+         killed |= 1ull << (4 * store->index + store->component);
+      }
+   }
+
+   return killed;
+}
+
+/* Actually spill a node so that it is no longer in the ready list. Note that
+ * this must exactly follow the logic of get_available_regs() or else the
+ * loads could fail to schedule.
+ */
+
+static void spill_node(sched_ctx *ctx, gpir_node *node, gpir_store_node *store)
+{
+   gpir_node_foreach_succ_safe(node, dep) {
+      if (dep->type != GPIR_DEP_INPUT)
+         continue;
+
+      gpir_node *use = dep->succ;
+      gpir_instr *instr = use->sched.instr;
+
+      if (!instr)
+         continue;
+
+      if (use->op == gpir_op_mov && instr == ctx->instr) {
+         spill_node(ctx, use, store);
+      } else {
+         gpir_load_node *load = gpir_node_create(ctx->block, gpir_op_load_reg);
+         load->index = store->index;
+         load->component = store->component;
+         list_add(&load->node.list, &ctx->block->node_list);
+         gpir_node_replace_child(dep->succ, dep->pred, &load->node);
+         gpir_node_replace_pred(dep, &load->node);
+         gpir_node_add_dep(&load->node, &store->node, GPIR_DEP_READ_AFTER_WRITE);
+         gpir_debug("spilling use %d of node %d to load node %d\n",
+                    use->index, node->index, load->node.index);
+         MAYBE_UNUSED bool result = _try_place_node(ctx, use->sched.instr, &load->node);
+         assert(result);
+      }
+   }
+
+   if (node->op == gpir_op_mov) {
+      /* We replaced all the uses of the move, so it's dead now. */
+      gpir_instr_remove_node(node->sched.instr, node);
+      gpir_node_delete(node);
+   } else {
+      /* We deleted all the uses of the node except the store, so it's not
+       * live anymore.
+       */
       list_del(&node->list);
-      node->sched.ready = false;
       node->sched.inserted = false;
-      gpir_node_replace_succ(move, node);
-      gpir_node_add_dep(move, node, GPIR_DEP_INPUT);
-      return move;
+      ctx->ready_list_slots--;
+      if (node->sched.max_node) {
+         node->sched.max_node = false;
+         ctx->instr->alu_num_slot_needed_by_max--;
+      }
+      if (node->sched.next_max_node) {
+         node->sched.next_max_node = false;
+         ctx->instr->alu_num_slot_needed_by_next_max--;
+      }
    }
 }
 
-static bool gpir_is_input_node(gpir_node *node)
+static bool used_by_store(gpir_node *node, gpir_instr *instr)
 {
    gpir_node_foreach_succ(node, dep) {
-      if (dep->type == GPIR_DEP_INPUT)
+      if (dep->type != GPIR_DEP_INPUT)
+         continue;
+
+      if (dep->succ->type == gpir_node_type_store &&
+          dep->succ->sched.instr == instr)
          return true;
    }
+
    return false;
 }
 
-static int gpir_get_min_scheduled_succ(gpir_node *node)
+
+
+static bool try_spill_node(sched_ctx *ctx, gpir_node *node)
+{
+   assert(node->op != gpir_op_mov);
+
+   if (used_by_store(node, ctx->instr))
+      return false;
+
+   gpir_debug("trying to spill %d\n", node->index);
+
+   int min_instr = INT_MAX;
+   uint64_t available = get_available_regs(ctx, node, &min_instr);
+   available &= ~get_killed_regs(ctx, min_instr);
+
+   if (node->sched.physreg_store) {
+      gpir_store_node *store = node->sched.physreg_store;
+      if (!(available & (1ull << (4 * store->index + store->component))))
+         return false;
+   } else {
+      available &= ~ctx->live_physregs;
+
+      if (available == 0)
+         return false;
+
+      /* TODO: use a better heuristic for choosing an available register? */
+      int physreg = ffsll(available) - 1;
+
+      ctx->live_physregs |= (1ull << physreg);
+
+      /* TODO: when we support multiple basic blocks, there may be register
+       * loads/stores to this register other than this one that haven't been
+       * scheduled yet so we may need to insert write-after-read dependencies.
+       */
+      gpir_store_node *store = gpir_node_create(ctx->block, gpir_op_store_reg);
+      store->index = physreg / 4;
+      store->component = physreg % 4;
+      store->child = node;
+      store->node.sched.max_node = false;
+      store->node.sched.next_max_node = false;
+      store->node.sched.pos = -1;
+      store->node.sched.instr = NULL;
+      store->node.sched.inserted = false;
+      store->node.sched.dist = node->sched.dist;
+      if (node->op == gpir_op_complex1) {
+         /* Complex1 cannot be directly stored, and has a latency of 2 */
+         store->node.sched.dist += 2;
+      }
+      node->sched.physreg_store = store;
+      gpir_node_add_dep(&store->node, node, GPIR_DEP_INPUT);
+      node->sched.ready = false;
+      schedule_insert_ready_list(ctx, &store->node);
+   }
+
+   gpir_debug("spilling %d to $%d.%c, store %d\n", node->index,
+              node->sched.physreg_store->index,
+              "xyzw"[node->sched.physreg_store->component],
+              node->sched.physreg_store->node.index);
+
+   spill_node(ctx, node, node->sched.physreg_store);
+
+   return true;
+}
+
+static bool try_spill_nodes(sched_ctx *ctx, gpir_node *orig_node)
+{
+   /* First, try to spill max nodes. */
+   list_for_each_entry_safe_rev(gpir_node, node, &ctx->ready_list, list) {
+      if (ctx->max_node_spill_needed <= 0)
+         break;
+
+      /* orig_node is the node we're trying to schedule, so spilling it makes
+       * no sense. Also don't try to spill any nodes in front of it, since
+       * they might be scheduled instead.
+       */
+      if (node == orig_node)
+         break;
+
+      if (node->op == gpir_op_mov) {
+         /* Don't try to spill loads, since that only adds another load and
+          * store which is likely pointless.
+          */
+         continue;
+      }
+
+      if (!gpir_is_input_node(node) || !node->sched.max_node)
+         continue;
+
+      if (try_spill_node(ctx, node)) {
+         ctx->max_node_spill_needed--;
+         ctx->total_spill_needed--;
+      }
+   }
+
+   /* Now, try to spill the remaining nodes. */
+   list_for_each_entry_safe_rev(gpir_node, node, &ctx->ready_list, list) {
+      if (ctx->total_spill_needed <= 0)
+         break;
+
+      if (node == orig_node)
+         break;
+
+      if (node->op == gpir_op_mov)
+         continue;
+
+      if (!gpir_is_input_node(node) ||
+          !(node->sched.max_node || node->sched.next_max_node))
+         continue;
+
+      if (try_spill_node(ctx, node))
+         ctx->total_spill_needed--;
+   }
+
+   return ctx->total_spill_needed <= 0 && ctx->max_node_spill_needed <= 0;
+}
+
+static int gpir_get_curr_ready_list_slots(sched_ctx *ctx)
+{
+   int total = 0;
+   list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
+      total += gpir_get_slots_required(node);
+   }
+
+   return total;
+}
+
+/* What gpir_get_min_end() would return if node were replaced with a move
+ * instruction not in the complex slot. Normally this is 2 + min_end, except
+ * for some store instructions which must have the move node in the same
+ * instruction.
+ */
+static int gpir_get_min_end_as_move(gpir_node *node)
 {
    int min = INT_MAX;
    gpir_node_foreach_succ(node, dep) {
       gpir_node *succ = dep->succ;
-      if (succ->sched.instr >= 0 && dep->type == GPIR_DEP_INPUT) {
-         if (min > succ->sched.instr)
-            min = succ->sched.instr;
+      if (succ->sched.instr && dep->type == GPIR_DEP_INPUT) {
+         switch (succ->op) {
+         case gpir_op_store_temp:
+         case gpir_op_store_reg:
+         case gpir_op_store_varying:
+            continue;
+         default:
+            break;
+         }
+         if (min > succ->sched.instr->index + 2)
+            min = succ->sched.instr->index + 2;
       }
    }
    return min;
 }
 
-static gpir_node *gpir_sched_instr_pass(gpir_instr *instr,
-                                        struct list_head *ready_list)
+/* Initialize node->sched.max_node and node->sched.next_max_node for every
+ * input node on the ready list. We should only need to do this once per
+ * instruction, at the beginning, since we never add max nodes to the ready
+ * list.
+ */
+
+static void sched_find_max_nodes(sched_ctx *ctx)
 {
-   /* fully ready node reach its max dist with any of its successor */
-   list_for_each_entry_safe(gpir_node, node, ready_list, list) {
-      if (node->sched.ready) {
-         int end = gpir_get_min_end(node);
-         assert(end >= instr->index);
-         if (instr->index < end)
-            continue;
+   ctx->instr->alu_num_slot_needed_by_next_max = -5;
+   ctx->instr->alu_num_slot_needed_by_max = 0;
 
-         gpir_debug("fully ready max node %d\n", node->index);
+   list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
+      if (!gpir_is_input_node(node))
+         continue;
 
-         if (schedule_try_place_node(instr, node))
-            return node;
+      int min_end_move = gpir_get_min_end_as_move(node);
+      node->sched.max_node = (min_end_move == ctx->instr->index);
+      node->sched.next_max_node = (min_end_move == ctx->instr->index + 1);
 
-         return gpir_sched_node(instr, node);
-      }
+      if (node->sched.max_node)
+         ctx->instr->alu_num_slot_needed_by_max++;
+      if (node->sched.next_max_node)
+         ctx->instr->alu_num_slot_needed_by_next_max++;
    }
+}
 
-   /* partially ready node reach its max dist with any of its successor */
-   list_for_each_entry_safe(gpir_node, node, ready_list, list) {
-      if (!node->sched.ready) {
-         int end = gpir_get_min_end(node);
-         assert(end >= instr->index);
-         if (instr->index < end)
-            continue;
+/* Verify the invariants described in gpir.h, as well as making sure the
+ * counts are correct.
+ */
+static void verify_max_nodes(sched_ctx *ctx)
+{
+   int alu_num_slot_needed_by_max = 0;
+   int alu_num_slot_needed_by_next_max = -5;
+   int alu_num_slot_needed_by_store = 0;
 
-         gpir_debug("partially ready max node %d\n", node->index);
+   list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
+      if (!gpir_is_input_node(node))
+         continue;
 
-         return gpir_sched_node(instr, node);
-      }
+      if (node->sched.max_node)
+         alu_num_slot_needed_by_max++;
+      if (node->sched.next_max_node)
+         alu_num_slot_needed_by_next_max++;
+      if (used_by_store(node, ctx->instr))
+         alu_num_slot_needed_by_store++;
    }
 
-   /* schedule node used by previous instr when count > 5 */
-   int count = 0;
-   gpir_node *two_slot_node = NULL;
-   list_for_each_entry(gpir_node, node, ready_list, list) {
-      if (gpir_is_input_node(node)) {
-         int min = gpir_get_min_scheduled_succ(node);
-         assert(min >= instr->index - 1);
-         if (min == instr->index - 1) {
-            if (gpir_op_infos[node->op].may_consume_two_slots) {
-               two_slot_node = node;
-               count += 2;
+   assert(ctx->instr->alu_num_slot_needed_by_max == alu_num_slot_needed_by_max);
+   assert(ctx->instr->alu_num_slot_needed_by_next_max == alu_num_slot_needed_by_next_max);
+   assert(ctx->instr->alu_num_slot_needed_by_store == alu_num_slot_needed_by_store);
+   assert(ctx->instr->alu_num_slot_free >= alu_num_slot_needed_by_store + alu_num_slot_needed_by_max + alu_num_slot_needed_by_next_max);
+   assert(ctx->instr->alu_non_cplx_slot_free >= alu_num_slot_needed_by_max);
+}
+
+static bool try_node(sched_ctx *ctx)
+{
+   gpir_node *best_node = NULL;
+   int best_score = INT_MIN;
+
+   /* Spilling will delete arbitrary nodes after the current one in the ready
+    * list, which means that we always need to look up the next node in the
+    * list at the end of each iteration. While list_for_each_entry() works for
+    * this purpose, its sanity checking assumes that you don't want to modify
+    * the list at all. We know better here, so we have to open-code
+    * list_for_each_entry() without the check in order to not assert.
+    */
+   for (gpir_node *node = LIST_ENTRY(gpir_node, ctx->ready_list.next, list);
+        &node->list != &ctx->ready_list;
+        node = LIST_ENTRY(gpir_node, node->list.next, list)) {
+      if (best_score != INT_MIN) {
+         if (node->sched.dist < best_node->sched.dist)
+            break;
+      }
+
+      if (node->sched.ready) {
+         ctx->total_spill_needed = 0;
+         ctx->max_node_spill_needed = 0;
+         int score = schedule_try_node(ctx, node, true);
+         if (score == INT_MIN) {
+            if (ctx->total_spill_needed > 0 &&
+                try_spill_nodes(ctx, node)) {
+               score = schedule_try_node(ctx, node, true);
+               if (score == INT_MIN)
+                  continue;
             }
-            else
-               count++;
+         }
+
+         if (score > best_score) {
+            best_score = score;
+            best_node = node;
          }
       }
    }
 
-   if (count > 5) {
-      /* When no slot avaible, must schedule a move for two slot node
-       * to reduce the count. This results from the dummy_m/f method.
-       */
-      if (gpir_instr_alu_slot_is_full(instr)) {
-         assert(two_slot_node);
-         gpir_debug("instr is full, schedule move node for two slot node %d\n",
-                    two_slot_node->index);
-         return gpir_sched_node(instr, two_slot_node);
-      }
-
-      /* schedule fully ready node first */
-      list_for_each_entry(gpir_node, node, ready_list, list) {
-         if (gpir_is_input_node(node)) {
-            int min = gpir_get_min_scheduled_succ(node);
-            if (min == instr->index - 1 && node->sched.ready) {
-               gpir_debug(">5 ready node %d\n", node->index);
-
-               if (schedule_try_place_node(instr, node))
-                  return node;
-            }
-         }
+   if (best_node) {
+      gpir_debug("scheduling %d (score = %d)%s\n", best_node->index,
+                 best_score, best_node->sched.max_node ? " (max)" : "");
+      MAYBE_UNUSED int score = schedule_try_node(ctx, best_node, false);
+      assert(score != INT_MIN);
+      return true;
+   }
+
+   return false;
+}
+
+static void place_move(sched_ctx *ctx, gpir_node *node)
+{
+   gpir_node *move = create_move(ctx, node);
+   gpir_node_foreach_succ_safe(move, dep) {
+      gpir_node *succ = dep->succ;
+      if (!succ->sched.instr ||
+          ctx->instr->index < succ->sched.instr->index + gpir_get_min_dist(dep)) {
+         gpir_node_replace_pred(dep, node);
+         if (dep->type == GPIR_DEP_INPUT)
+            gpir_node_replace_child(succ, move, node);
       }
+   }
+   MAYBE_UNUSED int score = schedule_try_node(ctx, move, false);
+   assert(score != INT_MIN);
+}
 
-      /* no fully ready node be scheduled, schedule partially ready node */
-      list_for_each_entry_safe(gpir_node, node, ready_list, list) {
-         if (gpir_is_input_node(node)) {
-            int min = gpir_get_min_scheduled_succ(node);
-            if (min == instr->index - 1 && !node->sched.ready) {
-               gpir_debug(">5 partially ready node %d\n", node->index);
+static bool sched_move(sched_ctx *ctx)
+{
+   list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
+      if (node->sched.max_node) {
+         place_move(ctx, node);
+         return true;
+      }
+   }
 
-               return gpir_sched_node(instr, node);
+   if (ctx->instr->alu_num_slot_needed_by_store > 0) {
+      list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
+         if (used_by_store(node, ctx->instr)) {
+            place_move(ctx, node);
+            /* If we have a store of a load, then we need to make sure that we
+             * immediately schedule the dependent load, or create a move
+             * instruction for it, like we would with a normal instruction.
+             * The rest of the code isn't set up to handle load nodes in the
+             * ready list -- see the comments in _schedule_try_node().
+             */
+            if (node->type == gpir_node_type_load) {
+               if (!schedule_try_place_node(ctx, node, false)) {
+                  create_move(ctx, node);
+               }
             }
+            return true;
          }
       }
+   }
 
-      /* finally schedule move for fully ready node */
-      list_for_each_entry_safe(gpir_node, node, ready_list, list) {
-         if (gpir_is_input_node(node)) {
-            int min = gpir_get_min_scheduled_succ(node);
-            if (min == instr->index - 1 && node->sched.ready) {
-               gpir_debug(">5 fully ready move node %d\n", node->index);
-
-               return gpir_sched_node(instr, node);
+   /* complex1 is a bit a special case, since it has a latency of 2 cycles.
+    * Once it is fully ready, we need to group all its uses in the same
+    * instruction, and then we need to avoid creating any moves in the next
+    * cycle in order to get it scheduled. Failing to do any of these things
+    * could result in a cycle penalty, or even worse, an infinite loop of
+    * inserting moves. If it is a next-max node and ready, then it has a use
+    * in the previous cycle. If it has a use in the current cycle as well,
+    * then we want to insert a move node to make it ready in two cycles -- if
+    * we don't, then there will be at least a one cycle penalty. Otherwise, it
+    * will be ready next cycle, and we shouldn't insert a move node, or else
+    * we'll also have a one cycle penalty. 
+    */
+   if (ctx->instr->alu_num_slot_free > 0) {
+      list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
+         if (node->sched.next_max_node && node->op == gpir_op_complex1 &&
+             node->sched.ready) {
+            bool skip = true;
+            gpir_node_foreach_succ(node, dep) {
+               if (dep->type != GPIR_DEP_INPUT)
+                  continue;
+
+               gpir_node *succ = dep->succ;
+
+               if (!succ->sched.instr ||
+                   succ->sched.instr->index != ctx->instr->index - 1) {
+                  skip = false;
+                  break;
+               }
             }
+
+            if (skip)
+               continue;
+
+            place_move(ctx, node);
+            return true;
          }
       }
    }
 
-   /* schedule remain fully ready nodes */
-   list_for_each_entry(gpir_node, node, ready_list, list) {
-      if (node->sched.ready) {
-         gpir_debug("remain fully ready node %d\n", node->index);
+   /* Once we've made all the required moves, we're free to use any extra
+    * slots to schedule more moves for next max nodes. Besides sometimes being
+    * necessary, this can free up extra space in the next instruction. We walk
+    * from back to front so that we pick nodes less likely to be scheduled
+    * next first -- an extra move would be unnecessary there. But make sure
+    * not to handle the complex1 case handled above.
+    */
+   if (ctx->instr->alu_num_slot_free > 0) {
+      list_for_each_entry_rev(gpir_node, node, &ctx->ready_list, list) {
+         if (node->sched.next_max_node &&
+             !(node->op == gpir_op_complex1 && node->sched.ready)) {
+            place_move(ctx, node);
+            return true;
+         }
+      }
+   }
 
-         if (schedule_try_place_node(instr, node))
-            return node;
+   /* We may have skipped complex1 above, but if we run out of space, we still
+    * need to insert the move.
+    */
+
+   if (ctx->instr->alu_num_slot_needed_by_next_max > 0) {
+      list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
+         if (node->sched.next_max_node) {
+            place_move(ctx, node);
+            return true;
+         }
       }
    }
 
-   return NULL;
+
+   return false;
 }
 
-static void schedule_print_pre_one_instr(gpir_instr *instr,
-                                         struct list_head *ready_list)
+static bool gpir_sched_instr_pass(sched_ctx *ctx)
+{
+   if (try_node(ctx))
+      return true;
+
+   if (sched_move(ctx))
+      return true;
+
+   return false;
+}
+
+static void schedule_print_pre_one_instr(sched_ctx *ctx)
 {
    if (!(lima_debug & LIMA_DEBUG_GP))
       return;
 
-   printf("instr %d for ready list:", instr->index);
-   list_for_each_entry(gpir_node, node, ready_list, list) {
-      printf(" %d/%c", node->index, node->sched.ready ? 'r' : 'p');
+   printf("instr %d for ready list:", ctx->instr->index);
+   list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
+      printf(" %d/%c (%d, %d, %s)", node->index, node->sched.ready ? 'r' : 'p',
+             node->sched.dist, gpir_get_slots_required(node),
+             node->sched.max_node ? "max" : (node->sched.next_max_node ? "next" : "none"));
+   }
+   printf("\nlive physregs: ");
+   for (unsigned i = 0; i < 16; i++) {
+      if (ctx->live_physregs & (0xfull << (4 * i))) {
+         printf("$%d.", i);
+         for (unsigned j = 0; j < 4; j++) {
+            if (ctx->live_physregs & (1ull << (4 * i + j)))
+               printf("%c", "xyzw"[j]);
+         }
+         printf(" ");
+      }
    }
    printf("\n");
 }
@@ -569,30 +1359,23 @@ static void schedule_print_post_one_instr(gpir_instr *instr)
 }
 
 
-static bool schedule_one_instr(gpir_block *block, struct list_head *ready_list)
+static bool schedule_one_instr(sched_ctx *ctx)
 {
-   gpir_instr *instr = gpir_instr_create(block);
+   gpir_instr *instr = gpir_instr_create(ctx->block);
    if (unlikely(!instr))
       return false;
 
-   schedule_print_pre_one_instr(instr, ready_list);
-
-   while (true) {
-      gpir_node *node = gpir_sched_instr_pass(instr, ready_list);
-      if (!node)
-         break;
+   ctx->instr = instr;
 
-      if (node->sched.instr < 0)
-         schedule_insert_ready_list(ready_list, node);
-      else {
-         list_del(&node->list);
-         list_add(&node->list, &block->node_list);
+   sched_find_max_nodes(ctx);
+   schedule_print_pre_one_instr(ctx);
 
-         gpir_node_foreach_pred(node, dep) {
-            gpir_node *pred = dep->pred;
-            schedule_insert_ready_list(ready_list, pred);
-         }
-      }
+   while (gpir_sched_instr_pass(ctx)) {
+      assert(ctx->ready_list_slots == gpir_get_curr_ready_list_slots(ctx));
+#ifndef NDEBUG
+      verify_max_nodes(ctx);
+      verify_ready_list(ctx);
+#endif
    }
 
    schedule_print_post_one_instr(instr);
@@ -607,45 +1390,33 @@ static bool schedule_block(gpir_block *block)
          schedule_update_distance(node);
    }
 
-   struct list_head ready_list;
-   list_inithead(&ready_list);
+   sched_ctx ctx;
+   list_inithead(&ctx.ready_list);
+   ctx.block = block;
+   ctx.ready_list_slots = 0;
+   /* TODO initialize with block live out once we have proper liveness
+    * tracking
+    */
+   ctx.live_physregs = 0;
 
    /* construct the ready list from root nodes */
    list_for_each_entry_safe(gpir_node, node, &block->node_list, list) {
       if (gpir_node_is_root(node))
-         schedule_insert_ready_list(&ready_list, node);
+         schedule_insert_ready_list(&ctx, node);
    }
 
    list_inithead(&block->node_list);
-   while (!list_empty(&ready_list)) {
-      if (!schedule_one_instr(block, &ready_list))
+   while (!list_empty(&ctx.ready_list)) {
+      if (!schedule_one_instr(&ctx))
          return false;
    }
 
    return true;
 }
 
-static void schedule_build_vreg_dependency(gpir_block *block)
+static void schedule_build_dependency(gpir_block *block)
 {
-   gpir_node *regs[GPIR_VALUE_REG_NUM] = {0};
-   list_for_each_entry(gpir_node, node, &block->node_list, list) {
-      /* store node has no value reg assigned */
-      if (node->value_reg < 0)
-         continue;
-
-      gpir_node *reg = regs[node->value_reg];
-      if (reg) {
-         gpir_node_foreach_succ(reg, dep) {
-            /* write after read dep should only apply to real 'read' */
-            if (dep->type != GPIR_DEP_INPUT)
-               continue;
-
-            gpir_node *succ = dep->succ;
-            gpir_node_add_dep(node, succ, GPIR_DEP_VREG_WRITE_AFTER_READ);
-         }
-      }
-      regs[node->value_reg] = node;
-   }
+   gpir_node *last_written[GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM] = {0};
 
    /* merge dummy_f/m to the node created from */
    list_for_each_entry_safe(gpir_node, node, &block->node_list, list) {
@@ -666,76 +1437,49 @@ static void schedule_build_vreg_dependency(gpir_block *block)
          gpir_node_delete(node);
       }
    }
-}
 
-static void schedule_build_preg_dependency(gpir_compiler *comp)
-{
-   /* merge reg with the same index */
-   gpir_reg *regs[GPIR_VALUE_REG_NUM] = {0};
-   list_for_each_entry(gpir_reg, reg, &comp->reg_list, list) {
-      if (!regs[reg->index])
-         regs[reg->index] = reg;
-      else {
-         list_splicetail(&reg->defs_list, &regs[reg->index]->defs_list);
-         list_splicetail(&reg->uses_list, &regs[reg->index]->uses_list);
+   /* Forward dependencies. We only need to add these for register loads,
+    * since value registers already have an input dependency.
+    */
+   list_for_each_entry(gpir_node, node, &block->node_list, list) {
+      if (node->op == gpir_op_load_reg) {
+         gpir_load_node *load = gpir_node_to_load(node);
+         unsigned index = 4 * load->index + load->component;
+         if (last_written[index]) {
+            gpir_node_add_dep(node, last_written[index], GPIR_DEP_READ_AFTER_WRITE);
+         }
       }
+
+      if (node->value_reg >= 0)
+         last_written[node->value_reg] = node;
    }
 
-   /* calculate physical reg read/write dependency for load/store nodes */
-   for (int i = 0; i < GPIR_VALUE_REG_NUM; i++) {
-      gpir_reg *reg = regs[i];
-      if (!reg)
-         continue;
+   memset(last_written, 0, sizeof(last_written));
 
-      /* sort reg write */
-      struct list_head tmp_list;
-      list_replace(&reg->defs_list, &tmp_list);
-      list_inithead(&reg->defs_list);
-      list_for_each_entry_safe(gpir_store_node, store, &tmp_list, reg_link) {
-         struct list_head *insert_pos = &reg->defs_list;
-         list_for_each_entry(gpir_store_node, st, &reg->defs_list, reg_link) {
-            if (st->node.sched.index > store->node.sched.index) {
-               insert_pos = &st->reg_link;
-               break;
-            }
+   /* False dependencies. For value registers, these exist only to make sure
+    * that the maximum pressure isn't exceeded and are hence "fake".
+    */
+   list_for_each_entry_rev(gpir_node, node, &block->node_list, list) {
+      if (node->op == gpir_op_load_reg) {
+         gpir_load_node *load = gpir_node_to_load(node);
+         unsigned index = 4 * load->index + load->component;
+         if (last_written[index]) {
+            gpir_node_add_dep(last_written[index], node, GPIR_DEP_WRITE_AFTER_READ);
          }
-         list_del(&store->reg_link);
-         list_addtail(&store->reg_link, insert_pos);
-      }
-
-      /* sort reg read */
-      list_replace(&reg->uses_list, &tmp_list);
-      list_inithead(&reg->uses_list);
-      list_for_each_entry_safe(gpir_load_node, load, &tmp_list, reg_link) {
-         struct list_head *insert_pos = &reg->uses_list;
-         list_for_each_entry(gpir_load_node, ld, &reg->uses_list, reg_link) {
-            if (ld->node.sched.index > load->node.sched.index) {
-               insert_pos = &ld->reg_link;
-               break;
+      } else {
+         gpir_node_foreach_pred(node, dep) {
+            if (dep->type == GPIR_DEP_INPUT) {
+               int index = dep->pred->value_reg;
+               if (index >= 0 && last_written[index]) {
+                  gpir_node_add_dep(last_written[index], node,
+                                    GPIR_DEP_WRITE_AFTER_READ);
+               }
             }
          }
-         list_del(&load->reg_link);
-         list_addtail(&load->reg_link, insert_pos);
-      }
-
-      /* insert dependency */
-      gpir_store_node *store =
-         list_first_entry(&reg->defs_list, gpir_store_node, reg_link);
-      gpir_store_node *next = store->reg_link.next != &reg->defs_list ?
-         list_first_entry(&store->reg_link, gpir_store_node, reg_link) : NULL;
-
-      list_for_each_entry(gpir_load_node, load, &reg->uses_list, reg_link) {
-         /* loop until load is between store and next */
-         while (next && next->node.sched.index < load->node.sched.index) {
-            store = next;
-            next = store->reg_link.next != &reg->defs_list ?
-               list_first_entry(&store->reg_link, gpir_store_node, reg_link) : NULL;
-         }
-
-         gpir_node_add_dep(&load->node, &store->node, GPIR_DEP_READ_AFTER_WRITE);
-         if (next)
-            gpir_node_add_dep(&next->node, &load->node, GPIR_DEP_WRITE_AFTER_READ);
       }
+
+      if (node->value_reg >= 0)
+         last_written[node->value_reg] = node;
    }
 }
 
@@ -792,20 +1536,25 @@ bool gpir_schedule_prog(gpir_compiler *comp)
    list_for_each_entry(gpir_block, block, &comp->block_list, list) {
       block->sched.instr_index = 0;
       list_for_each_entry(gpir_node, node, &block->node_list, list) {
-         node->sched.instr = -1;
+         node->sched.instr = NULL;
          node->sched.pos = -1;
          node->sched.index = index++;
          node->sched.dist = -1;
+         /* TODO when we support multiple basic blocks, we need a way to keep
+          * track of this for physregs allocated before the scheduler.
+          */
+         node->sched.physreg_store = NULL;
          node->sched.ready = false;
          node->sched.inserted = false;
+         node->sched.max_node = false;
+         node->sched.next_max_node = false;
       }
    }
 
-   /* build fake/virtual dependency */
+   /* build dependency */
    list_for_each_entry(gpir_block, block, &comp->block_list, list) {
-      schedule_build_vreg_dependency(block);
+      schedule_build_dependency(block);
    }
-   schedule_build_preg_dependency(comp);
 
    //gpir_debug("after scheduler build reg dependency\n");
    //gpir_node_print_prog_dep(comp);
diff --git a/src/gallium/drivers/lima/meson.build b/src/gallium/drivers/lima/meson.build
index 1db8c7ea8f9..0f13b9c7272 100644
--- a/src/gallium/drivers/lima/meson.build
+++ b/src/gallium/drivers/lima/meson.build
@@ -29,8 +29,7 @@ files_lima = files(
   'ir/gp/codegen.h',
   'ir/gp/codegen.c',
   'ir/gp/reduce_scheduler.c',
-  'ir/gp/value_regalloc.c',
-  'ir/gp/physical_regalloc.c',
+  'ir/gp/regalloc.c',
   'ir/gp/disasm.c',
 
   'ir/pp/ppir.h',
-- 
2.30.2