From: Eric Anholt <eric@anholt.net>
Date: Tue, 19 Feb 2019 17:30:52 +0000 (-0800)
Subject: nir: Add a scheduler pass to reduce maximum register pressure.
X-Git-Url: https://git.libre-soc.org/?p=mesa.git;a=commitdiff_plain;h=8afab607ac37871771cd75ac9dfdaea3bea65d25;hp=5159db60fccf17b00b8da4257dac027b6e2327ef

nir: Add a scheduler pass to reduce maximum register pressure.

This is similar to a scheduler I've written for vc4 and i965, but this
time written at the NIR level so that hopefully it's reusable.  A notable
new feature it has is Goodman/Hsu's heuristic of "once we've started
processing the uses of a value, prioritize processing the rest of their
uses", which should help avoid the heuristic otherwise making such
systematically bad choices around getting texture results consumed.

Results for v3d:

total instructions in shared programs: 6497588 -> 6518242 (0.32%)
total threads in shared programs: 154000 -> 152828 (-0.76%)
total uniforms in shared programs: 2119629 -> 2068681 (-2.40%)
total spills in shared programs: 4984 -> 472 (-90.53%)
total fills in shared programs: 6418 -> 1546 (-75.91%)

Acked-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com> (v1)
Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com> (v2)

v2: Use the DAG datastructure, fold in the scheduling-for-parallelism
    patch, include SSA defs in live values so we can switch to bottom-up
    if we want.
v3: Squash in improvements from Alejandro Piñeiro for getting V3D to
    successfully register allocate on GLES3.1 dEQP.  Make sure that
    discards don't move after store_output.  Comment spelling fix.
---

diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c
index 12009002f9b..340cda903e9 100644
--- a/src/broadcom/compiler/vir.c
+++ b/src/broadcom/compiler/vir.c
@@ -960,6 +960,11 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler,
         NIR_PASS_V(c->s, nir_lower_bool_to_int32);
         NIR_PASS_V(c->s, nir_convert_from_ssa, true);
 
+        /* Schedule for about half our register space, to enable more shaders
+         * to hit 4 threads.
+         */
+        NIR_PASS_V(c->s, nir_schedule, 24);
+
         v3d_nir_to_vir(c);
 
         v3d_set_prog_data(c, prog_data);
diff --git a/src/compiler/Makefile.sources b/src/compiler/Makefile.sources
index 0ca94d41d29..e13a19c749e 100644
--- a/src/compiler/Makefile.sources
+++ b/src/compiler/Makefile.sources
@@ -326,6 +326,7 @@ NIR_FILES = \
 	nir/nir_range_analysis.h \
 	nir/nir_remove_dead_variables.c \
 	nir/nir_repair_ssa.c \
+	nir/nir_schedule.c \
 	nir/nir_search.c \
 	nir/nir_search.h \
 	nir/nir_search_helpers.h \
diff --git a/src/compiler/nir/meson.build b/src/compiler/nir/meson.build
index ee197ea74fb..a5a3e90c8a4 100644
--- a/src/compiler/nir/meson.build
+++ b/src/compiler/nir/meson.build
@@ -209,6 +209,7 @@ files_libnir = files(
   'nir_range_analysis.h',
   'nir_remove_dead_variables.c',
   'nir_repair_ssa.c',
+  'nir_schedule.c',
   'nir_search.c',
   'nir_search.h',
   'nir_search_helpers.h',
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 8b5e777246a..dca4b8f82b8 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -4201,6 +4201,10 @@ typedef bool (*nir_should_vectorize_mem_func)(unsigned align, unsigned bit_size,
 bool nir_opt_load_store_vectorize(nir_shader *shader, nir_variable_mode modes,
                                   nir_should_vectorize_mem_func callback);
 
+void nir_schedule(nir_shader *shader, int threshold);
+
+void nir_strip(nir_shader *shader);
+
 void nir_sweep(nir_shader *shader);
 
 void nir_remap_dual_slot_attributes(nir_shader *shader,
diff --git a/src/compiler/nir/nir_schedule.c b/src/compiler/nir/nir_schedule.c
new file mode 100644
index 00000000000..0ad95a5bc89
--- /dev/null
+++ b/src/compiler/nir/nir_schedule.c
@@ -0,0 +1,1087 @@
+/*
+ * Copyright Â© 2019 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "nir.h"
+#include "util/dag.h"
+#include "util/u_dynarray.h"
+
+/** @file
+ *
+ * Implements basic-block-level prepass instruction scheduling in NIR to
+ * manage register pressure.
+ *
+ * This is based on the Goodman/Hsu paper (1988, cached copy at
+ * https://people.freedesktop.org/~anholt/scheduling-goodman-hsu.pdf).  We
+ * make up the DDG for NIR (which can be mostly done using the NIR def/use
+ * chains for SSA instructions, plus some edges for ordering register writes
+ * vs reads, and some more for ordering intrinsics).  Then we pick heads off
+ * of the DDG using their heuristic to emit the NIR instructions back into the
+ * block in their new order.
+ *
+ * The hard case for prepass scheduling on GPUs seems to always be consuming
+ * texture/ubo results.  The register pressure heuristic doesn't want to pick
+ * an instr that starts consuming texture results because it usually won't be
+ * the only usage, so that instruction will increase pressure.
+ *
+ * If you try to force consumption of tex results always, then in a case where
+ * single sample is used for many outputs, you'll end up picking every other
+ * user and expanding register pressure.  The partially_evaluated_path flag
+ * helps tremendously, in that if you happen for whatever reason to pick a
+ * texture sample's output, then you'll try to finish off that sample.  Future
+ * work may include doing some local search before locking in a choice, to try
+ * to more reliably find the case where just a few choices going against the
+ * heuristic can manage to free the whole vector.
+ */
+
+static bool debug;
+
+/**
+ * Represents a node in the DDG for a NIR instruction.
+ */
+typedef struct {
+   struct dag_node dag; /* must be first for our u_dynarray_foreach */
+   nir_instr *instr;
+   bool partially_evaluated_path;
+
+   /* Approximate estimate of the delay between starting this instruction and
+    * its results being available.
+    *
+    * Accuracy is not too important, given that we're prepass scheduling here
+    * and just trying to reduce excess dependencies introduced by a register
+    * allocator by stretching out the live intervals of expensive
+    * instructions.
+    */
+   uint32_t delay;
+
+   /* Cost of the maximum-delay path from this node to the leaves. */
+   uint32_t max_delay;
+
+   /* scoreboard->time value when this instruction can be scheduled without
+    * any stalls expected.
+    */
+   uint32_t ready_time;
+} nir_schedule_node;
+
+typedef struct {
+   struct dag *dag;
+
+   nir_shader *shader;
+
+   /* Mapping from nir_register * or nir_ssa_def * to a struct set of
+    * instructions remaining to be scheduled using the register.
+    */
+   struct hash_table *remaining_uses;
+
+   /* Map from nir_instr to nir_schedule_node * */
+   struct hash_table *instr_map;
+
+   /* Set of nir_register * or nir_ssa_def * that have had any instruction
+    * scheduled on them.
+    */
+   struct set *live_values;
+
+   /* An abstract approximation of the number of nir_scheduler_node->delay
+    * units since the start of the shader.
+    */
+   uint32_t time;
+
+   /* Number of channels currently used by the NIR instructions that have been
+    * scheduled.
+    */
+   int pressure;
+
+   /* Number of channels that may be in use before we switch to the
+    * pressure-prioritizing scheduling heuristic.
+    */
+   int threshold;
+} nir_schedule_scoreboard;
+
+/* When walking the instructions in reverse, we use this flag to swap
+ * before/after in add_dep().
+ */
+enum direction { F, R };
+
+typedef struct {
+   nir_shader *shader;
+
+   /* Map from nir_instr to nir_schedule_node * */
+   struct hash_table *instr_map;
+   /* Map from nir_register to nir_schedule_node * */
+   struct hash_table *reg_map;
+
+   /* Scheduler nodes for last instruction involved in some class of dependency.
+    */
+   nir_schedule_node *load_input;
+   nir_schedule_node *store_shared;
+   nir_schedule_node *unknown_intrinsic;
+   nir_schedule_node *discard;
+   nir_schedule_node *jump;
+
+   enum direction dir;
+} nir_deps_state;
+
+static void *
+_mesa_hash_table_search_data(struct hash_table *ht, void *key)
+{
+   struct hash_entry *entry = _mesa_hash_table_search(ht, key);
+   if (!entry)
+      return NULL;
+   return entry->data;
+}
+
+static nir_schedule_node *
+nir_schedule_get_node(struct hash_table *instr_map, nir_instr *instr)
+{
+   return _mesa_hash_table_search_data(instr_map, instr);
+}
+
+static struct set *
+nir_schedule_scoreboard_get_src(nir_schedule_scoreboard *scoreboard, nir_src *src)
+{
+   if (src->is_ssa) {
+      return _mesa_hash_table_search_data(scoreboard->remaining_uses, src->ssa);
+   } else {
+      return _mesa_hash_table_search_data(scoreboard->remaining_uses,
+                                          src->reg.reg);
+   }
+}
+
+static int
+nir_schedule_def_pressure(nir_ssa_def *def)
+{
+   return def->num_components;
+}
+
+static int
+nir_schedule_src_pressure(nir_src *src)
+{
+   if (src->is_ssa)
+      return nir_schedule_def_pressure(src->ssa);
+   else
+      return src->reg.reg->num_components;
+}
+
+static int
+nir_schedule_dest_pressure(nir_dest *dest)
+{
+   if (dest->is_ssa)
+      return nir_schedule_def_pressure(&dest->ssa);
+   else
+      return dest->reg.reg->num_components;
+}
+
+/**
+ * Adds a dependency such that @after must appear in the final program after
+ * @before.
+ *
+ * We add @before as a child of @after, so that DAG heads are the outputs of
+ * the program and we make our scheduling decisions bottom to top.
+ */
+static void
+add_dep(nir_deps_state *state,
+        nir_schedule_node *before,
+        nir_schedule_node *after)
+{
+   if (!before || !after)
+      return;
+
+   assert(before != after);
+
+   if (state->dir == F)
+      dag_add_edge(&before->dag, &after->dag, NULL);
+   else
+      dag_add_edge(&after->dag, &before->dag, NULL);
+}
+
+
+static void
+add_read_dep(nir_deps_state *state,
+             nir_schedule_node *before,
+             nir_schedule_node *after)
+{
+   add_dep(state, before, after);
+}
+
+static void
+add_write_dep(nir_deps_state *state,
+              nir_schedule_node **before,
+              nir_schedule_node *after)
+{
+   add_dep(state, *before, after);
+   *before = after;
+}
+
+static bool
+nir_schedule_reg_src_deps(nir_src *src, void *in_state)
+{
+   nir_deps_state *state = in_state;
+
+   if (src->is_ssa)
+      return true;
+
+   struct hash_entry *entry = _mesa_hash_table_search(state->reg_map,
+                                                      src->reg.reg);
+   if (!entry)
+      return true;
+   nir_schedule_node *dst_n = entry->data;
+
+   nir_schedule_node *src_n = nir_schedule_get_node(state->instr_map,
+                                                    src->parent_instr);
+
+   add_dep(state, dst_n, src_n);
+
+   return true;
+}
+
+static bool
+nir_schedule_reg_dest_deps(nir_dest *dest, void *in_state)
+{
+   nir_deps_state *state = in_state;
+
+   if (dest->is_ssa)
+      return true;
+
+   nir_schedule_node *dest_n = nir_schedule_get_node(state->instr_map,
+                                                     dest->reg.parent_instr);
+
+   struct hash_entry *entry = _mesa_hash_table_search(state->reg_map,
+                                                      dest->reg.reg);
+   if (!entry) {
+      _mesa_hash_table_insert(state->reg_map, dest->reg.reg, dest_n);
+      return true;
+   }
+   nir_schedule_node **before = (nir_schedule_node **)&entry->data;
+
+   add_write_dep(state, before, dest_n);
+
+   return true;
+}
+
+static bool
+nir_schedule_ssa_deps(nir_ssa_def *def, void *in_state)
+{
+   nir_deps_state *state = in_state;
+   nir_schedule_node *def_n = nir_schedule_get_node(state->instr_map, def->parent_instr);
+
+   nir_foreach_use(src, def) {
+      nir_schedule_node *use_n = nir_schedule_get_node(state->instr_map,
+                                                       src->parent_instr);
+
+      add_read_dep(state, def_n, use_n);
+   }
+
+   return true;
+}
+
+static void
+nir_schedule_intrinsic_deps(nir_deps_state *state,
+                            nir_intrinsic_instr *instr)
+{
+   nir_schedule_node *n = nir_schedule_get_node(state->instr_map, &instr->instr);
+
+   switch (instr->intrinsic) {
+   case nir_intrinsic_load_uniform:
+   case nir_intrinsic_load_ubo:
+   case nir_intrinsic_load_front_face:
+      break;
+
+   case nir_intrinsic_discard:
+   case nir_intrinsic_discard_if:
+      /* We are adding two dependencies:
+       *
+       * * A individual one that we could use to add a read_dep while handling
+       *   nir_instr_type_tex
+       *
+       * * Include it on the unknown intrinsic set, as we want discard to be
+       *   serialized in in the same order relative to intervening stores or
+       *   atomic accesses to SSBOs and images
+       */
+      add_write_dep(state, &state->discard, n);
+      add_write_dep(state, &state->unknown_intrinsic, n);
+      break;
+
+   case nir_intrinsic_store_output:
+      /* For some non-FS shader stages, or for some hardware, output stores
+       * affect the same shared memory as input loads.
+       */
+      if (state->shader->info.stage != MESA_SHADER_FRAGMENT)
+         add_write_dep(state, &state->load_input, n);
+
+      /* Make sure that preceding discards stay before the store_output */
+      add_read_dep(state, state->discard, n);
+
+      break;
+
+   case nir_intrinsic_load_input:
+      add_read_dep(state, state->load_input, n);
+      break;
+
+   case nir_intrinsic_load_shared:
+      /* Don't move load_shared beyond a following store_shared, as it could
+       * change their value
+       */
+      add_read_dep(state, state->store_shared, n);
+      break;
+
+   case nir_intrinsic_store_shared:
+      add_write_dep(state, &state->store_shared, n);
+      break;
+
+   case nir_intrinsic_barrier:
+   case nir_intrinsic_memory_barrier_shared:
+      add_write_dep(state, &state->store_shared, n);
+
+      /* Serialize against ssbos/atomics/etc. */
+      add_write_dep(state, &state->unknown_intrinsic, n);
+      break;
+
+   default:
+      /* Attempt to handle other intrinsics that we haven't individually
+       * categorized by serializing them in the same order relative to each
+       * other.
+       */
+      add_write_dep(state, &state->unknown_intrinsic, n);
+      break;
+   }
+}
+
+/**
+ * Common code for dependencies that need to be tracked both forward and
+ * backward.
+ *
+ * This is for things like "all reads of r4 have to happen between the r4
+ * writes that surround them".
+ */
+static void
+nir_schedule_calculate_deps(nir_deps_state *state, nir_schedule_node *n)
+{
+   nir_instr *instr = n->instr;
+
+   /* For NIR SSA defs, we only need to do a single pass of making the uses
+    * depend on the def.
+    */
+   if (state->dir == F)
+      nir_foreach_ssa_def(instr, nir_schedule_ssa_deps, state);
+
+   /* For NIR regs, track the last writer in the scheduler state so that we
+    * can keep the writes in order and let reads get reordered only between
+    * each write.
+    */
+   nir_foreach_src(instr, nir_schedule_reg_src_deps, state);
+
+   nir_foreach_dest(instr, nir_schedule_reg_dest_deps, state);
+
+   /* Make sure any other instructions keep their positions relative to
+    * jumps.
+    */
+   if (instr->type != nir_instr_type_jump)
+      add_read_dep(state, state->jump, n);
+
+   switch (instr->type) {
+   case nir_instr_type_ssa_undef:
+   case nir_instr_type_load_const:
+   case nir_instr_type_alu:
+   case nir_instr_type_deref:
+      break;
+
+   case nir_instr_type_tex:
+      /* Don't move texture ops before a discard, as that could increase
+       * memory bandwidth for reading the discarded samples.
+       */
+      add_read_dep(state, state->discard, n);
+      break;
+
+   case nir_instr_type_jump:
+      add_write_dep(state, &state->jump, n);
+      break;
+
+   case nir_instr_type_call:
+      unreachable("Calls should have been lowered");
+      break;
+
+   case nir_instr_type_parallel_copy:
+      unreachable("Parallel copies should have been lowered");
+      break;
+
+   case nir_instr_type_phi:
+      unreachable("nir_schedule() should be called after lowering from SSA");
+      break;
+
+   case nir_instr_type_intrinsic:
+      nir_schedule_intrinsic_deps(state, nir_instr_as_intrinsic(instr));
+      break;
+   }
+}
+
+static void
+calculate_forward_deps(nir_schedule_scoreboard *scoreboard, nir_block *block)
+{
+   nir_deps_state state = {
+      .shader = scoreboard->shader,
+      .dir = F,
+      .instr_map = scoreboard->instr_map,
+      .reg_map = _mesa_pointer_hash_table_create(NULL),
+   };
+
+   nir_foreach_instr(instr, block) {
+      nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map,
+                                                      instr);
+      nir_schedule_calculate_deps(&state, node);
+   }
+
+   ralloc_free(state.reg_map);
+}
+
+static void
+calculate_reverse_deps(nir_schedule_scoreboard *scoreboard, nir_block *block)
+{
+   nir_deps_state state = {
+      .shader = scoreboard->shader,
+      .dir = R,
+      .instr_map = scoreboard->instr_map,
+      .reg_map = _mesa_pointer_hash_table_create(NULL),
+   };
+
+   nir_foreach_instr_reverse(instr, block) {
+      nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map,
+                                                      instr);
+      nir_schedule_calculate_deps(&state, node);
+   }
+
+   ralloc_free(state.reg_map);
+}
+
+typedef struct {
+   nir_schedule_scoreboard *scoreboard;
+   int regs_freed;
+} nir_schedule_regs_freed_state;
+
+static bool
+nir_schedule_regs_freed_src_cb(nir_src *src, void *in_state)
+{
+   nir_schedule_regs_freed_state *state = in_state;
+   nir_schedule_scoreboard *scoreboard = state->scoreboard;
+   struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src);
+
+   if (remaining_uses->entries == 1 &&
+       _mesa_set_search(remaining_uses, src->parent_instr)) {
+      state->regs_freed += nir_schedule_src_pressure(src);
+   }
+
+   return true;
+}
+
+static bool
+nir_schedule_regs_freed_def_cb(nir_ssa_def *def, void *in_state)
+{
+   nir_schedule_regs_freed_state *state = in_state;
+
+   state->regs_freed -= nir_schedule_def_pressure(def);
+
+   return true;
+}
+
+static bool
+nir_schedule_regs_freed_dest_cb(nir_dest *dest, void *in_state)
+{
+   nir_schedule_regs_freed_state *state = in_state;
+   nir_schedule_scoreboard *scoreboard = state->scoreboard;
+
+   if (dest->is_ssa)
+      return true;
+
+   nir_register *reg = dest->reg.reg;
+
+   /* Only the first def of a reg counts against register pressure. */
+   if (!_mesa_set_search(scoreboard->live_values, reg))
+      state->regs_freed -= nir_schedule_dest_pressure(dest);
+
+   return true;
+}
+
+static int
+nir_schedule_regs_freed(nir_schedule_scoreboard *scoreboard, nir_schedule_node *n)
+{
+   nir_schedule_regs_freed_state state = {
+      .scoreboard = scoreboard,
+   };
+
+   nir_foreach_src(n->instr, nir_schedule_regs_freed_src_cb, &state);
+
+   nir_foreach_ssa_def(n->instr, nir_schedule_regs_freed_def_cb, &state);
+
+   nir_foreach_dest(n->instr, nir_schedule_regs_freed_dest_cb, &state);
+
+   return state.regs_freed;
+}
+
+/**
+ * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSP (Code
+ * Scheduling for Parallelism) heuristic.
+ *
+ * Picks an instruction on the critical that's ready to execute without
+ * stalls, if possible, otherwise picks the instruction on the critical path.
+ */
+static nir_schedule_node *
+nir_schedule_choose_instruction_csp(nir_schedule_scoreboard *scoreboard)
+{
+   nir_schedule_node *chosen = NULL;
+
+   /* Find the leader in the ready (shouldn't-stall) set with the maximum
+    * cost.
+    */
+   list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
+      if (scoreboard->time < n->ready_time)
+         continue;
+
+      if (!chosen || chosen->max_delay < n->max_delay)
+         chosen = n;
+   }
+   if (chosen) {
+      if (debug) {
+         fprintf(stderr, "chose (ready):          ");
+         nir_print_instr(chosen->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+
+      return chosen;
+   }
+
+   /* Otherwise, choose the leader with the maximum cost. */
+   list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
+      if (!chosen || chosen->max_delay < n->max_delay)
+         chosen = n;
+   }
+   if (debug) {
+      fprintf(stderr, "chose (leader):         ");
+      nir_print_instr(chosen->instr, stderr);
+      fprintf(stderr, "\n");
+   }
+
+   return chosen;
+}
+
+/**
+ * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSR (Code
+ * Scheduling for Register pressure) heuristic.
+ */
+static nir_schedule_node *
+nir_schedule_choose_instruction_csr(nir_schedule_scoreboard *scoreboard)
+{
+   nir_schedule_node *chosen = NULL;
+
+   /* Find a ready inst with regs freed and pick the one with max cost. */
+   list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
+      if (n->ready_time > scoreboard->time)
+         continue;
+
+      int regs_freed = nir_schedule_regs_freed(scoreboard, n);
+
+      if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) {
+         chosen = n;
+      }
+   }
+   if (chosen) {
+      if (debug) {
+         fprintf(stderr, "chose (freed+ready):    ");
+         nir_print_instr(chosen->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+
+      return chosen;
+   }
+
+   /* Find a leader with regs freed and pick the one with max cost. */
+   list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
+      int regs_freed = nir_schedule_regs_freed(scoreboard, n);
+
+      if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) {
+         chosen = n;
+      }
+   }
+   if (chosen) {
+      if (debug) {
+         fprintf(stderr, "chose (regs freed):     ");
+         nir_print_instr(chosen->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+
+      return chosen;
+   }
+
+   /* Find a partially evaluated path and try to finish it off */
+   list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
+      if (n->partially_evaluated_path &&
+          (!chosen || chosen->max_delay < n->max_delay)) {
+         chosen = n;
+      }
+   }
+   if (chosen) {
+      if (debug) {
+         fprintf(stderr, "chose (partial path):   ");
+         nir_print_instr(chosen->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+
+      return chosen;
+   }
+
+   /* Contra the paper, pick a leader with no effect on used regs.  This may
+    * open up new opportunities, as otherwise a single-operand instr consuming
+    * a value will tend to block finding freeing that value.  This had a
+    * massive effect on reducing spilling on V3D.
+    *
+    * XXX: Should this prioritize ready?
+    */
+   list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
+      if (nir_schedule_regs_freed(scoreboard, n) != 0)
+         continue;
+
+      if (!chosen || chosen->max_delay < n->max_delay)
+         chosen = n;
+   }
+   if (chosen) {
+      if (debug) {
+         fprintf(stderr, "chose (regs no-op):     ");
+         nir_print_instr(chosen->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+
+      return chosen;
+   }
+
+   /* Pick the max delay of the remaining ready set. */
+   list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
+      if (n->ready_time > scoreboard->time)
+         continue;
+      if (!chosen || chosen->max_delay < n->max_delay)
+         chosen = n;
+   }
+   if (chosen) {
+      if (debug) {
+         fprintf(stderr, "chose (ready max delay):   ");
+         nir_print_instr(chosen->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+      return chosen;
+   }
+
+   /* Pick the max delay of the remaining leaders. */
+   list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
+      if (!chosen || chosen->max_delay < n->max_delay)
+         chosen = n;
+   }
+
+   if (debug) {
+      fprintf(stderr, "chose (max delay):         ");
+      nir_print_instr(chosen->instr, stderr);
+      fprintf(stderr, "\n");
+   }
+
+   return chosen;
+}
+
+static void
+dump_state(nir_schedule_scoreboard *scoreboard)
+{
+   list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
+      fprintf(stderr, "maxdel %5d ", n->max_delay);
+      nir_print_instr(n->instr, stderr);
+      fprintf(stderr, "\n");
+
+      util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
+         nir_schedule_node *child = (nir_schedule_node *)edge->child;
+
+         fprintf(stderr, " -> (%d parents) ", child->dag.parent_count);
+         nir_print_instr(child->instr, stderr);
+         fprintf(stderr, "\n");
+      }
+   }
+}
+
+static void
+nir_schedule_mark_use(nir_schedule_scoreboard *scoreboard,
+                      void *reg_or_def,
+                      nir_instr *reg_or_def_parent,
+                      int pressure)
+{
+   /* Make the value live if it's the first time it's been used. */
+   if (!_mesa_set_search(scoreboard->live_values, reg_or_def)) {
+      _mesa_set_add(scoreboard->live_values, reg_or_def);
+      scoreboard->pressure += pressure;
+   }
+
+   /* Make the value dead if it's the last remaining use.  Be careful when one
+    * instruction uses a value twice to not decrement pressure twice.
+    */
+   struct set *remaining_uses =
+      _mesa_hash_table_search_data(scoreboard->remaining_uses, reg_or_def);
+   struct set_entry *entry = _mesa_set_search(remaining_uses, reg_or_def_parent);
+   if (entry) {
+      _mesa_set_remove(remaining_uses, entry);
+
+      if (remaining_uses->entries == 0)
+         scoreboard->pressure -= pressure;
+   }
+}
+
+static bool
+nir_schedule_mark_src_scheduled(nir_src *src, void *state)
+{
+   nir_schedule_scoreboard *scoreboard = state;
+   struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src);
+
+   struct set_entry *entry = _mesa_set_search(remaining_uses,
+                                              src->parent_instr);
+   if (entry) {
+      /* Once we've used an SSA value in one instruction, bump the priority of
+       * the other uses so the SSA value can get fully consumed.
+       *
+       * We don't do this for registers, and it's would be a hassle and it's
+       * unclear if that would help or not.  Also, skip it for constants, as
+       * they're often folded as immediates into backend instructions and have
+       * many unrelated instructions all referencing the same value (0).
+       */
+      if (src->is_ssa &&
+          src->ssa->parent_instr->type != nir_instr_type_load_const) {
+         nir_foreach_use(other_src, src->ssa) {
+            if (other_src->parent_instr == src->parent_instr)
+               continue;
+
+            nir_schedule_node *n =
+               nir_schedule_get_node(scoreboard->instr_map,
+                                     other_src->parent_instr);
+
+            if (n && !n->partially_evaluated_path) {
+               if (debug) {
+                  fprintf(stderr, "  New partially evaluated path: ");
+                  nir_print_instr(n->instr, stderr);
+                  fprintf(stderr, "\n");
+               }
+
+               n->partially_evaluated_path = true;
+            }
+         }
+      }
+   }
+
+   nir_schedule_mark_use(scoreboard,
+                         src->is_ssa ? (void *)src->ssa : (void *)src->reg.reg,
+                         src->parent_instr,
+                         nir_schedule_src_pressure(src));
+
+   return true;
+}
+
+static bool
+nir_schedule_mark_def_scheduled(nir_ssa_def *def, void *state)
+{
+   nir_schedule_scoreboard *scoreboard = state;
+
+   nir_schedule_mark_use(scoreboard, def, def->parent_instr,
+                         nir_schedule_def_pressure(def));
+
+   return true;
+}
+
+static bool
+nir_schedule_mark_dest_scheduled(nir_dest *dest, void *state)
+{
+   nir_schedule_scoreboard *scoreboard = state;
+
+   /* SSA defs were handled in nir_schedule_mark_def_scheduled()
+    */
+   if (dest->is_ssa)
+      return true;
+
+   /* XXX: This is not actually accurate for regs -- the last use of a reg may
+    * have a live interval that extends across control flow.  We should
+    * calculate the live ranges of regs, and have scheduler nodes for the CF
+    * nodes that also "use" the reg.
+    */
+   nir_schedule_mark_use(scoreboard, dest->reg.reg,
+                         dest->reg.parent_instr,
+                         nir_schedule_dest_pressure(dest));
+
+   return true;
+}
+
+static void
+nir_schedule_mark_node_scheduled(nir_schedule_scoreboard *scoreboard,
+                                 nir_schedule_node *n)
+{
+   nir_foreach_src(n->instr, nir_schedule_mark_src_scheduled, scoreboard);
+   nir_foreach_ssa_def(n->instr, nir_schedule_mark_def_scheduled, scoreboard);
+   nir_foreach_dest(n->instr, nir_schedule_mark_dest_scheduled, scoreboard);
+
+   util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
+      nir_schedule_node *child = (nir_schedule_node *)edge->child;
+
+      child->ready_time = MAX2(child->ready_time,
+                               scoreboard->time + n->delay);
+
+      if (child->dag.parent_count == 1) {
+         if (debug) {
+            fprintf(stderr, "  New DAG head: ");
+            nir_print_instr(child->instr, stderr);
+            fprintf(stderr, "\n");
+         }
+      }
+   }
+
+   dag_prune_head(scoreboard->dag, &n->dag);
+
+   scoreboard->time = MAX2(n->ready_time, scoreboard->time);
+   scoreboard->time++;
+}
+
+static void
+nir_schedule_instructions(nir_schedule_scoreboard *scoreboard, nir_block *block)
+{
+   while (!list_is_empty(&scoreboard->dag->heads)) {
+      if (debug) {
+         fprintf(stderr, "current list:\n");
+         dump_state(scoreboard);
+      }
+
+      nir_schedule_node *chosen;
+      if (scoreboard->pressure < scoreboard->threshold)
+         chosen = nir_schedule_choose_instruction_csp(scoreboard);
+      else
+         chosen = nir_schedule_choose_instruction_csr(scoreboard);
+
+      /* Now that we've scheduled a new instruction, some of its children may
+       * be promoted to the list of instructions ready to be scheduled.
+       */
+      nir_schedule_mark_node_scheduled(scoreboard, chosen);
+
+      /* Move the instruction to the end (so our first chosen instructions are
+       * the start of the program).
+       */
+      exec_node_remove(&chosen->instr->node);
+      exec_list_push_tail(&block->instr_list, &chosen->instr->node);
+
+      if (debug)
+         fprintf(stderr, "\n");
+   }
+}
+
+static uint32_t
+nir_schedule_get_delay(nir_instr *instr)
+{
+   switch (instr->type) {
+   case nir_instr_type_ssa_undef:
+   case nir_instr_type_load_const:
+   case nir_instr_type_alu:
+   case nir_instr_type_deref:
+   case nir_instr_type_jump:
+   case nir_instr_type_parallel_copy:
+   case nir_instr_type_call:
+   case nir_instr_type_phi:
+      return 1;
+
+   case nir_instr_type_intrinsic:
+      /* XXX: Pick a large number for UBO/SSBO/image/shared loads */
+      return 1;
+
+   case nir_instr_type_tex:
+      /* Pick some large number to try to fetch textures early and sample them
+       * late.
+       */
+      return 100;
+   }
+
+   return 0;
+}
+
+static void
+nir_schedule_dag_max_delay_cb(struct dag_node *node, void *state)
+{
+   nir_schedule_node *n = (nir_schedule_node *)node;
+   uint32_t max_delay = 0;
+
+   util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
+      nir_schedule_node *child = (nir_schedule_node *)edge->child;
+      max_delay = MAX2(child->max_delay, max_delay);
+   }
+
+   n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
+ }
+
+static void
+nir_schedule_block(nir_schedule_scoreboard *scoreboard, nir_block *block)
+{
+   void *mem_ctx = ralloc_context(NULL);
+   scoreboard->instr_map = _mesa_pointer_hash_table_create(mem_ctx);
+
+   scoreboard->dag = dag_create(mem_ctx);
+
+   nir_foreach_instr(instr, block) {
+      nir_schedule_node *n =
+         rzalloc(mem_ctx, nir_schedule_node);
+
+      n->instr = instr;
+      n->delay = nir_schedule_get_delay(instr);
+      dag_init_node(scoreboard->dag, &n->dag);
+
+      _mesa_hash_table_insert(scoreboard->instr_map, instr, n);
+   }
+
+   calculate_forward_deps(scoreboard, block);
+   calculate_reverse_deps(scoreboard, block);
+
+   dag_traverse_bottom_up(scoreboard->dag, nir_schedule_dag_max_delay_cb, NULL);
+
+   nir_schedule_instructions(scoreboard, block);
+
+   ralloc_free(mem_ctx);
+   scoreboard->instr_map = NULL;
+}
+
+static bool
+nir_schedule_ssa_def_init_scoreboard(nir_ssa_def *def, void *state)
+{
+   nir_schedule_scoreboard *scoreboard = state;
+   struct set *def_uses = _mesa_pointer_set_create(scoreboard);
+
+   _mesa_hash_table_insert(scoreboard->remaining_uses, def, def_uses);
+
+   _mesa_set_add(def_uses, def->parent_instr);
+
+   nir_foreach_use(src, def) {
+      _mesa_set_add(def_uses, src->parent_instr);
+   }
+
+   /* XXX: Handle if uses */
+
+   return true;
+}
+
+static nir_schedule_scoreboard *
+nir_schedule_get_scoreboard(nir_shader *shader, int threshold)
+{
+   nir_schedule_scoreboard *scoreboard = rzalloc(NULL, nir_schedule_scoreboard);
+
+   scoreboard->shader = shader;
+   scoreboard->live_values = _mesa_pointer_set_create(scoreboard);
+   scoreboard->remaining_uses = _mesa_pointer_hash_table_create(scoreboard);
+   scoreboard->threshold = threshold;
+   scoreboard->pressure = 0;
+
+   nir_foreach_function(function, shader) {
+      nir_foreach_register(reg, &function->impl->registers) {
+         struct set *register_uses =
+            _mesa_pointer_set_create(scoreboard);
+
+         _mesa_hash_table_insert(scoreboard->remaining_uses, reg, register_uses);
+
+         nir_foreach_use(src, reg) {
+            _mesa_set_add(register_uses, src->parent_instr);
+         }
+
+         /* XXX: Handle if uses */
+
+         nir_foreach_def(dest, reg) {
+            _mesa_set_add(register_uses, dest->reg.parent_instr);
+         }
+      }
+
+      nir_foreach_block(block, function->impl) {
+         nir_foreach_instr(instr, block) {
+            nir_foreach_ssa_def(instr, nir_schedule_ssa_def_init_scoreboard,
+                                scoreboard);
+         }
+
+         /* XXX: We're ignoring if uses, which may prioritize scheduling other
+          * uses of the if src even when it doesn't help.  That's not many
+          * values, though, so meh.
+          */
+      }
+   }
+
+   return scoreboard;
+}
+
+static void
+nir_schedule_validate_uses(nir_schedule_scoreboard *scoreboard)
+{
+#ifdef NDEBUG
+   return;
+#endif
+
+   bool any_uses = false;
+
+   hash_table_foreach(scoreboard->remaining_uses, entry) {
+      struct set *remaining_uses = entry->data;
+
+      set_foreach(remaining_uses, instr_entry) {
+         if (!any_uses) {
+            fprintf(stderr, "Tracked uses remain after scheduling.  "
+                    "Affected instructions: \n");
+            any_uses = true;
+         }
+         nir_print_instr(instr_entry->key, stderr);
+         fprintf(stderr, "\n");
+      }
+   }
+
+   assert(!any_uses);
+}
+
+/**
+ * Schedules the NIR instructions to try to decrease stalls (for example,
+ * delaying texture reads) while managing register pressure.
+ *
+ * The threshold represents "number of NIR register/SSA def channels live
+ * before switching the scheduling heuristic to reduce register pressure",
+ * since most of our GPU architectures are scalar (extending to vector with a
+ * flag wouldn't be hard).  This number should be a bit below the number of
+ * registers available (counting any that may be occupied by system value
+ * payload values, for example), since the heuristic may not always be able to
+ * free a register immediately.  The amount below the limit is up to you to
+ * tune.
+ */
+void
+nir_schedule(nir_shader *shader, int threshold)
+{
+   nir_schedule_scoreboard *scoreboard = nir_schedule_get_scoreboard(shader,
+                                                                     threshold);
+
+   if (debug) {
+      fprintf(stderr, "NIR shader before scheduling:\n");
+      nir_print_shader(shader, stderr);
+   }
+
+   nir_foreach_function(function, shader) {
+      if (!function->impl)
+         continue;
+
+      nir_foreach_block(block, function->impl) {
+         nir_schedule_block(scoreboard, block);
+      }
+   }
+
+   nir_schedule_validate_uses(scoreboard);
+
+   ralloc_free(scoreboard);
+}