--- /dev/null
+/*
+ * Copyright © 2019 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "nir.h"
+#include "util/dag.h"
+#include "util/u_dynarray.h"
+
+/** @file
+ *
+ * Implements basic-block-level prepass instruction scheduling in NIR to
+ * manage register pressure.
+ *
+ * This is based on the Goodman/Hsu paper (1988, cached copy at
+ * https://people.freedesktop.org/~anholt/scheduling-goodman-hsu.pdf). We
+ * make up the DDG for NIR (which can be mostly done using the NIR def/use
+ * chains for SSA instructions, plus some edges for ordering register writes
+ * vs reads, and some more for ordering intrinsics). Then we pick heads off
+ * of the DDG using their heuristic to emit the NIR instructions back into the
+ * block in their new order.
+ *
+ * The hard case for prepass scheduling on GPUs seems to always be consuming
+ * texture/ubo results. The register pressure heuristic doesn't want to pick
+ * an instr that starts consuming texture results because it usually won't be
+ * the only usage, so that instruction will increase pressure.
+ *
+ * If you try to force consumption of tex results always, then in a case where
+ * single sample is used for many outputs, you'll end up picking every other
+ * user and expanding register pressure. The partially_evaluated_path flag
+ * helps tremendously, in that if you happen for whatever reason to pick a
+ * texture sample's output, then you'll try to finish off that sample. Future
+ * work may include doing some local search before locking in a choice, to try
+ * to more reliably find the case where just a few choices going against the
+ * heuristic can manage to free the whole vector.
+ */
+
+static bool debug;
+
+/**
+ * Represents a node in the DDG for a NIR instruction.
+ */
+typedef struct {
+ struct dag_node dag; /* must be first for our u_dynarray_foreach */
+ nir_instr *instr;
+ bool partially_evaluated_path;
+
+ /* Approximate estimate of the delay between starting this instruction and
+ * its results being available.
+ *
+ * Accuracy is not too important, given that we're prepass scheduling here
+ * and just trying to reduce excess dependencies introduced by a register
+ * allocator by stretching out the live intervals of expensive
+ * instructions.
+ */
+ uint32_t delay;
+
+ /* Cost of the maximum-delay path from this node to the leaves. */
+ uint32_t max_delay;
+
+ /* scoreboard->time value when this instruction can be scheduled without
+ * any stalls expected.
+ */
+ uint32_t ready_time;
+} nir_schedule_node;
+
+typedef struct {
+ struct dag *dag;
+
+ nir_shader *shader;
+
+ /* Mapping from nir_register * or nir_ssa_def * to a struct set of
+ * instructions remaining to be scheduled using the register.
+ */
+ struct hash_table *remaining_uses;
+
+ /* Map from nir_instr to nir_schedule_node * */
+ struct hash_table *instr_map;
+
+ /* Set of nir_register * or nir_ssa_def * that have had any instruction
+ * scheduled on them.
+ */
+ struct set *live_values;
+
+ /* An abstract approximation of the number of nir_scheduler_node->delay
+ * units since the start of the shader.
+ */
+ uint32_t time;
+
+ /* Number of channels currently used by the NIR instructions that have been
+ * scheduled.
+ */
+ int pressure;
+
+ /* Number of channels that may be in use before we switch to the
+ * pressure-prioritizing scheduling heuristic.
+ */
+ int threshold;
+} nir_schedule_scoreboard;
+
+/* When walking the instructions in reverse, we use this flag to swap
+ * before/after in add_dep().
+ */
+enum direction { F, R };
+
+typedef struct {
+ nir_shader *shader;
+
+ /* Map from nir_instr to nir_schedule_node * */
+ struct hash_table *instr_map;
+ /* Map from nir_register to nir_schedule_node * */
+ struct hash_table *reg_map;
+
+ /* Scheduler nodes for last instruction involved in some class of dependency.
+ */
+ nir_schedule_node *load_input;
+ nir_schedule_node *store_shared;
+ nir_schedule_node *unknown_intrinsic;
+ nir_schedule_node *discard;
+ nir_schedule_node *jump;
+
+ enum direction dir;
+} nir_deps_state;
+
+static void *
+_mesa_hash_table_search_data(struct hash_table *ht, void *key)
+{
+ struct hash_entry *entry = _mesa_hash_table_search(ht, key);
+ if (!entry)
+ return NULL;
+ return entry->data;
+}
+
+static nir_schedule_node *
+nir_schedule_get_node(struct hash_table *instr_map, nir_instr *instr)
+{
+ return _mesa_hash_table_search_data(instr_map, instr);
+}
+
+static struct set *
+nir_schedule_scoreboard_get_src(nir_schedule_scoreboard *scoreboard, nir_src *src)
+{
+ if (src->is_ssa) {
+ return _mesa_hash_table_search_data(scoreboard->remaining_uses, src->ssa);
+ } else {
+ return _mesa_hash_table_search_data(scoreboard->remaining_uses,
+ src->reg.reg);
+ }
+}
+
+static int
+nir_schedule_def_pressure(nir_ssa_def *def)
+{
+ return def->num_components;
+}
+
+static int
+nir_schedule_src_pressure(nir_src *src)
+{
+ if (src->is_ssa)
+ return nir_schedule_def_pressure(src->ssa);
+ else
+ return src->reg.reg->num_components;
+}
+
+static int
+nir_schedule_dest_pressure(nir_dest *dest)
+{
+ if (dest->is_ssa)
+ return nir_schedule_def_pressure(&dest->ssa);
+ else
+ return dest->reg.reg->num_components;
+}
+
+/**
+ * Adds a dependency such that @after must appear in the final program after
+ * @before.
+ *
+ * We add @before as a child of @after, so that DAG heads are the outputs of
+ * the program and we make our scheduling decisions bottom to top.
+ */
+static void
+add_dep(nir_deps_state *state,
+ nir_schedule_node *before,
+ nir_schedule_node *after)
+{
+ if (!before || !after)
+ return;
+
+ assert(before != after);
+
+ if (state->dir == F)
+ dag_add_edge(&before->dag, &after->dag, NULL);
+ else
+ dag_add_edge(&after->dag, &before->dag, NULL);
+}
+
+
+static void
+add_read_dep(nir_deps_state *state,
+ nir_schedule_node *before,
+ nir_schedule_node *after)
+{
+ add_dep(state, before, after);
+}
+
+static void
+add_write_dep(nir_deps_state *state,
+ nir_schedule_node **before,
+ nir_schedule_node *after)
+{
+ add_dep(state, *before, after);
+ *before = after;
+}
+
+static bool
+nir_schedule_reg_src_deps(nir_src *src, void *in_state)
+{
+ nir_deps_state *state = in_state;
+
+ if (src->is_ssa)
+ return true;
+
+ struct hash_entry *entry = _mesa_hash_table_search(state->reg_map,
+ src->reg.reg);
+ if (!entry)
+ return true;
+ nir_schedule_node *dst_n = entry->data;
+
+ nir_schedule_node *src_n = nir_schedule_get_node(state->instr_map,
+ src->parent_instr);
+
+ add_dep(state, dst_n, src_n);
+
+ return true;
+}
+
+static bool
+nir_schedule_reg_dest_deps(nir_dest *dest, void *in_state)
+{
+ nir_deps_state *state = in_state;
+
+ if (dest->is_ssa)
+ return true;
+
+ nir_schedule_node *dest_n = nir_schedule_get_node(state->instr_map,
+ dest->reg.parent_instr);
+
+ struct hash_entry *entry = _mesa_hash_table_search(state->reg_map,
+ dest->reg.reg);
+ if (!entry) {
+ _mesa_hash_table_insert(state->reg_map, dest->reg.reg, dest_n);
+ return true;
+ }
+ nir_schedule_node **before = (nir_schedule_node **)&entry->data;
+
+ add_write_dep(state, before, dest_n);
+
+ return true;
+}
+
+static bool
+nir_schedule_ssa_deps(nir_ssa_def *def, void *in_state)
+{
+ nir_deps_state *state = in_state;
+ nir_schedule_node *def_n = nir_schedule_get_node(state->instr_map, def->parent_instr);
+
+ nir_foreach_use(src, def) {
+ nir_schedule_node *use_n = nir_schedule_get_node(state->instr_map,
+ src->parent_instr);
+
+ add_read_dep(state, def_n, use_n);
+ }
+
+ return true;
+}
+
+static void
+nir_schedule_intrinsic_deps(nir_deps_state *state,
+ nir_intrinsic_instr *instr)
+{
+ nir_schedule_node *n = nir_schedule_get_node(state->instr_map, &instr->instr);
+
+ switch (instr->intrinsic) {
+ case nir_intrinsic_load_uniform:
+ case nir_intrinsic_load_ubo:
+ case nir_intrinsic_load_front_face:
+ break;
+
+ case nir_intrinsic_discard:
+ case nir_intrinsic_discard_if:
+ /* We are adding two dependencies:
+ *
+ * * A individual one that we could use to add a read_dep while handling
+ * nir_instr_type_tex
+ *
+ * * Include it on the unknown intrinsic set, as we want discard to be
+ * serialized in in the same order relative to intervening stores or
+ * atomic accesses to SSBOs and images
+ */
+ add_write_dep(state, &state->discard, n);
+ add_write_dep(state, &state->unknown_intrinsic, n);
+ break;
+
+ case nir_intrinsic_store_output:
+ /* For some non-FS shader stages, or for some hardware, output stores
+ * affect the same shared memory as input loads.
+ */
+ if (state->shader->info.stage != MESA_SHADER_FRAGMENT)
+ add_write_dep(state, &state->load_input, n);
+
+ /* Make sure that preceding discards stay before the store_output */
+ add_read_dep(state, state->discard, n);
+
+ break;
+
+ case nir_intrinsic_load_input:
+ add_read_dep(state, state->load_input, n);
+ break;
+
+ case nir_intrinsic_load_shared:
+ /* Don't move load_shared beyond a following store_shared, as it could
+ * change their value
+ */
+ add_read_dep(state, state->store_shared, n);
+ break;
+
+ case nir_intrinsic_store_shared:
+ add_write_dep(state, &state->store_shared, n);
+ break;
+
+ case nir_intrinsic_barrier:
+ case nir_intrinsic_memory_barrier_shared:
+ add_write_dep(state, &state->store_shared, n);
+
+ /* Serialize against ssbos/atomics/etc. */
+ add_write_dep(state, &state->unknown_intrinsic, n);
+ break;
+
+ default:
+ /* Attempt to handle other intrinsics that we haven't individually
+ * categorized by serializing them in the same order relative to each
+ * other.
+ */
+ add_write_dep(state, &state->unknown_intrinsic, n);
+ break;
+ }
+}
+
+/**
+ * Common code for dependencies that need to be tracked both forward and
+ * backward.
+ *
+ * This is for things like "all reads of r4 have to happen between the r4
+ * writes that surround them".
+ */
+static void
+nir_schedule_calculate_deps(nir_deps_state *state, nir_schedule_node *n)
+{
+ nir_instr *instr = n->instr;
+
+ /* For NIR SSA defs, we only need to do a single pass of making the uses
+ * depend on the def.
+ */
+ if (state->dir == F)
+ nir_foreach_ssa_def(instr, nir_schedule_ssa_deps, state);
+
+ /* For NIR regs, track the last writer in the scheduler state so that we
+ * can keep the writes in order and let reads get reordered only between
+ * each write.
+ */
+ nir_foreach_src(instr, nir_schedule_reg_src_deps, state);
+
+ nir_foreach_dest(instr, nir_schedule_reg_dest_deps, state);
+
+ /* Make sure any other instructions keep their positions relative to
+ * jumps.
+ */
+ if (instr->type != nir_instr_type_jump)
+ add_read_dep(state, state->jump, n);
+
+ switch (instr->type) {
+ case nir_instr_type_ssa_undef:
+ case nir_instr_type_load_const:
+ case nir_instr_type_alu:
+ case nir_instr_type_deref:
+ break;
+
+ case nir_instr_type_tex:
+ /* Don't move texture ops before a discard, as that could increase
+ * memory bandwidth for reading the discarded samples.
+ */
+ add_read_dep(state, state->discard, n);
+ break;
+
+ case nir_instr_type_jump:
+ add_write_dep(state, &state->jump, n);
+ break;
+
+ case nir_instr_type_call:
+ unreachable("Calls should have been lowered");
+ break;
+
+ case nir_instr_type_parallel_copy:
+ unreachable("Parallel copies should have been lowered");
+ break;
+
+ case nir_instr_type_phi:
+ unreachable("nir_schedule() should be called after lowering from SSA");
+ break;
+
+ case nir_instr_type_intrinsic:
+ nir_schedule_intrinsic_deps(state, nir_instr_as_intrinsic(instr));
+ break;
+ }
+}
+
+static void
+calculate_forward_deps(nir_schedule_scoreboard *scoreboard, nir_block *block)
+{
+ nir_deps_state state = {
+ .shader = scoreboard->shader,
+ .dir = F,
+ .instr_map = scoreboard->instr_map,
+ .reg_map = _mesa_pointer_hash_table_create(NULL),
+ };
+
+ nir_foreach_instr(instr, block) {
+ nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map,
+ instr);
+ nir_schedule_calculate_deps(&state, node);
+ }
+
+ ralloc_free(state.reg_map);
+}
+
+static void
+calculate_reverse_deps(nir_schedule_scoreboard *scoreboard, nir_block *block)
+{
+ nir_deps_state state = {
+ .shader = scoreboard->shader,
+ .dir = R,
+ .instr_map = scoreboard->instr_map,
+ .reg_map = _mesa_pointer_hash_table_create(NULL),
+ };
+
+ nir_foreach_instr_reverse(instr, block) {
+ nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map,
+ instr);
+ nir_schedule_calculate_deps(&state, node);
+ }
+
+ ralloc_free(state.reg_map);
+}
+
+typedef struct {
+ nir_schedule_scoreboard *scoreboard;
+ int regs_freed;
+} nir_schedule_regs_freed_state;
+
+static bool
+nir_schedule_regs_freed_src_cb(nir_src *src, void *in_state)
+{
+ nir_schedule_regs_freed_state *state = in_state;
+ nir_schedule_scoreboard *scoreboard = state->scoreboard;
+ struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src);
+
+ if (remaining_uses->entries == 1 &&
+ _mesa_set_search(remaining_uses, src->parent_instr)) {
+ state->regs_freed += nir_schedule_src_pressure(src);
+ }
+
+ return true;
+}
+
+static bool
+nir_schedule_regs_freed_def_cb(nir_ssa_def *def, void *in_state)
+{
+ nir_schedule_regs_freed_state *state = in_state;
+
+ state->regs_freed -= nir_schedule_def_pressure(def);
+
+ return true;
+}
+
+static bool
+nir_schedule_regs_freed_dest_cb(nir_dest *dest, void *in_state)
+{
+ nir_schedule_regs_freed_state *state = in_state;
+ nir_schedule_scoreboard *scoreboard = state->scoreboard;
+
+ if (dest->is_ssa)
+ return true;
+
+ nir_register *reg = dest->reg.reg;
+
+ /* Only the first def of a reg counts against register pressure. */
+ if (!_mesa_set_search(scoreboard->live_values, reg))
+ state->regs_freed -= nir_schedule_dest_pressure(dest);
+
+ return true;
+}
+
+static int
+nir_schedule_regs_freed(nir_schedule_scoreboard *scoreboard, nir_schedule_node *n)
+{
+ nir_schedule_regs_freed_state state = {
+ .scoreboard = scoreboard,
+ };
+
+ nir_foreach_src(n->instr, nir_schedule_regs_freed_src_cb, &state);
+
+ nir_foreach_ssa_def(n->instr, nir_schedule_regs_freed_def_cb, &state);
+
+ nir_foreach_dest(n->instr, nir_schedule_regs_freed_dest_cb, &state);
+
+ return state.regs_freed;
+}
+
+/**
+ * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSP (Code
+ * Scheduling for Parallelism) heuristic.
+ *
+ * Picks an instruction on the critical that's ready to execute without
+ * stalls, if possible, otherwise picks the instruction on the critical path.
+ */
+static nir_schedule_node *
+nir_schedule_choose_instruction_csp(nir_schedule_scoreboard *scoreboard)
+{
+ nir_schedule_node *chosen = NULL;
+
+ /* Find the leader in the ready (shouldn't-stall) set with the maximum
+ * cost.
+ */
+ list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
+ if (scoreboard->time < n->ready_time)
+ continue;
+
+ if (!chosen || chosen->max_delay < n->max_delay)
+ chosen = n;
+ }
+ if (chosen) {
+ if (debug) {
+ fprintf(stderr, "chose (ready): ");
+ nir_print_instr(chosen->instr, stderr);
+ fprintf(stderr, "\n");
+ }
+
+ return chosen;
+ }
+
+ /* Otherwise, choose the leader with the maximum cost. */
+ list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
+ if (!chosen || chosen->max_delay < n->max_delay)
+ chosen = n;
+ }
+ if (debug) {
+ fprintf(stderr, "chose (leader): ");
+ nir_print_instr(chosen->instr, stderr);
+ fprintf(stderr, "\n");
+ }
+
+ return chosen;
+}
+
+/**
+ * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSR (Code
+ * Scheduling for Register pressure) heuristic.
+ */
+static nir_schedule_node *
+nir_schedule_choose_instruction_csr(nir_schedule_scoreboard *scoreboard)
+{
+ nir_schedule_node *chosen = NULL;
+
+ /* Find a ready inst with regs freed and pick the one with max cost. */
+ list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
+ if (n->ready_time > scoreboard->time)
+ continue;
+
+ int regs_freed = nir_schedule_regs_freed(scoreboard, n);
+
+ if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) {
+ chosen = n;
+ }
+ }
+ if (chosen) {
+ if (debug) {
+ fprintf(stderr, "chose (freed+ready): ");
+ nir_print_instr(chosen->instr, stderr);
+ fprintf(stderr, "\n");
+ }
+
+ return chosen;
+ }
+
+ /* Find a leader with regs freed and pick the one with max cost. */
+ list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
+ int regs_freed = nir_schedule_regs_freed(scoreboard, n);
+
+ if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) {
+ chosen = n;
+ }
+ }
+ if (chosen) {
+ if (debug) {
+ fprintf(stderr, "chose (regs freed): ");
+ nir_print_instr(chosen->instr, stderr);
+ fprintf(stderr, "\n");
+ }
+
+ return chosen;
+ }
+
+ /* Find a partially evaluated path and try to finish it off */
+ list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
+ if (n->partially_evaluated_path &&
+ (!chosen || chosen->max_delay < n->max_delay)) {
+ chosen = n;
+ }
+ }
+ if (chosen) {
+ if (debug) {
+ fprintf(stderr, "chose (partial path): ");
+ nir_print_instr(chosen->instr, stderr);
+ fprintf(stderr, "\n");
+ }
+
+ return chosen;
+ }
+
+ /* Contra the paper, pick a leader with no effect on used regs. This may
+ * open up new opportunities, as otherwise a single-operand instr consuming
+ * a value will tend to block finding freeing that value. This had a
+ * massive effect on reducing spilling on V3D.
+ *
+ * XXX: Should this prioritize ready?
+ */
+ list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
+ if (nir_schedule_regs_freed(scoreboard, n) != 0)
+ continue;
+
+ if (!chosen || chosen->max_delay < n->max_delay)
+ chosen = n;
+ }
+ if (chosen) {
+ if (debug) {
+ fprintf(stderr, "chose (regs no-op): ");
+ nir_print_instr(chosen->instr, stderr);
+ fprintf(stderr, "\n");
+ }
+
+ return chosen;
+ }
+
+ /* Pick the max delay of the remaining ready set. */
+ list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
+ if (n->ready_time > scoreboard->time)
+ continue;
+ if (!chosen || chosen->max_delay < n->max_delay)
+ chosen = n;
+ }
+ if (chosen) {
+ if (debug) {
+ fprintf(stderr, "chose (ready max delay): ");
+ nir_print_instr(chosen->instr, stderr);
+ fprintf(stderr, "\n");
+ }
+ return chosen;
+ }
+
+ /* Pick the max delay of the remaining leaders. */
+ list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
+ if (!chosen || chosen->max_delay < n->max_delay)
+ chosen = n;
+ }
+
+ if (debug) {
+ fprintf(stderr, "chose (max delay): ");
+ nir_print_instr(chosen->instr, stderr);
+ fprintf(stderr, "\n");
+ }
+
+ return chosen;
+}
+
+static void
+dump_state(nir_schedule_scoreboard *scoreboard)
+{
+ list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
+ fprintf(stderr, "maxdel %5d ", n->max_delay);
+ nir_print_instr(n->instr, stderr);
+ fprintf(stderr, "\n");
+
+ util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
+ nir_schedule_node *child = (nir_schedule_node *)edge->child;
+
+ fprintf(stderr, " -> (%d parents) ", child->dag.parent_count);
+ nir_print_instr(child->instr, stderr);
+ fprintf(stderr, "\n");
+ }
+ }
+}
+
+static void
+nir_schedule_mark_use(nir_schedule_scoreboard *scoreboard,
+ void *reg_or_def,
+ nir_instr *reg_or_def_parent,
+ int pressure)
+{
+ /* Make the value live if it's the first time it's been used. */
+ if (!_mesa_set_search(scoreboard->live_values, reg_or_def)) {
+ _mesa_set_add(scoreboard->live_values, reg_or_def);
+ scoreboard->pressure += pressure;
+ }
+
+ /* Make the value dead if it's the last remaining use. Be careful when one
+ * instruction uses a value twice to not decrement pressure twice.
+ */
+ struct set *remaining_uses =
+ _mesa_hash_table_search_data(scoreboard->remaining_uses, reg_or_def);
+ struct set_entry *entry = _mesa_set_search(remaining_uses, reg_or_def_parent);
+ if (entry) {
+ _mesa_set_remove(remaining_uses, entry);
+
+ if (remaining_uses->entries == 0)
+ scoreboard->pressure -= pressure;
+ }
+}
+
+static bool
+nir_schedule_mark_src_scheduled(nir_src *src, void *state)
+{
+ nir_schedule_scoreboard *scoreboard = state;
+ struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src);
+
+ struct set_entry *entry = _mesa_set_search(remaining_uses,
+ src->parent_instr);
+ if (entry) {
+ /* Once we've used an SSA value in one instruction, bump the priority of
+ * the other uses so the SSA value can get fully consumed.
+ *
+ * We don't do this for registers, and it's would be a hassle and it's
+ * unclear if that would help or not. Also, skip it for constants, as
+ * they're often folded as immediates into backend instructions and have
+ * many unrelated instructions all referencing the same value (0).
+ */
+ if (src->is_ssa &&
+ src->ssa->parent_instr->type != nir_instr_type_load_const) {
+ nir_foreach_use(other_src, src->ssa) {
+ if (other_src->parent_instr == src->parent_instr)
+ continue;
+
+ nir_schedule_node *n =
+ nir_schedule_get_node(scoreboard->instr_map,
+ other_src->parent_instr);
+
+ if (n && !n->partially_evaluated_path) {
+ if (debug) {
+ fprintf(stderr, " New partially evaluated path: ");
+ nir_print_instr(n->instr, stderr);
+ fprintf(stderr, "\n");
+ }
+
+ n->partially_evaluated_path = true;
+ }
+ }
+ }
+ }
+
+ nir_schedule_mark_use(scoreboard,
+ src->is_ssa ? (void *)src->ssa : (void *)src->reg.reg,
+ src->parent_instr,
+ nir_schedule_src_pressure(src));
+
+ return true;
+}
+
+static bool
+nir_schedule_mark_def_scheduled(nir_ssa_def *def, void *state)
+{
+ nir_schedule_scoreboard *scoreboard = state;
+
+ nir_schedule_mark_use(scoreboard, def, def->parent_instr,
+ nir_schedule_def_pressure(def));
+
+ return true;
+}
+
+static bool
+nir_schedule_mark_dest_scheduled(nir_dest *dest, void *state)
+{
+ nir_schedule_scoreboard *scoreboard = state;
+
+ /* SSA defs were handled in nir_schedule_mark_def_scheduled()
+ */
+ if (dest->is_ssa)
+ return true;
+
+ /* XXX: This is not actually accurate for regs -- the last use of a reg may
+ * have a live interval that extends across control flow. We should
+ * calculate the live ranges of regs, and have scheduler nodes for the CF
+ * nodes that also "use" the reg.
+ */
+ nir_schedule_mark_use(scoreboard, dest->reg.reg,
+ dest->reg.parent_instr,
+ nir_schedule_dest_pressure(dest));
+
+ return true;
+}
+
+static void
+nir_schedule_mark_node_scheduled(nir_schedule_scoreboard *scoreboard,
+ nir_schedule_node *n)
+{
+ nir_foreach_src(n->instr, nir_schedule_mark_src_scheduled, scoreboard);
+ nir_foreach_ssa_def(n->instr, nir_schedule_mark_def_scheduled, scoreboard);
+ nir_foreach_dest(n->instr, nir_schedule_mark_dest_scheduled, scoreboard);
+
+ util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
+ nir_schedule_node *child = (nir_schedule_node *)edge->child;
+
+ child->ready_time = MAX2(child->ready_time,
+ scoreboard->time + n->delay);
+
+ if (child->dag.parent_count == 1) {
+ if (debug) {
+ fprintf(stderr, " New DAG head: ");
+ nir_print_instr(child->instr, stderr);
+ fprintf(stderr, "\n");
+ }
+ }
+ }
+
+ dag_prune_head(scoreboard->dag, &n->dag);
+
+ scoreboard->time = MAX2(n->ready_time, scoreboard->time);
+ scoreboard->time++;
+}
+
+static void
+nir_schedule_instructions(nir_schedule_scoreboard *scoreboard, nir_block *block)
+{
+ while (!list_is_empty(&scoreboard->dag->heads)) {
+ if (debug) {
+ fprintf(stderr, "current list:\n");
+ dump_state(scoreboard);
+ }
+
+ nir_schedule_node *chosen;
+ if (scoreboard->pressure < scoreboard->threshold)
+ chosen = nir_schedule_choose_instruction_csp(scoreboard);
+ else
+ chosen = nir_schedule_choose_instruction_csr(scoreboard);
+
+ /* Now that we've scheduled a new instruction, some of its children may
+ * be promoted to the list of instructions ready to be scheduled.
+ */
+ nir_schedule_mark_node_scheduled(scoreboard, chosen);
+
+ /* Move the instruction to the end (so our first chosen instructions are
+ * the start of the program).
+ */
+ exec_node_remove(&chosen->instr->node);
+ exec_list_push_tail(&block->instr_list, &chosen->instr->node);
+
+ if (debug)
+ fprintf(stderr, "\n");
+ }
+}
+
+static uint32_t
+nir_schedule_get_delay(nir_instr *instr)
+{
+ switch (instr->type) {
+ case nir_instr_type_ssa_undef:
+ case nir_instr_type_load_const:
+ case nir_instr_type_alu:
+ case nir_instr_type_deref:
+ case nir_instr_type_jump:
+ case nir_instr_type_parallel_copy:
+ case nir_instr_type_call:
+ case nir_instr_type_phi:
+ return 1;
+
+ case nir_instr_type_intrinsic:
+ /* XXX: Pick a large number for UBO/SSBO/image/shared loads */
+ return 1;
+
+ case nir_instr_type_tex:
+ /* Pick some large number to try to fetch textures early and sample them
+ * late.
+ */
+ return 100;
+ }
+
+ return 0;
+}
+
+static void
+nir_schedule_dag_max_delay_cb(struct dag_node *node, void *state)
+{
+ nir_schedule_node *n = (nir_schedule_node *)node;
+ uint32_t max_delay = 0;
+
+ util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
+ nir_schedule_node *child = (nir_schedule_node *)edge->child;
+ max_delay = MAX2(child->max_delay, max_delay);
+ }
+
+ n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
+ }
+
+static void
+nir_schedule_block(nir_schedule_scoreboard *scoreboard, nir_block *block)
+{
+ void *mem_ctx = ralloc_context(NULL);
+ scoreboard->instr_map = _mesa_pointer_hash_table_create(mem_ctx);
+
+ scoreboard->dag = dag_create(mem_ctx);
+
+ nir_foreach_instr(instr, block) {
+ nir_schedule_node *n =
+ rzalloc(mem_ctx, nir_schedule_node);
+
+ n->instr = instr;
+ n->delay = nir_schedule_get_delay(instr);
+ dag_init_node(scoreboard->dag, &n->dag);
+
+ _mesa_hash_table_insert(scoreboard->instr_map, instr, n);
+ }
+
+ calculate_forward_deps(scoreboard, block);
+ calculate_reverse_deps(scoreboard, block);
+
+ dag_traverse_bottom_up(scoreboard->dag, nir_schedule_dag_max_delay_cb, NULL);
+
+ nir_schedule_instructions(scoreboard, block);
+
+ ralloc_free(mem_ctx);
+ scoreboard->instr_map = NULL;
+}
+
+static bool
+nir_schedule_ssa_def_init_scoreboard(nir_ssa_def *def, void *state)
+{
+ nir_schedule_scoreboard *scoreboard = state;
+ struct set *def_uses = _mesa_pointer_set_create(scoreboard);
+
+ _mesa_hash_table_insert(scoreboard->remaining_uses, def, def_uses);
+
+ _mesa_set_add(def_uses, def->parent_instr);
+
+ nir_foreach_use(src, def) {
+ _mesa_set_add(def_uses, src->parent_instr);
+ }
+
+ /* XXX: Handle if uses */
+
+ return true;
+}
+
+static nir_schedule_scoreboard *
+nir_schedule_get_scoreboard(nir_shader *shader, int threshold)
+{
+ nir_schedule_scoreboard *scoreboard = rzalloc(NULL, nir_schedule_scoreboard);
+
+ scoreboard->shader = shader;
+ scoreboard->live_values = _mesa_pointer_set_create(scoreboard);
+ scoreboard->remaining_uses = _mesa_pointer_hash_table_create(scoreboard);
+ scoreboard->threshold = threshold;
+ scoreboard->pressure = 0;
+
+ nir_foreach_function(function, shader) {
+ nir_foreach_register(reg, &function->impl->registers) {
+ struct set *register_uses =
+ _mesa_pointer_set_create(scoreboard);
+
+ _mesa_hash_table_insert(scoreboard->remaining_uses, reg, register_uses);
+
+ nir_foreach_use(src, reg) {
+ _mesa_set_add(register_uses, src->parent_instr);
+ }
+
+ /* XXX: Handle if uses */
+
+ nir_foreach_def(dest, reg) {
+ _mesa_set_add(register_uses, dest->reg.parent_instr);
+ }
+ }
+
+ nir_foreach_block(block, function->impl) {
+ nir_foreach_instr(instr, block) {
+ nir_foreach_ssa_def(instr, nir_schedule_ssa_def_init_scoreboard,
+ scoreboard);
+ }
+
+ /* XXX: We're ignoring if uses, which may prioritize scheduling other
+ * uses of the if src even when it doesn't help. That's not many
+ * values, though, so meh.
+ */
+ }
+ }
+
+ return scoreboard;
+}
+
+static void
+nir_schedule_validate_uses(nir_schedule_scoreboard *scoreboard)
+{
+#ifdef NDEBUG
+ return;
+#endif
+
+ bool any_uses = false;
+
+ hash_table_foreach(scoreboard->remaining_uses, entry) {
+ struct set *remaining_uses = entry->data;
+
+ set_foreach(remaining_uses, instr_entry) {
+ if (!any_uses) {
+ fprintf(stderr, "Tracked uses remain after scheduling. "
+ "Affected instructions: \n");
+ any_uses = true;
+ }
+ nir_print_instr(instr_entry->key, stderr);
+ fprintf(stderr, "\n");
+ }
+ }
+
+ assert(!any_uses);
+}
+
+/**
+ * Schedules the NIR instructions to try to decrease stalls (for example,
+ * delaying texture reads) while managing register pressure.
+ *
+ * The threshold represents "number of NIR register/SSA def channels live
+ * before switching the scheduling heuristic to reduce register pressure",
+ * since most of our GPU architectures are scalar (extending to vector with a
+ * flag wouldn't be hard). This number should be a bit below the number of
+ * registers available (counting any that may be occupied by system value
+ * payload values, for example), since the heuristic may not always be able to
+ * free a register immediately. The amount below the limit is up to you to
+ * tune.
+ */
+void
+nir_schedule(nir_shader *shader, int threshold)
+{
+ nir_schedule_scoreboard *scoreboard = nir_schedule_get_scoreboard(shader,
+ threshold);
+
+ if (debug) {
+ fprintf(stderr, "NIR shader before scheduling:\n");
+ nir_print_shader(shader, stderr);
+ }
+
+ nir_foreach_function(function, shader) {
+ if (!function->impl)
+ continue;
+
+ nir_foreach_block(block, function->impl) {
+ nir_schedule_block(scoreboard, block);
+ }
+ }
+
+ nir_schedule_validate_uses(scoreboard);
+
+ ralloc_free(scoreboard);
+}