freedreno/ir3: post-RA sched pass
authorRob Clark <robdclark@chromium.org>
Fri, 13 Dec 2019 22:09:39 +0000 (14:09 -0800)
committerMarge Bot <eric+marge@anholt.net>
Sat, 1 Feb 2020 02:40:22 +0000 (02:40 +0000)
After RA, we can schedule to increase parallelism (reduce nop's) without
worrying about increasing register pressure.  This pass lets us cut down
the instruction count ~10%, and prioritize bary.f, kill, etc, which
would tend to increase register pressure if we tried to do that before
RA.

It should be more useful if RA round-robin'd register choices.

Signed-off-by: Rob Clark <robdclark@chromium.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/merge_requests/3569>

src/freedreno/Makefile.sources
src/freedreno/ir3/ir3.h
src/freedreno/ir3/ir3_compiler_nir.c
src/freedreno/ir3/ir3_postsched.c [new file with mode: 0644]
src/freedreno/ir3/ir3_ra.c
src/freedreno/ir3/meson.build

index fb0b9f2159f720be4fc43296f4865a079483cf51..c67fa375e03716e0e714d965bae801c857e03e61 100644 (file)
@@ -44,6 +44,7 @@ ir3_SOURCES := \
        ir3/ir3_nir_lower_tex_prefetch.c \
        ir3/ir3_nir_lower_tg4_to_tex.c \
        ir3/ir3_nir_move_varying_inputs.c \
+       ir3/ir3_postsched.c \
        ir3/ir3_print.c \
        ir3/ir3_ra.c \
        ir3/ir3_sched.c \
index b10f2f0da79c4e9ac4c5096482691e6e1102e001..f78a7d6f3646243ea360a5ade5d556c0491cd25e 100644 (file)
@@ -1138,6 +1138,9 @@ void ir3_sun(struct ir3 *ir);
 void ir3_sched_add_deps(struct ir3 *ir);
 int ir3_sched(struct ir3 *ir);
 
+struct ir3_context;
+int ir3_postsched(struct ir3_context *ctx);
+
 bool ir3_a6xx_fixup_atomic_dests(struct ir3 *ir, struct ir3_shader_variant *so);
 
 /* register assignment: */
index 13052648814fd4ce91e192ff9581bf43ff246bdc..52a049a4123b02953abc9a0a3cbd9618c81ce917 100644 (file)
@@ -3481,6 +3481,9 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 
        ir3_debug_print(ir, "AFTER RA");
 
+       ir3_postsched(ctx);
+       ir3_debug_print(ir, "AFTER POSTSCHED");
+
        if (compiler->gpu_id >= 600) {
                if (ir3_a6xx_fixup_atomic_dests(ir, so)) {
                        ir3_debug_print(ir, "AFTER ATOMIC FIXUP");
diff --git a/src/freedreno/ir3/ir3_postsched.c b/src/freedreno/ir3/ir3_postsched.c
new file mode 100644 (file)
index 0000000..4290e88
--- /dev/null
@@ -0,0 +1,669 @@
+/*
+ * Copyright (C) 2019 Google, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+
+#include "util/dag.h"
+#include "util/u_math.h"
+
+#include "ir3.h"
+#include "ir3_compiler.h"
+#include "ir3_context.h"
+
+#ifdef DEBUG
+#define SCHED_DEBUG (ir3_shader_debug & IR3_DBG_SCHEDMSGS)
+#else
+#define SCHED_DEBUG 0
+#endif
+#define d(fmt, ...) do { if (SCHED_DEBUG) { \
+       printf("PSCHED: "fmt"\n", ##__VA_ARGS__); \
+} } while (0)
+
+#define di(instr, fmt, ...) do { if (SCHED_DEBUG) { \
+       printf("PSCHED: "fmt": ", ##__VA_ARGS__); \
+       ir3_print_instr(instr); \
+} } while (0)
+
+/*
+ * Post RA Instruction Scheduling
+ */
+
+struct ir3_postsched_ctx {
+       struct ir3_context *ctx;
+
+       void *mem_ctx;
+       struct ir3_block *block;           /* the current block */
+       struct dag *dag;
+
+       struct list_head unscheduled_list; /* unscheduled instructions */
+       struct ir3_instruction *scheduled; /* last scheduled instr */
+       struct ir3_instruction *pred;      /* current p0.x user, if any */
+
+       bool error;
+};
+
+struct ir3_postsched_node {
+       struct dag_node dag;     /* must be first for util_dynarray_foreach */
+       struct ir3_instruction *instr;
+       bool partially_evaluated_path;
+
+       unsigned delay;
+       unsigned max_delay;
+};
+
+#define foreach_sched_node(__n, __list) \
+       list_for_each_entry(struct ir3_postsched_node, __n, __list, dag.link)
+
+#define foreach_bit(b, mask) \
+       for (uint32_t _m = ({debug_assert((mask) >= 1); (mask);}); _m && ({(b) = u_bit_scan(&_m); 1;});)
+
+// TODO deduplicate
+static bool is_sfu_or_mem(struct ir3_instruction *instr)
+{
+       return is_sfu(instr) || is_mem(instr);
+}
+
+static void
+schedule(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
+{
+       debug_assert(ctx->block == instr->block);
+
+       /* maybe there is a better way to handle this than just stuffing
+        * a nop.. ideally we'd know about this constraint in the
+        * scheduling and depth calculation..
+        */
+       if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr))
+               ir3_NOP(ctx->block);
+
+       /* remove from unscheduled_list:
+        */
+       list_delinit(&instr->node);
+
+       if (writes_pred(instr)) {
+               ctx->pred = instr;
+       }
+
+       di(instr, "schedule");
+
+       list_addtail(&instr->node, &instr->block->instr_list);
+       ctx->scheduled = instr;
+
+       struct ir3_postsched_node *n = instr->data;
+       dag_prune_head(ctx->dag, &n->dag);
+}
+
+static void
+dump_state(struct ir3_postsched_ctx *ctx)
+{
+       if (!SCHED_DEBUG)
+               return;
+
+       foreach_sched_node (n, &ctx->dag->heads) {
+               di(n->instr, "maxdel=%3d    ", n->max_delay);
+
+               util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
+                       struct ir3_postsched_node *child =
+                               (struct ir3_postsched_node *)edge->child;
+
+                       di(child->instr, " -> (%d parents) ", child->dag.parent_count);
+               }
+       }
+}
+
+/* find instruction to schedule: */
+static struct ir3_instruction *
+choose_instr(struct ir3_postsched_ctx *ctx)
+{
+       struct ir3_postsched_node *chosen = NULL;
+
+       dump_state(ctx);
+
+       foreach_sched_node (n, &ctx->dag->heads) {
+               if (!is_meta(n->instr))
+                       continue;
+
+               if (!chosen || (chosen->max_delay < n->max_delay))
+                       chosen = n;
+       }
+
+       if (chosen) {
+               di(chosen->instr, "prio: chose (meta)");
+               return chosen->instr;
+       }
+
+       /* Try to schedule inputs with a higher priority, if possible, as
+        * the last bary.f unlocks varying storage to unblock more VS
+        * warps.
+        */
+       foreach_sched_node (n, &ctx->dag->heads) {
+               if (!is_input(n->instr))
+                       continue;
+
+               if (!chosen || (chosen->max_delay < n->max_delay))
+                       chosen = n;
+       }
+
+       if (chosen) {
+               di(chosen->instr, "prio: chose (input)");
+               return chosen->instr;
+       }
+
+       /* Next prioritize discards: */
+       foreach_sched_node (n, &ctx->dag->heads) {
+               unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
+
+               if (d > 0)
+                       continue;
+
+               if (!is_kill(n->instr))
+                       continue;
+
+               if (!chosen || (chosen->max_delay < n->max_delay))
+                       chosen = n;
+       }
+
+       if (chosen) {
+               di(chosen->instr, "csp: chose (kill, hard ready)");
+               return chosen->instr;
+       }
+
+       /* Next prioritize expensive instructions: */
+       foreach_sched_node (n, &ctx->dag->heads) {
+               unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
+
+               if (d > 0)
+                       continue;
+
+               if (!(is_sfu(n->instr) || is_tex(n->instr)))
+                       continue;
+
+               if (!chosen || (chosen->max_delay < n->max_delay))
+                       chosen = n;
+       }
+
+       if (chosen) {
+               di(chosen->instr, "csp: chose (sfu/tex, hard ready)");
+               return chosen->instr;
+       }
+
+       /* First try to find a ready leader w/ soft delay (ie. including extra
+        * delay for things like tex fetch which can be synchronized w/ sync
+        * bit (but we probably do want to schedule some other instructions
+        * while we wait)
+        */
+       foreach_sched_node (n, &ctx->dag->heads) {
+               unsigned d = ir3_delay_calc(ctx->block, n->instr, true, false);
+
+               if (d > 0)
+                       continue;
+
+               if (!chosen || (chosen->max_delay < n->max_delay))
+                       chosen = n;
+       }
+
+       if (chosen) {
+               di(chosen->instr, "csp: chose (soft ready)");
+               return chosen->instr;
+       }
+
+       /* Next try to find a ready leader that can be scheduled without nop's,
+        * which in the case of things that need (sy)/(ss) could result in
+        * stalls.. but we've already decided there is not a better option.
+        */
+       foreach_sched_node (n, &ctx->dag->heads) {
+               unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
+
+               if (d > 0)
+                       continue;
+
+               if (!chosen || (chosen->max_delay < n->max_delay))
+                       chosen = n;
+       }
+
+       if (chosen) {
+               di(chosen->instr, "csp: chose (hard ready)");
+               return chosen->instr;
+       }
+
+       /* Otherwise choose leader with maximum cost:
+        *
+        * TODO should we try to balance cost and delays?  I guess it is
+        * a balance between now-nop's and future-nop's?
+        */
+       foreach_sched_node (n, &ctx->dag->heads) {
+               if (!chosen || chosen->max_delay < n->max_delay)
+                       chosen = n;
+       }
+
+       if (chosen) {
+               di(chosen->instr, "csp: chose (leader)");
+               return chosen->instr;
+       }
+
+       return NULL;
+}
+
+struct ir3_postsched_deps_state {
+       struct ir3_context *ctx;
+
+       enum { F, R } direction;
+
+       bool merged;
+
+       /* Track the mapping between sched node (instruction) that last
+        * wrote a given register (in whichever direction we are iterating
+        * the block)
+        *
+        * Note, this table is twice as big as the # of regs, to deal with
+        * half-precision regs.  The approach differs depending on whether
+        * the half and full precision register files are "merged" (conflict,
+        * ie. a6xx+) in which case we consider each full precision dep
+        * as two half-precision dependencies, vs older separate (non-
+        * conflicting) in which case the first half of the table is used
+        * for full precision and 2nd half for half-precision.
+        */
+       struct ir3_postsched_node *regs[2 * 256];
+};
+
+/* bounds checking read/write accessors, since OoB access to stuff on
+ * the stack is gonna cause a bad day.
+ */
+#define dep_reg(state, idx) *({ \
+               assert((idx) < ARRAY_SIZE((state)->regs)); \
+               &(state)->regs[(idx)]; \
+       })
+
+static void
+add_dep(struct ir3_postsched_deps_state *state,
+               struct ir3_postsched_node *before,
+               struct ir3_postsched_node *after)
+{
+       if (!before || !after)
+               return;
+
+       assert(before != after);
+
+       if (state->direction == F) {
+               dag_add_edge(&before->dag, &after->dag, NULL);
+       } else {
+               dag_add_edge(&after->dag, &before->dag, NULL);
+       }
+}
+
+static void
+add_single_reg_dep(struct ir3_postsched_deps_state *state,
+               struct ir3_postsched_node *node, unsigned num, bool write)
+{
+       add_dep(state, dep_reg(state, num), node);
+       if (write) {
+               dep_reg(state, num) = node;
+       }
+}
+
+/* This is where we handled full vs half-precision, and potential conflicts
+ * between half and full precision that result in additional dependencies.
+ * The 'reg' arg is really just to know half vs full precision.
+ */
+static void
+add_reg_dep(struct ir3_postsched_deps_state *state,
+               struct ir3_postsched_node *node, const struct ir3_register *reg,
+               unsigned num, bool write)
+{
+       if (state->merged) {
+               if (reg->flags & IR3_REG_HALF) {
+                       /* single conflict in half-reg space: */
+                       add_single_reg_dep(state, node, num, write);
+               } else {
+                       /* two conflicts in half-reg space: */
+                       add_single_reg_dep(state, node, 2 * num + 0, write);
+                       add_single_reg_dep(state, node, 2 * num + 1, write);
+               }
+       } else {
+               if (reg->flags & IR3_REG_HALF)
+                       num += ARRAY_SIZE(state->regs) / 2;
+               add_single_reg_dep(state, node, num, write);
+       }
+}
+
+static void
+calculate_deps(struct ir3_postsched_deps_state *state,
+               struct ir3_postsched_node *node)
+{
+       static const struct ir3_register half_reg = { .flags = IR3_REG_HALF };
+       struct ir3_register *reg;
+       int b;
+
+       /* Add dependencies on instructions that previously (or next,
+        * in the reverse direction) wrote any of our src registers:
+        */
+       foreach_src_n (reg, i, node->instr) {
+               /* NOTE: relative access for a src can be either const or gpr: */
+               if (reg->flags & IR3_REG_RELATIV) {
+                       /* also reads a0.x: */
+                       add_reg_dep(state, node, &half_reg, regid(REG_A0, 0), false);
+               }
+
+               if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
+                       continue;
+
+               if (reg->flags & IR3_REG_RELATIV) {
+                       /* mark entire array as read: */
+                       struct ir3_array *arr = ir3_lookup_array(state->ctx->ir, reg->array.id);
+                       for (unsigned i = 0; i < arr->length; i++) {
+                               add_reg_dep(state, node, reg, arr->reg + i, false);
+                       }
+               } else {
+                       foreach_bit (b, reg->wrmask) {
+                               add_reg_dep(state, node, reg, reg->num + b, false);
+
+                               struct ir3_postsched_node *dep = dep_reg(state, reg->num + b);
+                               if (dep && (state->direction == F)) {
+                                       unsigned d = ir3_delayslots(dep->instr, node->instr, i);
+                                       node->delay = MAX2(node->delay, d);
+                               }
+                       }
+               }
+       }
+
+       if (dest_regs(node->instr) == 0)
+               return;
+
+       /* And then after we update the state for what this instruction
+        * wrote:
+        */
+       reg = node->instr->regs[0];
+       if (reg->flags & IR3_REG_RELATIV) {
+               /* mark the entire array as written: */
+               struct ir3_array *arr = ir3_lookup_array(state->ctx->ir, reg->array.id);
+               for (unsigned i = 0; i < arr->length; i++) {
+                       add_reg_dep(state, node, reg, arr->reg + i, true);
+               }
+
+               /* also reads a0.x: */
+               add_reg_dep(state, node, &half_reg, regid(REG_A0, 0), false);
+       } else {
+               foreach_bit (b, reg->wrmask) {
+                       add_reg_dep(state, node, reg, reg->num + b, true);
+               }
+       }
+}
+
+static void
+calculate_forward_deps(struct ir3_postsched_ctx *ctx)
+{
+       struct ir3_postsched_deps_state state = {
+                       .ctx = ctx->ctx,
+                       .direction = F,
+                       .merged = ctx->ctx->compiler->gpu_id >= 600,
+       };
+
+       foreach_instr (instr, &ctx->unscheduled_list) {
+               calculate_deps(&state, instr->data);
+       }
+}
+
+static void
+calculate_reverse_deps(struct ir3_postsched_ctx *ctx)
+{
+       struct ir3_postsched_deps_state state = {
+                       .ctx = ctx->ctx,
+                       .direction = R,
+                       .merged = ctx->ctx->compiler->gpu_id >= 600,
+       };
+
+       foreach_instr_rev (instr, &ctx->unscheduled_list) {
+               calculate_deps(&state, instr->data);
+       }
+}
+
+static void
+sched_node_init(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
+{
+       struct ir3_postsched_node *n = rzalloc(ctx->mem_ctx, struct ir3_postsched_node);
+
+       dag_init_node(ctx->dag, &n->dag);
+
+       n->instr = instr;
+       instr->data = n;
+}
+
+static void
+sched_dag_max_delay_cb(struct dag_node *node, void *state)
+{
+       struct ir3_postsched_node *n = (struct ir3_postsched_node *)node;
+       uint32_t max_delay = 0;
+
+       util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
+               struct ir3_postsched_node *child = (struct ir3_postsched_node *)edge->child;
+               max_delay = MAX2(child->max_delay, max_delay);
+       }
+
+       n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
+}
+
+static void
+sched_dag_init(struct ir3_postsched_ctx *ctx)
+{
+       ctx->mem_ctx = ralloc_context(NULL);
+
+       ctx->dag = dag_create(ctx->mem_ctx);
+
+       foreach_instr (instr, &ctx->unscheduled_list)
+               sched_node_init(ctx, instr);
+
+       calculate_forward_deps(ctx);
+       calculate_reverse_deps(ctx);
+
+       /*
+        * Normal srcs won't be in SSA at this point, those are dealt with in
+        * calculate_forward_deps() and calculate_reverse_deps().  But we still
+        * have the false-dep information in SSA form, so go ahead and add
+        * dependencies for that here:
+        */
+       foreach_instr (instr, &ctx->unscheduled_list) {
+               struct ir3_postsched_node *n = instr->data;
+               struct ir3_instruction *src;
+
+               foreach_ssa_src_n (src, i, instr) {
+                       if (src->block != instr->block)
+                               continue;
+
+                       /* we can end up with unused false-deps.. just skip them: */
+                       if (src->flags & IR3_INSTR_UNUSED)
+                               continue;
+
+                       struct ir3_postsched_node *sn = src->data;
+
+                       /* don't consider dependencies in other blocks: */
+                       if (src->block != instr->block)
+                               continue;
+
+                       dag_add_edge(&sn->dag, &n->dag, NULL);
+               }
+       }
+
+       // TODO do we want to do this after reverse-dependencies?
+       dag_traverse_bottom_up(ctx->dag, sched_dag_max_delay_cb, NULL);
+}
+
+static void
+sched_dag_destroy(struct ir3_postsched_ctx *ctx)
+{
+       ralloc_free(ctx->mem_ctx);
+       ctx->mem_ctx = NULL;
+       ctx->dag = NULL;
+}
+
+static void
+sched_block(struct ir3_postsched_ctx *ctx, struct ir3_block *block)
+{
+       ctx->block = block;
+       ctx->scheduled = NULL;
+       ctx->pred = NULL;
+
+       /* move all instructions to the unscheduled list, and
+        * empty the block's instruction list (to which we will
+        * be inserting).
+        */
+       list_replace(&block->instr_list, &ctx->unscheduled_list);
+       list_inithead(&block->instr_list);
+
+       // TODO once we are using post-sched for everything we can
+       // just not stick in NOP's prior to post-sched, and drop this.
+       // for now keep this, since it makes post-sched optional:
+       foreach_instr_safe (instr, &ctx->unscheduled_list) {
+               switch (instr->opc) {
+               case OPC_NOP:
+               case OPC_BR:
+               case OPC_JUMP:
+                       list_delinit(&instr->node);
+                       break;
+               default:
+                       break;
+               }
+       }
+
+       sched_dag_init(ctx);
+
+       /* First schedule all meta:input instructions, followed by
+        * tex-prefetch.  We want all of the instructions that load
+        * values into registers before the shader starts to go
+        * before any other instructions.  But in particular we
+        * want inputs to come before prefetches.  This is because
+        * a FS's bary_ij input may not actually be live in the
+        * shader, but it should not be scheduled on top of any
+        * other input (but can be overwritten by a tex prefetch)
+        */
+       foreach_instr_safe (instr, &ctx->unscheduled_list)
+               if (instr->opc == OPC_META_INPUT)
+                       schedule(ctx, instr);
+
+       foreach_instr_safe (instr, &ctx->unscheduled_list)
+               if (instr->opc == OPC_META_TEX_PREFETCH)
+                       schedule(ctx, instr);
+
+       while (!list_is_empty(&ctx->unscheduled_list)) {
+               struct ir3_instruction *instr;
+
+               instr = choose_instr(ctx);
+
+               /* this shouldn't happen: */
+               if (!instr) {
+                       ctx->error = true;
+                       break;
+               }
+
+               unsigned delay = ir3_delay_calc(ctx->block, instr, false, false);
+               d("delay=%u", delay);
+
+               /* and if we run out of instructions that can be scheduled,
+                * then it is time for nop's:
+                */
+               debug_assert(delay <= 6);
+               while (delay > 0) {
+                       ir3_NOP(block);
+                       delay--;
+               }
+
+               schedule(ctx, instr);
+       }
+
+       sched_dag_destroy(ctx);
+}
+
+
+static bool
+is_self_mov(struct ir3_instruction *instr)
+{
+       if (!is_same_type_mov(instr))
+               return false;
+
+       if (instr->regs[0]->num != instr->regs[1]->num)
+               return false;
+
+       if (instr->regs[0]->flags & IR3_REG_RELATIV)
+               return false;
+
+       if (instr->regs[1]->flags & (IR3_REG_CONST | IR3_REG_IMMED |
+                       IR3_REG_RELATIV | IR3_REG_FNEG | IR3_REG_FABS |
+                       IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT |
+                       IR3_REG_EVEN | IR3_REG_POS_INF))
+               return false;
+
+       return true;
+}
+
+/* sometimes we end up w/ in-place mov's, ie. mov.u32u32 r1.y, r1.y
+ * as a result of places were before RA we are not sure that it is
+ * safe to eliminate.  We could eliminate these earlier, but sometimes
+ * they are tangled up in false-dep's, etc, so it is easier just to
+ * let them exist until after RA
+ */
+static void
+cleanup_self_movs(struct ir3 *ir)
+{
+       foreach_block (block, &ir->block_list) {
+               foreach_instr_safe (instr, &block->instr_list) {
+                       struct ir3_register *reg;
+
+                       foreach_src (reg, instr) {
+                               if (!reg->instr)
+                                       continue;
+
+                               if (is_self_mov(reg->instr)) {
+                                       list_delinit(&reg->instr->node);
+                                       reg->instr = reg->instr->regs[1]->instr;
+                               }
+                       }
+
+                       for (unsigned i = 0; i < instr->deps_count; i++) {
+                               if (is_self_mov(instr->deps[i])) {
+                                       list_delinit(&instr->deps[i]->node);
+                                       instr->deps[i] = instr->deps[i]->regs[1]->instr;
+                               }
+                       }
+               }
+       }
+}
+
+int
+ir3_postsched(struct ir3_context *cctx)
+{
+       struct ir3_postsched_ctx ctx = {
+                       .ctx = cctx,
+       };
+
+       ir3_remove_nops(cctx->ir);
+       cleanup_self_movs(cctx->ir);
+
+       foreach_block (block, &cctx->ir->block_list) {
+               sched_block(&ctx, block);
+       }
+
+       if (ctx.error)
+               return -1;
+
+       return 0;
+}
index 823968a5cec7d5df6b5b77df706f6bc7e2d61809..0d1028a76d76b2cfa63c57cb1b2fb783516e438e 100644 (file)
@@ -385,14 +385,11 @@ size_to_class(unsigned sz, bool half, bool high)
 static bool
 writes_gpr(struct ir3_instruction *instr)
 {
-       if (is_store(instr))
-               return false;
-       if (instr->regs_count == 0)
+       if (dest_regs(instr) == 0)
                return false;
        /* is dest a normal temp register: */
        struct ir3_register *reg = instr->regs[0];
-       if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
-               return false;
+       debug_assert(!(reg->flags & (IR3_REG_CONST | IR3_REG_IMMED)));
        if ((reg->num == regid(REG_A0, 0)) ||
                        (reg->num == regid(REG_P0, 0)))
                return false;
index 0c2cb21930ad970f9d27441866cd6257e35f0276..89ec273034e75809cab66f127d0ed1049f646d3c 100644 (file)
@@ -71,6 +71,7 @@ libfreedreno_ir3_files = files(
   'ir3_nir_lower_tex_prefetch.c',
   'ir3_nir_lower_tg4_to_tex.c',
   'ir3_nir_move_varying_inputs.c',
+  'ir3_postsched.c',
   'ir3_print.c',
   'ir3_ra.c',
   'ir3_sched.c',