From 0f78c32492ed096649b015a4967d6d56c18dd14a Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Fri, 13 Dec 2019 14:09:39 -0800 Subject: [PATCH] freedreno/ir3: post-RA sched pass After RA, we can schedule to increase parallelism (reduce nop's) without worrying about increasing register pressure. This pass lets us cut down the instruction count ~10%, and prioritize bary.f, kill, etc, which would tend to increase register pressure if we tried to do that before RA. It should be more useful if RA round-robin'd register choices. Signed-off-by: Rob Clark Part-of: --- src/freedreno/Makefile.sources | 1 + src/freedreno/ir3/ir3.h | 3 + src/freedreno/ir3/ir3_compiler_nir.c | 3 + src/freedreno/ir3/ir3_postsched.c | 669 +++++++++++++++++++++++++++ src/freedreno/ir3/ir3_ra.c | 7 +- src/freedreno/ir3/meson.build | 1 + 6 files changed, 679 insertions(+), 5 deletions(-) create mode 100644 src/freedreno/ir3/ir3_postsched.c diff --git a/src/freedreno/Makefile.sources b/src/freedreno/Makefile.sources index fb0b9f2159f..c67fa375e03 100644 --- a/src/freedreno/Makefile.sources +++ b/src/freedreno/Makefile.sources @@ -44,6 +44,7 @@ ir3_SOURCES := \ ir3/ir3_nir_lower_tex_prefetch.c \ ir3/ir3_nir_lower_tg4_to_tex.c \ ir3/ir3_nir_move_varying_inputs.c \ + ir3/ir3_postsched.c \ ir3/ir3_print.c \ ir3/ir3_ra.c \ ir3/ir3_sched.c \ diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h index b10f2f0da79..f78a7d6f364 100644 --- a/src/freedreno/ir3/ir3.h +++ b/src/freedreno/ir3/ir3.h @@ -1138,6 +1138,9 @@ void ir3_sun(struct ir3 *ir); void ir3_sched_add_deps(struct ir3 *ir); int ir3_sched(struct ir3 *ir); +struct ir3_context; +int ir3_postsched(struct ir3_context *ctx); + bool ir3_a6xx_fixup_atomic_dests(struct ir3 *ir, struct ir3_shader_variant *so); /* register assignment: */ diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c index 13052648814..52a049a4123 100644 --- a/src/freedreno/ir3/ir3_compiler_nir.c +++ b/src/freedreno/ir3/ir3_compiler_nir.c @@ -3481,6 +3481,9 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, ir3_debug_print(ir, "AFTER RA"); + ir3_postsched(ctx); + ir3_debug_print(ir, "AFTER POSTSCHED"); + if (compiler->gpu_id >= 600) { if (ir3_a6xx_fixup_atomic_dests(ir, so)) { ir3_debug_print(ir, "AFTER ATOMIC FIXUP"); diff --git a/src/freedreno/ir3/ir3_postsched.c b/src/freedreno/ir3/ir3_postsched.c new file mode 100644 index 00000000000..4290e882249 --- /dev/null +++ b/src/freedreno/ir3/ir3_postsched.c @@ -0,0 +1,669 @@ +/* + * Copyright (C) 2019 Google, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark + */ + + +#include "util/dag.h" +#include "util/u_math.h" + +#include "ir3.h" +#include "ir3_compiler.h" +#include "ir3_context.h" + +#ifdef DEBUG +#define SCHED_DEBUG (ir3_shader_debug & IR3_DBG_SCHEDMSGS) +#else +#define SCHED_DEBUG 0 +#endif +#define d(fmt, ...) do { if (SCHED_DEBUG) { \ + printf("PSCHED: "fmt"\n", ##__VA_ARGS__); \ +} } while (0) + +#define di(instr, fmt, ...) do { if (SCHED_DEBUG) { \ + printf("PSCHED: "fmt": ", ##__VA_ARGS__); \ + ir3_print_instr(instr); \ +} } while (0) + +/* + * Post RA Instruction Scheduling + */ + +struct ir3_postsched_ctx { + struct ir3_context *ctx; + + void *mem_ctx; + struct ir3_block *block; /* the current block */ + struct dag *dag; + + struct list_head unscheduled_list; /* unscheduled instructions */ + struct ir3_instruction *scheduled; /* last scheduled instr */ + struct ir3_instruction *pred; /* current p0.x user, if any */ + + bool error; +}; + +struct ir3_postsched_node { + struct dag_node dag; /* must be first for util_dynarray_foreach */ + struct ir3_instruction *instr; + bool partially_evaluated_path; + + unsigned delay; + unsigned max_delay; +}; + +#define foreach_sched_node(__n, __list) \ + list_for_each_entry(struct ir3_postsched_node, __n, __list, dag.link) + +#define foreach_bit(b, mask) \ + for (uint32_t _m = ({debug_assert((mask) >= 1); (mask);}); _m && ({(b) = u_bit_scan(&_m); 1;});) + +// TODO deduplicate +static bool is_sfu_or_mem(struct ir3_instruction *instr) +{ + return is_sfu(instr) || is_mem(instr); +} + +static void +schedule(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr) +{ + debug_assert(ctx->block == instr->block); + + /* maybe there is a better way to handle this than just stuffing + * a nop.. ideally we'd know about this constraint in the + * scheduling and depth calculation.. + */ + if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr)) + ir3_NOP(ctx->block); + + /* remove from unscheduled_list: + */ + list_delinit(&instr->node); + + if (writes_pred(instr)) { + ctx->pred = instr; + } + + di(instr, "schedule"); + + list_addtail(&instr->node, &instr->block->instr_list); + ctx->scheduled = instr; + + struct ir3_postsched_node *n = instr->data; + dag_prune_head(ctx->dag, &n->dag); +} + +static void +dump_state(struct ir3_postsched_ctx *ctx) +{ + if (!SCHED_DEBUG) + return; + + foreach_sched_node (n, &ctx->dag->heads) { + di(n->instr, "maxdel=%3d ", n->max_delay); + + util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { + struct ir3_postsched_node *child = + (struct ir3_postsched_node *)edge->child; + + di(child->instr, " -> (%d parents) ", child->dag.parent_count); + } + } +} + +/* find instruction to schedule: */ +static struct ir3_instruction * +choose_instr(struct ir3_postsched_ctx *ctx) +{ + struct ir3_postsched_node *chosen = NULL; + + dump_state(ctx); + + foreach_sched_node (n, &ctx->dag->heads) { + if (!is_meta(n->instr)) + continue; + + if (!chosen || (chosen->max_delay < n->max_delay)) + chosen = n; + } + + if (chosen) { + di(chosen->instr, "prio: chose (meta)"); + return chosen->instr; + } + + /* Try to schedule inputs with a higher priority, if possible, as + * the last bary.f unlocks varying storage to unblock more VS + * warps. + */ + foreach_sched_node (n, &ctx->dag->heads) { + if (!is_input(n->instr)) + continue; + + if (!chosen || (chosen->max_delay < n->max_delay)) + chosen = n; + } + + if (chosen) { + di(chosen->instr, "prio: chose (input)"); + return chosen->instr; + } + + /* Next prioritize discards: */ + foreach_sched_node (n, &ctx->dag->heads) { + unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false); + + if (d > 0) + continue; + + if (!is_kill(n->instr)) + continue; + + if (!chosen || (chosen->max_delay < n->max_delay)) + chosen = n; + } + + if (chosen) { + di(chosen->instr, "csp: chose (kill, hard ready)"); + return chosen->instr; + } + + /* Next prioritize expensive instructions: */ + foreach_sched_node (n, &ctx->dag->heads) { + unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false); + + if (d > 0) + continue; + + if (!(is_sfu(n->instr) || is_tex(n->instr))) + continue; + + if (!chosen || (chosen->max_delay < n->max_delay)) + chosen = n; + } + + if (chosen) { + di(chosen->instr, "csp: chose (sfu/tex, hard ready)"); + return chosen->instr; + } + + /* First try to find a ready leader w/ soft delay (ie. including extra + * delay for things like tex fetch which can be synchronized w/ sync + * bit (but we probably do want to schedule some other instructions + * while we wait) + */ + foreach_sched_node (n, &ctx->dag->heads) { + unsigned d = ir3_delay_calc(ctx->block, n->instr, true, false); + + if (d > 0) + continue; + + if (!chosen || (chosen->max_delay < n->max_delay)) + chosen = n; + } + + if (chosen) { + di(chosen->instr, "csp: chose (soft ready)"); + return chosen->instr; + } + + /* Next try to find a ready leader that can be scheduled without nop's, + * which in the case of things that need (sy)/(ss) could result in + * stalls.. but we've already decided there is not a better option. + */ + foreach_sched_node (n, &ctx->dag->heads) { + unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false); + + if (d > 0) + continue; + + if (!chosen || (chosen->max_delay < n->max_delay)) + chosen = n; + } + + if (chosen) { + di(chosen->instr, "csp: chose (hard ready)"); + return chosen->instr; + } + + /* Otherwise choose leader with maximum cost: + * + * TODO should we try to balance cost and delays? I guess it is + * a balance between now-nop's and future-nop's? + */ + foreach_sched_node (n, &ctx->dag->heads) { + if (!chosen || chosen->max_delay < n->max_delay) + chosen = n; + } + + if (chosen) { + di(chosen->instr, "csp: chose (leader)"); + return chosen->instr; + } + + return NULL; +} + +struct ir3_postsched_deps_state { + struct ir3_context *ctx; + + enum { F, R } direction; + + bool merged; + + /* Track the mapping between sched node (instruction) that last + * wrote a given register (in whichever direction we are iterating + * the block) + * + * Note, this table is twice as big as the # of regs, to deal with + * half-precision regs. The approach differs depending on whether + * the half and full precision register files are "merged" (conflict, + * ie. a6xx+) in which case we consider each full precision dep + * as two half-precision dependencies, vs older separate (non- + * conflicting) in which case the first half of the table is used + * for full precision and 2nd half for half-precision. + */ + struct ir3_postsched_node *regs[2 * 256]; +}; + +/* bounds checking read/write accessors, since OoB access to stuff on + * the stack is gonna cause a bad day. + */ +#define dep_reg(state, idx) *({ \ + assert((idx) < ARRAY_SIZE((state)->regs)); \ + &(state)->regs[(idx)]; \ + }) + +static void +add_dep(struct ir3_postsched_deps_state *state, + struct ir3_postsched_node *before, + struct ir3_postsched_node *after) +{ + if (!before || !after) + return; + + assert(before != after); + + if (state->direction == F) { + dag_add_edge(&before->dag, &after->dag, NULL); + } else { + dag_add_edge(&after->dag, &before->dag, NULL); + } +} + +static void +add_single_reg_dep(struct ir3_postsched_deps_state *state, + struct ir3_postsched_node *node, unsigned num, bool write) +{ + add_dep(state, dep_reg(state, num), node); + if (write) { + dep_reg(state, num) = node; + } +} + +/* This is where we handled full vs half-precision, and potential conflicts + * between half and full precision that result in additional dependencies. + * The 'reg' arg is really just to know half vs full precision. + */ +static void +add_reg_dep(struct ir3_postsched_deps_state *state, + struct ir3_postsched_node *node, const struct ir3_register *reg, + unsigned num, bool write) +{ + if (state->merged) { + if (reg->flags & IR3_REG_HALF) { + /* single conflict in half-reg space: */ + add_single_reg_dep(state, node, num, write); + } else { + /* two conflicts in half-reg space: */ + add_single_reg_dep(state, node, 2 * num + 0, write); + add_single_reg_dep(state, node, 2 * num + 1, write); + } + } else { + if (reg->flags & IR3_REG_HALF) + num += ARRAY_SIZE(state->regs) / 2; + add_single_reg_dep(state, node, num, write); + } +} + +static void +calculate_deps(struct ir3_postsched_deps_state *state, + struct ir3_postsched_node *node) +{ + static const struct ir3_register half_reg = { .flags = IR3_REG_HALF }; + struct ir3_register *reg; + int b; + + /* Add dependencies on instructions that previously (or next, + * in the reverse direction) wrote any of our src registers: + */ + foreach_src_n (reg, i, node->instr) { + /* NOTE: relative access for a src can be either const or gpr: */ + if (reg->flags & IR3_REG_RELATIV) { + /* also reads a0.x: */ + add_reg_dep(state, node, &half_reg, regid(REG_A0, 0), false); + } + + if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED)) + continue; + + if (reg->flags & IR3_REG_RELATIV) { + /* mark entire array as read: */ + struct ir3_array *arr = ir3_lookup_array(state->ctx->ir, reg->array.id); + for (unsigned i = 0; i < arr->length; i++) { + add_reg_dep(state, node, reg, arr->reg + i, false); + } + } else { + foreach_bit (b, reg->wrmask) { + add_reg_dep(state, node, reg, reg->num + b, false); + + struct ir3_postsched_node *dep = dep_reg(state, reg->num + b); + if (dep && (state->direction == F)) { + unsigned d = ir3_delayslots(dep->instr, node->instr, i); + node->delay = MAX2(node->delay, d); + } + } + } + } + + if (dest_regs(node->instr) == 0) + return; + + /* And then after we update the state for what this instruction + * wrote: + */ + reg = node->instr->regs[0]; + if (reg->flags & IR3_REG_RELATIV) { + /* mark the entire array as written: */ + struct ir3_array *arr = ir3_lookup_array(state->ctx->ir, reg->array.id); + for (unsigned i = 0; i < arr->length; i++) { + add_reg_dep(state, node, reg, arr->reg + i, true); + } + + /* also reads a0.x: */ + add_reg_dep(state, node, &half_reg, regid(REG_A0, 0), false); + } else { + foreach_bit (b, reg->wrmask) { + add_reg_dep(state, node, reg, reg->num + b, true); + } + } +} + +static void +calculate_forward_deps(struct ir3_postsched_ctx *ctx) +{ + struct ir3_postsched_deps_state state = { + .ctx = ctx->ctx, + .direction = F, + .merged = ctx->ctx->compiler->gpu_id >= 600, + }; + + foreach_instr (instr, &ctx->unscheduled_list) { + calculate_deps(&state, instr->data); + } +} + +static void +calculate_reverse_deps(struct ir3_postsched_ctx *ctx) +{ + struct ir3_postsched_deps_state state = { + .ctx = ctx->ctx, + .direction = R, + .merged = ctx->ctx->compiler->gpu_id >= 600, + }; + + foreach_instr_rev (instr, &ctx->unscheduled_list) { + calculate_deps(&state, instr->data); + } +} + +static void +sched_node_init(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr) +{ + struct ir3_postsched_node *n = rzalloc(ctx->mem_ctx, struct ir3_postsched_node); + + dag_init_node(ctx->dag, &n->dag); + + n->instr = instr; + instr->data = n; +} + +static void +sched_dag_max_delay_cb(struct dag_node *node, void *state) +{ + struct ir3_postsched_node *n = (struct ir3_postsched_node *)node; + uint32_t max_delay = 0; + + util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { + struct ir3_postsched_node *child = (struct ir3_postsched_node *)edge->child; + max_delay = MAX2(child->max_delay, max_delay); + } + + n->max_delay = MAX2(n->max_delay, max_delay + n->delay); +} + +static void +sched_dag_init(struct ir3_postsched_ctx *ctx) +{ + ctx->mem_ctx = ralloc_context(NULL); + + ctx->dag = dag_create(ctx->mem_ctx); + + foreach_instr (instr, &ctx->unscheduled_list) + sched_node_init(ctx, instr); + + calculate_forward_deps(ctx); + calculate_reverse_deps(ctx); + + /* + * Normal srcs won't be in SSA at this point, those are dealt with in + * calculate_forward_deps() and calculate_reverse_deps(). But we still + * have the false-dep information in SSA form, so go ahead and add + * dependencies for that here: + */ + foreach_instr (instr, &ctx->unscheduled_list) { + struct ir3_postsched_node *n = instr->data; + struct ir3_instruction *src; + + foreach_ssa_src_n (src, i, instr) { + if (src->block != instr->block) + continue; + + /* we can end up with unused false-deps.. just skip them: */ + if (src->flags & IR3_INSTR_UNUSED) + continue; + + struct ir3_postsched_node *sn = src->data; + + /* don't consider dependencies in other blocks: */ + if (src->block != instr->block) + continue; + + dag_add_edge(&sn->dag, &n->dag, NULL); + } + } + + // TODO do we want to do this after reverse-dependencies? + dag_traverse_bottom_up(ctx->dag, sched_dag_max_delay_cb, NULL); +} + +static void +sched_dag_destroy(struct ir3_postsched_ctx *ctx) +{ + ralloc_free(ctx->mem_ctx); + ctx->mem_ctx = NULL; + ctx->dag = NULL; +} + +static void +sched_block(struct ir3_postsched_ctx *ctx, struct ir3_block *block) +{ + ctx->block = block; + ctx->scheduled = NULL; + ctx->pred = NULL; + + /* move all instructions to the unscheduled list, and + * empty the block's instruction list (to which we will + * be inserting). + */ + list_replace(&block->instr_list, &ctx->unscheduled_list); + list_inithead(&block->instr_list); + + // TODO once we are using post-sched for everything we can + // just not stick in NOP's prior to post-sched, and drop this. + // for now keep this, since it makes post-sched optional: + foreach_instr_safe (instr, &ctx->unscheduled_list) { + switch (instr->opc) { + case OPC_NOP: + case OPC_BR: + case OPC_JUMP: + list_delinit(&instr->node); + break; + default: + break; + } + } + + sched_dag_init(ctx); + + /* First schedule all meta:input instructions, followed by + * tex-prefetch. We want all of the instructions that load + * values into registers before the shader starts to go + * before any other instructions. But in particular we + * want inputs to come before prefetches. This is because + * a FS's bary_ij input may not actually be live in the + * shader, but it should not be scheduled on top of any + * other input (but can be overwritten by a tex prefetch) + */ + foreach_instr_safe (instr, &ctx->unscheduled_list) + if (instr->opc == OPC_META_INPUT) + schedule(ctx, instr); + + foreach_instr_safe (instr, &ctx->unscheduled_list) + if (instr->opc == OPC_META_TEX_PREFETCH) + schedule(ctx, instr); + + while (!list_is_empty(&ctx->unscheduled_list)) { + struct ir3_instruction *instr; + + instr = choose_instr(ctx); + + /* this shouldn't happen: */ + if (!instr) { + ctx->error = true; + break; + } + + unsigned delay = ir3_delay_calc(ctx->block, instr, false, false); + d("delay=%u", delay); + + /* and if we run out of instructions that can be scheduled, + * then it is time for nop's: + */ + debug_assert(delay <= 6); + while (delay > 0) { + ir3_NOP(block); + delay--; + } + + schedule(ctx, instr); + } + + sched_dag_destroy(ctx); +} + + +static bool +is_self_mov(struct ir3_instruction *instr) +{ + if (!is_same_type_mov(instr)) + return false; + + if (instr->regs[0]->num != instr->regs[1]->num) + return false; + + if (instr->regs[0]->flags & IR3_REG_RELATIV) + return false; + + if (instr->regs[1]->flags & (IR3_REG_CONST | IR3_REG_IMMED | + IR3_REG_RELATIV | IR3_REG_FNEG | IR3_REG_FABS | + IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT | + IR3_REG_EVEN | IR3_REG_POS_INF)) + return false; + + return true; +} + +/* sometimes we end up w/ in-place mov's, ie. mov.u32u32 r1.y, r1.y + * as a result of places were before RA we are not sure that it is + * safe to eliminate. We could eliminate these earlier, but sometimes + * they are tangled up in false-dep's, etc, so it is easier just to + * let them exist until after RA + */ +static void +cleanup_self_movs(struct ir3 *ir) +{ + foreach_block (block, &ir->block_list) { + foreach_instr_safe (instr, &block->instr_list) { + struct ir3_register *reg; + + foreach_src (reg, instr) { + if (!reg->instr) + continue; + + if (is_self_mov(reg->instr)) { + list_delinit(®->instr->node); + reg->instr = reg->instr->regs[1]->instr; + } + } + + for (unsigned i = 0; i < instr->deps_count; i++) { + if (is_self_mov(instr->deps[i])) { + list_delinit(&instr->deps[i]->node); + instr->deps[i] = instr->deps[i]->regs[1]->instr; + } + } + } + } +} + +int +ir3_postsched(struct ir3_context *cctx) +{ + struct ir3_postsched_ctx ctx = { + .ctx = cctx, + }; + + ir3_remove_nops(cctx->ir); + cleanup_self_movs(cctx->ir); + + foreach_block (block, &cctx->ir->block_list) { + sched_block(&ctx, block); + } + + if (ctx.error) + return -1; + + return 0; +} diff --git a/src/freedreno/ir3/ir3_ra.c b/src/freedreno/ir3/ir3_ra.c index 823968a5cec..0d1028a76d7 100644 --- a/src/freedreno/ir3/ir3_ra.c +++ b/src/freedreno/ir3/ir3_ra.c @@ -385,14 +385,11 @@ size_to_class(unsigned sz, bool half, bool high) static bool writes_gpr(struct ir3_instruction *instr) { - if (is_store(instr)) - return false; - if (instr->regs_count == 0) + if (dest_regs(instr) == 0) return false; /* is dest a normal temp register: */ struct ir3_register *reg = instr->regs[0]; - if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED)) - return false; + debug_assert(!(reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))); if ((reg->num == regid(REG_A0, 0)) || (reg->num == regid(REG_P0, 0))) return false; diff --git a/src/freedreno/ir3/meson.build b/src/freedreno/ir3/meson.build index 0c2cb21930a..89ec273034e 100644 --- a/src/freedreno/ir3/meson.build +++ b/src/freedreno/ir3/meson.build @@ -71,6 +71,7 @@ libfreedreno_ir3_files = files( 'ir3_nir_lower_tex_prefetch.c', 'ir3_nir_lower_tg4_to_tex.c', 'ir3_nir_move_varying_inputs.c', + 'ir3_postsched.c', 'ir3_print.c', 'ir3_ra.c', 'ir3_sched.c', -- 2.30.2