-/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
-
/*
* Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
*
return d;
}
+/**
+ * @block: the block to search in, starting from end; in first pass,
+ * this will be the block the instruction would be inserted into
+ * (but has not yet, ie. it only contains already scheduled
+ * instructions). For intra-block scheduling (second pass), this
+ * would be one of the predecessor blocks.
+ * @instr: the instruction to search for
+ * @maxd: max distance, bail after searching this # of instruction
+ * slots, since it means the instruction we are looking for is
+ * far enough away
+ * @pred: if true, recursively search into predecessor blocks to
+ * find the worst case (shortest) distance (only possible after
+ * individual blocks are all scheduled
+ */
static unsigned
-distance(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr,
- unsigned maxd)
+distance(struct ir3_block *block, struct ir3_instruction *instr,
+ unsigned maxd, bool pred)
{
- struct list_head *instr_list = &ctx->block->instr_list;
unsigned d = 0;
- list_for_each_entry_rev (struct ir3_instruction, n, instr_list, node) {
+ list_for_each_entry_rev (struct ir3_instruction, n, &block->instr_list, node) {
if ((n == instr) || (d >= maxd))
- break;
- if (is_alu(n) || is_flow(n))
+ return d;
+ /* NOTE: don't count branch/jump since we don't know yet if they will
+ * be eliminated later in resolve_jumps().. really should do that
+ * earlier so we don't have this constraint.
+ */
+ if (is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR)))
d++;
}
+ /* if coming from a predecessor block, assume it is assigned far
+ * enough away.. we'll fix up later.
+ */
+ if (!pred)
+ return maxd;
+
+ if (pred && (block->data != block)) {
+ /* Search into predecessor blocks, finding the one with the
+ * shortest distance, since that will be the worst case
+ */
+ unsigned min = maxd - d;
+
+ /* (ab)use block->data to prevent recursion: */
+ block->data = block;
+
+ for (unsigned i = 0; i < block->predecessors_count; i++) {
+ unsigned n;
+
+ n = distance(block->predecessors[i], instr, min, pred);
+
+ min = MIN2(min, n);
+ }
+
+ block->data = NULL;
+ d += min;
+ }
+
return d;
}
/* calculate delay for specified src: */
static unsigned
-delay_calc_srcn(struct ir3_sched_ctx *ctx,
+delay_calc_srcn(struct ir3_block *block,
struct ir3_instruction *assigner,
- struct ir3_instruction *consumer, unsigned srcn)
+ struct ir3_instruction *consumer,
+ unsigned srcn, bool soft, bool pred)
{
unsigned delay = 0;
struct ir3_instruction *src;
foreach_ssa_src(src, assigner) {
unsigned d;
- if (src->block != assigner->block)
- break;
- d = delay_calc_srcn(ctx, src, consumer, srcn);
+ d = delay_calc_srcn(block, src, consumer, srcn, soft, pred);
delay = MAX2(delay, d);
}
} else {
- delay = ir3_delayslots(assigner, consumer, srcn);
- delay -= distance(ctx, assigner, delay);
+ if (soft) {
+ if (is_sfu(assigner)) {
+ delay = 4;
+ } else {
+ delay = ir3_delayslots(assigner, consumer, srcn);
+ }
+ } else {
+ delay = ir3_delayslots(assigner, consumer, srcn);
+ }
+ delay -= distance(block, assigner, delay, pred);
}
return delay;
/* calculate delay for instruction (maximum of delay for all srcs): */
static unsigned
-delay_calc(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
+delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
+ bool soft, bool pred)
{
unsigned delay = 0;
struct ir3_instruction *src;
foreach_ssa_src_n(src, i, instr) {
unsigned d;
- /* for array writes, no need to delay on previous write: */
- if (i == 0)
- continue;
- if (src->block != instr->block)
- continue;
- d = delay_calc_srcn(ctx, src, instr, i);
+ d = delay_calc_srcn(block, src, instr, i, soft, pred);
delay = MAX2(delay, d);
}
/* find instruction to schedule: */
static struct ir3_instruction *
-find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes)
+find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
+ bool soft)
{
struct ir3_instruction *best_instr = NULL;
unsigned min_delay = ~0;
if (!candidate)
continue;
- delay = delay_calc(ctx, candidate);
+ delay = delay_calc(ctx->block, candidate, soft, false);
if (delay < min_delay) {
best_instr = candidate;
min_delay = delay;
list_inithead(&block->instr_list);
list_inithead(&ctx->depth_list);
- /* first a pre-pass to schedule all meta:input/phi instructions
+ /* first a pre-pass to schedule all meta:input instructions
* (which need to appear first so that RA knows the register is
* occupied), and move remaining to depth sorted list:
*/
list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node) {
- if (is_meta(instr) && ((instr->opc == OPC_META_INPUT) ||
- (instr->opc == OPC_META_PHI))) {
+ if (instr->opc == OPC_META_INPUT) {
schedule(ctx, instr);
} else {
ir3_insert_by_depth(instr, &ctx->depth_list);
struct ir3_sched_notes notes = {0};
struct ir3_instruction *instr;
- instr = find_eligible_instr(ctx, ¬es);
+ instr = find_eligible_instr(ctx, ¬es, true);
+ if (!instr)
+ instr = find_eligible_instr(ctx, ¬es, false);
if (instr) {
- unsigned delay = delay_calc(ctx, instr);
+ unsigned delay = delay_calc(ctx->block, instr, false, false);
/* and if we run out of instructions that can be scheduled,
* then it is time for nop's:
debug_assert(ctx->pred);
debug_assert(block->condition);
- delay -= distance(ctx, ctx->pred, delay);
+ delay -= distance(ctx->block, ctx->pred, delay, false);
while (delay > 0) {
ir3_NOP(block);
*/
}
-/* this is needed to ensure later RA stage succeeds: */
+/* After scheduling individual blocks, we still could have cases where
+ * one (or more) paths into a block, a value produced by a previous
+ * has too few delay slots to be legal. We can't deal with this in the
+ * first pass, because loops (ie. we can't ensure all predecessor blocks
+ * are already scheduled in the first pass). All we can really do at
+ * this point is stuff in extra nop's until things are legal.
+ */
static void
-sched_insert_parallel_copies(struct ir3_block *block)
+sched_intra_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
{
- list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
- if (is_meta(instr) && (instr->opc == OPC_META_PHI)) {
- struct ir3_register *reg;
- foreach_src(reg, instr) {
- struct ir3_instruction *src = reg->instr;
- struct ir3_instruction *mov =
- ir3_MOV(src->block, src, TYPE_U32);
- mov->regs[0]->flags |= IR3_REG_PHI_SRC;
- mov->regs[0]->instr = instr;
- reg->instr = mov;
- }
+ unsigned n = 0;
+
+ ctx->block = block;
+
+ list_for_each_entry_safe (struct ir3_instruction, instr, &block->instr_list, node) {
+ unsigned delay = 0;
+
+ for (unsigned i = 0; i < block->predecessors_count; i++) {
+ unsigned d = delay_calc(block->predecessors[i], instr, false, true);
+ delay = MAX2(d, delay);
}
+
+ while (delay > n) {
+ struct ir3_instruction *nop = ir3_NOP(block);
+
+ /* move to before instr: */
+ list_delinit(&nop->node);
+ list_addtail(&nop->node, &instr->node);
+
+ n++;
+ }
+
+ /* we can bail once we hit worst case delay: */
+ if (++n > 6)
+ break;
}
}
int ir3_sched(struct ir3 *ir)
{
struct ir3_sched_ctx ctx = {0};
- list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
- sched_insert_parallel_copies(block);
- }
+
ir3_clear_mark(ir);
+
list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
sched_block(&ctx, block);
}
+
+ list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+ sched_intra_block(&ctx, block);
+ }
+
if (ctx.error)
return -1;
return 0;
}
+
+/* does instruction 'prior' need to be scheduled before 'instr'? */
+static bool
+depends_on(struct ir3_instruction *instr, struct ir3_instruction *prior)
+{
+ /* TODO for dependencies that are related to a specific object, ie
+ * a specific SSBO/image/array, we could relax this constraint to
+ * make accesses to unrelated objects not depend on each other (at
+ * least as long as not declared coherent)
+ */
+ if (((instr->barrier_class & IR3_BARRIER_EVERYTHING) && prior->barrier_class) ||
+ ((prior->barrier_class & IR3_BARRIER_EVERYTHING) && instr->barrier_class))
+ return true;
+ return !!(instr->barrier_class & prior->barrier_conflict);
+}
+
+static void
+add_barrier_deps(struct ir3_block *block, struct ir3_instruction *instr)
+{
+ struct list_head *prev = instr->node.prev;
+ struct list_head *next = instr->node.next;
+
+ /* add dependencies on previous instructions that must be scheduled
+ * prior to the current instruction
+ */
+ while (prev != &block->instr_list) {
+ struct ir3_instruction *pi =
+ LIST_ENTRY(struct ir3_instruction, prev, node);
+
+ prev = prev->prev;
+
+ if (is_meta(pi))
+ continue;
+
+ if (instr->barrier_class == pi->barrier_class) {
+ ir3_instr_add_dep(instr, pi);
+ break;
+ }
+
+ if (depends_on(instr, pi))
+ ir3_instr_add_dep(instr, pi);
+ }
+
+ /* add dependencies on this instruction to following instructions
+ * that must be scheduled after the current instruction:
+ */
+ while (next != &block->instr_list) {
+ struct ir3_instruction *ni =
+ LIST_ENTRY(struct ir3_instruction, next, node);
+
+ next = next->next;
+
+ if (is_meta(ni))
+ continue;
+
+ if (instr->barrier_class == ni->barrier_class) {
+ ir3_instr_add_dep(ni, instr);
+ break;
+ }
+
+ if (depends_on(ni, instr))
+ ir3_instr_add_dep(ni, instr);
+ }
+}
+
+/* before scheduling a block, we need to add any necessary false-dependencies
+ * to ensure that:
+ *
+ * (1) barriers are scheduled in the right order wrt instructions related
+ * to the barrier
+ *
+ * (2) reads that come before a write actually get scheduled before the
+ * write
+ */
+static void
+calculate_deps(struct ir3_block *block)
+{
+ list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+ if (instr->barrier_class) {
+ add_barrier_deps(block, instr);
+ }
+ }
+}
+
+void
+ir3_sched_add_deps(struct ir3 *ir)
+{
+ list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+ calculate_deps(block);
+ }
+}