-/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
-
/*
* Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
*
#include "ir3.h"
-enum {
- SCHEDULED = -1,
- DELAYED = -2,
-};
-
/*
* Instruction Scheduling:
*
- * Using the depth sorted list from depth pass, attempt to recursively
- * schedule deepest unscheduled path. The first instruction that cannot
- * be scheduled, returns the required delay slots it needs, at which
- * point we return back up to the top and attempt to schedule by next
- * highest depth. After a sufficient number of instructions have been
- * scheduled, return back to beginning of list and start again. If you
- * reach the end of depth sorted list without being able to insert any
- * instruction, insert nop's. Repeat until no more unscheduled
- * instructions.
+ * A recursive depth based scheduling algo. Recursively find an eligible
+ * instruction to schedule from the deepest instruction (recursing through
+ * it's unscheduled src instructions). Normally this would result in a
+ * lot of re-traversal of the same instructions, so we cache results in
+ * instr->data (and clear cached results that would be no longer valid
+ * after scheduling an instruction).
*
* There are a few special cases that need to be handled, since sched
* is currently independent of register allocation. Usages of address
*/
struct ir3_sched_ctx {
- struct ir3_instruction *scheduled; /* last scheduled instr */
+ struct ir3_block *block; /* the current block */
+ struct list_head depth_list; /* depth sorted unscheduled instrs */
+ struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/
struct ir3_instruction *addr; /* current a0.x user, if any */
struct ir3_instruction *pred; /* current p0.x user, if any */
- unsigned cnt;
bool error;
};
+static bool is_sfu_or_mem(struct ir3_instruction *instr)
+{
+ return is_sfu(instr) || is_mem(instr);
+}
+
+#define NULL_INSTR ((void *)~0)
+
+static void
+clear_cache(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
+{
+ list_for_each_entry (struct ir3_instruction, instr2, &ctx->depth_list, node) {
+ if ((instr2->data == instr) || (instr2->data == NULL_INSTR) || !instr)
+ instr2->data = NULL;
+ }
+}
+
+static void
+schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
+{
+ debug_assert(ctx->block == instr->block);
+
+ /* maybe there is a better way to handle this than just stuffing
+ * a nop.. ideally we'd know about this constraint in the
+ * scheduling and depth calculation..
+ */
+ if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr))
+ ir3_NOP(ctx->block);
+
+ /* remove from depth list:
+ */
+ list_delinit(&instr->node);
+
+ if (writes_addr(instr)) {
+ debug_assert(ctx->addr == NULL);
+ ctx->addr = instr;
+ }
+
+ if (writes_pred(instr)) {
+ debug_assert(ctx->pred == NULL);
+ ctx->pred = instr;
+ }
+
+ instr->flags |= IR3_INSTR_MARK;
+
+ list_addtail(&instr->node, &instr->block->instr_list);
+ ctx->scheduled = instr;
+
+ if (writes_addr(instr) || writes_pred(instr) || is_input(instr)) {
+ clear_cache(ctx, NULL);
+ } else {
+ /* invalidate only the necessary entries.. */
+ clear_cache(ctx, instr);
+ }
+}
+
static struct ir3_instruction *
deepest(struct ir3_instruction **srcs, unsigned nsrcs)
{
return d;
}
-static unsigned distance(struct ir3_sched_ctx *ctx,
- struct ir3_instruction *instr, unsigned maxd)
+/**
+ * @block: the block to search in, starting from end; in first pass,
+ * this will be the block the instruction would be inserted into
+ * (but has not yet, ie. it only contains already scheduled
+ * instructions). For intra-block scheduling (second pass), this
+ * would be one of the predecessor blocks.
+ * @instr: the instruction to search for
+ * @maxd: max distance, bail after searching this # of instruction
+ * slots, since it means the instruction we are looking for is
+ * far enough away
+ * @pred: if true, recursively search into predecessor blocks to
+ * find the worst case (shortest) distance (only possible after
+ * individual blocks are all scheduled
+ */
+static unsigned
+distance(struct ir3_block *block, struct ir3_instruction *instr,
+ unsigned maxd, bool pred)
{
- struct ir3_instruction *n = ctx->scheduled;
unsigned d = 0;
- while (n && (n != instr) && (d < maxd)) {
- if (is_alu(n) || is_flow(n))
+
+ list_for_each_entry_rev (struct ir3_instruction, n, &block->instr_list, node) {
+ if ((n == instr) || (d >= maxd))
+ return d;
+ /* NOTE: don't count branch/jump since we don't know yet if they will
+ * be eliminated later in resolve_jumps().. really should do that
+ * earlier so we don't have this constraint.
+ */
+ if (is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR)))
d++;
- n = n->next;
}
- return d;
-}
-
-/* TODO maybe we want double linked list? */
-static struct ir3_instruction * prev(struct ir3_instruction *instr)
-{
- struct ir3_instruction *p = instr->block->head;
- while (p && (p->next != instr))
- p = p->next;
- return p;
-}
-
-static bool is_sfu_or_mem(struct ir3_instruction *instr)
-{
- return is_sfu(instr) || is_mem(instr);
-}
-
-static void schedule(struct ir3_sched_ctx *ctx,
- struct ir3_instruction *instr, bool remove)
-{
- struct ir3_block *block = instr->block;
-
- /* maybe there is a better way to handle this than just stuffing
- * a nop.. ideally we'd know about this constraint in the
- * scheduling and depth calculation..
- */
- if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr))
- schedule(ctx, ir3_instr_create(block, 0, OPC_NOP), false);
- /* remove from depth list:
+ /* if coming from a predecessor block, assume it is assigned far
+ * enough away.. we'll fix up later.
*/
- if (remove) {
- struct ir3_instruction *p = prev(instr);
+ if (!pred)
+ return maxd;
- /* NOTE: this can happen for inputs which are not
- * read.. in that case there is no need to schedule
- * the input, so just bail:
+ if (pred && (block->data != block)) {
+ /* Search into predecessor blocks, finding the one with the
+ * shortest distance, since that will be the worst case
*/
- if (instr != (p ? p->next : block->head))
- return;
+ unsigned min = maxd - d;
- if (p)
- p->next = instr->next;
- else
- block->head = instr->next;
- }
+ /* (ab)use block->data to prevent recursion: */
+ block->data = block;
- if (writes_addr(instr)) {
- assert(ctx->addr == NULL);
- ctx->addr = instr;
- }
+ for (unsigned i = 0; i < block->predecessors_count; i++) {
+ unsigned n;
- if (writes_pred(instr)) {
- assert(ctx->pred == NULL);
- ctx->pred = instr;
- }
+ n = distance(block->predecessors[i], instr, min, pred);
- instr->flags |= IR3_INSTR_MARK;
+ min = MIN2(min, n);
+ }
- instr->next = ctx->scheduled;
- ctx->scheduled = instr;
+ block->data = NULL;
+ d += min;
+ }
- ctx->cnt++;
+ return d;
}
-/*
- * Delay-slot calculation. Follows fanin/fanout.
- */
-
/* calculate delay for specified src: */
-static unsigned delay_calc_srcn(struct ir3_sched_ctx *ctx,
+static unsigned
+delay_calc_srcn(struct ir3_block *block,
struct ir3_instruction *assigner,
- struct ir3_instruction *consumer, unsigned srcn)
+ struct ir3_instruction *consumer,
+ unsigned srcn, bool soft, bool pred)
{
unsigned delay = 0;
if (is_meta(assigner)) {
struct ir3_instruction *src;
foreach_ssa_src(src, assigner) {
- unsigned d = delay_calc_srcn(ctx, src, consumer, srcn);
+ unsigned d;
+ d = delay_calc_srcn(block, src, consumer, srcn, soft, pred);
delay = MAX2(delay, d);
}
} else {
- delay = ir3_delayslots(assigner, consumer, srcn);
- delay -= distance(ctx, assigner, delay);
+ if (soft) {
+ if (is_sfu(assigner)) {
+ delay = 4;
+ } else {
+ delay = ir3_delayslots(assigner, consumer, srcn);
+ }
+ } else {
+ delay = ir3_delayslots(assigner, consumer, srcn);
+ }
+ delay -= distance(block, assigner, delay, pred);
}
return delay;
}
/* calculate delay for instruction (maximum of delay for all srcs): */
-static unsigned delay_calc(struct ir3_sched_ctx *ctx,
- struct ir3_instruction *instr)
+static unsigned
+delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
+ bool soft, bool pred)
{
unsigned delay = 0;
struct ir3_instruction *src;
foreach_ssa_src_n(src, i, instr) {
- unsigned d = delay_calc_srcn(ctx, src, instr, i);
+ unsigned d;
+ d = delay_calc_srcn(block, src, instr, i, soft, pred);
delay = MAX2(delay, d);
}
return delay;
}
-/* A negative return value signals that an instruction has been newly
- * SCHEDULED (or DELAYED due to address or predicate register already
- * in use), return back up to the top of the stack (to block_sched())
+struct ir3_sched_notes {
+ /* there is at least one kill which could be scheduled, except
+ * for unscheduled bary.f's:
+ */
+ bool blocked_kill;
+ /* there is at least one instruction that could be scheduled,
+ * except for conflicting address/predicate register usage:
+ */
+ bool addr_conflict, pred_conflict;
+};
+
+static bool is_scheduled(struct ir3_instruction *instr)
+{
+ return !!(instr->flags & IR3_INSTR_MARK);
+}
+
+/* could an instruction be scheduled if specified ssa src was scheduled? */
+static bool
+could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
+{
+ struct ir3_instruction *other_src;
+ foreach_ssa_src(other_src, instr) {
+ /* if dependency not scheduled, we aren't ready yet: */
+ if ((src != other_src) && !is_scheduled(other_src)) {
+ return false;
+ }
+ }
+ return true;
+}
+
+/* Check if instruction is ok to schedule. Make sure it is not blocked
+ * by use of addr/predicate register, etc.
*/
-static int trysched(struct ir3_sched_ctx *ctx,
+static bool
+check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
struct ir3_instruction *instr)
{
- struct ir3_instruction *srcs[64];
- struct ir3_instruction *src;
- unsigned delay, nsrcs = 0;
-
- /* if already scheduled: */
- if (instr->flags & IR3_INSTR_MARK)
- return 0;
+ /* For instructions that write address register we need to
+ * make sure there is at least one instruction that uses the
+ * addr value which is otherwise ready.
+ *
+ * TODO if any instructions use pred register and have other
+ * src args, we would need to do the same for writes_pred()..
+ */
+ if (writes_addr(instr)) {
+ struct ir3 *ir = instr->block->shader;
+ bool ready = false;
+ for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) {
+ struct ir3_instruction *indirect = ir->indirects[i];
+ if (!indirect)
+ continue;
+ if (indirect->address != instr)
+ continue;
+ ready = could_sched(indirect, instr);
+ }
- /* figure out our src's, copy 'em out into an array for sorting: */
- foreach_ssa_src(src, instr) {
- debug_assert(nsrcs < ARRAY_SIZE(srcs));
- srcs[nsrcs++] = src;
+ /* nothing could be scheduled, so keep looking: */
+ if (!ready)
+ return false;
}
- /* for each src register in sorted order:
+ /* if this is a write to address/predicate register, and that
+ * register is currently in use, we need to defer until it is
+ * free:
*/
- delay = 0;
- while ((src = deepest(srcs, nsrcs))) {
- delay = trysched(ctx, src);
- if (delay)
- return delay;
+ if (writes_addr(instr) && ctx->addr) {
+ debug_assert(ctx->addr != instr);
+ notes->addr_conflict = true;
+ return false;
}
- /* all our dependents are scheduled, figure out if
- * we have enough delay slots to schedule ourself:
- */
- delay = delay_calc(ctx, instr);
- if (delay)
- return delay;
+ if (writes_pred(instr) && ctx->pred) {
+ debug_assert(ctx->pred != instr);
+ notes->pred_conflict = true;
+ return false;
+ }
/* if the instruction is a kill, we need to ensure *every*
* bary.f is scheduled. The hw seems unhappy if the thread
*/
if (is_kill(instr)) {
struct ir3 *ir = instr->block->shader;
- unsigned i;
- for (i = 0; i < ir->baryfs_count; i++) {
+ for (unsigned i = 0; i < ir->baryfs_count; i++) {
struct ir3_instruction *baryf = ir->baryfs[i];
- if (baryf->depth == DEPTH_UNUSED)
+ if (baryf->flags & IR3_INSTR_UNUSED)
continue;
- delay = trysched(ctx, baryf);
- if (delay)
- return delay;
+ if (!is_scheduled(baryf)) {
+ notes->blocked_kill = true;
+ return false;
+ }
}
}
- /* if instruction writes address register, we need to ensure
- * that the instructions which use the address register value
- * have all their other dependencies scheduled.
- * TODO we may possibly need to do the same thing with predicate
- * register usage, but for now we get by without since the
- * predicate usage patterns are more simple
+ return true;
+}
+
+/* Find the best instruction to schedule from specified instruction or
+ * recursively it's ssa sources.
+ */
+static struct ir3_instruction *
+find_instr_recursive(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
+ struct ir3_instruction *instr)
+{
+ struct ir3_instruction *srcs[__ssa_src_cnt(instr)];
+ struct ir3_instruction *src;
+ unsigned nsrcs = 0;
+
+ if (is_scheduled(instr))
+ return NULL;
+
+ /* use instr->data to cache the results of recursing up the
+ * instr src's. Otherwise the recursive algo can scale quite
+ * badly w/ shader size. But this takes some care to clear
+ * the cache appropriately when instructions are scheduled.
*/
- if (writes_addr(instr)) {
- struct ir3 *ir = instr->block->shader;
- unsigned i;
+ if (instr->data) {
+ if (instr->data == NULL_INSTR)
+ return NULL;
+ return instr->data;
+ }
- for (i = 0; i < ir->indirects_count; i++) {
- struct ir3_instruction *indirect = ir->indirects[i];
- if (indirect->depth == DEPTH_UNUSED)
- continue;
- if (indirect->address != instr)
- continue;
- /* NOTE: avoid recursively scheduling the dependency
- * on ourself (ie. avoid infinite recursion):
- */
- foreach_ssa_src(src, indirect) {
- if ((src == instr) || (src->address == instr))
- continue;
- delay = trysched(ctx, src);
- if (delay)
- return delay;
- }
+ /* find unscheduled srcs: */
+ foreach_ssa_src(src, instr) {
+ if (!is_scheduled(src)) {
+ debug_assert(nsrcs < ARRAY_SIZE(srcs));
+ srcs[nsrcs++] = src;
}
}
- /* if this is a write to address/predicate register, and that
- * register is currently in use, we need to defer until it is
- * free:
- */
- if (writes_addr(instr) && ctx->addr) {
- assert(ctx->addr != instr);
- return DELAYED;
+ /* if all our src's are already scheduled: */
+ if (nsrcs == 0) {
+ if (check_instr(ctx, notes, instr)) {
+ instr->data = instr;
+ return instr;
+ }
+ return NULL;
}
- if (writes_pred(instr) && ctx->pred) {
- assert(ctx->pred != instr);
- return DELAYED;
+
+ while ((src = deepest(srcs, nsrcs))) {
+ struct ir3_instruction *candidate;
+
+ candidate = find_instr_recursive(ctx, notes, src);
+ if (!candidate)
+ continue;
+
+ if (check_instr(ctx, notes, candidate)) {
+ instr->data = candidate;
+ return candidate;
+ }
}
- schedule(ctx, instr, true);
- return SCHEDULED;
+ instr->data = NULL_INSTR;
+ return NULL;
}
-static struct ir3_instruction * reverse(struct ir3_instruction *instr)
+/* find instruction to schedule: */
+static struct ir3_instruction *
+find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
+ bool soft)
{
- struct ir3_instruction *reversed = NULL;
- while (instr) {
- struct ir3_instruction *next = instr->next;
- instr->next = reversed;
- reversed = instr;
- instr = next;
+ struct ir3_instruction *best_instr = NULL;
+ unsigned min_delay = ~0;
+
+ /* TODO we'd really rather use the list/array of block outputs. But we
+ * don't have such a thing. Recursing *every* instruction in the list
+ * will result in a lot of repeated traversal, since instructions will
+ * get traversed both when they appear as ssa src to a later instruction
+ * as well as where they appear in the depth_list.
+ */
+ list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) {
+ struct ir3_instruction *candidate;
+ unsigned delay;
+
+ candidate = find_instr_recursive(ctx, notes, instr);
+ if (!candidate)
+ continue;
+
+ delay = delay_calc(ctx->block, candidate, soft, false);
+ if (delay < min_delay) {
+ best_instr = candidate;
+ min_delay = delay;
+ }
+
+ if (min_delay == 0)
+ break;
}
- return reversed;
-}
-static bool uses_current_addr(struct ir3_sched_ctx *ctx,
- struct ir3_instruction *instr)
-{
- return instr->address && (ctx->addr == instr->address);
+ return best_instr;
}
-static bool uses_current_pred(struct ir3_sched_ctx *ctx,
- struct ir3_instruction *instr)
+/* "spill" the address register by remapping any unscheduled
+ * instructions which depend on the current address register
+ * to a clone of the instruction which wrote the address reg.
+ */
+static struct ir3_instruction *
+split_addr(struct ir3_sched_ctx *ctx)
{
- struct ir3_instruction *src;
- foreach_ssa_src(src, instr)
- if (ctx->pred == src)
- return true;
- return false;
+ struct ir3 *ir;
+ struct ir3_instruction *new_addr = NULL;
+ unsigned i;
+
+ debug_assert(ctx->addr);
+
+ ir = ctx->addr->block->shader;
+
+ for (i = 0; i < ir->indirects_count; i++) {
+ struct ir3_instruction *indirect = ir->indirects[i];
+
+ if (!indirect)
+ continue;
+
+ /* skip instructions already scheduled: */
+ if (is_scheduled(indirect))
+ continue;
+
+ /* remap remaining instructions using current addr
+ * to new addr:
+ */
+ if (indirect->address == ctx->addr) {
+ if (!new_addr) {
+ new_addr = ir3_instr_clone(ctx->addr);
+ /* original addr is scheduled, but new one isn't: */
+ new_addr->flags &= ~IR3_INSTR_MARK;
+ }
+ ir3_instr_set_address(indirect, new_addr);
+ }
+ }
+
+ /* all remaining indirects remapped to new addr: */
+ ctx->addr = NULL;
+
+ return new_addr;
}
-/* when we encounter an instruction that writes to the address register
- * when it is in use, we delay that instruction and try to schedule all
- * other instructions using the current address register:
+/* "spill" the predicate register by remapping any unscheduled
+ * instructions which depend on the current predicate register
+ * to a clone of the instruction which wrote the address reg.
*/
-static int block_sched_undelayed(struct ir3_sched_ctx *ctx,
- struct ir3_block *block)
+static struct ir3_instruction *
+split_pred(struct ir3_sched_ctx *ctx)
{
- struct ir3_instruction *instr = block->head;
- bool addr_in_use = false;
- bool pred_in_use = false;
- bool all_delayed = true;
- unsigned cnt = ~0, attempted = 0;
-
- while (instr) {
- struct ir3_instruction *next = instr->next;
- bool addr = uses_current_addr(ctx, instr);
- bool pred = uses_current_pred(ctx, instr);
-
- if (addr || pred) {
- int ret = trysched(ctx, instr);
-
- if (ret != DELAYED)
- all_delayed = false;
-
- if (ret == SCHEDULED)
- cnt = 0;
- else if (ret > 0)
- cnt = MIN2(cnt, ret);
- if (addr)
- addr_in_use = true;
- if (pred)
- pred_in_use = true;
-
- attempted++;
- }
+ struct ir3 *ir;
+ struct ir3_instruction *new_pred = NULL;
+ unsigned i;
- instr = next;
- }
+ debug_assert(ctx->pred);
- if (!addr_in_use)
- ctx->addr = NULL;
+ ir = ctx->pred->block->shader;
- if (!pred_in_use)
- ctx->pred = NULL;
+ for (i = 0; i < ir->predicates_count; i++) {
+ struct ir3_instruction *predicated = ir->predicates[i];
- /* detect if we've gotten ourselves into an impossible situation
- * and bail if needed
- */
- if (all_delayed && (attempted > 0))
- ctx->error = true;
+ /* skip instructions already scheduled: */
+ if (is_scheduled(predicated))
+ continue;
+
+ /* remap remaining instructions using current pred
+ * to new pred:
+ *
+ * TODO is there ever a case when pred isn't first
+ * (and only) src?
+ */
+ if (ssa(predicated->regs[1]) == ctx->pred) {
+ if (!new_pred) {
+ new_pred = ir3_instr_clone(ctx->pred);
+ /* original pred is scheduled, but new one isn't: */
+ new_pred->flags &= ~IR3_INSTR_MARK;
+ }
+ predicated->regs[1]->instr = new_pred;
+ }
+ }
+
+ /* all remaining predicated remapped to new pred: */
+ ctx->pred = NULL;
- return cnt;
+ return new_pred;
}
-static void block_sched(struct ir3_sched_ctx *ctx, struct ir3_block *block)
+static void
+sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
{
- struct ir3_instruction *instr;
+ struct list_head unscheduled_list;
- /* schedule all the shader input's (meta-instr) first so that
- * the RA step sees that the input registers contain a value
- * from the start of the shader:
+ ctx->block = block;
+
+ /* addr/pred writes are per-block: */
+ ctx->addr = NULL;
+ ctx->pred = NULL;
+
+ /* move all instructions to the unscheduled list, and
+ * empty the block's instruction list (to which we will
+ * be inserting).
+ */
+ list_replace(&block->instr_list, &unscheduled_list);
+ list_inithead(&block->instr_list);
+ list_inithead(&ctx->depth_list);
+
+ /* first a pre-pass to schedule all meta:input instructions
+ * (which need to appear first so that RA knows the register is
+ * occupied), and move remaining to depth sorted list:
*/
- if (!block->parent) {
- unsigned i;
- for (i = 0; i < block->ninputs; i++) {
- struct ir3_instruction *in = block->inputs[i];
- if (in)
- schedule(ctx, in, true);
+ list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node) {
+ if (instr->opc == OPC_META_INPUT) {
+ schedule(ctx, instr);
+ } else {
+ ir3_insert_by_depth(instr, &ctx->depth_list);
}
}
- while ((instr = block->head) && !ctx->error) {
- /* NOTE: always grab next *before* trysched(), in case the
- * instruction is actually scheduled (and therefore moved
- * from depth list into scheduled list)
- */
- struct ir3_instruction *next = instr->next;
- int cnt = trysched(ctx, instr);
+ while (!list_empty(&ctx->depth_list)) {
+ struct ir3_sched_notes notes = {0};
+ struct ir3_instruction *instr;
- if (cnt == DELAYED)
- cnt = block_sched_undelayed(ctx, block);
+ instr = find_eligible_instr(ctx, ¬es, true);
+ if (!instr)
+ instr = find_eligible_instr(ctx, ¬es, false);
- /* -1 is signal to return up stack, but to us means same as 0: */
- cnt = MAX2(0, cnt);
- cnt += ctx->cnt;
- instr = next;
+ if (instr) {
+ unsigned delay = delay_calc(ctx->block, instr, false, false);
- /* if deepest remaining instruction cannot be scheduled, try
- * the increasingly more shallow instructions until needed
- * number of delay slots is filled:
- */
- while (instr && (cnt > ctx->cnt)) {
- next = instr->next;
- trysched(ctx, instr);
- instr = next;
+ /* and if we run out of instructions that can be scheduled,
+ * then it is time for nop's:
+ */
+ debug_assert(delay <= 6);
+ while (delay > 0) {
+ ir3_NOP(block);
+ delay--;
+ }
+
+ schedule(ctx, instr);
+ } else {
+ struct ir3_instruction *new_instr = NULL;
+
+ /* nothing available to schedule.. if we are blocked on
+ * address/predicate register conflict, then break the
+ * deadlock by cloning the instruction that wrote that
+ * reg:
+ */
+ if (notes.addr_conflict) {
+ new_instr = split_addr(ctx);
+ } else if (notes.pred_conflict) {
+ new_instr = split_pred(ctx);
+ } else {
+ debug_assert(0);
+ ctx->error = true;
+ return;
+ }
+
+ if (new_instr) {
+ /* clearing current addr/pred can change what is
+ * available to schedule, so clear cache..
+ */
+ clear_cache(ctx, NULL);
+
+ ir3_insert_by_depth(new_instr, &ctx->depth_list);
+ /* the original instr that wrote addr/pred may have
+ * originated from a different block:
+ */
+ new_instr->block = block;
+ }
+ }
+ }
+
+ /* And lastly, insert branch/jump instructions to take us to
+ * the next block. Later we'll strip back out the branches
+ * that simply jump to next instruction.
+ */
+ if (block->successors[1]) {
+ /* if/else, conditional branches to "then" or "else": */
+ struct ir3_instruction *br;
+ unsigned delay = 6;
+
+ debug_assert(ctx->pred);
+ debug_assert(block->condition);
+
+ delay -= distance(ctx->block, ctx->pred, delay, false);
+
+ while (delay > 0) {
+ ir3_NOP(block);
+ delay--;
}
- /* and if we run out of instructions that can be scheduled,
- * then it is time for nop's:
+ /* create "else" branch first (since "then" block should
+ * frequently/always end up being a fall-thru):
*/
- while (cnt > ctx->cnt)
- schedule(ctx, ir3_instr_create(block, 0, OPC_NOP), false);
+ br = ir3_BR(block);
+ br->cat0.inv = true;
+ br->cat0.target = block->successors[1];
+
+ /* NOTE: we have to hard code delay of 6 above, since
+ * we want to insert the nop's before constructing the
+ * branch. Throw in an assert so we notice if this
+ * ever breaks on future generation:
+ */
+ debug_assert(ir3_delayslots(ctx->pred, br, 0) == 6);
+
+ br = ir3_BR(block);
+ br->cat0.target = block->successors[0];
+
+ } else if (block->successors[0]) {
+ /* otherwise unconditional jump to next block: */
+ struct ir3_instruction *jmp;
+
+ jmp = ir3_JUMP(block);
+ jmp->cat0.target = block->successors[0];
}
- /* at this point, scheduled list is in reverse order, so fix that: */
- block->head = reverse(ctx->scheduled);
+ /* NOTE: if we kept track of the predecessors, we could do a better
+ * job w/ (jp) flags.. every node w/ > predecessor is a join point.
+ * Note that as we eliminate blocks which contain only an unconditional
+ * jump we probably need to propagate (jp) flag..
+ */
}
-int ir3_block_sched(struct ir3_block *block)
+/* After scheduling individual blocks, we still could have cases where
+ * one (or more) paths into a block, a value produced by a previous
+ * has too few delay slots to be legal. We can't deal with this in the
+ * first pass, because loops (ie. we can't ensure all predecessor blocks
+ * are already scheduled in the first pass). All we can really do at
+ * this point is stuff in extra nop's until things are legal.
+ */
+static void
+sched_intra_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
+{
+ unsigned n = 0;
+
+ ctx->block = block;
+
+ list_for_each_entry_safe (struct ir3_instruction, instr, &block->instr_list, node) {
+ unsigned delay = 0;
+
+ for (unsigned i = 0; i < block->predecessors_count; i++) {
+ unsigned d = delay_calc(block->predecessors[i], instr, false, true);
+ delay = MAX2(d, delay);
+ }
+
+ while (delay > n) {
+ struct ir3_instruction *nop = ir3_NOP(block);
+
+ /* move to before instr: */
+ list_delinit(&nop->node);
+ list_addtail(&nop->node, &instr->node);
+
+ n++;
+ }
+
+ /* we can bail once we hit worst case delay: */
+ if (++n > 6)
+ break;
+ }
+}
+
+int ir3_sched(struct ir3 *ir)
{
struct ir3_sched_ctx ctx = {0};
- ir3_clear_mark(block->shader);
- block_sched(&ctx, block);
+
+ ir3_clear_mark(ir);
+
+ list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+ sched_block(&ctx, block);
+ }
+
+ list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+ sched_intra_block(&ctx, block);
+ }
+
if (ctx.error)
return -1;
return 0;
}
+
+/* does instruction 'prior' need to be scheduled before 'instr'? */
+static bool
+depends_on(struct ir3_instruction *instr, struct ir3_instruction *prior)
+{
+ /* TODO for dependencies that are related to a specific object, ie
+ * a specific SSBO/image/array, we could relax this constraint to
+ * make accesses to unrelated objects not depend on each other (at
+ * least as long as not declared coherent)
+ */
+ if (((instr->barrier_class & IR3_BARRIER_EVERYTHING) && prior->barrier_class) ||
+ ((prior->barrier_class & IR3_BARRIER_EVERYTHING) && instr->barrier_class))
+ return true;
+ return !!(instr->barrier_class & prior->barrier_conflict);
+}
+
+static void
+add_barrier_deps(struct ir3_block *block, struct ir3_instruction *instr)
+{
+ struct list_head *prev = instr->node.prev;
+ struct list_head *next = instr->node.next;
+
+ /* add dependencies on previous instructions that must be scheduled
+ * prior to the current instruction
+ */
+ while (prev != &block->instr_list) {
+ struct ir3_instruction *pi =
+ LIST_ENTRY(struct ir3_instruction, prev, node);
+
+ prev = prev->prev;
+
+ if (is_meta(pi))
+ continue;
+
+ if (instr->barrier_class == pi->barrier_class) {
+ ir3_instr_add_dep(instr, pi);
+ break;
+ }
+
+ if (depends_on(instr, pi))
+ ir3_instr_add_dep(instr, pi);
+ }
+
+ /* add dependencies on this instruction to following instructions
+ * that must be scheduled after the current instruction:
+ */
+ while (next != &block->instr_list) {
+ struct ir3_instruction *ni =
+ LIST_ENTRY(struct ir3_instruction, next, node);
+
+ next = next->next;
+
+ if (is_meta(ni))
+ continue;
+
+ if (instr->barrier_class == ni->barrier_class) {
+ ir3_instr_add_dep(ni, instr);
+ break;
+ }
+
+ if (depends_on(ni, instr))
+ ir3_instr_add_dep(ni, instr);
+ }
+}
+
+/* before scheduling a block, we need to add any necessary false-dependencies
+ * to ensure that:
+ *
+ * (1) barriers are scheduled in the right order wrt instructions related
+ * to the barrier
+ *
+ * (2) reads that come before a write actually get scheduled before the
+ * write
+ */
+static void
+calculate_deps(struct ir3_block *block)
+{
+ list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+ if (instr->barrier_class) {
+ add_barrier_deps(block, instr);
+ }
+ }
+}
+
+void
+ir3_sched_add_deps(struct ir3 *ir)
+{
+ list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+ calculate_deps(block);
+ }
+}