--- /dev/null
+/*
+ * Copyright (C) 2019 Google, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ * Rob Clark <robclark@freedesktop.org>
+ */
+
+#include "ir3.h"
+
+/*
+ * Helpers to figure out the necessary delay slots between instructions. Used
+ * both in scheduling pass(es) and the final pass to insert any required nop's
+ * so that the shader program is valid.
+ *
+ * Note that this needs to work both pre and post RA, so we can't assume ssa
+ * src iterators work.
+ */
+
+/* generally don't count false dependencies, since this can just be
+ * something like a barrier, or SSBO store. The exception is array
+ * dependencies if the assigner is an array write and the consumer
+ * reads the same array.
+ */
+static bool
+ignore_dep(struct ir3_instruction *assigner,
+ struct ir3_instruction *consumer, unsigned n)
+{
+ if (!__is_false_dep(consumer, n))
+ return false;
+
+ if (assigner->barrier_class & IR3_BARRIER_ARRAY_W) {
+ struct ir3_register *dst = assigner->regs[0];
+ struct ir3_register *src;
+
+ debug_assert(dst->flags & IR3_REG_ARRAY);
+
+ foreach_src (src, consumer) {
+ if ((src->flags & IR3_REG_ARRAY) &&
+ (dst->array.id == src->array.id)) {
+ return false;
+ }
+ }
+ }
+
+ return true;
+}
+
+/* calculate required # of delay slots between the instruction that
+ * assigns a value and the one that consumes
+ */
+int
+ir3_delayslots(struct ir3_instruction *assigner,
+ struct ir3_instruction *consumer, unsigned n)
+{
+ if (ignore_dep(assigner, consumer, n))
+ return 0;
+
+ /* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal
+ * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch
+ * handled with sync bits
+ */
+
+ if (is_meta(assigner) || is_meta(consumer))
+ return 0;
+
+ if (writes_addr(assigner))
+ return 6;
+
+ /* handled via sync flags: */
+ if (is_sfu(assigner) || is_tex(assigner) || is_mem(assigner))
+ return 0;
+
+ /* assigner must be alu: */
+ if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) ||
+ is_mem(consumer)) {
+ return 6;
+ } else if ((is_mad(consumer->opc) || is_madsh(consumer->opc)) &&
+ (n == 3)) {
+ /* special case, 3rd src to cat3 not required on first cycle */
+ return 1;
+ } else {
+ return 3;
+ }
+}
+
+static bool
+count_instruction(struct ir3_instruction *n)
+{
+ /* NOTE: don't count branch/jump since we don't know yet if they will
+ * be eliminated later in resolve_jumps().. really should do that
+ * earlier so we don't have this constraint.
+ */
+ return is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR));
+}
+
+/**
+ * @block: the block to search in, starting from end; in first pass,
+ * this will be the block the instruction would be inserted into
+ * (but has not yet, ie. it only contains already scheduled
+ * instructions). For intra-block scheduling (second pass), this
+ * would be one of the predecessor blocks.
+ * @instr: the instruction to search for
+ * @maxd: max distance, bail after searching this # of instruction
+ * slots, since it means the instruction we are looking for is
+ * far enough away
+ * @pred: if true, recursively search into predecessor blocks to
+ * find the worst case (shortest) distance (only possible after
+ * individual blocks are all scheduled)
+ */
+unsigned
+ir3_distance(struct ir3_block *block, struct ir3_instruction *instr,
+ unsigned maxd, bool pred)
+{
+ unsigned d = 0;
+
+ /* Note that this relies on incrementally building up the block's
+ * instruction list.. but this is how scheduling and nopsched
+ * work.
+ */
+ foreach_instr_rev (n, &block->instr_list) {
+ if ((n == instr) || (d >= maxd))
+ return MIN2(maxd, d + n->nop);
+ if (count_instruction(n))
+ d = MIN2(maxd, d + 1 + n->repeat + n->nop);
+ }
+
+ /* if coming from a predecessor block, assume it is assigned far
+ * enough away.. we'll fix up later.
+ */
+ if (!pred)
+ return maxd;
+
+ if (pred && (block->data != block)) {
+ /* Search into predecessor blocks, finding the one with the
+ * shortest distance, since that will be the worst case
+ */
+ unsigned min = maxd - d;
+
+ /* (ab)use block->data to prevent recursion: */
+ block->data = block;
+
+ set_foreach (block->predecessors, entry) {
+ struct ir3_block *pred = (struct ir3_block *)entry->key;
+ unsigned n;
+
+ n = ir3_distance(pred, instr, min, pred);
+
+ min = MIN2(min, n);
+ }
+
+ block->data = NULL;
+ d += min;
+ }
+
+ return d;
+}
+
+/* calculate delay for specified src: */
+static unsigned
+delay_calc_srcn(struct ir3_block *block,
+ struct ir3_instruction *assigner,
+ struct ir3_instruction *consumer,
+ unsigned srcn, bool soft, bool pred)
+{
+ unsigned delay = 0;
+
+ if (is_meta(assigner)) {
+ struct ir3_register *src;
+ foreach_src (src, assigner) {
+ unsigned d;
+
+ if (!src->instr)
+ continue;
+
+ d = delay_calc_srcn(block, src->instr, consumer, srcn, soft, pred);
+ delay = MAX2(delay, d);
+ }
+ } else {
+ if (soft) {
+ if (is_sfu(assigner)) {
+ delay = 4;
+ } else {
+ delay = ir3_delayslots(assigner, consumer, srcn);
+ }
+ } else {
+ delay = ir3_delayslots(assigner, consumer, srcn);
+ }
+ delay -= ir3_distance(block, assigner, delay, pred);
+ }
+
+ return delay;
+}
+
+static struct ir3_instruction *
+find_array_write(struct ir3_block *block, unsigned array_id, unsigned maxd)
+{
+ unsigned d = 0;
+
+ /* Note that this relies on incrementally building up the block's
+ * instruction list.. but this is how scheduling and nopsched
+ * work.
+ */
+ foreach_instr_rev (n, &block->instr_list) {
+ if (d >= maxd)
+ return NULL;
+ if (count_instruction(n))
+ d++;
+ if (dest_regs(n) == 0)
+ continue;
+
+ /* note that a dest reg will never be an immediate */
+ if (n->regs[0]->array.id == array_id)
+ return n;
+ }
+
+ return NULL;
+}
+
+/* like list_length() but only counts instructions which count in the
+ * delay determination:
+ */
+static unsigned
+count_block_delay(struct ir3_block *block)
+{
+ unsigned delay = 0;
+ foreach_instr (n, &block->instr_list) {
+ if (!count_instruction(n))
+ continue;
+ delay++;
+ }
+ return delay;
+}
+
+static unsigned
+delay_calc_array(struct ir3_block *block, unsigned array_id,
+ struct ir3_instruction *consumer, unsigned srcn,
+ bool soft, bool pred, unsigned maxd)
+{
+ struct ir3_instruction *assigner;
+
+ assigner = find_array_write(block, array_id, maxd);
+ if (assigner)
+ return delay_calc_srcn(block, assigner, consumer, srcn, soft, pred);
+
+ if (!pred)
+ return 0;
+
+ unsigned len = count_block_delay(block);
+ if (maxd <= len)
+ return 0;
+
+ maxd -= len;
+
+ if (block->data == block) {
+ /* we have a loop, return worst case: */
+ return maxd;
+ }
+
+ /* If we need to search into predecessors, find the one with the
+ * max delay.. the resulting delay is that minus the number of
+ * counted instructions in this block:
+ */
+ unsigned max = 0;
+
+ /* (ab)use block->data to prevent recursion: */
+ block->data = block;
+
+ set_foreach (block->predecessors, entry) {
+ struct ir3_block *pred = (struct ir3_block *)entry->key;
+ unsigned delay =
+ delay_calc_array(pred, array_id, consumer, srcn, soft, pred, maxd);
+
+ max = MAX2(max, delay);
+ }
+
+ block->data = NULL;
+
+ if (max < len)
+ return 0;
+
+ return max - len;
+}
+
+/**
+ * Calculate delay for instruction (maximum of delay for all srcs):
+ *
+ * @soft: If true, add additional delay for situations where they
+ * would not be strictly required because a sync flag would be
+ * used (but scheduler would prefer to schedule some other
+ * instructions first to avoid stalling on sync flag)
+ * @pred: If true, recurse into predecessor blocks
+ */
+unsigned
+ir3_delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
+ bool soft, bool pred)
+{
+ unsigned delay = 0;
+ struct ir3_register *src;
+
+ foreach_src_n (src, i, instr) {
+ unsigned d = 0;
+
+ if ((src->flags & IR3_REG_RELATIV) && !(src->flags & IR3_REG_CONST)) {
+ d = delay_calc_array(block, src->array.id, instr, i+1, soft, pred, 6);
+ } else if (src->instr) {
+ d = delay_calc_srcn(block, src->instr, instr, i+1, soft, pred);
+ }
+
+ delay = MAX2(delay, d);
+ }
+
+ if (instr->address) {
+ unsigned d = delay_calc_srcn(block, instr->address, instr, 0, soft, pred);
+ delay = MAX2(delay, d);
+ }
+
+ return delay;
+}
return d;
}
-/**
- * @block: the block to search in, starting from end; in first pass,
- * this will be the block the instruction would be inserted into
- * (but has not yet, ie. it only contains already scheduled
- * instructions). For intra-block scheduling (second pass), this
- * would be one of the predecessor blocks.
- * @instr: the instruction to search for
- * @maxd: max distance, bail after searching this # of instruction
- * slots, since it means the instruction we are looking for is
- * far enough away
- * @pred: if true, recursively search into predecessor blocks to
- * find the worst case (shortest) distance (only possible after
- * individual blocks are all scheduled
- */
-static unsigned
-distance(struct ir3_block *block, struct ir3_instruction *instr,
- unsigned maxd, bool pred)
-{
- unsigned d = 0;
-
- foreach_instr_rev (n, &block->instr_list) {
- if ((n == instr) || (d >= maxd))
- return d;
- /* NOTE: don't count branch/jump since we don't know yet if they will
- * be eliminated later in resolve_jumps().. really should do that
- * earlier so we don't have this constraint.
- */
- if (is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR)))
- d++;
- }
-
- /* if coming from a predecessor block, assume it is assigned far
- * enough away.. we'll fix up later.
- */
- if (!pred)
- return maxd;
-
- if (pred && (block->data != block)) {
- /* Search into predecessor blocks, finding the one with the
- * shortest distance, since that will be the worst case
- */
- unsigned min = maxd - d;
-
- /* (ab)use block->data to prevent recursion: */
- block->data = block;
-
- set_foreach(block->predecessors, entry) {
- struct ir3_block *pred = (struct ir3_block *)entry->key;
- unsigned n;
-
- n = distance(pred, instr, min, pred);
-
- min = MIN2(min, n);
- }
-
- block->data = NULL;
- d += min;
- }
-
- return d;
-}
-
-/* calculate delay for specified src: */
-static unsigned
-delay_calc_srcn(struct ir3_block *block,
- struct ir3_instruction *assigner,
- struct ir3_instruction *consumer,
- unsigned srcn, bool soft, bool pred)
-{
- unsigned delay = 0;
-
- if (is_meta(assigner)) {
- struct ir3_instruction *src;
- foreach_ssa_src(src, assigner) {
- unsigned d;
- d = delay_calc_srcn(block, src, consumer, srcn, soft, pred);
- delay = MAX2(delay, d);
- }
- } else {
- if (soft) {
- if (is_sfu(assigner)) {
- delay = 4;
- } else {
- delay = ir3_delayslots(assigner, consumer, srcn);
- }
- } else {
- delay = ir3_delayslots(assigner, consumer, srcn);
- }
- delay -= distance(block, assigner, delay, pred);
- }
-
- return delay;
-}
-
-/* calculate delay for instruction (maximum of delay for all srcs): */
-static unsigned
-delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
- bool soft, bool pred)
-{
- unsigned delay = 0;
- struct ir3_instruction *src;
-
- foreach_ssa_src_n(src, i, instr) {
- unsigned d;
- d = delay_calc_srcn(block, src, instr, i, soft, pred);
- delay = MAX2(delay, d);
- }
-
- return delay;
-}
-
struct ir3_sched_notes {
/* there is at least one kill which could be scheduled, except
* for unscheduled bary.f's:
continue;
}
- int rank = delay_calc(ctx->block, candidate, soft, false);
+ int rank = ir3_delay_calc(ctx->block, candidate, soft, false);
/* if too many live values, prioritize instructions that reduce the
* number of live values:
instr = find_eligible_instr(ctx, ¬es, false);
if (instr) {
- unsigned delay = delay_calc(ctx->block, instr, false, false);
+ unsigned delay = ir3_delay_calc(ctx->block, instr, false, false);
d("delay=%u", delay);
debug_assert(ctx->pred);
debug_assert(block->condition);
- delay -= distance(ctx->block, ctx->pred, delay, false);
+ delay -= ir3_distance(ctx->block, ctx->pred, delay, false);
while (delay > 0) {
ir3_NOP(block);
set_foreach(block->predecessors, entry) {
struct ir3_block *pred = (struct ir3_block *)entry->key;
- unsigned d = delay_calc(pred, instr, false, true);
+ unsigned d = ir3_delay_calc(pred, instr, false, true);
delay = MAX2(d, delay);
}