From: Rob Clark Date: Wed, 18 Dec 2019 19:10:12 +0000 (-0800) Subject: freedreno/ir3: split out delay helpers X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=c803c662f990621acefd2f002d9df0d42ad8a3a0;p=mesa.git freedreno/ir3: split out delay helpers We're going to want these also for a post-RA sched pass. And also to split nop stuffing out into it's own pass. Signed-off-by: Rob Clark Part-of: --- diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h index a8ce9ea12a6..03abaafa393 100644 --- a/src/freedreno/ir3/ir3.h +++ b/src/freedreno/ir3/ir3.h @@ -1113,10 +1113,16 @@ static inline bool __is_false_dep(struct ir3_instruction *instr, unsigned n) void ir3_print(struct ir3 *ir); void ir3_print_instr(struct ir3_instruction *instr); -/* depth calculation: */ -struct ir3_shader_variant; +/* delay calculation: */ int ir3_delayslots(struct ir3_instruction *assigner, struct ir3_instruction *consumer, unsigned n); +unsigned ir3_distance(struct ir3_block *block, struct ir3_instruction *instr, + unsigned maxd, bool pred); +unsigned ir3_delay_calc(struct ir3_block *block, struct ir3_instruction *instr, + bool soft, bool pred); + +/* depth calculation: */ +struct ir3_shader_variant; void ir3_insert_by_depth(struct ir3_instruction *instr, struct list_head *list); void ir3_depth(struct ir3 *ir, struct ir3_shader_variant *so); diff --git a/src/freedreno/ir3/ir3_delay.c b/src/freedreno/ir3/ir3_delay.c new file mode 100644 index 00000000000..506e2969326 --- /dev/null +++ b/src/freedreno/ir3/ir3_delay.c @@ -0,0 +1,337 @@ +/* + * Copyright (C) 2019 Google, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark + */ + +#include "ir3.h" + +/* + * Helpers to figure out the necessary delay slots between instructions. Used + * both in scheduling pass(es) and the final pass to insert any required nop's + * so that the shader program is valid. + * + * Note that this needs to work both pre and post RA, so we can't assume ssa + * src iterators work. + */ + +/* generally don't count false dependencies, since this can just be + * something like a barrier, or SSBO store. The exception is array + * dependencies if the assigner is an array write and the consumer + * reads the same array. + */ +static bool +ignore_dep(struct ir3_instruction *assigner, + struct ir3_instruction *consumer, unsigned n) +{ + if (!__is_false_dep(consumer, n)) + return false; + + if (assigner->barrier_class & IR3_BARRIER_ARRAY_W) { + struct ir3_register *dst = assigner->regs[0]; + struct ir3_register *src; + + debug_assert(dst->flags & IR3_REG_ARRAY); + + foreach_src (src, consumer) { + if ((src->flags & IR3_REG_ARRAY) && + (dst->array.id == src->array.id)) { + return false; + } + } + } + + return true; +} + +/* calculate required # of delay slots between the instruction that + * assigns a value and the one that consumes + */ +int +ir3_delayslots(struct ir3_instruction *assigner, + struct ir3_instruction *consumer, unsigned n) +{ + if (ignore_dep(assigner, consumer, n)) + return 0; + + /* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal + * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch + * handled with sync bits + */ + + if (is_meta(assigner) || is_meta(consumer)) + return 0; + + if (writes_addr(assigner)) + return 6; + + /* handled via sync flags: */ + if (is_sfu(assigner) || is_tex(assigner) || is_mem(assigner)) + return 0; + + /* assigner must be alu: */ + if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) || + is_mem(consumer)) { + return 6; + } else if ((is_mad(consumer->opc) || is_madsh(consumer->opc)) && + (n == 3)) { + /* special case, 3rd src to cat3 not required on first cycle */ + return 1; + } else { + return 3; + } +} + +static bool +count_instruction(struct ir3_instruction *n) +{ + /* NOTE: don't count branch/jump since we don't know yet if they will + * be eliminated later in resolve_jumps().. really should do that + * earlier so we don't have this constraint. + */ + return is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR)); +} + +/** + * @block: the block to search in, starting from end; in first pass, + * this will be the block the instruction would be inserted into + * (but has not yet, ie. it only contains already scheduled + * instructions). For intra-block scheduling (second pass), this + * would be one of the predecessor blocks. + * @instr: the instruction to search for + * @maxd: max distance, bail after searching this # of instruction + * slots, since it means the instruction we are looking for is + * far enough away + * @pred: if true, recursively search into predecessor blocks to + * find the worst case (shortest) distance (only possible after + * individual blocks are all scheduled) + */ +unsigned +ir3_distance(struct ir3_block *block, struct ir3_instruction *instr, + unsigned maxd, bool pred) +{ + unsigned d = 0; + + /* Note that this relies on incrementally building up the block's + * instruction list.. but this is how scheduling and nopsched + * work. + */ + foreach_instr_rev (n, &block->instr_list) { + if ((n == instr) || (d >= maxd)) + return MIN2(maxd, d + n->nop); + if (count_instruction(n)) + d = MIN2(maxd, d + 1 + n->repeat + n->nop); + } + + /* if coming from a predecessor block, assume it is assigned far + * enough away.. we'll fix up later. + */ + if (!pred) + return maxd; + + if (pred && (block->data != block)) { + /* Search into predecessor blocks, finding the one with the + * shortest distance, since that will be the worst case + */ + unsigned min = maxd - d; + + /* (ab)use block->data to prevent recursion: */ + block->data = block; + + set_foreach (block->predecessors, entry) { + struct ir3_block *pred = (struct ir3_block *)entry->key; + unsigned n; + + n = ir3_distance(pred, instr, min, pred); + + min = MIN2(min, n); + } + + block->data = NULL; + d += min; + } + + return d; +} + +/* calculate delay for specified src: */ +static unsigned +delay_calc_srcn(struct ir3_block *block, + struct ir3_instruction *assigner, + struct ir3_instruction *consumer, + unsigned srcn, bool soft, bool pred) +{ + unsigned delay = 0; + + if (is_meta(assigner)) { + struct ir3_register *src; + foreach_src (src, assigner) { + unsigned d; + + if (!src->instr) + continue; + + d = delay_calc_srcn(block, src->instr, consumer, srcn, soft, pred); + delay = MAX2(delay, d); + } + } else { + if (soft) { + if (is_sfu(assigner)) { + delay = 4; + } else { + delay = ir3_delayslots(assigner, consumer, srcn); + } + } else { + delay = ir3_delayslots(assigner, consumer, srcn); + } + delay -= ir3_distance(block, assigner, delay, pred); + } + + return delay; +} + +static struct ir3_instruction * +find_array_write(struct ir3_block *block, unsigned array_id, unsigned maxd) +{ + unsigned d = 0; + + /* Note that this relies on incrementally building up the block's + * instruction list.. but this is how scheduling and nopsched + * work. + */ + foreach_instr_rev (n, &block->instr_list) { + if (d >= maxd) + return NULL; + if (count_instruction(n)) + d++; + if (dest_regs(n) == 0) + continue; + + /* note that a dest reg will never be an immediate */ + if (n->regs[0]->array.id == array_id) + return n; + } + + return NULL; +} + +/* like list_length() but only counts instructions which count in the + * delay determination: + */ +static unsigned +count_block_delay(struct ir3_block *block) +{ + unsigned delay = 0; + foreach_instr (n, &block->instr_list) { + if (!count_instruction(n)) + continue; + delay++; + } + return delay; +} + +static unsigned +delay_calc_array(struct ir3_block *block, unsigned array_id, + struct ir3_instruction *consumer, unsigned srcn, + bool soft, bool pred, unsigned maxd) +{ + struct ir3_instruction *assigner; + + assigner = find_array_write(block, array_id, maxd); + if (assigner) + return delay_calc_srcn(block, assigner, consumer, srcn, soft, pred); + + if (!pred) + return 0; + + unsigned len = count_block_delay(block); + if (maxd <= len) + return 0; + + maxd -= len; + + if (block->data == block) { + /* we have a loop, return worst case: */ + return maxd; + } + + /* If we need to search into predecessors, find the one with the + * max delay.. the resulting delay is that minus the number of + * counted instructions in this block: + */ + unsigned max = 0; + + /* (ab)use block->data to prevent recursion: */ + block->data = block; + + set_foreach (block->predecessors, entry) { + struct ir3_block *pred = (struct ir3_block *)entry->key; + unsigned delay = + delay_calc_array(pred, array_id, consumer, srcn, soft, pred, maxd); + + max = MAX2(max, delay); + } + + block->data = NULL; + + if (max < len) + return 0; + + return max - len; +} + +/** + * Calculate delay for instruction (maximum of delay for all srcs): + * + * @soft: If true, add additional delay for situations where they + * would not be strictly required because a sync flag would be + * used (but scheduler would prefer to schedule some other + * instructions first to avoid stalling on sync flag) + * @pred: If true, recurse into predecessor blocks + */ +unsigned +ir3_delay_calc(struct ir3_block *block, struct ir3_instruction *instr, + bool soft, bool pred) +{ + unsigned delay = 0; + struct ir3_register *src; + + foreach_src_n (src, i, instr) { + unsigned d = 0; + + if ((src->flags & IR3_REG_RELATIV) && !(src->flags & IR3_REG_CONST)) { + d = delay_calc_array(block, src->array.id, instr, i+1, soft, pred, 6); + } else if (src->instr) { + d = delay_calc_srcn(block, src->instr, instr, i+1, soft, pred); + } + + delay = MAX2(delay, d); + } + + if (instr->address) { + unsigned d = delay_calc_srcn(block, instr->address, instr, 0, soft, pred); + delay = MAX2(delay, d); + } + + return delay; +} diff --git a/src/freedreno/ir3/ir3_depth.c b/src/freedreno/ir3/ir3_depth.c index f1f7b94b2fc..59ea3af407b 100644 --- a/src/freedreno/ir3/ir3_depth.c +++ b/src/freedreno/ir3/ir3_depth.c @@ -48,72 +48,6 @@ * blocks depth sorted list, which is used by the scheduling pass. */ -/* generally don't count false dependencies, since this can just be - * something like a barrier, or SSBO store. The exception is array - * dependencies if the assigner is an array write and the consumer - * reads the same array. - */ -static bool -ignore_dep(struct ir3_instruction *assigner, - struct ir3_instruction *consumer, unsigned n) -{ - if (!__is_false_dep(consumer, n)) - return false; - - if (assigner->barrier_class & IR3_BARRIER_ARRAY_W) { - struct ir3_register *dst = assigner->regs[0]; - struct ir3_register *src; - - debug_assert(dst->flags & IR3_REG_ARRAY); - - foreach_src(src, consumer) { - if ((src->flags & IR3_REG_ARRAY) && - (dst->array.id == src->array.id)) { - return false; - } - } - } - - return true; -} - -/* calculate required # of delay slots between the instruction that - * assigns a value and the one that consumes - */ -int ir3_delayslots(struct ir3_instruction *assigner, - struct ir3_instruction *consumer, unsigned n) -{ - if (ignore_dep(assigner, consumer, n)) - return 0; - - /* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal - * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch - * handled with sync bits - */ - - if (is_meta(assigner) || is_meta(consumer)) - return 0; - - if (writes_addr(assigner)) - return 6; - - /* handled via sync flags: */ - if (is_sfu(assigner) || is_tex(assigner) || is_mem(assigner)) - return 0; - - /* assigner must be alu: */ - if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) || - is_mem(consumer)) { - return 6; - } else if ((is_mad(consumer->opc) || is_madsh(consumer->opc)) && - (n == 3)) { - /* special case, 3rd src to cat3 not required on first cycle */ - return 1; - } else { - return 3; - } -} - void ir3_insert_by_depth(struct ir3_instruction *instr, struct list_head *list) { diff --git a/src/freedreno/ir3/ir3_sched.c b/src/freedreno/ir3/ir3_sched.c index c2f6b3e020d..ec5ad6e872e 100644 --- a/src/freedreno/ir3/ir3_sched.c +++ b/src/freedreno/ir3/ir3_sched.c @@ -265,117 +265,6 @@ deepest(struct ir3_instruction **srcs, unsigned nsrcs) return d; } -/** - * @block: the block to search in, starting from end; in first pass, - * this will be the block the instruction would be inserted into - * (but has not yet, ie. it only contains already scheduled - * instructions). For intra-block scheduling (second pass), this - * would be one of the predecessor blocks. - * @instr: the instruction to search for - * @maxd: max distance, bail after searching this # of instruction - * slots, since it means the instruction we are looking for is - * far enough away - * @pred: if true, recursively search into predecessor blocks to - * find the worst case (shortest) distance (only possible after - * individual blocks are all scheduled - */ -static unsigned -distance(struct ir3_block *block, struct ir3_instruction *instr, - unsigned maxd, bool pred) -{ - unsigned d = 0; - - foreach_instr_rev (n, &block->instr_list) { - if ((n == instr) || (d >= maxd)) - return d; - /* NOTE: don't count branch/jump since we don't know yet if they will - * be eliminated later in resolve_jumps().. really should do that - * earlier so we don't have this constraint. - */ - if (is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR))) - d++; - } - - /* if coming from a predecessor block, assume it is assigned far - * enough away.. we'll fix up later. - */ - if (!pred) - return maxd; - - if (pred && (block->data != block)) { - /* Search into predecessor blocks, finding the one with the - * shortest distance, since that will be the worst case - */ - unsigned min = maxd - d; - - /* (ab)use block->data to prevent recursion: */ - block->data = block; - - set_foreach(block->predecessors, entry) { - struct ir3_block *pred = (struct ir3_block *)entry->key; - unsigned n; - - n = distance(pred, instr, min, pred); - - min = MIN2(min, n); - } - - block->data = NULL; - d += min; - } - - return d; -} - -/* calculate delay for specified src: */ -static unsigned -delay_calc_srcn(struct ir3_block *block, - struct ir3_instruction *assigner, - struct ir3_instruction *consumer, - unsigned srcn, bool soft, bool pred) -{ - unsigned delay = 0; - - if (is_meta(assigner)) { - struct ir3_instruction *src; - foreach_ssa_src(src, assigner) { - unsigned d; - d = delay_calc_srcn(block, src, consumer, srcn, soft, pred); - delay = MAX2(delay, d); - } - } else { - if (soft) { - if (is_sfu(assigner)) { - delay = 4; - } else { - delay = ir3_delayslots(assigner, consumer, srcn); - } - } else { - delay = ir3_delayslots(assigner, consumer, srcn); - } - delay -= distance(block, assigner, delay, pred); - } - - return delay; -} - -/* calculate delay for instruction (maximum of delay for all srcs): */ -static unsigned -delay_calc(struct ir3_block *block, struct ir3_instruction *instr, - bool soft, bool pred) -{ - unsigned delay = 0; - struct ir3_instruction *src; - - foreach_ssa_src_n(src, i, instr) { - unsigned d; - d = delay_calc_srcn(block, src, instr, i, soft, pred); - delay = MAX2(delay, d); - } - - return delay; -} - struct ir3_sched_notes { /* there is at least one kill which could be scheduled, except * for unscheduled bary.f's: @@ -658,7 +547,7 @@ find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, continue; } - int rank = delay_calc(ctx->block, candidate, soft, false); + int rank = ir3_delay_calc(ctx->block, candidate, soft, false); /* if too many live values, prioritize instructions that reduce the * number of live values: @@ -827,7 +716,7 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block) instr = find_eligible_instr(ctx, ¬es, false); if (instr) { - unsigned delay = delay_calc(ctx->block, instr, false, false); + unsigned delay = ir3_delay_calc(ctx->block, instr, false, false); d("delay=%u", delay); @@ -886,7 +775,7 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block) debug_assert(ctx->pred); debug_assert(block->condition); - delay -= distance(ctx->block, ctx->pred, delay, false); + delay -= ir3_distance(ctx->block, ctx->pred, delay, false); while (delay > 0) { ir3_NOP(block); @@ -944,7 +833,7 @@ sched_intra_block(struct ir3_sched_ctx *ctx, struct ir3_block *block) set_foreach(block->predecessors, entry) { struct ir3_block *pred = (struct ir3_block *)entry->key; - unsigned d = delay_calc(pred, instr, false, true); + unsigned d = ir3_delay_calc(pred, instr, false, true); delay = MAX2(d, delay); } diff --git a/src/freedreno/ir3/meson.build b/src/freedreno/ir3/meson.build index c377767084c..0c2cb21930a 100644 --- a/src/freedreno/ir3/meson.build +++ b/src/freedreno/ir3/meson.build @@ -54,6 +54,7 @@ libfreedreno_ir3_files = files( 'ir3_context.c', 'ir3_context.h', 'ir3_cp.c', + 'ir3_delay.c', 'ir3_depth.c', 'ir3_group.c', 'ir3_image.c',