2 * Copyright (C) 2019 Google, Inc.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 * Rob Clark <robclark@freedesktop.org>
30 * Helpers to figure out the necessary delay slots between instructions. Used
31 * both in scheduling pass(es) and the final pass to insert any required nop's
32 * so that the shader program is valid.
34 * Note that this needs to work both pre and post RA, so we can't assume ssa
38 /* generally don't count false dependencies, since this can just be
39 * something like a barrier, or SSBO store. The exception is array
40 * dependencies if the assigner is an array write and the consumer
41 * reads the same array.
44 ignore_dep(struct ir3_instruction
*assigner
,
45 struct ir3_instruction
*consumer
, unsigned n
)
47 if (!__is_false_dep(consumer
, n
))
50 if (assigner
->barrier_class
& IR3_BARRIER_ARRAY_W
) {
51 struct ir3_register
*dst
= assigner
->regs
[0];
53 debug_assert(dst
->flags
& IR3_REG_ARRAY
);
55 foreach_src (src
, consumer
) {
56 if ((src
->flags
& IR3_REG_ARRAY
) &&
57 (dst
->array
.id
== src
->array
.id
)) {
66 /* calculate required # of delay slots between the instruction that
67 * assigns a value and the one that consumes
70 ir3_delayslots(struct ir3_instruction
*assigner
,
71 struct ir3_instruction
*consumer
, unsigned n
, bool soft
)
73 if (ignore_dep(assigner
, consumer
, n
))
76 /* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal
77 * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch
78 * handled with sync bits
81 if (is_meta(assigner
) || is_meta(consumer
))
84 if (writes_addr0(assigner
) || writes_addr1(assigner
))
87 /* On a6xx, it takes the number of delay slots to get a SFU result
88 * back (ie. using nop's instead of (ss) is:
94 * and so on. Not quite sure where it tapers out (ie. how many
95 * warps share an SFU unit). But 10 seems like a reasonable #
98 if (soft
&& is_sfu(assigner
))
101 /* handled via sync flags: */
102 if (is_sfu(assigner
) || is_tex(assigner
) || is_mem(assigner
))
105 /* assigner must be alu: */
106 if (is_flow(consumer
) || is_sfu(consumer
) || is_tex(consumer
) ||
109 } else if ((is_mad(consumer
->opc
) || is_madsh(consumer
->opc
)) &&
111 /* special case, 3rd src to cat3 not required on first cycle */
119 count_instruction(struct ir3_instruction
*n
)
121 /* NOTE: don't count branch/jump since we don't know yet if they will
122 * be eliminated later in resolve_jumps().. really should do that
123 * earlier so we don't have this constraint.
125 return is_alu(n
) || (is_flow(n
) && (n
->opc
!= OPC_JUMP
) && (n
->opc
!= OPC_B
));
129 * @block: the block to search in, starting from end; in first pass,
130 * this will be the block the instruction would be inserted into
131 * (but has not yet, ie. it only contains already scheduled
132 * instructions). For intra-block scheduling (second pass), this
133 * would be one of the predecessor blocks.
134 * @instr: the instruction to search for
135 * @maxd: max distance, bail after searching this # of instruction
136 * slots, since it means the instruction we are looking for is
138 * @pred: if true, recursively search into predecessor blocks to
139 * find the worst case (shortest) distance (only possible after
140 * individual blocks are all scheduled)
143 distance(struct ir3_block
*block
, struct ir3_instruction
*instr
,
144 unsigned maxd
, bool pred
)
148 /* Note that this relies on incrementally building up the block's
149 * instruction list.. but this is how scheduling and nopsched
152 foreach_instr_rev (n
, &block
->instr_list
) {
153 if ((n
== instr
) || (d
>= maxd
))
154 return MIN2(maxd
, d
+ n
->nop
);
155 if (count_instruction(n
))
156 d
= MIN2(maxd
, d
+ 1 + n
->repeat
+ n
->nop
);
159 /* if coming from a predecessor block, assume it is assigned far
160 * enough away.. we'll fix up later.
165 if (pred
&& (block
->data
!= block
)) {
166 /* Search into predecessor blocks, finding the one with the
167 * shortest distance, since that will be the worst case
169 unsigned min
= maxd
- d
;
171 /* (ab)use block->data to prevent recursion: */
174 set_foreach (block
->predecessors
, entry
) {
175 struct ir3_block
*pred
= (struct ir3_block
*)entry
->key
;
178 n
= distance(pred
, instr
, min
, pred
);
190 /* calculate delay for specified src: */
192 delay_calc_srcn(struct ir3_block
*block
,
193 struct ir3_instruction
*assigner
,
194 struct ir3_instruction
*consumer
,
195 unsigned srcn
, bool soft
, bool pred
)
199 if (is_meta(assigner
)) {
200 foreach_src_n (src
, n
, assigner
) {
206 d
= delay_calc_srcn(block
, src
->instr
, consumer
, srcn
, soft
, pred
);
208 /* A (rptN) instruction executes in consecutive cycles so
209 * it's outputs are written in successive cycles. And
210 * likewise for it's (r)'d (incremented) inputs, they are
211 * read on successive cycles.
213 * So we need to adjust the delay for (rptN)'s assigners
214 * and consumers accordingly.
216 * Note that the dst of a (rptN) instruction is implicitly
217 * (r) (the assigner case), although that is not the case
218 * for src registers. There is exactly one case, bary.f,
219 * which has a vecN (collect) src that is not (r)'d.
221 if ((assigner
->opc
== OPC_META_SPLIT
) && src
->instr
->repeat
) {
222 /* (rptN) assigner case: */
223 d
-= MIN2(d
, src
->instr
->repeat
- assigner
->split
.off
);
224 } else if ((assigner
->opc
== OPC_META_COLLECT
) && consumer
->repeat
&&
225 (consumer
->regs
[srcn
]->flags
& IR3_REG_R
)) {
229 delay
= MAX2(delay
, d
);
232 delay
= ir3_delayslots(assigner
, consumer
, srcn
, soft
);
233 delay
-= distance(block
, assigner
, delay
, pred
);
239 static struct ir3_instruction
*
240 find_array_write(struct ir3_block
*block
, unsigned array_id
, unsigned maxd
)
244 /* Note that this relies on incrementally building up the block's
245 * instruction list.. but this is how scheduling and nopsched
248 foreach_instr_rev (n
, &block
->instr_list
) {
251 if (count_instruction(n
))
253 if (dest_regs(n
) == 0)
256 /* note that a dest reg will never be an immediate */
257 if (n
->regs
[0]->array
.id
== array_id
)
264 /* like list_length() but only counts instructions which count in the
265 * delay determination:
268 count_block_delay(struct ir3_block
*block
)
271 foreach_instr (n
, &block
->instr_list
) {
272 if (!count_instruction(n
))
280 delay_calc_array(struct ir3_block
*block
, unsigned array_id
,
281 struct ir3_instruction
*consumer
, unsigned srcn
,
282 bool soft
, bool pred
, unsigned maxd
)
284 struct ir3_instruction
*assigner
;
286 assigner
= find_array_write(block
, array_id
, maxd
);
288 return delay_calc_srcn(block
, assigner
, consumer
, srcn
, soft
, pred
);
293 unsigned len
= count_block_delay(block
);
299 if (block
->data
== block
) {
300 /* we have a loop, return worst case: */
304 /* If we need to search into predecessors, find the one with the
305 * max delay.. the resulting delay is that minus the number of
306 * counted instructions in this block:
310 /* (ab)use block->data to prevent recursion: */
313 set_foreach (block
->predecessors
, entry
) {
314 struct ir3_block
*pred
= (struct ir3_block
*)entry
->key
;
316 delay_calc_array(pred
, array_id
, consumer
, srcn
, soft
, pred
, maxd
);
318 max
= MAX2(max
, delay
);
330 * Calculate delay for instruction (maximum of delay for all srcs):
332 * @soft: If true, add additional delay for situations where they
333 * would not be strictly required because a sync flag would be
334 * used (but scheduler would prefer to schedule some other
335 * instructions first to avoid stalling on sync flag)
336 * @pred: If true, recurse into predecessor blocks
339 ir3_delay_calc(struct ir3_block
*block
, struct ir3_instruction
*instr
,
340 bool soft
, bool pred
)
344 foreach_src_n (src
, i
, instr
) {
347 if ((src
->flags
& IR3_REG_RELATIV
) && !(src
->flags
& IR3_REG_CONST
)) {
348 d
= delay_calc_array(block
, src
->array
.id
, instr
, i
+1, soft
, pred
, 6);
349 } else if (src
->instr
) {
350 d
= delay_calc_srcn(block
, src
->instr
, instr
, i
+1, soft
, pred
);
353 delay
= MAX2(delay
, d
);
356 if (instr
->address
) {
357 unsigned d
= delay_calc_srcn(block
, instr
->address
, instr
, 0, soft
, pred
);
358 delay
= MAX2(delay
, d
);
365 * Remove nop instructions. The scheduler can insert placeholder nop's
366 * so that ir3_delay_calc() can account for nop's that won't be needed
367 * due to nop's triggered by a previous instruction. However, before
368 * legalize, we want to remove these. The legalize pass can insert
369 * some nop's if needed to hold (for example) sync flags. This final
370 * remaining nops are inserted by legalize after this.
373 ir3_remove_nops(struct ir3
*ir
)
375 foreach_block (block
, &ir
->block_list
) {
376 foreach_instr_safe (instr
, &block
->instr_list
) {
377 if (instr
->opc
== OPC_NOP
) {
378 list_del(&instr
->node
);