2 * Copyright (C) 2019 Google, Inc.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 * Rob Clark <robclark@freedesktop.org>
30 * Helpers to figure out the necessary delay slots between instructions. Used
31 * both in scheduling pass(es) and the final pass to insert any required nop's
32 * so that the shader program is valid.
34 * Note that this needs to work both pre and post RA, so we can't assume ssa
38 /* generally don't count false dependencies, since this can just be
39 * something like a barrier, or SSBO store. The exception is array
40 * dependencies if the assigner is an array write and the consumer
41 * reads the same array.
44 ignore_dep(struct ir3_instruction
*assigner
,
45 struct ir3_instruction
*consumer
, unsigned n
)
47 if (!__is_false_dep(consumer
, n
))
50 if (assigner
->barrier_class
& IR3_BARRIER_ARRAY_W
) {
51 struct ir3_register
*dst
= assigner
->regs
[0];
52 struct ir3_register
*src
;
54 debug_assert(dst
->flags
& IR3_REG_ARRAY
);
56 foreach_src (src
, consumer
) {
57 if ((src
->flags
& IR3_REG_ARRAY
) &&
58 (dst
->array
.id
== src
->array
.id
)) {
67 /* calculate required # of delay slots between the instruction that
68 * assigns a value and the one that consumes
71 ir3_delayslots(struct ir3_instruction
*assigner
,
72 struct ir3_instruction
*consumer
, unsigned n
)
74 if (ignore_dep(assigner
, consumer
, n
))
77 /* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal
78 * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch
79 * handled with sync bits
82 if (is_meta(assigner
) || is_meta(consumer
))
85 if (writes_addr(assigner
))
88 /* handled via sync flags: */
89 if (is_sfu(assigner
) || is_tex(assigner
) || is_mem(assigner
))
92 /* assigner must be alu: */
93 if (is_flow(consumer
) || is_sfu(consumer
) || is_tex(consumer
) ||
96 } else if ((is_mad(consumer
->opc
) || is_madsh(consumer
->opc
)) &&
98 /* special case, 3rd src to cat3 not required on first cycle */
106 count_instruction(struct ir3_instruction
*n
)
108 /* NOTE: don't count branch/jump since we don't know yet if they will
109 * be eliminated later in resolve_jumps().. really should do that
110 * earlier so we don't have this constraint.
112 return is_alu(n
) || (is_flow(n
) && (n
->opc
!= OPC_JUMP
) && (n
->opc
!= OPC_BR
));
116 * @block: the block to search in, starting from end; in first pass,
117 * this will be the block the instruction would be inserted into
118 * (but has not yet, ie. it only contains already scheduled
119 * instructions). For intra-block scheduling (second pass), this
120 * would be one of the predecessor blocks.
121 * @instr: the instruction to search for
122 * @maxd: max distance, bail after searching this # of instruction
123 * slots, since it means the instruction we are looking for is
125 * @pred: if true, recursively search into predecessor blocks to
126 * find the worst case (shortest) distance (only possible after
127 * individual blocks are all scheduled)
130 ir3_distance(struct ir3_block
*block
, struct ir3_instruction
*instr
,
131 unsigned maxd
, bool pred
)
135 /* Note that this relies on incrementally building up the block's
136 * instruction list.. but this is how scheduling and nopsched
139 foreach_instr_rev (n
, &block
->instr_list
) {
140 if ((n
== instr
) || (d
>= maxd
))
141 return MIN2(maxd
, d
+ n
->nop
);
142 if (count_instruction(n
))
143 d
= MIN2(maxd
, d
+ 1 + n
->repeat
+ n
->nop
);
146 /* if coming from a predecessor block, assume it is assigned far
147 * enough away.. we'll fix up later.
152 if (pred
&& (block
->data
!= block
)) {
153 /* Search into predecessor blocks, finding the one with the
154 * shortest distance, since that will be the worst case
156 unsigned min
= maxd
- d
;
158 /* (ab)use block->data to prevent recursion: */
161 set_foreach (block
->predecessors
, entry
) {
162 struct ir3_block
*pred
= (struct ir3_block
*)entry
->key
;
165 n
= ir3_distance(pred
, instr
, min
, pred
);
177 /* calculate delay for specified src: */
179 delay_calc_srcn(struct ir3_block
*block
,
180 struct ir3_instruction
*assigner
,
181 struct ir3_instruction
*consumer
,
182 unsigned srcn
, bool soft
, bool pred
)
186 if (is_meta(assigner
)) {
187 struct ir3_register
*src
;
188 foreach_src (src
, assigner
) {
194 d
= delay_calc_srcn(block
, src
->instr
, consumer
, srcn
, soft
, pred
);
195 delay
= MAX2(delay
, d
);
199 if (is_sfu(assigner
)) {
202 delay
= ir3_delayslots(assigner
, consumer
, srcn
);
205 delay
= ir3_delayslots(assigner
, consumer
, srcn
);
207 delay
-= ir3_distance(block
, assigner
, delay
, pred
);
213 static struct ir3_instruction
*
214 find_array_write(struct ir3_block
*block
, unsigned array_id
, unsigned maxd
)
218 /* Note that this relies on incrementally building up the block's
219 * instruction list.. but this is how scheduling and nopsched
222 foreach_instr_rev (n
, &block
->instr_list
) {
225 if (count_instruction(n
))
227 if (dest_regs(n
) == 0)
230 /* note that a dest reg will never be an immediate */
231 if (n
->regs
[0]->array
.id
== array_id
)
238 /* like list_length() but only counts instructions which count in the
239 * delay determination:
242 count_block_delay(struct ir3_block
*block
)
245 foreach_instr (n
, &block
->instr_list
) {
246 if (!count_instruction(n
))
254 delay_calc_array(struct ir3_block
*block
, unsigned array_id
,
255 struct ir3_instruction
*consumer
, unsigned srcn
,
256 bool soft
, bool pred
, unsigned maxd
)
258 struct ir3_instruction
*assigner
;
260 assigner
= find_array_write(block
, array_id
, maxd
);
262 return delay_calc_srcn(block
, assigner
, consumer
, srcn
, soft
, pred
);
267 unsigned len
= count_block_delay(block
);
273 if (block
->data
== block
) {
274 /* we have a loop, return worst case: */
278 /* If we need to search into predecessors, find the one with the
279 * max delay.. the resulting delay is that minus the number of
280 * counted instructions in this block:
284 /* (ab)use block->data to prevent recursion: */
287 set_foreach (block
->predecessors
, entry
) {
288 struct ir3_block
*pred
= (struct ir3_block
*)entry
->key
;
290 delay_calc_array(pred
, array_id
, consumer
, srcn
, soft
, pred
, maxd
);
292 max
= MAX2(max
, delay
);
304 * Calculate delay for instruction (maximum of delay for all srcs):
306 * @soft: If true, add additional delay for situations where they
307 * would not be strictly required because a sync flag would be
308 * used (but scheduler would prefer to schedule some other
309 * instructions first to avoid stalling on sync flag)
310 * @pred: If true, recurse into predecessor blocks
313 ir3_delay_calc(struct ir3_block
*block
, struct ir3_instruction
*instr
,
314 bool soft
, bool pred
)
317 struct ir3_register
*src
;
319 foreach_src_n (src
, i
, instr
) {
322 if ((src
->flags
& IR3_REG_RELATIV
) && !(src
->flags
& IR3_REG_CONST
)) {
323 d
= delay_calc_array(block
, src
->array
.id
, instr
, i
+1, soft
, pred
, 6);
324 } else if (src
->instr
) {
325 d
= delay_calc_srcn(block
, src
->instr
, instr
, i
+1, soft
, pred
);
328 delay
= MAX2(delay
, d
);
331 if (instr
->address
) {
332 unsigned d
= delay_calc_srcn(block
, instr
->address
, instr
, 0, soft
, pred
);
333 delay
= MAX2(delay
, d
);
340 * Remove nop instructions. The scheduler can insert placeholder nop's
341 * so that ir3_delay_calc() can account for nop's that won't be needed
342 * due to nop's triggered by a previous instruction. However, before
343 * legalize, we want to remove these. The legalize pass can insert
344 * some nop's if needed to hold (for example) sync flags. This final
345 * remaining nops are inserted by legalize after this.
348 ir3_remove_nops(struct ir3
*ir
)
350 foreach_block (block
, &ir
->block_list
) {
351 foreach_instr_safe (instr
, &block
->instr_list
) {
352 if (instr
->opc
== OPC_NOP
) {
353 list_del(&instr
->node
);