2 * Copyright (C) 2019 Google, Inc.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 * Rob Clark <robclark@freedesktop.org>
30 * Helpers to figure out the necessary delay slots between instructions. Used
31 * both in scheduling pass(es) and the final pass to insert any required nop's
32 * so that the shader program is valid.
34 * Note that this needs to work both pre and post RA, so we can't assume ssa
38 /* generally don't count false dependencies, since this can just be
39 * something like a barrier, or SSBO store. The exception is array
40 * dependencies if the assigner is an array write and the consumer
41 * reads the same array.
44 ignore_dep(struct ir3_instruction
*assigner
,
45 struct ir3_instruction
*consumer
, unsigned n
)
47 if (!__is_false_dep(consumer
, n
))
50 if (assigner
->barrier_class
& IR3_BARRIER_ARRAY_W
) {
51 struct ir3_register
*dst
= assigner
->regs
[0];
53 debug_assert(dst
->flags
& IR3_REG_ARRAY
);
55 foreach_src (src
, consumer
) {
56 if ((src
->flags
& IR3_REG_ARRAY
) &&
57 (dst
->array
.id
== src
->array
.id
)) {
66 /* calculate required # of delay slots between the instruction that
67 * assigns a value and the one that consumes
70 ir3_delayslots(struct ir3_instruction
*assigner
,
71 struct ir3_instruction
*consumer
, unsigned n
, bool soft
)
73 if (ignore_dep(assigner
, consumer
, n
))
76 /* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal
77 * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch
78 * handled with sync bits
81 if (is_meta(assigner
) || is_meta(consumer
))
84 if (writes_addr0(assigner
) || writes_addr1(assigner
))
87 /* On a6xx, it takes the number of delay slots to get a SFU result
88 * back (ie. using nop's instead of (ss) is:
94 * and so on. Not quite sure where it tapers out (ie. how many
95 * warps share an SFU unit). But 10 seems like a reasonable #
98 if (soft
&& is_sfu(assigner
))
101 /* handled via sync flags: */
102 if (is_sfu(assigner
) || is_tex(assigner
) || is_mem(assigner
))
105 /* assigner must be alu: */
106 if (is_flow(consumer
) || is_sfu(consumer
) || is_tex(consumer
) ||
109 } else if ((is_mad(consumer
->opc
) || is_madsh(consumer
->opc
)) &&
111 /* special case, 3rd src to cat3 not required on first cycle */
119 count_instruction(struct ir3_instruction
*n
)
121 /* NOTE: don't count branch/jump since we don't know yet if they will
122 * be eliminated later in resolve_jumps().. really should do that
123 * earlier so we don't have this constraint.
125 return is_alu(n
) || (is_flow(n
) && (n
->opc
!= OPC_JUMP
) && (n
->opc
!= OPC_B
));
129 * @block: the block to search in, starting from end; in first pass,
130 * this will be the block the instruction would be inserted into
131 * (but has not yet, ie. it only contains already scheduled
132 * instructions). For intra-block scheduling (second pass), this
133 * would be one of the predecessor blocks.
134 * @instr: the instruction to search for
135 * @maxd: max distance, bail after searching this # of instruction
136 * slots, since it means the instruction we are looking for is
138 * @pred: if true, recursively search into predecessor blocks to
139 * find the worst case (shortest) distance (only possible after
140 * individual blocks are all scheduled)
143 distance(struct ir3_block
*block
, struct ir3_instruction
*instr
,
144 unsigned maxd
, bool pred
)
148 /* Note that this relies on incrementally building up the block's
149 * instruction list.. but this is how scheduling and nopsched
152 foreach_instr_rev (n
, &block
->instr_list
) {
153 if ((n
== instr
) || (d
>= maxd
))
154 return MIN2(maxd
, d
+ n
->nop
);
155 if (count_instruction(n
))
156 d
= MIN2(maxd
, d
+ 1 + n
->repeat
+ n
->nop
);
159 /* if coming from a predecessor block, assume it is assigned far
160 * enough away.. we'll fix up later.
165 if (pred
&& (block
->data
!= block
)) {
166 /* Search into predecessor blocks, finding the one with the
167 * shortest distance, since that will be the worst case
169 unsigned min
= maxd
- d
;
171 /* (ab)use block->data to prevent recursion: */
174 set_foreach (block
->predecessors
, entry
) {
175 struct ir3_block
*pred
= (struct ir3_block
*)entry
->key
;
178 n
= distance(pred
, instr
, min
, pred
);
190 /* calculate delay for specified src: */
192 delay_calc_srcn(struct ir3_block
*block
,
193 struct ir3_instruction
*assigner
,
194 struct ir3_instruction
*consumer
,
195 unsigned srcn
, bool soft
, bool pred
)
199 if (is_meta(assigner
)) {
200 foreach_src (src
, assigner
) {
206 d
= delay_calc_srcn(block
, src
->instr
, consumer
, srcn
, soft
, pred
);
207 delay
= MAX2(delay
, d
);
210 delay
= ir3_delayslots(assigner
, consumer
, srcn
, soft
);
211 delay
-= distance(block
, assigner
, delay
, pred
);
217 static struct ir3_instruction
*
218 find_array_write(struct ir3_block
*block
, unsigned array_id
, unsigned maxd
)
222 /* Note that this relies on incrementally building up the block's
223 * instruction list.. but this is how scheduling and nopsched
226 foreach_instr_rev (n
, &block
->instr_list
) {
229 if (count_instruction(n
))
231 if (dest_regs(n
) == 0)
234 /* note that a dest reg will never be an immediate */
235 if (n
->regs
[0]->array
.id
== array_id
)
242 /* like list_length() but only counts instructions which count in the
243 * delay determination:
246 count_block_delay(struct ir3_block
*block
)
249 foreach_instr (n
, &block
->instr_list
) {
250 if (!count_instruction(n
))
258 delay_calc_array(struct ir3_block
*block
, unsigned array_id
,
259 struct ir3_instruction
*consumer
, unsigned srcn
,
260 bool soft
, bool pred
, unsigned maxd
)
262 struct ir3_instruction
*assigner
;
264 assigner
= find_array_write(block
, array_id
, maxd
);
266 return delay_calc_srcn(block
, assigner
, consumer
, srcn
, soft
, pred
);
271 unsigned len
= count_block_delay(block
);
277 if (block
->data
== block
) {
278 /* we have a loop, return worst case: */
282 /* If we need to search into predecessors, find the one with the
283 * max delay.. the resulting delay is that minus the number of
284 * counted instructions in this block:
288 /* (ab)use block->data to prevent recursion: */
291 set_foreach (block
->predecessors
, entry
) {
292 struct ir3_block
*pred
= (struct ir3_block
*)entry
->key
;
294 delay_calc_array(pred
, array_id
, consumer
, srcn
, soft
, pred
, maxd
);
296 max
= MAX2(max
, delay
);
308 * Calculate delay for instruction (maximum of delay for all srcs):
310 * @soft: If true, add additional delay for situations where they
311 * would not be strictly required because a sync flag would be
312 * used (but scheduler would prefer to schedule some other
313 * instructions first to avoid stalling on sync flag)
314 * @pred: If true, recurse into predecessor blocks
317 ir3_delay_calc(struct ir3_block
*block
, struct ir3_instruction
*instr
,
318 bool soft
, bool pred
)
322 foreach_src_n (src
, i
, instr
) {
325 if ((src
->flags
& IR3_REG_RELATIV
) && !(src
->flags
& IR3_REG_CONST
)) {
326 d
= delay_calc_array(block
, src
->array
.id
, instr
, i
+1, soft
, pred
, 6);
327 } else if (src
->instr
) {
328 d
= delay_calc_srcn(block
, src
->instr
, instr
, i
+1, soft
, pred
);
331 delay
= MAX2(delay
, d
);
334 if (instr
->address
) {
335 unsigned d
= delay_calc_srcn(block
, instr
->address
, instr
, 0, soft
, pred
);
336 delay
= MAX2(delay
, d
);
343 * Remove nop instructions. The scheduler can insert placeholder nop's
344 * so that ir3_delay_calc() can account for nop's that won't be needed
345 * due to nop's triggered by a previous instruction. However, before
346 * legalize, we want to remove these. The legalize pass can insert
347 * some nop's if needed to hold (for example) sync flags. This final
348 * remaining nops are inserted by legalize after this.
351 ir3_remove_nops(struct ir3
*ir
)
353 foreach_block (block
, &ir
->block_list
) {
354 foreach_instr_safe (instr
, &block
->instr_list
) {
355 if (instr
->opc
== OPC_NOP
) {
356 list_del(&instr
->node
);