2 * Copyright (C) 2019 Google, Inc.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 * Rob Clark <robclark@freedesktop.org>
30 * Helpers to figure out the necessary delay slots between instructions. Used
31 * both in scheduling pass(es) and the final pass to insert any required nop's
32 * so that the shader program is valid.
34 * Note that this needs to work both pre and post RA, so we can't assume ssa
38 /* generally don't count false dependencies, since this can just be
39 * something like a barrier, or SSBO store. The exception is array
40 * dependencies if the assigner is an array write and the consumer
41 * reads the same array.
44 ignore_dep(struct ir3_instruction
*assigner
,
45 struct ir3_instruction
*consumer
, unsigned n
)
47 if (!__is_false_dep(consumer
, n
))
50 if (assigner
->barrier_class
& IR3_BARRIER_ARRAY_W
) {
51 struct ir3_register
*dst
= assigner
->regs
[0];
52 struct ir3_register
*src
;
54 debug_assert(dst
->flags
& IR3_REG_ARRAY
);
56 foreach_src (src
, consumer
) {
57 if ((src
->flags
& IR3_REG_ARRAY
) &&
58 (dst
->array
.id
== src
->array
.id
)) {
67 /* calculate required # of delay slots between the instruction that
68 * assigns a value and the one that consumes
71 ir3_delayslots(struct ir3_instruction
*assigner
,
72 struct ir3_instruction
*consumer
, unsigned n
, bool soft
)
74 if (ignore_dep(assigner
, consumer
, n
))
77 /* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal
78 * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch
79 * handled with sync bits
82 if (is_meta(assigner
) || is_meta(consumer
))
85 if (writes_addr(assigner
))
88 /* On a6xx, it takes the number of delay slots to get a SFU result
89 * back (ie. using nop's instead of (ss) is:
95 * and so on. Not quite sure where it tapers out (ie. how many
96 * warps share an SFU unit). But 10 seems like a reasonable #
99 if (soft
&& is_sfu(assigner
))
102 /* handled via sync flags: */
103 if (is_sfu(assigner
) || is_tex(assigner
) || is_mem(assigner
))
106 /* assigner must be alu: */
107 if (is_flow(consumer
) || is_sfu(consumer
) || is_tex(consumer
) ||
110 } else if ((is_mad(consumer
->opc
) || is_madsh(consumer
->opc
)) &&
112 /* special case, 3rd src to cat3 not required on first cycle */
120 count_instruction(struct ir3_instruction
*n
)
122 /* NOTE: don't count branch/jump since we don't know yet if they will
123 * be eliminated later in resolve_jumps().. really should do that
124 * earlier so we don't have this constraint.
126 return is_alu(n
) || (is_flow(n
) && (n
->opc
!= OPC_JUMP
) && (n
->opc
!= OPC_BR
));
130 * @block: the block to search in, starting from end; in first pass,
131 * this will be the block the instruction would be inserted into
132 * (but has not yet, ie. it only contains already scheduled
133 * instructions). For intra-block scheduling (second pass), this
134 * would be one of the predecessor blocks.
135 * @instr: the instruction to search for
136 * @maxd: max distance, bail after searching this # of instruction
137 * slots, since it means the instruction we are looking for is
139 * @pred: if true, recursively search into predecessor blocks to
140 * find the worst case (shortest) distance (only possible after
141 * individual blocks are all scheduled)
144 distance(struct ir3_block
*block
, struct ir3_instruction
*instr
,
145 unsigned maxd
, bool pred
)
149 /* Note that this relies on incrementally building up the block's
150 * instruction list.. but this is how scheduling and nopsched
153 foreach_instr_rev (n
, &block
->instr_list
) {
154 if ((n
== instr
) || (d
>= maxd
))
155 return MIN2(maxd
, d
+ n
->nop
);
156 if (count_instruction(n
))
157 d
= MIN2(maxd
, d
+ 1 + n
->repeat
+ n
->nop
);
160 /* if coming from a predecessor block, assume it is assigned far
161 * enough away.. we'll fix up later.
166 if (pred
&& (block
->data
!= block
)) {
167 /* Search into predecessor blocks, finding the one with the
168 * shortest distance, since that will be the worst case
170 unsigned min
= maxd
- d
;
172 /* (ab)use block->data to prevent recursion: */
175 set_foreach (block
->predecessors
, entry
) {
176 struct ir3_block
*pred
= (struct ir3_block
*)entry
->key
;
179 n
= distance(pred
, instr
, min
, pred
);
191 /* calculate delay for specified src: */
193 delay_calc_srcn(struct ir3_block
*block
,
194 struct ir3_instruction
*assigner
,
195 struct ir3_instruction
*consumer
,
196 unsigned srcn
, bool soft
, bool pred
)
200 if (is_meta(assigner
)) {
201 struct ir3_register
*src
;
202 foreach_src (src
, assigner
) {
208 d
= delay_calc_srcn(block
, src
->instr
, consumer
, srcn
, soft
, pred
);
209 delay
= MAX2(delay
, d
);
212 delay
= ir3_delayslots(assigner
, consumer
, srcn
, soft
);
213 delay
-= distance(block
, assigner
, delay
, pred
);
219 static struct ir3_instruction
*
220 find_array_write(struct ir3_block
*block
, unsigned array_id
, unsigned maxd
)
224 /* Note that this relies on incrementally building up the block's
225 * instruction list.. but this is how scheduling and nopsched
228 foreach_instr_rev (n
, &block
->instr_list
) {
231 if (count_instruction(n
))
233 if (dest_regs(n
) == 0)
236 /* note that a dest reg will never be an immediate */
237 if (n
->regs
[0]->array
.id
== array_id
)
244 /* like list_length() but only counts instructions which count in the
245 * delay determination:
248 count_block_delay(struct ir3_block
*block
)
251 foreach_instr (n
, &block
->instr_list
) {
252 if (!count_instruction(n
))
260 delay_calc_array(struct ir3_block
*block
, unsigned array_id
,
261 struct ir3_instruction
*consumer
, unsigned srcn
,
262 bool soft
, bool pred
, unsigned maxd
)
264 struct ir3_instruction
*assigner
;
266 assigner
= find_array_write(block
, array_id
, maxd
);
268 return delay_calc_srcn(block
, assigner
, consumer
, srcn
, soft
, pred
);
273 unsigned len
= count_block_delay(block
);
279 if (block
->data
== block
) {
280 /* we have a loop, return worst case: */
284 /* If we need to search into predecessors, find the one with the
285 * max delay.. the resulting delay is that minus the number of
286 * counted instructions in this block:
290 /* (ab)use block->data to prevent recursion: */
293 set_foreach (block
->predecessors
, entry
) {
294 struct ir3_block
*pred
= (struct ir3_block
*)entry
->key
;
296 delay_calc_array(pred
, array_id
, consumer
, srcn
, soft
, pred
, maxd
);
298 max
= MAX2(max
, delay
);
310 * Calculate delay for instruction (maximum of delay for all srcs):
312 * @soft: If true, add additional delay for situations where they
313 * would not be strictly required because a sync flag would be
314 * used (but scheduler would prefer to schedule some other
315 * instructions first to avoid stalling on sync flag)
316 * @pred: If true, recurse into predecessor blocks
319 ir3_delay_calc(struct ir3_block
*block
, struct ir3_instruction
*instr
,
320 bool soft
, bool pred
)
323 struct ir3_register
*src
;
325 foreach_src_n (src
, i
, instr
) {
328 if ((src
->flags
& IR3_REG_RELATIV
) && !(src
->flags
& IR3_REG_CONST
)) {
329 d
= delay_calc_array(block
, src
->array
.id
, instr
, i
+1, soft
, pred
, 6);
330 } else if (src
->instr
) {
331 d
= delay_calc_srcn(block
, src
->instr
, instr
, i
+1, soft
, pred
);
334 delay
= MAX2(delay
, d
);
337 if (instr
->address
) {
338 unsigned d
= delay_calc_srcn(block
, instr
->address
, instr
, 0, soft
, pred
);
339 delay
= MAX2(delay
, d
);
346 * Remove nop instructions. The scheduler can insert placeholder nop's
347 * so that ir3_delay_calc() can account for nop's that won't be needed
348 * due to nop's triggered by a previous instruction. However, before
349 * legalize, we want to remove these. The legalize pass can insert
350 * some nop's if needed to hold (for example) sync flags. This final
351 * remaining nops are inserted by legalize after this.
354 ir3_remove_nops(struct ir3
*ir
)
356 foreach_block (block
, &ir
->block_list
) {
357 foreach_instr_safe (instr
, &block
->instr_list
) {
358 if (instr
->opc
== OPC_NOP
) {
359 list_del(&instr
->node
);