1 /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
4 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26 * Rob Clark <robclark@freedesktop.org>
30 #include "util/u_math.h"
35 * Instruction Scheduling:
37 * A priority-queue based scheduling algo. Add eligible instructions,
38 * ie. ones with all their dependencies scheduled, to the priority
39 * (depth) sorted queue (list). Pop highest priority instruction off
40 * the queue and schedule it, add newly eligible instructions to the
41 * priority queue, rinse, repeat.
43 * There are a few special cases that need to be handled, since sched
44 * is currently independent of register allocation. Usages of address
45 * register (a0.x) or predicate register (p0.x) must be serialized. Ie.
46 * if you have two pairs of instructions that write the same special
47 * register and then read it, then those pairs cannot be interleaved.
48 * To solve this, when we are in such a scheduling "critical section",
49 * and we encounter a conflicting write to a special register, we try
50 * to schedule any remaining instructions that use that value first.
53 struct ir3_sched_ctx
{
54 struct ir3_block
*block
; /* the current block */
55 struct ir3_instruction
*scheduled
; /* last scheduled instr XXX remove*/
56 struct ir3_instruction
*addr
; /* current a0.x user, if any */
57 struct ir3_instruction
*pred
; /* current p0.x user, if any */
61 static bool is_sfu_or_mem(struct ir3_instruction
*instr
)
63 return is_sfu(instr
) || is_mem(instr
);
67 schedule(struct ir3_sched_ctx
*ctx
, struct ir3_instruction
*instr
)
69 debug_assert(ctx
->block
== instr
->block
);
71 /* maybe there is a better way to handle this than just stuffing
72 * a nop.. ideally we'd know about this constraint in the
73 * scheduling and depth calculation..
75 if (ctx
->scheduled
&& is_sfu_or_mem(ctx
->scheduled
) && is_sfu_or_mem(instr
))
78 /* remove from depth list:
80 list_delinit(&instr
->node
);
82 if (writes_addr(instr
)) {
83 debug_assert(ctx
->addr
== NULL
);
87 if (writes_pred(instr
)) {
88 debug_assert(ctx
->pred
== NULL
);
92 instr
->flags
|= IR3_INSTR_MARK
;
94 list_addtail(&instr
->node
, &instr
->block
->instr_list
);
95 ctx
->scheduled
= instr
;
99 distance(struct ir3_sched_ctx
*ctx
, struct ir3_instruction
*instr
,
102 struct list_head
*instr_list
= &ctx
->block
->instr_list
;
105 list_for_each_entry_rev (struct ir3_instruction
, n
, instr_list
, node
) {
106 if ((n
== instr
) || (d
>= maxd
))
108 if (is_alu(n
) || is_flow(n
))
115 /* calculate delay for specified src: */
117 delay_calc_srcn(struct ir3_sched_ctx
*ctx
,
118 struct ir3_instruction
*assigner
,
119 struct ir3_instruction
*consumer
, unsigned srcn
)
123 if (is_meta(assigner
)) {
124 struct ir3_instruction
*src
;
125 foreach_ssa_src(src
, assigner
) {
127 if (src
->block
!= assigner
->block
)
129 d
= delay_calc_srcn(ctx
, src
, consumer
, srcn
);
130 delay
= MAX2(delay
, d
);
133 delay
= ir3_delayslots(assigner
, consumer
, srcn
);
134 delay
-= distance(ctx
, assigner
, delay
);
140 /* calculate delay for instruction (maximum of delay for all srcs): */
142 delay_calc(struct ir3_sched_ctx
*ctx
, struct ir3_instruction
*instr
)
145 struct ir3_instruction
*src
;
147 foreach_ssa_src_n(src
, i
, instr
) {
149 if (src
->block
!= instr
->block
)
151 d
= delay_calc_srcn(ctx
, src
, instr
, i
);
152 delay
= MAX2(delay
, d
);
158 struct ir3_sched_notes
{
159 /* there is at least one kill which could be scheduled, except
160 * for unscheduled bary.f's:
163 /* there is at least one instruction that could be scheduled,
164 * except for conflicting address/predicate register usage:
166 bool addr_conflict
, pred_conflict
;
169 static bool is_scheduled(struct ir3_instruction
*instr
)
171 return !!(instr
->flags
& IR3_INSTR_MARK
);
175 check_conflict(struct ir3_sched_ctx
*ctx
, struct ir3_sched_notes
*notes
,
176 struct ir3_instruction
*instr
)
178 /* if this is a write to address/predicate register, and that
179 * register is currently in use, we need to defer until it is
182 if (writes_addr(instr
) && ctx
->addr
) {
183 debug_assert(ctx
->addr
!= instr
);
184 notes
->addr_conflict
= true;
188 if (writes_pred(instr
) && ctx
->pred
) {
189 debug_assert(ctx
->pred
!= instr
);
190 notes
->pred_conflict
= true;
197 /* is this instruction ready to be scheduled? Return negative for not
198 * ready (updating notes if needed), or >= 0 to indicate number of
199 * delay slots needed.
202 instr_eligibility(struct ir3_sched_ctx
*ctx
, struct ir3_sched_notes
*notes
,
203 struct ir3_instruction
*instr
)
205 struct ir3_instruction
*src
;
208 /* Phi instructions can have a dependency on something not
209 * scheduled yet (for ex, loops). But OTOH we don't really
210 * care. By definition phi's should appear at the top of
211 * the block, and it's sources should be values from the
212 * previously executing block, so they are always ready to
215 if (is_meta(instr
) && (instr
->opc
== OPC_META_PHI
))
218 foreach_ssa_src(src
, instr
) {
219 /* if dependency not scheduled, we aren't ready yet: */
220 if (!is_scheduled(src
))
224 /* all our dependents are scheduled, figure out if
225 * we have enough delay slots to schedule ourself:
227 delay
= delay_calc(ctx
, instr
);
231 /* if the instruction is a kill, we need to ensure *every*
232 * bary.f is scheduled. The hw seems unhappy if the thread
233 * gets killed before the end-input (ei) flag is hit.
235 * We could do this by adding each bary.f instruction as
236 * virtual ssa src for the kill instruction. But we have
237 * fixed length instr->regs[].
239 * TODO this wouldn't be quite right if we had multiple
240 * basic blocks, if any block was conditional. We'd need
241 * to schedule the bary.f's outside of any block which
242 * was conditional that contained a kill.. I think..
244 if (is_kill(instr
)) {
245 struct ir3
*ir
= instr
->block
->shader
;
247 for (unsigned i
= 0; i
< ir
->baryfs_count
; i
++) {
248 struct ir3_instruction
*baryf
= ir
->baryfs
[i
];
249 if (baryf
->depth
== DEPTH_UNUSED
)
251 if (!is_scheduled(baryf
)) {
252 notes
->blocked_kill
= true;
258 if (check_conflict(ctx
, notes
, instr
))
264 /* could an instruction be scheduled if specified ssa src was scheduled? */
266 could_sched(struct ir3_instruction
*instr
, struct ir3_instruction
*src
)
268 struct ir3_instruction
*other_src
;
269 foreach_ssa_src(other_src
, instr
) {
270 /* if dependency not scheduled, we aren't ready yet: */
271 if ((src
!= other_src
) && !is_scheduled(other_src
)) {
278 /* move eligible instructions to the priority list: */
280 add_eligible_instrs(struct ir3_sched_ctx
*ctx
, struct ir3_sched_notes
*notes
,
281 struct list_head
*prio_queue
, struct list_head
*unscheduled_list
)
283 unsigned min_delay
= ~0;
285 list_for_each_entry_safe (struct ir3_instruction
, instr
, unscheduled_list
, node
) {
286 int e
= instr_eligibility(ctx
, notes
, instr
);
290 /* For instructions that write address register we need to
291 * make sure there is at least one instruction that uses the
292 * addr value which is otherwise ready.
294 * TODO if any instructions use pred register and have other
295 * src args, we would need to do the same for writes_pred()..
297 if (unlikely(writes_addr(instr
))) {
298 struct ir3
*ir
= instr
->block
->shader
;
300 for (unsigned i
= 0; (i
< ir
->indirects_count
) && !ready
; i
++) {
301 struct ir3_instruction
*indirect
= ir
->indirects
[i
];
302 if (indirect
->address
!= instr
)
304 ready
= could_sched(indirect
, instr
);
307 /* nothing could be scheduled, so keep looking: */
312 min_delay
= MIN2(min_delay
, e
);
314 /* remove from unscheduled list and into priority queue: */
315 list_delinit(&instr
->node
);
316 ir3_insert_by_depth(instr
, prio_queue
);
323 /* "spill" the address register by remapping any unscheduled
324 * instructions which depend on the current address register
325 * to a clone of the instruction which wrote the address reg.
327 static struct ir3_instruction
*
328 split_addr(struct ir3_sched_ctx
*ctx
)
331 struct ir3_instruction
*new_addr
= NULL
;
334 debug_assert(ctx
->addr
);
336 ir
= ctx
->addr
->block
->shader
;
338 for (i
= 0; i
< ir
->indirects_count
; i
++) {
339 struct ir3_instruction
*indirect
= ir
->indirects
[i
];
341 /* skip instructions already scheduled: */
342 if (is_scheduled(indirect
))
345 /* remap remaining instructions using current addr
348 if (indirect
->address
== ctx
->addr
) {
350 new_addr
= ir3_instr_clone(ctx
->addr
);
351 /* original addr is scheduled, but new one isn't: */
352 new_addr
->flags
&= ~IR3_INSTR_MARK
;
354 ir3_instr_set_address(indirect
, new_addr
);
358 /* all remaining indirects remapped to new addr: */
364 /* "spill" the predicate register by remapping any unscheduled
365 * instructions which depend on the current predicate register
366 * to a clone of the instruction which wrote the address reg.
368 static struct ir3_instruction
*
369 split_pred(struct ir3_sched_ctx
*ctx
)
372 struct ir3_instruction
*new_pred
= NULL
;
375 debug_assert(ctx
->pred
);
377 ir
= ctx
->pred
->block
->shader
;
379 for (i
= 0; i
< ir
->predicates_count
; i
++) {
380 struct ir3_instruction
*predicated
= ir
->predicates
[i
];
382 /* skip instructions already scheduled: */
383 if (is_scheduled(predicated
))
386 /* remap remaining instructions using current pred
389 * TODO is there ever a case when pred isn't first
392 if (ssa(predicated
->regs
[1]) == ctx
->pred
) {
394 new_pred
= ir3_instr_clone(ctx
->pred
);
395 /* original pred is scheduled, but new one isn't: */
396 new_pred
->flags
&= ~IR3_INSTR_MARK
;
398 predicated
->regs
[1]->instr
= new_pred
;
402 /* all remaining predicated remapped to new pred: */
409 sched_block(struct ir3_sched_ctx
*ctx
, struct ir3_block
*block
)
411 struct list_head unscheduled_list
, prio_queue
;
415 /* move all instructions to the unscheduled list, and
416 * empty the block's instruction list (to which we will
419 list_replace(&block
->instr_list
, &unscheduled_list
);
420 list_inithead(&block
->instr_list
);
421 list_inithead(&prio_queue
);
423 /* first a pre-pass to schedule all meta:input/phi instructions
424 * (which need to appear first so that RA knows the register is
427 list_for_each_entry_safe (struct ir3_instruction
, instr
, &unscheduled_list
, node
) {
428 if (is_meta(instr
) && ((instr
->opc
== OPC_META_INPUT
) ||
429 (instr
->opc
== OPC_META_PHI
)))
430 schedule(ctx
, instr
);
433 while (!(list_empty(&unscheduled_list
) &&
434 list_empty(&prio_queue
))) {
435 struct ir3_sched_notes notes
= {0};
438 delay
= add_eligible_instrs(ctx
, ¬es
, &prio_queue
, &unscheduled_list
);
440 if (!list_empty(&prio_queue
)) {
441 struct ir3_instruction
*instr
= list_last_entry(&prio_queue
,
442 struct ir3_instruction
, node
);
443 /* ugg, this is a bit ugly, but between the time when
444 * the instruction became eligible and now, a new
445 * conflict may have arose..
447 if (check_conflict(ctx
, ¬es
, instr
)) {
448 list_del(&instr
->node
);
449 list_addtail(&instr
->node
, &unscheduled_list
);
453 schedule(ctx
, instr
);
454 } else if (delay
== ~0) {
455 struct ir3_instruction
*new_instr
= NULL
;
457 /* nothing available to schedule.. if we are blocked on
458 * address/predicate register conflict, then break the
459 * deadlock by cloning the instruction that wrote that
462 if (notes
.addr_conflict
) {
463 new_instr
= split_addr(ctx
);
464 } else if (notes
.pred_conflict
) {
465 new_instr
= split_pred(ctx
);
473 list_del(&new_instr
->node
);
474 list_addtail(&new_instr
->node
, &unscheduled_list
);
478 /* and if we run out of instructions that can be scheduled,
479 * then it is time for nop's:
481 debug_assert(delay
<= 6);
489 /* And lastly, insert branch/jump instructions to take us to
490 * the next block. Later we'll strip back out the branches
491 * that simply jump to next instruction.
493 if (block
->successors
[1]) {
494 /* if/else, conditional branches to "then" or "else": */
495 struct ir3_instruction
*br
;
498 debug_assert(ctx
->pred
);
499 debug_assert(block
->condition
);
501 delay
-= distance(ctx
, ctx
->pred
, delay
);
508 /* create "else" branch first (since "then" block should
509 * frequently/always end up being a fall-thru):
513 br
->cat0
.target
= block
->successors
[1];
515 /* NOTE: we have to hard code delay of 6 above, since
516 * we want to insert the nop's before constructing the
517 * branch. Throw in an assert so we notice if this
518 * ever breaks on future generation:
520 debug_assert(ir3_delayslots(ctx
->pred
, br
, 0) == 6);
523 br
->cat0
.target
= block
->successors
[0];
525 } else if (block
->successors
[0]) {
526 /* otherwise unconditional jump to next block: */
527 struct ir3_instruction
*jmp
;
529 jmp
= ir3_JUMP(block
);
530 jmp
->cat0
.target
= block
->successors
[0];
533 /* NOTE: if we kept track of the predecessors, we could do a better
534 * job w/ (jp) flags.. every node w/ > predecessor is a join point.
535 * Note that as we eliminate blocks which contain only an unconditional
536 * jump we probably need to propagate (jp) flag..
540 /* this is needed to ensure later RA stage succeeds: */
542 sched_insert_parallel_copies(struct ir3_block
*block
)
544 list_for_each_entry (struct ir3_instruction
, instr
, &block
->instr_list
, node
) {
545 if (is_meta(instr
) && (instr
->opc
== OPC_META_PHI
)) {
546 struct ir3_register
*reg
;
547 foreach_src(reg
, instr
) {
548 struct ir3_instruction
*src
= reg
->instr
;
549 struct ir3_instruction
*mov
=
550 ir3_MOV(src
->block
, src
, TYPE_U32
);
551 mov
->regs
[0]->flags
|= IR3_REG_PHI_SRC
;
552 mov
->regs
[0]->instr
= instr
;
559 int ir3_sched(struct ir3
*ir
)
561 struct ir3_sched_ctx ctx
= {0};
562 list_for_each_entry (struct ir3_block
, block
, &ir
->block_list
, node
) {
563 sched_insert_parallel_copies(block
);
566 list_for_each_entry (struct ir3_block
, block
, &ir
->block_list
, node
) {
567 sched_block(&ctx
, block
);