1 /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
4 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26 * Rob Clark <robclark@freedesktop.org>
30 #include "util/u_math.h"
35 * Instruction Scheduling:
37 * A recursive depth based scheduling algo. Recursively find an eligible
38 * instruction to schedule from the deepest instruction (recursing through
39 * it's unscheduled src instructions). Normally this would result in a
40 * lot of re-traversal of the same instructions, so we cache results in
41 * instr->data (and clear cached results that would be no longer valid
42 * after scheduling an instruction).
44 * There are a few special cases that need to be handled, since sched
45 * is currently independent of register allocation. Usages of address
46 * register (a0.x) or predicate register (p0.x) must be serialized. Ie.
47 * if you have two pairs of instructions that write the same special
48 * register and then read it, then those pairs cannot be interleaved.
49 * To solve this, when we are in such a scheduling "critical section",
50 * and we encounter a conflicting write to a special register, we try
51 * to schedule any remaining instructions that use that value first.
54 struct ir3_sched_ctx
{
55 struct ir3_block
*block
; /* the current block */
56 struct list_head depth_list
; /* depth sorted unscheduled instrs */
57 struct ir3_instruction
*scheduled
; /* last scheduled instr XXX remove*/
58 struct ir3_instruction
*addr
; /* current a0.x user, if any */
59 struct ir3_instruction
*pred
; /* current p0.x user, if any */
63 static bool is_sfu_or_mem(struct ir3_instruction
*instr
)
65 return is_sfu(instr
) || is_mem(instr
);
68 #define NULL_INSTR ((void *)~0)
71 clear_cache(struct ir3_sched_ctx
*ctx
, struct ir3_instruction
*instr
)
73 list_for_each_entry (struct ir3_instruction
, instr2
, &ctx
->depth_list
, node
) {
74 if ((instr2
->data
== instr
) || (instr2
->data
== NULL_INSTR
) || !instr
)
80 schedule(struct ir3_sched_ctx
*ctx
, struct ir3_instruction
*instr
)
82 debug_assert(ctx
->block
== instr
->block
);
84 /* maybe there is a better way to handle this than just stuffing
85 * a nop.. ideally we'd know about this constraint in the
86 * scheduling and depth calculation..
88 if (ctx
->scheduled
&& is_sfu_or_mem(ctx
->scheduled
) && is_sfu_or_mem(instr
))
91 /* remove from depth list:
93 list_delinit(&instr
->node
);
95 if (writes_addr(instr
)) {
96 debug_assert(ctx
->addr
== NULL
);
100 if (writes_pred(instr
)) {
101 debug_assert(ctx
->pred
== NULL
);
105 instr
->flags
|= IR3_INSTR_MARK
;
107 list_addtail(&instr
->node
, &instr
->block
->instr_list
);
108 ctx
->scheduled
= instr
;
110 if (writes_addr(instr
) || writes_pred(instr
) || is_input(instr
)) {
111 clear_cache(ctx
, NULL
);
113 /* invalidate only the necessary entries.. */
114 clear_cache(ctx
, instr
);
118 static struct ir3_instruction
*
119 deepest(struct ir3_instruction
**srcs
, unsigned nsrcs
)
121 struct ir3_instruction
*d
= NULL
;
122 unsigned i
= 0, id
= 0;
124 while ((i
< nsrcs
) && !(d
= srcs
[id
= i
]))
130 for (; i
< nsrcs
; i
++)
131 if (srcs
[i
] && (srcs
[i
]->depth
> d
->depth
))
140 distance(struct ir3_sched_ctx
*ctx
, struct ir3_instruction
*instr
,
143 struct list_head
*instr_list
= &ctx
->block
->instr_list
;
146 list_for_each_entry_rev (struct ir3_instruction
, n
, instr_list
, node
) {
147 if ((n
== instr
) || (d
>= maxd
))
149 if (is_alu(n
) || is_flow(n
))
156 /* calculate delay for specified src: */
158 delay_calc_srcn(struct ir3_sched_ctx
*ctx
,
159 struct ir3_instruction
*assigner
,
160 struct ir3_instruction
*consumer
, unsigned srcn
)
164 if (is_meta(assigner
)) {
165 struct ir3_instruction
*src
;
166 foreach_ssa_src(src
, assigner
) {
168 if (src
->block
!= assigner
->block
)
170 d
= delay_calc_srcn(ctx
, src
, consumer
, srcn
);
171 delay
= MAX2(delay
, d
);
174 delay
= ir3_delayslots(assigner
, consumer
, srcn
);
175 delay
-= distance(ctx
, assigner
, delay
);
181 /* calculate delay for instruction (maximum of delay for all srcs): */
183 delay_calc(struct ir3_sched_ctx
*ctx
, struct ir3_instruction
*instr
)
186 struct ir3_instruction
*src
;
188 foreach_ssa_src_n(src
, i
, instr
) {
190 /* for array writes, no need to delay on previous write: */
193 if (src
->block
!= instr
->block
)
195 d
= delay_calc_srcn(ctx
, src
, instr
, i
);
196 delay
= MAX2(delay
, d
);
202 struct ir3_sched_notes
{
203 /* there is at least one kill which could be scheduled, except
204 * for unscheduled bary.f's:
207 /* there is at least one instruction that could be scheduled,
208 * except for conflicting address/predicate register usage:
210 bool addr_conflict
, pred_conflict
;
213 static bool is_scheduled(struct ir3_instruction
*instr
)
215 return !!(instr
->flags
& IR3_INSTR_MARK
);
218 /* could an instruction be scheduled if specified ssa src was scheduled? */
220 could_sched(struct ir3_instruction
*instr
, struct ir3_instruction
*src
)
222 struct ir3_instruction
*other_src
;
223 foreach_ssa_src(other_src
, instr
) {
224 /* if dependency not scheduled, we aren't ready yet: */
225 if ((src
!= other_src
) && !is_scheduled(other_src
)) {
232 /* Check if instruction is ok to schedule. Make sure it is not blocked
233 * by use of addr/predicate register, etc.
236 check_instr(struct ir3_sched_ctx
*ctx
, struct ir3_sched_notes
*notes
,
237 struct ir3_instruction
*instr
)
239 /* For instructions that write address register we need to
240 * make sure there is at least one instruction that uses the
241 * addr value which is otherwise ready.
243 * TODO if any instructions use pred register and have other
244 * src args, we would need to do the same for writes_pred()..
246 if (writes_addr(instr
)) {
247 struct ir3
*ir
= instr
->block
->shader
;
249 for (unsigned i
= 0; (i
< ir
->indirects_count
) && !ready
; i
++) {
250 struct ir3_instruction
*indirect
= ir
->indirects
[i
];
253 if (indirect
->address
!= instr
)
255 ready
= could_sched(indirect
, instr
);
258 /* nothing could be scheduled, so keep looking: */
263 /* if this is a write to address/predicate register, and that
264 * register is currently in use, we need to defer until it is
267 if (writes_addr(instr
) && ctx
->addr
) {
268 debug_assert(ctx
->addr
!= instr
);
269 notes
->addr_conflict
= true;
273 if (writes_pred(instr
) && ctx
->pred
) {
274 debug_assert(ctx
->pred
!= instr
);
275 notes
->pred_conflict
= true;
279 /* if the instruction is a kill, we need to ensure *every*
280 * bary.f is scheduled. The hw seems unhappy if the thread
281 * gets killed before the end-input (ei) flag is hit.
283 * We could do this by adding each bary.f instruction as
284 * virtual ssa src for the kill instruction. But we have
285 * fixed length instr->regs[].
287 * TODO this wouldn't be quite right if we had multiple
288 * basic blocks, if any block was conditional. We'd need
289 * to schedule the bary.f's outside of any block which
290 * was conditional that contained a kill.. I think..
292 if (is_kill(instr
)) {
293 struct ir3
*ir
= instr
->block
->shader
;
295 for (unsigned i
= 0; i
< ir
->baryfs_count
; i
++) {
296 struct ir3_instruction
*baryf
= ir
->baryfs
[i
];
297 if (baryf
->flags
& IR3_INSTR_UNUSED
)
299 if (!is_scheduled(baryf
)) {
300 notes
->blocked_kill
= true;
309 /* Find the best instruction to schedule from specified instruction or
310 * recursively it's ssa sources.
312 static struct ir3_instruction
*
313 find_instr_recursive(struct ir3_sched_ctx
*ctx
, struct ir3_sched_notes
*notes
,
314 struct ir3_instruction
*instr
)
316 struct ir3_instruction
*srcs
[__ssa_src_cnt(instr
)];
317 struct ir3_instruction
*src
;
320 if (is_scheduled(instr
))
323 /* use instr->data to cache the results of recursing up the
324 * instr src's. Otherwise the recursive algo can scale quite
325 * badly w/ shader size. But this takes some care to clear
326 * the cache appropriately when instructions are scheduled.
329 if (instr
->data
== NULL_INSTR
)
334 /* find unscheduled srcs: */
335 foreach_ssa_src(src
, instr
) {
336 if (!is_scheduled(src
)) {
337 debug_assert(nsrcs
< ARRAY_SIZE(srcs
));
342 /* if all our src's are already scheduled: */
344 if (check_instr(ctx
, notes
, instr
)) {
351 while ((src
= deepest(srcs
, nsrcs
))) {
352 struct ir3_instruction
*candidate
;
354 candidate
= find_instr_recursive(ctx
, notes
, src
);
358 if (check_instr(ctx
, notes
, candidate
)) {
359 instr
->data
= candidate
;
364 instr
->data
= NULL_INSTR
;
368 /* find instruction to schedule: */
369 static struct ir3_instruction
*
370 find_eligible_instr(struct ir3_sched_ctx
*ctx
, struct ir3_sched_notes
*notes
)
372 struct ir3_instruction
*best_instr
= NULL
;
373 unsigned min_delay
= ~0;
375 /* TODO we'd really rather use the list/array of block outputs. But we
376 * don't have such a thing. Recursing *every* instruction in the list
377 * will result in a lot of repeated traversal, since instructions will
378 * get traversed both when they appear as ssa src to a later instruction
379 * as well as where they appear in the depth_list.
381 list_for_each_entry_rev (struct ir3_instruction
, instr
, &ctx
->depth_list
, node
) {
382 struct ir3_instruction
*candidate
;
385 candidate
= find_instr_recursive(ctx
, notes
, instr
);
389 delay
= delay_calc(ctx
, candidate
);
390 if (delay
< min_delay
) {
391 best_instr
= candidate
;
402 /* "spill" the address register by remapping any unscheduled
403 * instructions which depend on the current address register
404 * to a clone of the instruction which wrote the address reg.
406 static struct ir3_instruction
*
407 split_addr(struct ir3_sched_ctx
*ctx
)
410 struct ir3_instruction
*new_addr
= NULL
;
413 debug_assert(ctx
->addr
);
415 ir
= ctx
->addr
->block
->shader
;
417 for (i
= 0; i
< ir
->indirects_count
; i
++) {
418 struct ir3_instruction
*indirect
= ir
->indirects
[i
];
423 /* skip instructions already scheduled: */
424 if (is_scheduled(indirect
))
427 /* remap remaining instructions using current addr
430 if (indirect
->address
== ctx
->addr
) {
432 new_addr
= ir3_instr_clone(ctx
->addr
);
433 /* original addr is scheduled, but new one isn't: */
434 new_addr
->flags
&= ~IR3_INSTR_MARK
;
436 ir3_instr_set_address(indirect
, new_addr
);
440 /* all remaining indirects remapped to new addr: */
446 /* "spill" the predicate register by remapping any unscheduled
447 * instructions which depend on the current predicate register
448 * to a clone of the instruction which wrote the address reg.
450 static struct ir3_instruction
*
451 split_pred(struct ir3_sched_ctx
*ctx
)
454 struct ir3_instruction
*new_pred
= NULL
;
457 debug_assert(ctx
->pred
);
459 ir
= ctx
->pred
->block
->shader
;
461 for (i
= 0; i
< ir
->predicates_count
; i
++) {
462 struct ir3_instruction
*predicated
= ir
->predicates
[i
];
464 /* skip instructions already scheduled: */
465 if (is_scheduled(predicated
))
468 /* remap remaining instructions using current pred
471 * TODO is there ever a case when pred isn't first
474 if (ssa(predicated
->regs
[1]) == ctx
->pred
) {
476 new_pred
= ir3_instr_clone(ctx
->pred
);
477 /* original pred is scheduled, but new one isn't: */
478 new_pred
->flags
&= ~IR3_INSTR_MARK
;
480 predicated
->regs
[1]->instr
= new_pred
;
484 /* all remaining predicated remapped to new pred: */
491 sched_block(struct ir3_sched_ctx
*ctx
, struct ir3_block
*block
)
493 struct list_head unscheduled_list
;
497 /* addr/pred writes are per-block: */
501 /* move all instructions to the unscheduled list, and
502 * empty the block's instruction list (to which we will
505 list_replace(&block
->instr_list
, &unscheduled_list
);
506 list_inithead(&block
->instr_list
);
507 list_inithead(&ctx
->depth_list
);
509 /* first a pre-pass to schedule all meta:input/phi instructions
510 * (which need to appear first so that RA knows the register is
511 * occupied), and move remaining to depth sorted list:
513 list_for_each_entry_safe (struct ir3_instruction
, instr
, &unscheduled_list
, node
) {
514 if ((instr
->opc
== OPC_META_INPUT
) || (instr
->opc
== OPC_META_PHI
)) {
515 schedule(ctx
, instr
);
517 ir3_insert_by_depth(instr
, &ctx
->depth_list
);
521 while (!list_empty(&ctx
->depth_list
)) {
522 struct ir3_sched_notes notes
= {0};
523 struct ir3_instruction
*instr
;
525 instr
= find_eligible_instr(ctx
, ¬es
);
528 unsigned delay
= delay_calc(ctx
, instr
);
530 /* and if we run out of instructions that can be scheduled,
531 * then it is time for nop's:
533 debug_assert(delay
<= 6);
539 schedule(ctx
, instr
);
541 struct ir3_instruction
*new_instr
= NULL
;
543 /* nothing available to schedule.. if we are blocked on
544 * address/predicate register conflict, then break the
545 * deadlock by cloning the instruction that wrote that
548 if (notes
.addr_conflict
) {
549 new_instr
= split_addr(ctx
);
550 } else if (notes
.pred_conflict
) {
551 new_instr
= split_pred(ctx
);
559 /* clearing current addr/pred can change what is
560 * available to schedule, so clear cache..
562 clear_cache(ctx
, NULL
);
564 ir3_insert_by_depth(new_instr
, &ctx
->depth_list
);
565 /* the original instr that wrote addr/pred may have
566 * originated from a different block:
568 new_instr
->block
= block
;
573 /* And lastly, insert branch/jump instructions to take us to
574 * the next block. Later we'll strip back out the branches
575 * that simply jump to next instruction.
577 if (block
->successors
[1]) {
578 /* if/else, conditional branches to "then" or "else": */
579 struct ir3_instruction
*br
;
582 debug_assert(ctx
->pred
);
583 debug_assert(block
->condition
);
585 delay
-= distance(ctx
, ctx
->pred
, delay
);
592 /* create "else" branch first (since "then" block should
593 * frequently/always end up being a fall-thru):
597 br
->cat0
.target
= block
->successors
[1];
599 /* NOTE: we have to hard code delay of 6 above, since
600 * we want to insert the nop's before constructing the
601 * branch. Throw in an assert so we notice if this
602 * ever breaks on future generation:
604 debug_assert(ir3_delayslots(ctx
->pred
, br
, 0) == 6);
607 br
->cat0
.target
= block
->successors
[0];
609 } else if (block
->successors
[0]) {
610 /* otherwise unconditional jump to next block: */
611 struct ir3_instruction
*jmp
;
613 jmp
= ir3_JUMP(block
);
614 jmp
->cat0
.target
= block
->successors
[0];
617 /* NOTE: if we kept track of the predecessors, we could do a better
618 * job w/ (jp) flags.. every node w/ > predecessor is a join point.
619 * Note that as we eliminate blocks which contain only an unconditional
620 * jump we probably need to propagate (jp) flag..
624 /* this is needed to ensure later RA stage succeeds: */
626 sched_insert_parallel_copies(struct ir3_block
*block
)
628 list_for_each_entry (struct ir3_instruction
, instr
, &block
->instr_list
, node
) {
629 if (instr
->opc
== OPC_META_PHI
) {
630 struct ir3_register
*reg
, *reg2
;
631 foreach_src(reg
, instr
) {
632 struct ir3_instruction
*src
= reg
->instr
;
633 struct ir3_instruction
*mov
= NULL
;
635 /* after CP we could end up w/ duplicate phi srcs: */
636 foreach_src(reg2
, instr
) {
639 /* reg2 is before reg1 so already an inserted mov: */
640 else if (reg2
->instr
->regs
[1]->instr
== src
) {
647 mov
= ir3_MOV(src
->block
, src
, TYPE_U32
);
648 mov
->regs
[0]->flags
|= IR3_REG_PHI_SRC
;
649 mov
->regs
[0]->instr
= instr
;
658 int ir3_sched(struct ir3
*ir
)
660 struct ir3_sched_ctx ctx
= {0};
661 list_for_each_entry (struct ir3_block
, block
, &ir
->block_list
, node
) {
662 sched_insert_parallel_copies(block
);
665 list_for_each_entry (struct ir3_block
, block
, &ir
->block_list
, node
) {
666 sched_block(&ctx
, block
);