2 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 * Rob Clark <robclark@freedesktop.org>
28 #include "util/u_math.h"
31 #include "ir3_compiler.h"
34 #define SCHED_DEBUG (ir3_shader_debug & IR3_DBG_SCHEDMSGS)
38 #define d(fmt, ...) do { if (SCHED_DEBUG) { \
39 printf("SCHED: "fmt"\n", ##__VA_ARGS__); \
42 #define di(instr, fmt, ...) do { if (SCHED_DEBUG) { \
43 printf("SCHED: "fmt": ", ##__VA_ARGS__); \
44 ir3_print_instr(instr); \
48 * Instruction Scheduling:
50 * A recursive depth based scheduling algo. Recursively find an eligible
51 * instruction to schedule from the deepest instruction (recursing through
52 * it's unscheduled src instructions). Normally this would result in a
53 * lot of re-traversal of the same instructions, so we cache results in
54 * instr->data (and clear cached results that would be no longer valid
55 * after scheduling an instruction).
57 * There are a few special cases that need to be handled, since sched
58 * is currently independent of register allocation. Usages of address
59 * register (a0.x) or predicate register (p0.x) must be serialized. Ie.
60 * if you have two pairs of instructions that write the same special
61 * register and then read it, then those pairs cannot be interleaved.
62 * To solve this, when we are in such a scheduling "critical section",
63 * and we encounter a conflicting write to a special register, we try
64 * to schedule any remaining instructions that use that value first.
67 struct ir3_sched_ctx
{
68 struct ir3_block
*block
; /* the current block */
69 struct list_head depth_list
; /* depth sorted unscheduled instrs */
70 struct ir3_instruction
*scheduled
; /* last scheduled instr XXX remove*/
71 struct ir3_instruction
*addr
; /* current a0.x user, if any */
72 struct ir3_instruction
*pred
; /* current p0.x user, if any */
73 int live_values
; /* estimate of current live values */
77 static bool is_scheduled(struct ir3_instruction
*instr
)
79 return !!(instr
->flags
& IR3_INSTR_MARK
);
82 static bool is_sfu_or_mem(struct ir3_instruction
*instr
)
84 return is_sfu(instr
) || is_mem(instr
);
88 unuse_each_src(struct ir3_sched_ctx
*ctx
, struct ir3_instruction
*instr
)
90 struct ir3_instruction
*src
;
92 foreach_ssa_src_n(src
, n
, instr
) {
93 if (__is_false_dep(instr
, n
))
95 if (instr
->block
!= src
->block
)
97 if ((src
->opc
== OPC_META_COLLECT
) || (src
->opc
== OPC_META_SPLIT
)) {
98 unuse_each_src(ctx
, src
);
100 debug_assert(src
->use_count
> 0);
102 if (--src
->use_count
== 0) {
103 ctx
->live_values
-= dest_regs(src
);
104 debug_assert(ctx
->live_values
>= 0);
110 static void clear_cache(struct ir3_sched_ctx
*ctx
, struct ir3_instruction
*instr
);
111 static void use_instr(struct ir3_instruction
*instr
);
113 /* transfers a use-count to new instruction, for cases where we
114 * "spill" address or predicate. Note this might cause the
115 * previous instruction that loaded a0.x/p0.x to become live
116 * again, when we previously thought it was dead.
119 transfer_use(struct ir3_sched_ctx
*ctx
, struct ir3_instruction
*orig_instr
,
120 struct ir3_instruction
*new_instr
)
122 struct ir3_instruction
*src
;
124 debug_assert(is_scheduled(orig_instr
));
126 foreach_ssa_src_n(src
, n
, new_instr
) {
127 if (__is_false_dep(new_instr
, n
))
129 ctx
->live_values
+= dest_regs(src
);
133 clear_cache(ctx
, orig_instr
);
137 use_each_src(struct ir3_instruction
*instr
)
139 struct ir3_instruction
*src
;
141 foreach_ssa_src_n(src
, n
, instr
) {
142 if (__is_false_dep(instr
, n
))
149 use_instr(struct ir3_instruction
*instr
)
151 if ((instr
->opc
== OPC_META_COLLECT
) || (instr
->opc
== OPC_META_SPLIT
)) {
159 update_live_values(struct ir3_sched_ctx
*ctx
, struct ir3_instruction
*instr
)
161 if ((instr
->opc
== OPC_META_COLLECT
) || (instr
->opc
== OPC_META_SPLIT
))
164 ctx
->live_values
+= dest_regs(instr
);
165 unuse_each_src(ctx
, instr
);
169 update_use_count(struct ir3
*ir
)
171 foreach_block (block
, &ir
->block_list
) {
172 foreach_instr (instr
, &block
->instr_list
) {
173 instr
->use_count
= 0;
177 foreach_block (block
, &ir
->block_list
) {
178 foreach_instr (instr
, &block
->instr_list
) {
179 if ((instr
->opc
== OPC_META_COLLECT
) || (instr
->opc
== OPC_META_SPLIT
))
186 /* Shader outputs are also used:
188 struct ir3_instruction
*out
;
189 foreach_output(out
, ir
)
193 #define NULL_INSTR ((void *)~0)
196 clear_cache(struct ir3_sched_ctx
*ctx
, struct ir3_instruction
*instr
)
198 foreach_instr (instr2
, &ctx
->depth_list
) {
199 if ((instr2
->data
== instr
) || (instr2
->data
== NULL_INSTR
) || !instr
)
205 schedule(struct ir3_sched_ctx
*ctx
, struct ir3_instruction
*instr
)
207 debug_assert(ctx
->block
== instr
->block
);
209 /* maybe there is a better way to handle this than just stuffing
210 * a nop.. ideally we'd know about this constraint in the
211 * scheduling and depth calculation..
213 if (ctx
->scheduled
&& is_sfu_or_mem(ctx
->scheduled
) && is_sfu_or_mem(instr
))
216 /* remove from depth list:
218 list_delinit(&instr
->node
);
220 if (writes_addr(instr
)) {
221 debug_assert(ctx
->addr
== NULL
);
225 if (writes_pred(instr
)) {
226 debug_assert(ctx
->pred
== NULL
);
230 instr
->flags
|= IR3_INSTR_MARK
;
232 di(instr
, "schedule");
234 list_addtail(&instr
->node
, &instr
->block
->instr_list
);
235 ctx
->scheduled
= instr
;
237 update_live_values(ctx
, instr
);
239 if (writes_addr(instr
) || writes_pred(instr
) || is_input(instr
)) {
240 clear_cache(ctx
, NULL
);
242 /* invalidate only the necessary entries.. */
243 clear_cache(ctx
, instr
);
247 static struct ir3_instruction
*
248 deepest(struct ir3_instruction
**srcs
, unsigned nsrcs
)
250 struct ir3_instruction
*d
= NULL
;
251 unsigned i
= 0, id
= 0;
253 while ((i
< nsrcs
) && !(d
= srcs
[id
= i
]))
259 for (; i
< nsrcs
; i
++)
260 if (srcs
[i
] && (srcs
[i
]->depth
> d
->depth
))
268 struct ir3_sched_notes
{
269 /* there is at least one kill which could be scheduled, except
270 * for unscheduled bary.f's:
273 /* there is at least one instruction that could be scheduled,
274 * except for conflicting address/predicate register usage:
276 bool addr_conflict
, pred_conflict
;
279 /* could an instruction be scheduled if specified ssa src was scheduled? */
281 could_sched(struct ir3_instruction
*instr
, struct ir3_instruction
*src
)
283 struct ir3_instruction
*other_src
;
284 foreach_ssa_src(other_src
, instr
) {
285 /* if dependency not scheduled, we aren't ready yet: */
286 if ((src
!= other_src
) && !is_scheduled(other_src
)) {
293 /* Check if instruction is ok to schedule. Make sure it is not blocked
294 * by use of addr/predicate register, etc.
297 check_instr(struct ir3_sched_ctx
*ctx
, struct ir3_sched_notes
*notes
,
298 struct ir3_instruction
*instr
)
300 debug_assert(!is_scheduled(instr
));
302 /* For instructions that write address register we need to
303 * make sure there is at least one instruction that uses the
304 * addr value which is otherwise ready.
306 * TODO if any instructions use pred register and have other
307 * src args, we would need to do the same for writes_pred()..
309 if (writes_addr(instr
)) {
310 struct ir3
*ir
= instr
->block
->shader
;
312 for (unsigned i
= 0; (i
< ir
->indirects_count
) && !ready
; i
++) {
313 struct ir3_instruction
*indirect
= ir
->indirects
[i
];
316 if (indirect
->address
!= instr
)
318 ready
= could_sched(indirect
, instr
);
321 /* nothing could be scheduled, so keep looking: */
326 /* if this is a write to address/predicate register, and that
327 * register is currently in use, we need to defer until it is
330 if (writes_addr(instr
) && ctx
->addr
) {
331 debug_assert(ctx
->addr
!= instr
);
332 notes
->addr_conflict
= true;
336 if (writes_pred(instr
) && ctx
->pred
) {
337 debug_assert(ctx
->pred
!= instr
);
338 notes
->pred_conflict
= true;
342 /* if the instruction is a kill, we need to ensure *every*
343 * bary.f is scheduled. The hw seems unhappy if the thread
344 * gets killed before the end-input (ei) flag is hit.
346 * We could do this by adding each bary.f instruction as
347 * virtual ssa src for the kill instruction. But we have
348 * fixed length instr->regs[].
350 * TODO this wouldn't be quite right if we had multiple
351 * basic blocks, if any block was conditional. We'd need
352 * to schedule the bary.f's outside of any block which
353 * was conditional that contained a kill.. I think..
355 if (is_kill(instr
)) {
356 struct ir3
*ir
= instr
->block
->shader
;
358 for (unsigned i
= 0; i
< ir
->baryfs_count
; i
++) {
359 struct ir3_instruction
*baryf
= ir
->baryfs
[i
];
360 if (baryf
->flags
& IR3_INSTR_UNUSED
)
362 if (!is_scheduled(baryf
)) {
363 notes
->blocked_kill
= true;
372 /* Find the best instruction to schedule from specified instruction or
373 * recursively it's ssa sources.
375 static struct ir3_instruction
*
376 find_instr_recursive(struct ir3_sched_ctx
*ctx
, struct ir3_sched_notes
*notes
,
377 struct ir3_instruction
*instr
)
379 struct ir3_instruction
*srcs
[__ssa_src_cnt(instr
)];
380 struct ir3_instruction
*src
;
383 if (is_scheduled(instr
))
386 /* use instr->data to cache the results of recursing up the
387 * instr src's. Otherwise the recursive algo can scale quite
388 * badly w/ shader size. But this takes some care to clear
389 * the cache appropriately when instructions are scheduled.
392 if (instr
->data
== NULL_INSTR
)
397 /* find unscheduled srcs: */
398 foreach_ssa_src(src
, instr
) {
399 if (!is_scheduled(src
) && (src
->block
== instr
->block
)) {
400 debug_assert(nsrcs
< ARRAY_SIZE(srcs
));
405 /* if all our src's are already scheduled: */
407 if (check_instr(ctx
, notes
, instr
)) {
414 while ((src
= deepest(srcs
, nsrcs
))) {
415 struct ir3_instruction
*candidate
;
417 candidate
= find_instr_recursive(ctx
, notes
, src
);
421 if (check_instr(ctx
, notes
, candidate
)) {
422 instr
->data
= candidate
;
427 instr
->data
= NULL_INSTR
;
431 /* find net change to live values if instruction were scheduled: */
433 live_effect(struct ir3_instruction
*instr
)
435 struct ir3_instruction
*src
;
436 int new_live
= dest_regs(instr
);
439 foreach_ssa_src_n(src
, n
, instr
) {
440 if (__is_false_dep(instr
, n
))
443 if (instr
->block
!= src
->block
)
446 /* for split, just pass things along to the real src: */
447 if (src
->opc
== OPC_META_SPLIT
)
448 src
= ssa(src
->regs
[1]);
450 /* for collect, if this is the last use of *each* src,
451 * then it will decrease the live values, since RA treats
454 if (src
->opc
== OPC_META_COLLECT
) {
455 struct ir3_instruction
*src2
;
456 bool last_use
= true;
458 foreach_ssa_src(src2
, src
) {
459 if (src2
->use_count
> 1) {
466 old_live
+= dest_regs(src
);
469 debug_assert(src
->use_count
> 0);
471 if (src
->use_count
== 1) {
472 old_live
+= dest_regs(src
);
477 return new_live
- old_live
;
480 /* find instruction to schedule: */
481 static struct ir3_instruction
*
482 find_eligible_instr(struct ir3_sched_ctx
*ctx
, struct ir3_sched_notes
*notes
,
485 struct ir3_instruction
*best_instr
= NULL
;
486 int best_rank
= INT_MAX
; /* lower is better */
487 unsigned deepest
= 0;
489 /* TODO we'd really rather use the list/array of block outputs. But we
490 * don't have such a thing. Recursing *every* instruction in the list
491 * will result in a lot of repeated traversal, since instructions will
492 * get traversed both when they appear as ssa src to a later instruction
493 * as well as where they appear in the depth_list.
495 foreach_instr_rev (instr
, &ctx
->depth_list
) {
496 struct ir3_instruction
*candidate
;
498 candidate
= find_instr_recursive(ctx
, notes
, instr
);
502 if (is_meta(candidate
))
505 deepest
= MAX2(deepest
, candidate
->depth
);
508 /* traverse the list a second time.. but since we cache the result of
509 * find_instr_recursive() it isn't as bad as it looks.
511 foreach_instr_rev (instr
, &ctx
->depth_list
) {
512 struct ir3_instruction
*candidate
;
514 candidate
= find_instr_recursive(ctx
, notes
, instr
);
518 /* determine net change to # of live values: */
519 int le
= live_effect(candidate
);
521 /* if there is a net increase in # of live values, then apply some
522 * threshold to avoid instructions getting scheduled *too* early
523 * and increasing register pressure.
528 if (ctx
->live_values
> 4*4) {
534 /* Filter out any "shallow" instructions which would otherwise
535 * tend to get scheduled too early to fill delay slots even
536 * when they are not needed for a while. There will probably
537 * be later delay slots that they could just as easily fill.
539 * A classic case where this comes up is frag shaders that
540 * write a constant value (like 1.0f) to one of the channels
541 * of the output color(s). Since the mov from immed has no
542 * dependencies, it would otherwise get scheduled early to
543 * fill delay slots, occupying a register until the end of
546 if ((deepest
- candidate
->depth
) > threshold
)
550 int rank
= ir3_delay_calc(ctx
->block
, candidate
, soft
, false);
552 /* if too many live values, prioritize instructions that reduce the
553 * number of live values:
555 if (ctx
->live_values
> 16*4) {
557 } else if (ctx
->live_values
> 4*4) {
561 if (rank
< best_rank
) {
562 best_instr
= candidate
;
570 static struct ir3_instruction
*
571 split_instr(struct ir3_sched_ctx
*ctx
, struct ir3_instruction
*orig_instr
)
573 struct ir3_instruction
*new_instr
= ir3_instr_clone(orig_instr
);
574 ir3_insert_by_depth(new_instr
, &ctx
->depth_list
);
575 transfer_use(ctx
, orig_instr
, new_instr
);
579 /* "spill" the address register by remapping any unscheduled
580 * instructions which depend on the current address register
581 * to a clone of the instruction which wrote the address reg.
583 static struct ir3_instruction
*
584 split_addr(struct ir3_sched_ctx
*ctx
)
587 struct ir3_instruction
*new_addr
= NULL
;
590 debug_assert(ctx
->addr
);
592 ir
= ctx
->addr
->block
->shader
;
594 for (i
= 0; i
< ir
->indirects_count
; i
++) {
595 struct ir3_instruction
*indirect
= ir
->indirects
[i
];
600 /* skip instructions already scheduled: */
601 if (is_scheduled(indirect
))
604 /* remap remaining instructions using current addr
607 if (indirect
->address
== ctx
->addr
) {
609 new_addr
= split_instr(ctx
, ctx
->addr
);
610 /* original addr is scheduled, but new one isn't: */
611 new_addr
->flags
&= ~IR3_INSTR_MARK
;
613 indirect
->address
= NULL
;
614 ir3_instr_set_address(indirect
, new_addr
);
618 /* all remaining indirects remapped to new addr: */
624 /* "spill" the predicate register by remapping any unscheduled
625 * instructions which depend on the current predicate register
626 * to a clone of the instruction which wrote the address reg.
628 static struct ir3_instruction
*
629 split_pred(struct ir3_sched_ctx
*ctx
)
632 struct ir3_instruction
*new_pred
= NULL
;
635 debug_assert(ctx
->pred
);
637 ir
= ctx
->pred
->block
->shader
;
639 for (i
= 0; i
< ir
->predicates_count
; i
++) {
640 struct ir3_instruction
*predicated
= ir
->predicates
[i
];
642 /* skip instructions already scheduled: */
643 if (is_scheduled(predicated
))
646 /* remap remaining instructions using current pred
649 * TODO is there ever a case when pred isn't first
652 if (ssa(predicated
->regs
[1]) == ctx
->pred
) {
654 new_pred
= split_instr(ctx
, ctx
->pred
);
655 /* original pred is scheduled, but new one isn't: */
656 new_pred
->flags
&= ~IR3_INSTR_MARK
;
658 predicated
->regs
[1]->instr
= new_pred
;
662 /* all remaining predicated remapped to new pred: */
669 sched_block(struct ir3_sched_ctx
*ctx
, struct ir3_block
*block
)
671 struct list_head unscheduled_list
;
675 /* addr/pred writes are per-block: */
679 /* move all instructions to the unscheduled list, and
680 * empty the block's instruction list (to which we will
683 list_replace(&block
->instr_list
, &unscheduled_list
);
684 list_inithead(&block
->instr_list
);
685 list_inithead(&ctx
->depth_list
);
687 /* First schedule all meta:input instructions, followed by
688 * tex-prefetch. We want all of the instructions that load
689 * values into registers before the shader starts to go
690 * before any other instructions. But in particular we
691 * want inputs to come before prefetches. This is because
692 * a FS's bary_ij input may not actually be live in the
693 * shader, but it should not be scheduled on top of any
694 * other input (but can be overwritten by a tex prefetch)
696 * Finally, move all the remaining instructions to the depth-
699 foreach_instr_safe (instr
, &unscheduled_list
)
700 if (instr
->opc
== OPC_META_INPUT
)
701 schedule(ctx
, instr
);
703 foreach_instr_safe (instr
, &unscheduled_list
)
704 if (instr
->opc
== OPC_META_TEX_PREFETCH
)
705 schedule(ctx
, instr
);
707 foreach_instr_safe (instr
, &unscheduled_list
)
708 ir3_insert_by_depth(instr
, &ctx
->depth_list
);
710 while (!list_is_empty(&ctx
->depth_list
)) {
711 struct ir3_sched_notes notes
= {0};
712 struct ir3_instruction
*instr
;
714 instr
= find_eligible_instr(ctx
, ¬es
, true);
716 instr
= find_eligible_instr(ctx
, ¬es
, false);
719 unsigned delay
= ir3_delay_calc(ctx
->block
, instr
, false, false);
720 d("delay=%u", delay
);
722 /* and if we run out of instructions that can be scheduled,
723 * then it is time for nop's:
725 debug_assert(delay
<= 6);
731 schedule(ctx
, instr
);
733 struct ir3_instruction
*new_instr
= NULL
;
735 /* nothing available to schedule.. if we are blocked on
736 * address/predicate register conflict, then break the
737 * deadlock by cloning the instruction that wrote that
740 if (notes
.addr_conflict
) {
741 new_instr
= split_addr(ctx
);
742 } else if (notes
.pred_conflict
) {
743 new_instr
= split_pred(ctx
);
751 /* clearing current addr/pred can change what is
752 * available to schedule, so clear cache..
754 clear_cache(ctx
, NULL
);
756 ir3_insert_by_depth(new_instr
, &ctx
->depth_list
);
757 /* the original instr that wrote addr/pred may have
758 * originated from a different block:
760 new_instr
->block
= block
;
766 int ir3_sched(struct ir3
*ir
)
768 struct ir3_sched_ctx ctx
= {0};
771 update_use_count(ir
);
773 foreach_block (block
, &ir
->block_list
) {
775 sched_block(&ctx
, block
);
785 get_array_id(struct ir3_instruction
*instr
)
787 /* The expectation is that there is only a single array
788 * src or dst, ir3_cp should enforce this.
791 for (unsigned i
= 0; i
< instr
->regs_count
; i
++)
792 if (instr
->regs
[i
]->flags
& IR3_REG_ARRAY
)
793 return instr
->regs
[i
]->array
.id
;
795 unreachable("this was unexpected");
798 /* does instruction 'prior' need to be scheduled before 'instr'? */
800 depends_on(struct ir3_instruction
*instr
, struct ir3_instruction
*prior
)
802 /* TODO for dependencies that are related to a specific object, ie
803 * a specific SSBO/image/array, we could relax this constraint to
804 * make accesses to unrelated objects not depend on each other (at
805 * least as long as not declared coherent)
807 if (((instr
->barrier_class
& IR3_BARRIER_EVERYTHING
) && prior
->barrier_class
) ||
808 ((prior
->barrier_class
& IR3_BARRIER_EVERYTHING
) && instr
->barrier_class
))
811 if (instr
->barrier_class
& prior
->barrier_conflict
) {
812 if (!(instr
->barrier_class
& ~(IR3_BARRIER_ARRAY_R
| IR3_BARRIER_ARRAY_W
))) {
813 /* if only array barrier, then we can further limit false-deps
814 * by considering the array-id, ie reads/writes to different
815 * arrays do not depend on each other (no aliasing)
817 if (get_array_id(instr
) != get_array_id(prior
)) {
829 add_barrier_deps(struct ir3_block
*block
, struct ir3_instruction
*instr
)
831 struct list_head
*prev
= instr
->node
.prev
;
832 struct list_head
*next
= instr
->node
.next
;
834 /* add dependencies on previous instructions that must be scheduled
835 * prior to the current instruction
837 while (prev
!= &block
->instr_list
) {
838 struct ir3_instruction
*pi
=
839 LIST_ENTRY(struct ir3_instruction
, prev
, node
);
846 if (instr
->barrier_class
== pi
->barrier_class
) {
847 ir3_instr_add_dep(instr
, pi
);
851 if (depends_on(instr
, pi
))
852 ir3_instr_add_dep(instr
, pi
);
855 /* add dependencies on this instruction to following instructions
856 * that must be scheduled after the current instruction:
858 while (next
!= &block
->instr_list
) {
859 struct ir3_instruction
*ni
=
860 LIST_ENTRY(struct ir3_instruction
, next
, node
);
867 if (instr
->barrier_class
== ni
->barrier_class
) {
868 ir3_instr_add_dep(ni
, instr
);
872 if (depends_on(ni
, instr
))
873 ir3_instr_add_dep(ni
, instr
);
877 /* before scheduling a block, we need to add any necessary false-dependencies
880 * (1) barriers are scheduled in the right order wrt instructions related
883 * (2) reads that come before a write actually get scheduled before the
887 calculate_deps(struct ir3_block
*block
)
889 foreach_instr (instr
, &block
->instr_list
) {
890 if (instr
->barrier_class
) {
891 add_barrier_deps(block
, instr
);
897 ir3_sched_add_deps(struct ir3
*ir
)
899 foreach_block (block
, &ir
->block_list
) {
900 calculate_deps(block
);