2 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 * Rob Clark <robclark@freedesktop.org>
28 #include "util/u_math.h"
31 #include "ir3_compiler.h"
34 #define SCHED_DEBUG (ir3_shader_debug & IR3_DBG_SCHEDMSGS)
38 #define d(fmt, ...) do { if (SCHED_DEBUG) { \
39 printf("SCHED: "fmt"\n", ##__VA_ARGS__); \
42 #define di(instr, fmt, ...) do { if (SCHED_DEBUG) { \
43 printf("SCHED: "fmt": ", ##__VA_ARGS__); \
44 ir3_print_instr(instr); \
48 * Instruction Scheduling:
50 * A recursive depth based scheduling algo. Recursively find an eligible
51 * instruction to schedule from the deepest instruction (recursing through
52 * it's unscheduled src instructions). Normally this would result in a
53 * lot of re-traversal of the same instructions, so we cache results in
54 * instr->data (and clear cached results that would be no longer valid
55 * after scheduling an instruction).
57 * There are a few special cases that need to be handled, since sched
58 * is currently independent of register allocation. Usages of address
59 * register (a0.x) or predicate register (p0.x) must be serialized. Ie.
60 * if you have two pairs of instructions that write the same special
61 * register and then read it, then those pairs cannot be interleaved.
62 * To solve this, when we are in such a scheduling "critical section",
63 * and we encounter a conflicting write to a special register, we try
64 * to schedule any remaining instructions that use that value first.
67 struct ir3_sched_ctx
{
68 struct ir3_block
*block
; /* the current block */
69 struct list_head depth_list
; /* depth sorted unscheduled instrs */
70 struct ir3_instruction
*scheduled
; /* last scheduled instr XXX remove*/
71 struct ir3_instruction
*addr
; /* current a0.x user, if any */
72 struct ir3_instruction
*pred
; /* current p0.x user, if any */
73 int live_values
; /* estimate of current live values */
77 static bool is_scheduled(struct ir3_instruction
*instr
)
79 return !!(instr
->flags
& IR3_INSTR_MARK
);
82 static bool is_sfu_or_mem(struct ir3_instruction
*instr
)
84 return is_sfu(instr
) || is_mem(instr
);
88 unuse_each_src(struct ir3_sched_ctx
*ctx
, struct ir3_instruction
*instr
)
90 struct ir3_instruction
*src
;
92 foreach_ssa_src_n(src
, n
, instr
) {
93 if (__is_false_dep(instr
, n
))
95 if (instr
->block
!= src
->block
)
97 if ((src
->opc
== OPC_META_COLLECT
) || (src
->opc
== OPC_META_SPLIT
)) {
98 unuse_each_src(ctx
, src
);
100 debug_assert(src
->use_count
> 0);
102 if (--src
->use_count
== 0) {
103 ctx
->live_values
-= dest_regs(src
);
104 debug_assert(ctx
->live_values
>= 0);
110 static void clear_cache(struct ir3_sched_ctx
*ctx
, struct ir3_instruction
*instr
);
111 static void use_instr(struct ir3_instruction
*instr
);
113 /* transfers a use-count to new instruction, for cases where we
114 * "spill" address or predicate. Note this might cause the
115 * previous instruction that loaded a0.x/p0.x to become live
116 * again, when we previously thought it was dead.
119 transfer_use(struct ir3_sched_ctx
*ctx
, struct ir3_instruction
*orig_instr
,
120 struct ir3_instruction
*new_instr
)
122 struct ir3_instruction
*src
;
124 debug_assert(is_scheduled(orig_instr
));
126 foreach_ssa_src_n(src
, n
, new_instr
) {
127 if (__is_false_dep(new_instr
, n
))
129 ctx
->live_values
+= dest_regs(src
);
133 clear_cache(ctx
, orig_instr
);
137 use_each_src(struct ir3_instruction
*instr
)
139 struct ir3_instruction
*src
;
141 foreach_ssa_src_n(src
, n
, instr
) {
142 if (__is_false_dep(instr
, n
))
149 use_instr(struct ir3_instruction
*instr
)
151 if ((instr
->opc
== OPC_META_COLLECT
) || (instr
->opc
== OPC_META_SPLIT
)) {
159 update_live_values(struct ir3_sched_ctx
*ctx
, struct ir3_instruction
*instr
)
161 if ((instr
->opc
== OPC_META_COLLECT
) || (instr
->opc
== OPC_META_SPLIT
))
164 ctx
->live_values
+= dest_regs(instr
);
165 unuse_each_src(ctx
, instr
);
169 update_use_count(struct ir3
*ir
)
171 foreach_block (block
, &ir
->block_list
) {
172 foreach_instr (instr
, &block
->instr_list
) {
173 instr
->use_count
= 0;
177 foreach_block (block
, &ir
->block_list
) {
178 foreach_instr (instr
, &block
->instr_list
) {
179 if ((instr
->opc
== OPC_META_COLLECT
) || (instr
->opc
== OPC_META_SPLIT
))
186 /* Shader outputs are also used:
188 struct ir3_instruction
*out
;
189 foreach_output(out
, ir
)
193 #define NULL_INSTR ((void *)~0)
196 clear_cache(struct ir3_sched_ctx
*ctx
, struct ir3_instruction
*instr
)
198 foreach_instr (instr2
, &ctx
->depth_list
) {
199 if ((instr2
->data
== instr
) || (instr2
->data
== NULL_INSTR
) || !instr
)
205 schedule(struct ir3_sched_ctx
*ctx
, struct ir3_instruction
*instr
)
207 debug_assert(ctx
->block
== instr
->block
);
209 /* maybe there is a better way to handle this than just stuffing
210 * a nop.. ideally we'd know about this constraint in the
211 * scheduling and depth calculation..
213 if (ctx
->scheduled
&& is_sfu_or_mem(ctx
->scheduled
) && is_sfu_or_mem(instr
))
216 /* remove from depth list:
218 list_delinit(&instr
->node
);
220 if (writes_addr(instr
)) {
221 debug_assert(ctx
->addr
== NULL
);
225 if (writes_pred(instr
)) {
226 debug_assert(ctx
->pred
== NULL
);
230 instr
->flags
|= IR3_INSTR_MARK
;
232 di(instr
, "schedule");
234 list_addtail(&instr
->node
, &instr
->block
->instr_list
);
235 ctx
->scheduled
= instr
;
237 update_live_values(ctx
, instr
);
239 if (writes_addr(instr
) || writes_pred(instr
) || is_input(instr
)) {
240 clear_cache(ctx
, NULL
);
242 /* invalidate only the necessary entries.. */
243 clear_cache(ctx
, instr
);
247 static struct ir3_instruction
*
248 deepest(struct ir3_instruction
**srcs
, unsigned nsrcs
)
250 struct ir3_instruction
*d
= NULL
;
251 unsigned i
= 0, id
= 0;
253 while ((i
< nsrcs
) && !(d
= srcs
[id
= i
]))
259 for (; i
< nsrcs
; i
++)
260 if (srcs
[i
] && (srcs
[i
]->depth
> d
->depth
))
269 * @block: the block to search in, starting from end; in first pass,
270 * this will be the block the instruction would be inserted into
271 * (but has not yet, ie. it only contains already scheduled
272 * instructions). For intra-block scheduling (second pass), this
273 * would be one of the predecessor blocks.
274 * @instr: the instruction to search for
275 * @maxd: max distance, bail after searching this # of instruction
276 * slots, since it means the instruction we are looking for is
278 * @pred: if true, recursively search into predecessor blocks to
279 * find the worst case (shortest) distance (only possible after
280 * individual blocks are all scheduled
283 distance(struct ir3_block
*block
, struct ir3_instruction
*instr
,
284 unsigned maxd
, bool pred
)
288 foreach_instr_rev (n
, &block
->instr_list
) {
289 if ((n
== instr
) || (d
>= maxd
))
291 /* NOTE: don't count branch/jump since we don't know yet if they will
292 * be eliminated later in resolve_jumps().. really should do that
293 * earlier so we don't have this constraint.
295 if (is_alu(n
) || (is_flow(n
) && (n
->opc
!= OPC_JUMP
) && (n
->opc
!= OPC_BR
)))
299 /* if coming from a predecessor block, assume it is assigned far
300 * enough away.. we'll fix up later.
305 if (pred
&& (block
->data
!= block
)) {
306 /* Search into predecessor blocks, finding the one with the
307 * shortest distance, since that will be the worst case
309 unsigned min
= maxd
- d
;
311 /* (ab)use block->data to prevent recursion: */
314 set_foreach(block
->predecessors
, entry
) {
315 struct ir3_block
*pred
= (struct ir3_block
*)entry
->key
;
318 n
= distance(pred
, instr
, min
, pred
);
330 /* calculate delay for specified src: */
332 delay_calc_srcn(struct ir3_block
*block
,
333 struct ir3_instruction
*assigner
,
334 struct ir3_instruction
*consumer
,
335 unsigned srcn
, bool soft
, bool pred
)
339 if (is_meta(assigner
)) {
340 struct ir3_instruction
*src
;
341 foreach_ssa_src(src
, assigner
) {
343 d
= delay_calc_srcn(block
, src
, consumer
, srcn
, soft
, pred
);
344 delay
= MAX2(delay
, d
);
348 if (is_sfu(assigner
)) {
351 delay
= ir3_delayslots(assigner
, consumer
, srcn
);
354 delay
= ir3_delayslots(assigner
, consumer
, srcn
);
356 delay
-= distance(block
, assigner
, delay
, pred
);
362 /* calculate delay for instruction (maximum of delay for all srcs): */
364 delay_calc(struct ir3_block
*block
, struct ir3_instruction
*instr
,
365 bool soft
, bool pred
)
368 struct ir3_instruction
*src
;
370 foreach_ssa_src_n(src
, i
, instr
) {
372 d
= delay_calc_srcn(block
, src
, instr
, i
, soft
, pred
);
373 delay
= MAX2(delay
, d
);
379 struct ir3_sched_notes
{
380 /* there is at least one kill which could be scheduled, except
381 * for unscheduled bary.f's:
384 /* there is at least one instruction that could be scheduled,
385 * except for conflicting address/predicate register usage:
387 bool addr_conflict
, pred_conflict
;
390 /* could an instruction be scheduled if specified ssa src was scheduled? */
392 could_sched(struct ir3_instruction
*instr
, struct ir3_instruction
*src
)
394 struct ir3_instruction
*other_src
;
395 foreach_ssa_src(other_src
, instr
) {
396 /* if dependency not scheduled, we aren't ready yet: */
397 if ((src
!= other_src
) && !is_scheduled(other_src
)) {
404 /* Check if instruction is ok to schedule. Make sure it is not blocked
405 * by use of addr/predicate register, etc.
408 check_instr(struct ir3_sched_ctx
*ctx
, struct ir3_sched_notes
*notes
,
409 struct ir3_instruction
*instr
)
411 debug_assert(!is_scheduled(instr
));
413 /* For instructions that write address register we need to
414 * make sure there is at least one instruction that uses the
415 * addr value which is otherwise ready.
417 * TODO if any instructions use pred register and have other
418 * src args, we would need to do the same for writes_pred()..
420 if (writes_addr(instr
)) {
421 struct ir3
*ir
= instr
->block
->shader
;
423 for (unsigned i
= 0; (i
< ir
->indirects_count
) && !ready
; i
++) {
424 struct ir3_instruction
*indirect
= ir
->indirects
[i
];
427 if (indirect
->address
!= instr
)
429 ready
= could_sched(indirect
, instr
);
432 /* nothing could be scheduled, so keep looking: */
437 /* if this is a write to address/predicate register, and that
438 * register is currently in use, we need to defer until it is
441 if (writes_addr(instr
) && ctx
->addr
) {
442 debug_assert(ctx
->addr
!= instr
);
443 notes
->addr_conflict
= true;
447 if (writes_pred(instr
) && ctx
->pred
) {
448 debug_assert(ctx
->pred
!= instr
);
449 notes
->pred_conflict
= true;
453 /* if the instruction is a kill, we need to ensure *every*
454 * bary.f is scheduled. The hw seems unhappy if the thread
455 * gets killed before the end-input (ei) flag is hit.
457 * We could do this by adding each bary.f instruction as
458 * virtual ssa src for the kill instruction. But we have
459 * fixed length instr->regs[].
461 * TODO this wouldn't be quite right if we had multiple
462 * basic blocks, if any block was conditional. We'd need
463 * to schedule the bary.f's outside of any block which
464 * was conditional that contained a kill.. I think..
466 if (is_kill(instr
)) {
467 struct ir3
*ir
= instr
->block
->shader
;
469 for (unsigned i
= 0; i
< ir
->baryfs_count
; i
++) {
470 struct ir3_instruction
*baryf
= ir
->baryfs
[i
];
471 if (baryf
->flags
& IR3_INSTR_UNUSED
)
473 if (!is_scheduled(baryf
)) {
474 notes
->blocked_kill
= true;
483 /* Find the best instruction to schedule from specified instruction or
484 * recursively it's ssa sources.
486 static struct ir3_instruction
*
487 find_instr_recursive(struct ir3_sched_ctx
*ctx
, struct ir3_sched_notes
*notes
,
488 struct ir3_instruction
*instr
)
490 struct ir3_instruction
*srcs
[__ssa_src_cnt(instr
)];
491 struct ir3_instruction
*src
;
494 if (is_scheduled(instr
))
497 /* use instr->data to cache the results of recursing up the
498 * instr src's. Otherwise the recursive algo can scale quite
499 * badly w/ shader size. But this takes some care to clear
500 * the cache appropriately when instructions are scheduled.
503 if (instr
->data
== NULL_INSTR
)
508 /* find unscheduled srcs: */
509 foreach_ssa_src(src
, instr
) {
510 if (!is_scheduled(src
) && (src
->block
== instr
->block
)) {
511 debug_assert(nsrcs
< ARRAY_SIZE(srcs
));
516 /* if all our src's are already scheduled: */
518 if (check_instr(ctx
, notes
, instr
)) {
525 while ((src
= deepest(srcs
, nsrcs
))) {
526 struct ir3_instruction
*candidate
;
528 candidate
= find_instr_recursive(ctx
, notes
, src
);
532 if (check_instr(ctx
, notes
, candidate
)) {
533 instr
->data
= candidate
;
538 instr
->data
= NULL_INSTR
;
542 /* find net change to live values if instruction were scheduled: */
544 live_effect(struct ir3_instruction
*instr
)
546 struct ir3_instruction
*src
;
547 int new_live
= dest_regs(instr
);
550 foreach_ssa_src_n(src
, n
, instr
) {
551 if (__is_false_dep(instr
, n
))
554 if (instr
->block
!= src
->block
)
557 /* for split, just pass things along to the real src: */
558 if (src
->opc
== OPC_META_SPLIT
)
559 src
= ssa(src
->regs
[1]);
561 /* for collect, if this is the last use of *each* src,
562 * then it will decrease the live values, since RA treats
565 if (src
->opc
== OPC_META_COLLECT
) {
566 struct ir3_instruction
*src2
;
567 bool last_use
= true;
569 foreach_ssa_src(src2
, src
) {
570 if (src2
->use_count
> 1) {
577 old_live
+= dest_regs(src
);
580 debug_assert(src
->use_count
> 0);
582 if (src
->use_count
== 1) {
583 old_live
+= dest_regs(src
);
588 return new_live
- old_live
;
591 /* find instruction to schedule: */
592 static struct ir3_instruction
*
593 find_eligible_instr(struct ir3_sched_ctx
*ctx
, struct ir3_sched_notes
*notes
,
596 struct ir3_instruction
*best_instr
= NULL
;
597 int best_rank
= INT_MAX
; /* lower is better */
598 unsigned deepest
= 0;
600 /* TODO we'd really rather use the list/array of block outputs. But we
601 * don't have such a thing. Recursing *every* instruction in the list
602 * will result in a lot of repeated traversal, since instructions will
603 * get traversed both when they appear as ssa src to a later instruction
604 * as well as where they appear in the depth_list.
606 foreach_instr_rev (instr
, &ctx
->depth_list
) {
607 struct ir3_instruction
*candidate
;
609 candidate
= find_instr_recursive(ctx
, notes
, instr
);
613 if (is_meta(candidate
))
616 deepest
= MAX2(deepest
, candidate
->depth
);
619 /* traverse the list a second time.. but since we cache the result of
620 * find_instr_recursive() it isn't as bad as it looks.
622 foreach_instr_rev (instr
, &ctx
->depth_list
) {
623 struct ir3_instruction
*candidate
;
625 candidate
= find_instr_recursive(ctx
, notes
, instr
);
629 /* determine net change to # of live values: */
630 int le
= live_effect(candidate
);
632 /* if there is a net increase in # of live values, then apply some
633 * threshold to avoid instructions getting scheduled *too* early
634 * and increasing register pressure.
639 if (ctx
->live_values
> 4*4) {
645 /* Filter out any "shallow" instructions which would otherwise
646 * tend to get scheduled too early to fill delay slots even
647 * when they are not needed for a while. There will probably
648 * be later delay slots that they could just as easily fill.
650 * A classic case where this comes up is frag shaders that
651 * write a constant value (like 1.0f) to one of the channels
652 * of the output color(s). Since the mov from immed has no
653 * dependencies, it would otherwise get scheduled early to
654 * fill delay slots, occupying a register until the end of
657 if ((deepest
- candidate
->depth
) > threshold
)
661 int rank
= delay_calc(ctx
->block
, candidate
, soft
, false);
663 /* if too many live values, prioritize instructions that reduce the
664 * number of live values:
666 if (ctx
->live_values
> 16*4) {
668 } else if (ctx
->live_values
> 4*4) {
672 if (rank
< best_rank
) {
673 best_instr
= candidate
;
681 static struct ir3_instruction
*
682 split_instr(struct ir3_sched_ctx
*ctx
, struct ir3_instruction
*orig_instr
)
684 struct ir3_instruction
*new_instr
= ir3_instr_clone(orig_instr
);
685 ir3_insert_by_depth(new_instr
, &ctx
->depth_list
);
686 transfer_use(ctx
, orig_instr
, new_instr
);
690 /* "spill" the address register by remapping any unscheduled
691 * instructions which depend on the current address register
692 * to a clone of the instruction which wrote the address reg.
694 static struct ir3_instruction
*
695 split_addr(struct ir3_sched_ctx
*ctx
)
698 struct ir3_instruction
*new_addr
= NULL
;
701 debug_assert(ctx
->addr
);
703 ir
= ctx
->addr
->block
->shader
;
705 for (i
= 0; i
< ir
->indirects_count
; i
++) {
706 struct ir3_instruction
*indirect
= ir
->indirects
[i
];
711 /* skip instructions already scheduled: */
712 if (is_scheduled(indirect
))
715 /* remap remaining instructions using current addr
718 if (indirect
->address
== ctx
->addr
) {
720 new_addr
= split_instr(ctx
, ctx
->addr
);
721 /* original addr is scheduled, but new one isn't: */
722 new_addr
->flags
&= ~IR3_INSTR_MARK
;
724 indirect
->address
= NULL
;
725 ir3_instr_set_address(indirect
, new_addr
);
729 /* all remaining indirects remapped to new addr: */
735 /* "spill" the predicate register by remapping any unscheduled
736 * instructions which depend on the current predicate register
737 * to a clone of the instruction which wrote the address reg.
739 static struct ir3_instruction
*
740 split_pred(struct ir3_sched_ctx
*ctx
)
743 struct ir3_instruction
*new_pred
= NULL
;
746 debug_assert(ctx
->pred
);
748 ir
= ctx
->pred
->block
->shader
;
750 for (i
= 0; i
< ir
->predicates_count
; i
++) {
751 struct ir3_instruction
*predicated
= ir
->predicates
[i
];
753 /* skip instructions already scheduled: */
754 if (is_scheduled(predicated
))
757 /* remap remaining instructions using current pred
760 * TODO is there ever a case when pred isn't first
763 if (ssa(predicated
->regs
[1]) == ctx
->pred
) {
765 new_pred
= split_instr(ctx
, ctx
->pred
);
766 /* original pred is scheduled, but new one isn't: */
767 new_pred
->flags
&= ~IR3_INSTR_MARK
;
769 predicated
->regs
[1]->instr
= new_pred
;
773 /* all remaining predicated remapped to new pred: */
780 sched_block(struct ir3_sched_ctx
*ctx
, struct ir3_block
*block
)
782 struct list_head unscheduled_list
;
786 /* addr/pred writes are per-block: */
790 /* move all instructions to the unscheduled list, and
791 * empty the block's instruction list (to which we will
794 list_replace(&block
->instr_list
, &unscheduled_list
);
795 list_inithead(&block
->instr_list
);
796 list_inithead(&ctx
->depth_list
);
798 /* First schedule all meta:input instructions, followed by
799 * tex-prefetch. We want all of the instructions that load
800 * values into registers before the shader starts to go
801 * before any other instructions. But in particular we
802 * want inputs to come before prefetches. This is because
803 * a FS's bary_ij input may not actually be live in the
804 * shader, but it should not be scheduled on top of any
805 * other input (but can be overwritten by a tex prefetch)
807 * Finally, move all the remaining instructions to the depth-
810 foreach_instr_safe (instr
, &unscheduled_list
)
811 if (instr
->opc
== OPC_META_INPUT
)
812 schedule(ctx
, instr
);
814 foreach_instr_safe (instr
, &unscheduled_list
)
815 if (instr
->opc
== OPC_META_TEX_PREFETCH
)
816 schedule(ctx
, instr
);
818 foreach_instr_safe (instr
, &unscheduled_list
)
819 ir3_insert_by_depth(instr
, &ctx
->depth_list
);
821 while (!list_is_empty(&ctx
->depth_list
)) {
822 struct ir3_sched_notes notes
= {0};
823 struct ir3_instruction
*instr
;
825 instr
= find_eligible_instr(ctx
, ¬es
, true);
827 instr
= find_eligible_instr(ctx
, ¬es
, false);
830 unsigned delay
= delay_calc(ctx
->block
, instr
, false, false);
832 d("delay=%u", delay
);
834 /* and if we run out of instructions that can be scheduled,
835 * then it is time for nop's:
837 debug_assert(delay
<= 6);
843 schedule(ctx
, instr
);
845 struct ir3_instruction
*new_instr
= NULL
;
847 /* nothing available to schedule.. if we are blocked on
848 * address/predicate register conflict, then break the
849 * deadlock by cloning the instruction that wrote that
852 if (notes
.addr_conflict
) {
853 new_instr
= split_addr(ctx
);
854 } else if (notes
.pred_conflict
) {
855 new_instr
= split_pred(ctx
);
863 /* clearing current addr/pred can change what is
864 * available to schedule, so clear cache..
866 clear_cache(ctx
, NULL
);
868 ir3_insert_by_depth(new_instr
, &ctx
->depth_list
);
869 /* the original instr that wrote addr/pred may have
870 * originated from a different block:
872 new_instr
->block
= block
;
877 /* And lastly, insert branch/jump instructions to take us to
878 * the next block. Later we'll strip back out the branches
879 * that simply jump to next instruction.
881 if (block
->successors
[1]) {
882 /* if/else, conditional branches to "then" or "else": */
883 struct ir3_instruction
*br
;
886 debug_assert(ctx
->pred
);
887 debug_assert(block
->condition
);
889 delay
-= distance(ctx
->block
, ctx
->pred
, delay
, false);
896 /* create "else" branch first (since "then" block should
897 * frequently/always end up being a fall-thru):
901 br
->cat0
.target
= block
->successors
[1];
903 /* NOTE: we have to hard code delay of 6 above, since
904 * we want to insert the nop's before constructing the
905 * branch. Throw in an assert so we notice if this
906 * ever breaks on future generation:
908 debug_assert(ir3_delayslots(ctx
->pred
, br
, 0) == 6);
911 br
->cat0
.target
= block
->successors
[0];
913 } else if (block
->successors
[0]) {
914 /* otherwise unconditional jump to next block: */
915 struct ir3_instruction
*jmp
;
917 jmp
= ir3_JUMP(block
);
918 jmp
->cat0
.target
= block
->successors
[0];
921 /* NOTE: if we kept track of the predecessors, we could do a better
922 * job w/ (jp) flags.. every node w/ > predecessor is a join point.
923 * Note that as we eliminate blocks which contain only an unconditional
924 * jump we probably need to propagate (jp) flag..
928 /* After scheduling individual blocks, we still could have cases where
929 * one (or more) paths into a block, a value produced by a previous
930 * has too few delay slots to be legal. We can't deal with this in the
931 * first pass, because loops (ie. we can't ensure all predecessor blocks
932 * are already scheduled in the first pass). All we can really do at
933 * this point is stuff in extra nop's until things are legal.
936 sched_intra_block(struct ir3_sched_ctx
*ctx
, struct ir3_block
*block
)
942 foreach_instr_safe (instr
, &block
->instr_list
) {
945 set_foreach(block
->predecessors
, entry
) {
946 struct ir3_block
*pred
= (struct ir3_block
*)entry
->key
;
947 unsigned d
= delay_calc(pred
, instr
, false, true);
948 delay
= MAX2(d
, delay
);
952 struct ir3_instruction
*nop
= ir3_NOP(block
);
954 /* move to before instr: */
955 list_delinit(&nop
->node
);
956 list_addtail(&nop
->node
, &instr
->node
);
961 /* we can bail once we hit worst case delay: */
967 int ir3_sched(struct ir3
*ir
)
969 struct ir3_sched_ctx ctx
= {0};
972 update_use_count(ir
);
974 foreach_block (block
, &ir
->block_list
) {
976 sched_block(&ctx
, block
);
979 foreach_block (block
, &ir
->block_list
) {
980 sched_intra_block(&ctx
, block
);
990 get_array_id(struct ir3_instruction
*instr
)
992 /* The expectation is that there is only a single array
993 * src or dst, ir3_cp should enforce this.
996 for (unsigned i
= 0; i
< instr
->regs_count
; i
++)
997 if (instr
->regs
[i
]->flags
& IR3_REG_ARRAY
)
998 return instr
->regs
[i
]->array
.id
;
1000 unreachable("this was unexpected");
1003 /* does instruction 'prior' need to be scheduled before 'instr'? */
1005 depends_on(struct ir3_instruction
*instr
, struct ir3_instruction
*prior
)
1007 /* TODO for dependencies that are related to a specific object, ie
1008 * a specific SSBO/image/array, we could relax this constraint to
1009 * make accesses to unrelated objects not depend on each other (at
1010 * least as long as not declared coherent)
1012 if (((instr
->barrier_class
& IR3_BARRIER_EVERYTHING
) && prior
->barrier_class
) ||
1013 ((prior
->barrier_class
& IR3_BARRIER_EVERYTHING
) && instr
->barrier_class
))
1016 if (instr
->barrier_class
& prior
->barrier_conflict
) {
1017 if (!(instr
->barrier_class
& ~(IR3_BARRIER_ARRAY_R
| IR3_BARRIER_ARRAY_W
))) {
1018 /* if only array barrier, then we can further limit false-deps
1019 * by considering the array-id, ie reads/writes to different
1020 * arrays do not depend on each other (no aliasing)
1022 if (get_array_id(instr
) != get_array_id(prior
)) {
1034 add_barrier_deps(struct ir3_block
*block
, struct ir3_instruction
*instr
)
1036 struct list_head
*prev
= instr
->node
.prev
;
1037 struct list_head
*next
= instr
->node
.next
;
1039 /* add dependencies on previous instructions that must be scheduled
1040 * prior to the current instruction
1042 while (prev
!= &block
->instr_list
) {
1043 struct ir3_instruction
*pi
=
1044 LIST_ENTRY(struct ir3_instruction
, prev
, node
);
1051 if (instr
->barrier_class
== pi
->barrier_class
) {
1052 ir3_instr_add_dep(instr
, pi
);
1056 if (depends_on(instr
, pi
))
1057 ir3_instr_add_dep(instr
, pi
);
1060 /* add dependencies on this instruction to following instructions
1061 * that must be scheduled after the current instruction:
1063 while (next
!= &block
->instr_list
) {
1064 struct ir3_instruction
*ni
=
1065 LIST_ENTRY(struct ir3_instruction
, next
, node
);
1072 if (instr
->barrier_class
== ni
->barrier_class
) {
1073 ir3_instr_add_dep(ni
, instr
);
1077 if (depends_on(ni
, instr
))
1078 ir3_instr_add_dep(ni
, instr
);
1082 /* before scheduling a block, we need to add any necessary false-dependencies
1085 * (1) barriers are scheduled in the right order wrt instructions related
1088 * (2) reads that come before a write actually get scheduled before the
1092 calculate_deps(struct ir3_block
*block
)
1094 foreach_instr (instr
, &block
->instr_list
) {
1095 if (instr
->barrier_class
) {
1096 add_barrier_deps(block
, instr
);
1102 ir3_sched_add_deps(struct ir3
*ir
)
1104 foreach_block (block
, &ir
->block_list
) {
1105 calculate_deps(block
);