2 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 * Rob Clark <robclark@freedesktop.org>
28 #include "util/u_math.h"
33 * Instruction Scheduling:
35 * A recursive depth based scheduling algo. Recursively find an eligible
36 * instruction to schedule from the deepest instruction (recursing through
37 * it's unscheduled src instructions). Normally this would result in a
38 * lot of re-traversal of the same instructions, so we cache results in
39 * instr->data (and clear cached results that would be no longer valid
40 * after scheduling an instruction).
42 * There are a few special cases that need to be handled, since sched
43 * is currently independent of register allocation. Usages of address
44 * register (a0.x) or predicate register (p0.x) must be serialized. Ie.
45 * if you have two pairs of instructions that write the same special
46 * register and then read it, then those pairs cannot be interleaved.
47 * To solve this, when we are in such a scheduling "critical section",
48 * and we encounter a conflicting write to a special register, we try
49 * to schedule any remaining instructions that use that value first.
52 struct ir3_sched_ctx
{
53 struct ir3_block
*block
; /* the current block */
54 struct list_head depth_list
; /* depth sorted unscheduled instrs */
55 struct ir3_instruction
*scheduled
; /* last scheduled instr XXX remove*/
56 struct ir3_instruction
*addr
; /* current a0.x user, if any */
57 struct ir3_instruction
*pred
; /* current p0.x user, if any */
58 int live_values
; /* estimate of current live values */
62 static bool is_scheduled(struct ir3_instruction
*instr
)
64 return !!(instr
->flags
& IR3_INSTR_MARK
);
67 static bool is_sfu_or_mem(struct ir3_instruction
*instr
)
69 return is_sfu(instr
) || is_mem(instr
);
73 unuse_each_src(struct ir3_sched_ctx
*ctx
, struct ir3_instruction
*instr
)
75 struct ir3_instruction
*src
;
77 foreach_ssa_src_n(src
, n
, instr
) {
78 if (__is_false_dep(instr
, n
))
80 if (instr
->block
!= src
->block
)
82 if ((src
->opc
== OPC_META_FI
) || (src
->opc
== OPC_META_FO
)) {
83 unuse_each_src(ctx
, src
);
85 debug_assert(src
->use_count
> 0);
87 if (--src
->use_count
== 0) {
88 ctx
->live_values
-= dest_regs(src
);
89 debug_assert(ctx
->live_values
>= 0);
95 static void clear_cache(struct ir3_sched_ctx
*ctx
, struct ir3_instruction
*instr
);
96 static void use_instr(struct ir3_instruction
*instr
);
98 /* transfers a use-count to new instruction, for cases where we
99 * "spill" address or predicate. Note this might cause the
100 * previous instruction that loaded a0.x/p0.x to become live
101 * again, when we previously thought it was dead.
104 transfer_use(struct ir3_sched_ctx
*ctx
, struct ir3_instruction
*orig_instr
,
105 struct ir3_instruction
*new_instr
)
107 struct ir3_instruction
*src
;
109 debug_assert(is_scheduled(orig_instr
));
111 foreach_ssa_src_n(src
, n
, new_instr
) {
112 if (__is_false_dep(new_instr
, n
))
114 ctx
->live_values
+= dest_regs(src
);
118 clear_cache(ctx
, orig_instr
);
122 use_each_src(struct ir3_instruction
*instr
)
124 struct ir3_instruction
*src
;
126 foreach_ssa_src_n(src
, n
, instr
) {
127 if (__is_false_dep(instr
, n
))
134 use_instr(struct ir3_instruction
*instr
)
136 if ((instr
->opc
== OPC_META_FI
) || (instr
->opc
== OPC_META_FO
)) {
144 update_live_values(struct ir3_sched_ctx
*ctx
, struct ir3_instruction
*instr
)
146 if ((instr
->opc
== OPC_META_FI
) || (instr
->opc
== OPC_META_FO
))
149 ctx
->live_values
+= dest_regs(instr
);
150 unuse_each_src(ctx
, instr
);
154 update_use_count(struct ir3
*ir
)
156 list_for_each_entry (struct ir3_block
, block
, &ir
->block_list
, node
) {
157 list_for_each_entry (struct ir3_instruction
, instr
, &block
->instr_list
, node
) {
158 instr
->use_count
= 0;
162 list_for_each_entry (struct ir3_block
, block
, &ir
->block_list
, node
) {
163 list_for_each_entry (struct ir3_instruction
, instr
, &block
->instr_list
, node
) {
164 if ((instr
->opc
== OPC_META_FI
) || (instr
->opc
== OPC_META_FO
))
171 /* Shader outputs are also used:
173 for (unsigned i
= 0; i
< ir
->noutputs
; i
++) {
174 struct ir3_instruction
*out
= ir
->outputs
[i
];
183 #define NULL_INSTR ((void *)~0)
186 clear_cache(struct ir3_sched_ctx
*ctx
, struct ir3_instruction
*instr
)
188 list_for_each_entry (struct ir3_instruction
, instr2
, &ctx
->depth_list
, node
) {
189 if ((instr2
->data
== instr
) || (instr2
->data
== NULL_INSTR
) || !instr
)
195 schedule(struct ir3_sched_ctx
*ctx
, struct ir3_instruction
*instr
)
197 debug_assert(ctx
->block
== instr
->block
);
199 /* maybe there is a better way to handle this than just stuffing
200 * a nop.. ideally we'd know about this constraint in the
201 * scheduling and depth calculation..
203 if (ctx
->scheduled
&& is_sfu_or_mem(ctx
->scheduled
) && is_sfu_or_mem(instr
))
206 /* remove from depth list:
208 list_delinit(&instr
->node
);
210 if (writes_addr(instr
)) {
211 debug_assert(ctx
->addr
== NULL
);
215 if (writes_pred(instr
)) {
216 debug_assert(ctx
->pred
== NULL
);
220 instr
->flags
|= IR3_INSTR_MARK
;
222 list_addtail(&instr
->node
, &instr
->block
->instr_list
);
223 ctx
->scheduled
= instr
;
225 update_live_values(ctx
, instr
);
227 if (writes_addr(instr
) || writes_pred(instr
) || is_input(instr
)) {
228 clear_cache(ctx
, NULL
);
230 /* invalidate only the necessary entries.. */
231 clear_cache(ctx
, instr
);
235 static struct ir3_instruction
*
236 deepest(struct ir3_instruction
**srcs
, unsigned nsrcs
)
238 struct ir3_instruction
*d
= NULL
;
239 unsigned i
= 0, id
= 0;
241 while ((i
< nsrcs
) && !(d
= srcs
[id
= i
]))
247 for (; i
< nsrcs
; i
++)
248 if (srcs
[i
] && (srcs
[i
]->depth
> d
->depth
))
257 * @block: the block to search in, starting from end; in first pass,
258 * this will be the block the instruction would be inserted into
259 * (but has not yet, ie. it only contains already scheduled
260 * instructions). For intra-block scheduling (second pass), this
261 * would be one of the predecessor blocks.
262 * @instr: the instruction to search for
263 * @maxd: max distance, bail after searching this # of instruction
264 * slots, since it means the instruction we are looking for is
266 * @pred: if true, recursively search into predecessor blocks to
267 * find the worst case (shortest) distance (only possible after
268 * individual blocks are all scheduled
271 distance(struct ir3_block
*block
, struct ir3_instruction
*instr
,
272 unsigned maxd
, bool pred
)
276 list_for_each_entry_rev (struct ir3_instruction
, n
, &block
->instr_list
, node
) {
277 if ((n
== instr
) || (d
>= maxd
))
279 /* NOTE: don't count branch/jump since we don't know yet if they will
280 * be eliminated later in resolve_jumps().. really should do that
281 * earlier so we don't have this constraint.
283 if (is_alu(n
) || (is_flow(n
) && (n
->opc
!= OPC_JUMP
) && (n
->opc
!= OPC_BR
)))
287 /* if coming from a predecessor block, assume it is assigned far
288 * enough away.. we'll fix up later.
293 if (pred
&& (block
->data
!= block
)) {
294 /* Search into predecessor blocks, finding the one with the
295 * shortest distance, since that will be the worst case
297 unsigned min
= maxd
- d
;
299 /* (ab)use block->data to prevent recursion: */
302 set_foreach(block
->predecessors
, entry
) {
303 struct ir3_block
*pred
= (struct ir3_block
*)entry
->key
;
306 n
= distance(pred
, instr
, min
, pred
);
318 /* calculate delay for specified src: */
320 delay_calc_srcn(struct ir3_block
*block
,
321 struct ir3_instruction
*assigner
,
322 struct ir3_instruction
*consumer
,
323 unsigned srcn
, bool soft
, bool pred
)
327 if (is_meta(assigner
)) {
328 struct ir3_instruction
*src
;
329 foreach_ssa_src(src
, assigner
) {
331 d
= delay_calc_srcn(block
, src
, consumer
, srcn
, soft
, pred
);
332 delay
= MAX2(delay
, d
);
336 if (is_sfu(assigner
)) {
339 delay
= ir3_delayslots(assigner
, consumer
, srcn
);
342 delay
= ir3_delayslots(assigner
, consumer
, srcn
);
344 delay
-= distance(block
, assigner
, delay
, pred
);
350 /* calculate delay for instruction (maximum of delay for all srcs): */
352 delay_calc(struct ir3_block
*block
, struct ir3_instruction
*instr
,
353 bool soft
, bool pred
)
356 struct ir3_instruction
*src
;
358 foreach_ssa_src_n(src
, i
, instr
) {
360 d
= delay_calc_srcn(block
, src
, instr
, i
, soft
, pred
);
361 delay
= MAX2(delay
, d
);
367 struct ir3_sched_notes
{
368 /* there is at least one kill which could be scheduled, except
369 * for unscheduled bary.f's:
372 /* there is at least one instruction that could be scheduled,
373 * except for conflicting address/predicate register usage:
375 bool addr_conflict
, pred_conflict
;
378 /* could an instruction be scheduled if specified ssa src was scheduled? */
380 could_sched(struct ir3_instruction
*instr
, struct ir3_instruction
*src
)
382 struct ir3_instruction
*other_src
;
383 foreach_ssa_src(other_src
, instr
) {
384 /* if dependency not scheduled, we aren't ready yet: */
385 if ((src
!= other_src
) && !is_scheduled(other_src
)) {
392 /* Check if instruction is ok to schedule. Make sure it is not blocked
393 * by use of addr/predicate register, etc.
396 check_instr(struct ir3_sched_ctx
*ctx
, struct ir3_sched_notes
*notes
,
397 struct ir3_instruction
*instr
)
399 debug_assert(!is_scheduled(instr
));
401 /* For instructions that write address register we need to
402 * make sure there is at least one instruction that uses the
403 * addr value which is otherwise ready.
405 * TODO if any instructions use pred register and have other
406 * src args, we would need to do the same for writes_pred()..
408 if (writes_addr(instr
)) {
409 struct ir3
*ir
= instr
->block
->shader
;
411 for (unsigned i
= 0; (i
< ir
->indirects_count
) && !ready
; i
++) {
412 struct ir3_instruction
*indirect
= ir
->indirects
[i
];
415 if (indirect
->address
!= instr
)
417 ready
= could_sched(indirect
, instr
);
420 /* nothing could be scheduled, so keep looking: */
425 /* if this is a write to address/predicate register, and that
426 * register is currently in use, we need to defer until it is
429 if (writes_addr(instr
) && ctx
->addr
) {
430 debug_assert(ctx
->addr
!= instr
);
431 notes
->addr_conflict
= true;
435 if (writes_pred(instr
) && ctx
->pred
) {
436 debug_assert(ctx
->pred
!= instr
);
437 notes
->pred_conflict
= true;
441 /* if the instruction is a kill, we need to ensure *every*
442 * bary.f is scheduled. The hw seems unhappy if the thread
443 * gets killed before the end-input (ei) flag is hit.
445 * We could do this by adding each bary.f instruction as
446 * virtual ssa src for the kill instruction. But we have
447 * fixed length instr->regs[].
449 * TODO this wouldn't be quite right if we had multiple
450 * basic blocks, if any block was conditional. We'd need
451 * to schedule the bary.f's outside of any block which
452 * was conditional that contained a kill.. I think..
454 if (is_kill(instr
)) {
455 struct ir3
*ir
= instr
->block
->shader
;
457 for (unsigned i
= 0; i
< ir
->baryfs_count
; i
++) {
458 struct ir3_instruction
*baryf
= ir
->baryfs
[i
];
459 if (baryf
->flags
& IR3_INSTR_UNUSED
)
461 if (!is_scheduled(baryf
)) {
462 notes
->blocked_kill
= true;
471 /* Find the best instruction to schedule from specified instruction or
472 * recursively it's ssa sources.
474 static struct ir3_instruction
*
475 find_instr_recursive(struct ir3_sched_ctx
*ctx
, struct ir3_sched_notes
*notes
,
476 struct ir3_instruction
*instr
)
478 struct ir3_instruction
*srcs
[__ssa_src_cnt(instr
)];
479 struct ir3_instruction
*src
;
482 if (is_scheduled(instr
))
485 /* use instr->data to cache the results of recursing up the
486 * instr src's. Otherwise the recursive algo can scale quite
487 * badly w/ shader size. But this takes some care to clear
488 * the cache appropriately when instructions are scheduled.
491 if (instr
->data
== NULL_INSTR
)
496 /* find unscheduled srcs: */
497 foreach_ssa_src(src
, instr
) {
498 if (!is_scheduled(src
) && (src
->block
== instr
->block
)) {
499 debug_assert(nsrcs
< ARRAY_SIZE(srcs
));
504 /* if all our src's are already scheduled: */
506 if (check_instr(ctx
, notes
, instr
)) {
513 while ((src
= deepest(srcs
, nsrcs
))) {
514 struct ir3_instruction
*candidate
;
516 candidate
= find_instr_recursive(ctx
, notes
, src
);
520 if (check_instr(ctx
, notes
, candidate
)) {
521 instr
->data
= candidate
;
526 instr
->data
= NULL_INSTR
;
530 /* find net change to live values if instruction were scheduled: */
532 live_effect(struct ir3_instruction
*instr
)
534 struct ir3_instruction
*src
;
535 int new_live
= dest_regs(instr
);
538 foreach_ssa_src_n(src
, n
, instr
) {
539 if (__is_false_dep(instr
, n
))
542 if (instr
->block
!= src
->block
)
545 /* for fanout/split, just pass things along to the real src: */
546 if (src
->opc
== OPC_META_FO
)
547 src
= ssa(src
->regs
[1]);
549 /* for fanin/collect, if this is the last use of *each* src,
550 * then it will decrease the live values, since RA treats
553 if (src
->opc
== OPC_META_FI
) {
554 struct ir3_instruction
*src2
;
555 bool last_use
= true;
557 foreach_ssa_src(src2
, src
) {
558 if (src2
->use_count
> 1) {
565 old_live
+= dest_regs(src
);
568 debug_assert(src
->use_count
> 0);
570 if (src
->use_count
== 1) {
571 old_live
+= dest_regs(src
);
576 return new_live
- old_live
;
579 /* find instruction to schedule: */
580 static struct ir3_instruction
*
581 find_eligible_instr(struct ir3_sched_ctx
*ctx
, struct ir3_sched_notes
*notes
,
584 struct ir3_instruction
*best_instr
= NULL
;
585 int best_rank
= INT_MAX
; /* lower is better */
586 unsigned deepest
= 0;
588 /* TODO we'd really rather use the list/array of block outputs. But we
589 * don't have such a thing. Recursing *every* instruction in the list
590 * will result in a lot of repeated traversal, since instructions will
591 * get traversed both when they appear as ssa src to a later instruction
592 * as well as where they appear in the depth_list.
594 list_for_each_entry_rev (struct ir3_instruction
, instr
, &ctx
->depth_list
, node
) {
595 struct ir3_instruction
*candidate
;
597 candidate
= find_instr_recursive(ctx
, notes
, instr
);
601 if (is_meta(candidate
))
604 deepest
= MAX2(deepest
, candidate
->depth
);
607 /* traverse the list a second time.. but since we cache the result of
608 * find_instr_recursive() it isn't as bad as it looks.
610 list_for_each_entry_rev (struct ir3_instruction
, instr
, &ctx
->depth_list
, node
) {
611 struct ir3_instruction
*candidate
;
613 candidate
= find_instr_recursive(ctx
, notes
, instr
);
617 /* determine net change to # of live values: */
618 int le
= live_effect(candidate
);
620 /* if there is a net increase in # of live values, then apply some
621 * threshold to avoid instructions getting scheduled *too* early
622 * and increasing register pressure.
627 if (ctx
->live_values
> 4*4) {
633 /* Filter out any "shallow" instructions which would otherwise
634 * tend to get scheduled too early to fill delay slots even
635 * when they are not needed for a while. There will probably
636 * be later delay slots that they could just as easily fill.
638 * A classic case where this comes up is frag shaders that
639 * write a constant value (like 1.0f) to one of the channels
640 * of the output color(s). Since the mov from immed has no
641 * dependencies, it would otherwise get scheduled early to
642 * fill delay slots, occupying a register until the end of
645 if ((deepest
- candidate
->depth
) > threshold
)
649 int rank
= delay_calc(ctx
->block
, candidate
, soft
, false);
651 /* if too many live values, prioritize instructions that reduce the
652 * number of live values:
654 if (ctx
->live_values
> 16*4) {
656 } else if (ctx
->live_values
> 4*4) {
660 if (rank
< best_rank
) {
661 best_instr
= candidate
;
669 static struct ir3_instruction
*
670 split_instr(struct ir3_sched_ctx
*ctx
, struct ir3_instruction
*orig_instr
)
672 struct ir3_instruction
*new_instr
= ir3_instr_clone(orig_instr
);
673 ir3_insert_by_depth(new_instr
, &ctx
->depth_list
);
674 transfer_use(ctx
, orig_instr
, new_instr
);
678 /* "spill" the address register by remapping any unscheduled
679 * instructions which depend on the current address register
680 * to a clone of the instruction which wrote the address reg.
682 static struct ir3_instruction
*
683 split_addr(struct ir3_sched_ctx
*ctx
)
686 struct ir3_instruction
*new_addr
= NULL
;
689 debug_assert(ctx
->addr
);
691 ir
= ctx
->addr
->block
->shader
;
693 for (i
= 0; i
< ir
->indirects_count
; i
++) {
694 struct ir3_instruction
*indirect
= ir
->indirects
[i
];
699 /* skip instructions already scheduled: */
700 if (is_scheduled(indirect
))
703 /* remap remaining instructions using current addr
706 if (indirect
->address
== ctx
->addr
) {
708 new_addr
= split_instr(ctx
, ctx
->addr
);
709 /* original addr is scheduled, but new one isn't: */
710 new_addr
->flags
&= ~IR3_INSTR_MARK
;
712 indirect
->address
= NULL
;
713 ir3_instr_set_address(indirect
, new_addr
);
717 /* all remaining indirects remapped to new addr: */
723 /* "spill" the predicate register by remapping any unscheduled
724 * instructions which depend on the current predicate register
725 * to a clone of the instruction which wrote the address reg.
727 static struct ir3_instruction
*
728 split_pred(struct ir3_sched_ctx
*ctx
)
731 struct ir3_instruction
*new_pred
= NULL
;
734 debug_assert(ctx
->pred
);
736 ir
= ctx
->pred
->block
->shader
;
738 for (i
= 0; i
< ir
->predicates_count
; i
++) {
739 struct ir3_instruction
*predicated
= ir
->predicates
[i
];
741 /* skip instructions already scheduled: */
742 if (is_scheduled(predicated
))
745 /* remap remaining instructions using current pred
748 * TODO is there ever a case when pred isn't first
751 if (ssa(predicated
->regs
[1]) == ctx
->pred
) {
753 new_pred
= split_instr(ctx
, ctx
->pred
);
754 /* original pred is scheduled, but new one isn't: */
755 new_pred
->flags
&= ~IR3_INSTR_MARK
;
757 predicated
->regs
[1]->instr
= new_pred
;
761 /* all remaining predicated remapped to new pred: */
768 sched_block(struct ir3_sched_ctx
*ctx
, struct ir3_block
*block
)
770 struct list_head unscheduled_list
;
774 /* addr/pred writes are per-block: */
778 /* move all instructions to the unscheduled list, and
779 * empty the block's instruction list (to which we will
782 list_replace(&block
->instr_list
, &unscheduled_list
);
783 list_inithead(&block
->instr_list
);
784 list_inithead(&ctx
->depth_list
);
786 /* first a pre-pass to schedule all meta:input instructions
787 * (which need to appear first so that RA knows the register is
788 * occupied), and move remaining to depth sorted list:
790 list_for_each_entry_safe (struct ir3_instruction
, instr
, &unscheduled_list
, node
) {
791 if (instr
->opc
== OPC_META_INPUT
) {
792 schedule(ctx
, instr
);
794 ir3_insert_by_depth(instr
, &ctx
->depth_list
);
798 while (!list_empty(&ctx
->depth_list
)) {
799 struct ir3_sched_notes notes
= {0};
800 struct ir3_instruction
*instr
;
802 instr
= find_eligible_instr(ctx
, ¬es
, true);
804 instr
= find_eligible_instr(ctx
, ¬es
, false);
807 unsigned delay
= delay_calc(ctx
->block
, instr
, false, false);
809 /* and if we run out of instructions that can be scheduled,
810 * then it is time for nop's:
812 debug_assert(delay
<= 6);
818 schedule(ctx
, instr
);
820 struct ir3_instruction
*new_instr
= NULL
;
822 /* nothing available to schedule.. if we are blocked on
823 * address/predicate register conflict, then break the
824 * deadlock by cloning the instruction that wrote that
827 if (notes
.addr_conflict
) {
828 new_instr
= split_addr(ctx
);
829 } else if (notes
.pred_conflict
) {
830 new_instr
= split_pred(ctx
);
838 /* clearing current addr/pred can change what is
839 * available to schedule, so clear cache..
841 clear_cache(ctx
, NULL
);
843 ir3_insert_by_depth(new_instr
, &ctx
->depth_list
);
844 /* the original instr that wrote addr/pred may have
845 * originated from a different block:
847 new_instr
->block
= block
;
852 /* And lastly, insert branch/jump instructions to take us to
853 * the next block. Later we'll strip back out the branches
854 * that simply jump to next instruction.
856 if (block
->successors
[1]) {
857 /* if/else, conditional branches to "then" or "else": */
858 struct ir3_instruction
*br
;
861 debug_assert(ctx
->pred
);
862 debug_assert(block
->condition
);
864 delay
-= distance(ctx
->block
, ctx
->pred
, delay
, false);
871 /* create "else" branch first (since "then" block should
872 * frequently/always end up being a fall-thru):
876 br
->cat0
.target
= block
->successors
[1];
878 /* NOTE: we have to hard code delay of 6 above, since
879 * we want to insert the nop's before constructing the
880 * branch. Throw in an assert so we notice if this
881 * ever breaks on future generation:
883 debug_assert(ir3_delayslots(ctx
->pred
, br
, 0) == 6);
886 br
->cat0
.target
= block
->successors
[0];
888 } else if (block
->successors
[0]) {
889 /* otherwise unconditional jump to next block: */
890 struct ir3_instruction
*jmp
;
892 jmp
= ir3_JUMP(block
);
893 jmp
->cat0
.target
= block
->successors
[0];
896 /* NOTE: if we kept track of the predecessors, we could do a better
897 * job w/ (jp) flags.. every node w/ > predecessor is a join point.
898 * Note that as we eliminate blocks which contain only an unconditional
899 * jump we probably need to propagate (jp) flag..
903 /* After scheduling individual blocks, we still could have cases where
904 * one (or more) paths into a block, a value produced by a previous
905 * has too few delay slots to be legal. We can't deal with this in the
906 * first pass, because loops (ie. we can't ensure all predecessor blocks
907 * are already scheduled in the first pass). All we can really do at
908 * this point is stuff in extra nop's until things are legal.
911 sched_intra_block(struct ir3_sched_ctx
*ctx
, struct ir3_block
*block
)
917 list_for_each_entry_safe (struct ir3_instruction
, instr
, &block
->instr_list
, node
) {
920 set_foreach(block
->predecessors
, entry
) {
921 struct ir3_block
*pred
= (struct ir3_block
*)entry
->key
;
922 unsigned d
= delay_calc(pred
, instr
, false, true);
923 delay
= MAX2(d
, delay
);
927 struct ir3_instruction
*nop
= ir3_NOP(block
);
929 /* move to before instr: */
930 list_delinit(&nop
->node
);
931 list_addtail(&nop
->node
, &instr
->node
);
936 /* we can bail once we hit worst case delay: */
942 int ir3_sched(struct ir3
*ir
)
944 struct ir3_sched_ctx ctx
= {0};
947 update_use_count(ir
);
949 list_for_each_entry (struct ir3_block
, block
, &ir
->block_list
, node
) {
951 sched_block(&ctx
, block
);
954 list_for_each_entry (struct ir3_block
, block
, &ir
->block_list
, node
) {
955 sched_intra_block(&ctx
, block
);
965 get_array_id(struct ir3_instruction
*instr
)
967 /* The expectation is that there is only a single array
968 * src or dst, ir3_cp should enforce this.
971 for (unsigned i
= 0; i
< instr
->regs_count
; i
++)
972 if (instr
->regs
[i
]->flags
& IR3_REG_ARRAY
)
973 return instr
->regs
[i
]->array
.id
;
975 unreachable("this was unexpected");
978 /* does instruction 'prior' need to be scheduled before 'instr'? */
980 depends_on(struct ir3_instruction
*instr
, struct ir3_instruction
*prior
)
982 /* TODO for dependencies that are related to a specific object, ie
983 * a specific SSBO/image/array, we could relax this constraint to
984 * make accesses to unrelated objects not depend on each other (at
985 * least as long as not declared coherent)
987 if (((instr
->barrier_class
& IR3_BARRIER_EVERYTHING
) && prior
->barrier_class
) ||
988 ((prior
->barrier_class
& IR3_BARRIER_EVERYTHING
) && instr
->barrier_class
))
991 if (instr
->barrier_class
& prior
->barrier_conflict
) {
992 if (!(instr
->barrier_class
& ~(IR3_BARRIER_ARRAY_R
| IR3_BARRIER_ARRAY_W
))) {
993 /* if only array barrier, then we can further limit false-deps
994 * by considering the array-id, ie reads/writes to different
995 * arrays do not depend on each other (no aliasing)
997 if (get_array_id(instr
) != get_array_id(prior
)) {
1009 add_barrier_deps(struct ir3_block
*block
, struct ir3_instruction
*instr
)
1011 struct list_head
*prev
= instr
->node
.prev
;
1012 struct list_head
*next
= instr
->node
.next
;
1014 /* add dependencies on previous instructions that must be scheduled
1015 * prior to the current instruction
1017 while (prev
!= &block
->instr_list
) {
1018 struct ir3_instruction
*pi
=
1019 LIST_ENTRY(struct ir3_instruction
, prev
, node
);
1026 if (instr
->barrier_class
== pi
->barrier_class
) {
1027 ir3_instr_add_dep(instr
, pi
);
1031 if (depends_on(instr
, pi
))
1032 ir3_instr_add_dep(instr
, pi
);
1035 /* add dependencies on this instruction to following instructions
1036 * that must be scheduled after the current instruction:
1038 while (next
!= &block
->instr_list
) {
1039 struct ir3_instruction
*ni
=
1040 LIST_ENTRY(struct ir3_instruction
, next
, node
);
1047 if (instr
->barrier_class
== ni
->barrier_class
) {
1048 ir3_instr_add_dep(ni
, instr
);
1052 if (depends_on(ni
, instr
))
1053 ir3_instr_add_dep(ni
, instr
);
1057 /* before scheduling a block, we need to add any necessary false-dependencies
1060 * (1) barriers are scheduled in the right order wrt instructions related
1063 * (2) reads that come before a write actually get scheduled before the
1067 calculate_deps(struct ir3_block
*block
)
1069 list_for_each_entry (struct ir3_instruction
, instr
, &block
->instr_list
, node
) {
1070 if (instr
->barrier_class
) {
1071 add_barrier_deps(block
, instr
);
1077 ir3_sched_add_deps(struct ir3
*ir
)
1079 list_for_each_entry (struct ir3_block
, block
, &ir
->block_list
, node
) {
1080 calculate_deps(block
);