2 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 * Rob Clark <robclark@freedesktop.org>
29 #include "util/u_math.h"
32 #include "ir3_compiler.h"
35 #define SCHED_DEBUG (ir3_shader_debug & IR3_DBG_SCHEDMSGS)
39 #define d(fmt, ...) do { if (SCHED_DEBUG) { \
40 printf("SCHED: "fmt"\n", ##__VA_ARGS__); \
43 #define di(instr, fmt, ...) do { if (SCHED_DEBUG) { \
44 printf("SCHED: "fmt": ", ##__VA_ARGS__); \
45 ir3_print_instr(instr); \
49 * Instruction Scheduling:
51 * A block-level pre-RA scheduler, which works by creating a DAG of
52 * instruction dependencies, and heuristically picking a DAG head
53 * (instruction with no unscheduled dependencies).
55 * Where possible, it tries to pick instructions that avoid nop delay
56 * slots, but it will prefer to pick instructions that reduce (or do
57 * not increase) the number of live values.
59 * If the only possible choices are instructions that increase the
60 * number of live values, it will try to pick the one with the earliest
61 * consumer (based on pre-sched program order).
63 * There are a few special cases that need to be handled, since sched
64 * is currently independent of register allocation. Usages of address
65 * register (a0.x) or predicate register (p0.x) must be serialized. Ie.
66 * if you have two pairs of instructions that write the same special
67 * register and then read it, then those pairs cannot be interleaved.
68 * To solve this, when we are in such a scheduling "critical section",
69 * and we encounter a conflicting write to a special register, we try
70 * to schedule any remaining instructions that use that value first.
72 * TODO we can detect too-large live_values here.. would be a good place
73 * to "spill" cheap things, like move from uniform/immed. (Constructing
74 * list of ssa def consumers before sched pass would make this easier.
75 * Also, in general it is general it might be best not to re-use load_immed
78 * TODO we can use (abs)/(neg) src modifiers in a lot of cases to reduce
79 * the # of immediates in play (or at least that would help with
80 * dEQP-GLES31.functional.ubo.random.all_per_block_buffers.*).. probably
81 * do this in a nir pass that inserts fneg/etc? The cp pass should fold
82 * these into src modifiers..
85 struct ir3_sched_ctx
{
86 struct ir3_block
*block
; /* the current block */
89 struct list_head unscheduled_list
; /* unscheduled instructions */
90 struct ir3_instruction
*scheduled
; /* last scheduled instr */
91 struct ir3_instruction
*addr0
; /* current a0.x user, if any */
92 struct ir3_instruction
*addr1
; /* current a1.x user, if any */
93 struct ir3_instruction
*pred
; /* current p0.x user, if any */
104 struct ir3_sched_node
{
105 struct dag_node dag
; /* must be first for util_dynarray_foreach */
106 struct ir3_instruction
*instr
;
111 /* For instructions that are a meta:collect src, once we schedule
112 * the first src of the collect, the entire vecN is live (at least
113 * from the PoV of the first RA pass.. the 2nd scalar pass can fill
114 * in some of the gaps, but often not all). So we want to help out
115 * RA, and realize that as soon as we schedule the first collect
116 * src, there is no penalty to schedule the remainder (ie. they
117 * don't make additional values live). In fact we'd prefer to
118 * schedule the rest ASAP to minimize the live range of the vecN.
120 * For instructions that are the src of a collect, we track the
121 * corresponding collect, and mark them as partially live as soon
122 * as any one of the src's is scheduled.
124 struct ir3_instruction
*collect
;
127 /* Is this instruction a direct or indirect dependency for a kill?
128 * If so, we should prioritize it when possible
132 /* This node represents a shader output. A semi-common pattern in
133 * shaders is something along the lines of:
137 * Which we'd prefer to schedule as late as possible, since it
138 * produces a live value that is never killed/consumed. So detect
139 * outputs up-front, and avoid scheduling them unless the reduce
140 * register pressure (or at least are neutral)
145 #define foreach_sched_node(__n, __list) \
146 list_for_each_entry(struct ir3_sched_node, __n, __list, dag.link)
148 static void sched_node_init(struct ir3_sched_ctx
*ctx
, struct ir3_instruction
*instr
);
149 static void sched_node_add_dep(struct ir3_instruction
*instr
, struct ir3_instruction
*src
, int i
);
151 static bool is_scheduled(struct ir3_instruction
*instr
)
153 return !!(instr
->flags
& IR3_INSTR_MARK
);
157 schedule(struct ir3_sched_ctx
*ctx
, struct ir3_instruction
*instr
)
159 debug_assert(ctx
->block
== instr
->block
);
161 /* remove from depth list:
163 list_delinit(&instr
->node
);
165 if (writes_addr0(instr
)) {
166 debug_assert(ctx
->addr0
== NULL
);
170 if (writes_addr1(instr
)) {
171 debug_assert(ctx
->addr1
== NULL
);
175 if (writes_pred(instr
)) {
176 debug_assert(ctx
->pred
== NULL
);
180 instr
->flags
|= IR3_INSTR_MARK
;
182 di(instr
, "schedule");
184 list_addtail(&instr
->node
, &instr
->block
->instr_list
);
185 ctx
->scheduled
= instr
;
188 assert(ctx
->remaining_kills
> 0);
189 ctx
->remaining_kills
--;
192 struct ir3_sched_node
*n
= instr
->data
;
194 /* If this instruction is a meta:collect src, mark the remaining
195 * collect srcs as partially live.
198 foreach_ssa_src (src
, n
->collect
) {
199 if (src
->block
!= instr
->block
)
201 struct ir3_sched_node
*sn
= src
->data
;
202 sn
->partially_live
= true;
206 dag_prune_head(ctx
->dag
, &n
->dag
);
208 if (is_meta(instr
) && (instr
->opc
!= OPC_META_TEX_PREFETCH
))
213 } else if (check_src_cond(instr
, is_sfu
)) {
215 } else if (ctx
->sfu_delay
> 0) {
219 if (is_tex_or_prefetch(instr
)) {
220 /* NOTE that this isn't an attempt to hide texture fetch latency,
221 * but an attempt to hide the cost of switching to another warp.
222 * If we can, we'd like to try to schedule another texture fetch
223 * before scheduling something that would sync.
226 assert(ctx
->remaining_tex
> 0);
227 ctx
->remaining_tex
--;
228 } else if (check_src_cond(instr
, is_tex_or_prefetch
)) {
230 } else if (ctx
->tex_delay
> 0) {
235 struct ir3_sched_notes
{
236 /* there is at least one kill which could be scheduled, except
237 * for unscheduled bary.f's:
240 /* there is at least one instruction that could be scheduled,
241 * except for conflicting address/predicate register usage:
243 bool addr0_conflict
, addr1_conflict
, pred_conflict
;
246 /* could an instruction be scheduled if specified ssa src was scheduled? */
248 could_sched(struct ir3_instruction
*instr
, struct ir3_instruction
*src
)
250 foreach_ssa_src (other_src
, instr
) {
251 /* if dependency not scheduled, we aren't ready yet: */
252 if ((src
!= other_src
) && !is_scheduled(other_src
)) {
259 /* Check if instruction is ok to schedule. Make sure it is not blocked
260 * by use of addr/predicate register, etc.
263 check_instr(struct ir3_sched_ctx
*ctx
, struct ir3_sched_notes
*notes
,
264 struct ir3_instruction
*instr
)
266 debug_assert(!is_scheduled(instr
));
268 if (ctx
->remaining_kills
&& (is_tex(instr
) || is_mem(instr
))) {
269 /* avoid texture/memory access if we have unscheduled kills
270 * that could make the expensive operation unnecessary. By
271 * definition, if there are remaining kills, and this instr
272 * is not a dependency of a kill, there are other instructions
273 * that we can choose from.
275 struct ir3_sched_node
*n
= instr
->data
;
280 /* For instructions that write address register we need to
281 * make sure there is at least one instruction that uses the
282 * addr value which is otherwise ready.
284 * NOTE if any instructions use pred register and have other
285 * src args, we would need to do the same for writes_pred()..
287 if (writes_addr0(instr
)) {
288 struct ir3
*ir
= instr
->block
->shader
;
290 for (unsigned i
= 0; (i
< ir
->a0_users_count
) && !ready
; i
++) {
291 struct ir3_instruction
*indirect
= ir
->a0_users
[i
];
294 if (indirect
->address
!= instr
)
296 ready
= could_sched(indirect
, instr
);
299 /* nothing could be scheduled, so keep looking: */
304 if (writes_addr1(instr
)) {
305 struct ir3
*ir
= instr
->block
->shader
;
307 for (unsigned i
= 0; (i
< ir
->a1_users_count
) && !ready
; i
++) {
308 struct ir3_instruction
*indirect
= ir
->a1_users
[i
];
311 if (indirect
->address
!= instr
)
313 ready
= could_sched(indirect
, instr
);
316 /* nothing could be scheduled, so keep looking: */
321 /* if this is a write to address/predicate register, and that
322 * register is currently in use, we need to defer until it is
325 if (writes_addr0(instr
) && ctx
->addr0
) {
326 debug_assert(ctx
->addr0
!= instr
);
327 notes
->addr0_conflict
= true;
331 if (writes_addr1(instr
) && ctx
->addr1
) {
332 debug_assert(ctx
->addr1
!= instr
);
333 notes
->addr1_conflict
= true;
337 if (writes_pred(instr
) && ctx
->pred
) {
338 debug_assert(ctx
->pred
!= instr
);
339 notes
->pred_conflict
= true;
343 /* if the instruction is a kill, we need to ensure *every*
344 * bary.f is scheduled. The hw seems unhappy if the thread
345 * gets killed before the end-input (ei) flag is hit.
347 * We could do this by adding each bary.f instruction as
348 * virtual ssa src for the kill instruction. But we have
349 * fixed length instr->regs[].
351 * TODO we could handle this by false-deps now, probably.
353 if (is_kill(instr
)) {
354 struct ir3
*ir
= instr
->block
->shader
;
356 for (unsigned i
= 0; i
< ir
->baryfs_count
; i
++) {
357 struct ir3_instruction
*baryf
= ir
->baryfs
[i
];
358 if (baryf
->flags
& IR3_INSTR_UNUSED
)
360 if (!is_scheduled(baryf
)) {
361 notes
->blocked_kill
= true;
370 /* Find the instr->ip of the closest use of an instruction, in
371 * pre-sched order. This isn't going to be the same as post-sched
372 * order, but it is a reasonable approximation to limit scheduling
373 * instructions *too* early. This is mostly to prevent bad behavior
374 * in cases where we have a large number of possible instructions
375 * to choose, to avoid creating too much parallelism (ie. blowing
376 * up register pressure)
378 * See dEQP-GLES31.functional.atomic_counter.layout.reverse_offset.inc_dec.8_counters_5_calls_1_thread
381 nearest_use(struct ir3_instruction
*instr
)
383 unsigned nearest
= ~0;
384 foreach_ssa_use (use
, instr
)
385 if (!is_scheduled(use
))
386 nearest
= MIN2(nearest
, use
->ip
);
388 /* slight hack.. this heuristic tends to push bary.f's to later
389 * in the shader, closer to their uses. But we actually would
390 * prefer to get these scheduled earlier, to unlock varying
391 * storage for more VS jobs:
400 use_count(struct ir3_instruction
*instr
)
403 foreach_ssa_use (use
, instr
)
404 if (!is_scheduled(use
))
409 /* find net change to live values if instruction were scheduled: */
411 live_effect(struct ir3_instruction
*instr
)
413 struct ir3_sched_node
*n
= instr
->data
;
414 int new_live
= n
->partially_live
? 0 : dest_regs(instr
);
417 /* if we schedule something that causes a vecN to be live,
418 * then count all it's other components too:
421 new_live
*= n
->collect
->regs_count
- 1;
423 foreach_ssa_src_n (src
, n
, instr
) {
424 if (__is_false_dep(instr
, n
))
427 if (instr
->block
!= src
->block
)
430 if (use_count(src
) == 1)
431 freed_live
+= dest_regs(src
);
434 return new_live
- freed_live
;
437 /* Determine if this is an instruction that we'd prefer not to schedule
438 * yet, in order to avoid an (ss)/(sy) sync. This is limited by the
439 * sfu_delay/tex_delay counters, ie. the more cycles it has been since
440 * the last SFU/tex, the less costly a sync would be.
443 would_sync(struct ir3_sched_ctx
*ctx
, struct ir3_instruction
*instr
)
445 if (ctx
->sfu_delay
) {
446 if (check_src_cond(instr
, is_sfu
))
450 /* We mostly just want to try to schedule another texture fetch
451 * before scheduling something that would (sy) sync, so we can
452 * limit this rule to cases where there are remaining texture
455 if (ctx
->tex_delay
&& ctx
->remaining_tex
) {
456 if (check_src_cond(instr
, is_tex_or_prefetch
))
463 static struct ir3_sched_node
*
464 choose_instr_inc(struct ir3_sched_ctx
*ctx
, struct ir3_sched_notes
*notes
,
465 bool avoid_sync
, bool avoid_output
);
468 * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSR (Code
469 * Scheduling for Register pressure) heuristic.
471 * Only handles the case of choosing instructions that reduce register pressure
474 static struct ir3_sched_node
*
475 choose_instr_dec(struct ir3_sched_ctx
*ctx
, struct ir3_sched_notes
*notes
,
478 const char *mode
= avoid_sync
? "-as" : "";
479 struct ir3_sched_node
*chosen
= NULL
;
481 /* Find a ready inst with regs freed and pick the one with max cost. */
482 foreach_sched_node (n
, &ctx
->dag
->heads
) {
483 if (avoid_sync
&& would_sync(ctx
, n
->instr
))
486 unsigned d
= ir3_delay_calc(ctx
->block
, n
->instr
, false, false);
491 if (live_effect(n
->instr
) > -1)
494 if (!check_instr(ctx
, notes
, n
->instr
))
497 if (!chosen
|| chosen
->max_delay
< n
->max_delay
) {
503 di(chosen
->instr
, "dec%s: chose (freed+ready)", mode
);
507 /* Find a leader with regs freed and pick the one with max cost. */
508 foreach_sched_node (n
, &ctx
->dag
->heads
) {
509 if (avoid_sync
&& would_sync(ctx
, n
->instr
))
512 if (live_effect(n
->instr
) > -1)
515 if (!check_instr(ctx
, notes
, n
->instr
))
518 if (!chosen
|| chosen
->max_delay
< n
->max_delay
) {
524 di(chosen
->instr
, "dec%s: chose (freed)", mode
);
528 /* Contra the paper, pick a leader with no effect on used regs. This may
529 * open up new opportunities, as otherwise a single-operand instr consuming
530 * a value will tend to block finding freeing that value. This had a
531 * massive effect on reducing spilling on V3D.
533 * XXX: Should this prioritize ready?
535 foreach_sched_node (n
, &ctx
->dag
->heads
) {
536 if (avoid_sync
&& would_sync(ctx
, n
->instr
))
539 unsigned d
= ir3_delay_calc(ctx
->block
, n
->instr
, false, false);
544 if (live_effect(n
->instr
) > 0)
547 if (!check_instr(ctx
, notes
, n
->instr
))
550 if (!chosen
|| chosen
->max_delay
< n
->max_delay
)
555 di(chosen
->instr
, "dec%s: chose (neutral+ready)", mode
);
559 foreach_sched_node (n
, &ctx
->dag
->heads
) {
560 if (avoid_sync
&& would_sync(ctx
, n
->instr
))
563 if (live_effect(n
->instr
) > 0)
566 if (!check_instr(ctx
, notes
, n
->instr
))
569 if (!chosen
|| chosen
->max_delay
< n
->max_delay
)
574 di(chosen
->instr
, "dec%s: chose (neutral)", mode
);
578 return choose_instr_inc(ctx
, notes
, avoid_sync
, true);
582 * When we can't choose an instruction that reduces register pressure or
583 * is neutral, we end up here to try and pick the least bad option.
585 static struct ir3_sched_node
*
586 choose_instr_inc(struct ir3_sched_ctx
*ctx
, struct ir3_sched_notes
*notes
,
587 bool avoid_sync
, bool avoid_output
)
589 const char *mode
= avoid_sync
? "-as" : "";
590 struct ir3_sched_node
*chosen
= NULL
;
593 * From hear on out, we are picking something that increases
594 * register pressure. So try to pick something which will
597 unsigned chosen_distance
= 0;
599 /* Pick the max delay of the remaining ready set. */
600 foreach_sched_node (n
, &ctx
->dag
->heads
) {
601 if (avoid_output
&& n
->output
)
604 if (avoid_sync
&& would_sync(ctx
, n
->instr
))
607 unsigned d
= ir3_delay_calc(ctx
->block
, n
->instr
, false, false);
612 if (!check_instr(ctx
, notes
, n
->instr
))
615 unsigned distance
= nearest_use(n
->instr
);
617 if (!chosen
|| distance
< chosen_distance
) {
619 chosen_distance
= distance
;
624 di(chosen
->instr
, "inc%s: chose (distance+ready)", mode
);
628 /* Pick the max delay of the remaining leaders. */
629 foreach_sched_node (n
, &ctx
->dag
->heads
) {
630 if (avoid_output
&& n
->output
)
633 if (avoid_sync
&& would_sync(ctx
, n
->instr
))
636 if (!check_instr(ctx
, notes
, n
->instr
))
639 unsigned distance
= nearest_use(n
->instr
);
641 if (!chosen
|| distance
< chosen_distance
) {
643 chosen_distance
= distance
;
648 di(chosen
->instr
, "inc%s: chose (distance)", mode
);
655 /* Handles instruction selections for instructions we want to prioritize
656 * even if csp/csr would not pick them.
658 static struct ir3_sched_node
*
659 choose_instr_prio(struct ir3_sched_ctx
*ctx
, struct ir3_sched_notes
*notes
)
661 struct ir3_sched_node
*chosen
= NULL
;
663 foreach_sched_node (n
, &ctx
->dag
->heads
) {
664 if (!is_meta(n
->instr
))
667 if (!chosen
|| (chosen
->max_delay
< n
->max_delay
))
672 di(chosen
->instr
, "prio: chose (meta)");
680 dump_state(struct ir3_sched_ctx
*ctx
)
685 foreach_sched_node (n
, &ctx
->dag
->heads
) {
686 di(n
->instr
, "maxdel=%3d le=%d del=%u ",
687 n
->max_delay
, live_effect(n
->instr
),
688 ir3_delay_calc(ctx
->block
, n
->instr
, false, false));
690 util_dynarray_foreach(&n
->dag
.edges
, struct dag_edge
, edge
) {
691 struct ir3_sched_node
*child
= (struct ir3_sched_node
*)edge
->child
;
693 di(child
->instr
, " -> (%d parents) ", child
->dag
.parent_count
);
698 /* find instruction to schedule: */
699 static struct ir3_instruction
*
700 choose_instr(struct ir3_sched_ctx
*ctx
, struct ir3_sched_notes
*notes
)
702 struct ir3_sched_node
*chosen
;
706 chosen
= choose_instr_prio(ctx
, notes
);
708 return chosen
->instr
;
710 chosen
= choose_instr_dec(ctx
, notes
, true);
712 return chosen
->instr
;
714 chosen
= choose_instr_dec(ctx
, notes
, false);
716 return chosen
->instr
;
718 chosen
= choose_instr_inc(ctx
, notes
, false, false);
720 return chosen
->instr
;
725 static struct ir3_instruction
*
726 split_instr(struct ir3_sched_ctx
*ctx
, struct ir3_instruction
*orig_instr
)
728 struct ir3_instruction
*new_instr
= ir3_instr_clone(orig_instr
);
729 di(new_instr
, "split instruction");
730 sched_node_init(ctx
, new_instr
);
734 /* "spill" the address registers by remapping any unscheduled
735 * instructions which depend on the current address register
736 * to a clone of the instruction which wrote the address reg.
738 static struct ir3_instruction
*
739 split_addr(struct ir3_sched_ctx
*ctx
, struct ir3_instruction
**addr
,
740 struct ir3_instruction
**users
, unsigned users_count
)
742 struct ir3_instruction
*new_addr
= NULL
;
747 for (i
= 0; i
< users_count
; i
++) {
748 struct ir3_instruction
*indirect
= users
[i
];
753 /* skip instructions already scheduled: */
754 if (is_scheduled(indirect
))
757 /* remap remaining instructions using current addr
760 if (indirect
->address
== *addr
) {
762 new_addr
= split_instr(ctx
, *addr
);
763 /* original addr is scheduled, but new one isn't: */
764 new_addr
->flags
&= ~IR3_INSTR_MARK
;
766 indirect
->address
= new_addr
;
767 /* don't need to remove old dag edge since old addr is
770 sched_node_add_dep(indirect
, new_addr
, 0);
771 di(indirect
, "new address");
775 /* all remaining indirects remapped to new addr: */
781 /* "spill" the predicate register by remapping any unscheduled
782 * instructions which depend on the current predicate register
783 * to a clone of the instruction which wrote the address reg.
785 static struct ir3_instruction
*
786 split_pred(struct ir3_sched_ctx
*ctx
)
789 struct ir3_instruction
*new_pred
= NULL
;
792 debug_assert(ctx
->pred
);
794 ir
= ctx
->pred
->block
->shader
;
796 for (i
= 0; i
< ir
->predicates_count
; i
++) {
797 struct ir3_instruction
*predicated
= ir
->predicates
[i
];
799 /* skip instructions already scheduled: */
800 if (is_scheduled(predicated
))
803 /* remap remaining instructions using current pred
806 * TODO is there ever a case when pred isn't first
809 if (ssa(predicated
->regs
[1]) == ctx
->pred
) {
811 new_pred
= split_instr(ctx
, ctx
->pred
);
812 /* original pred is scheduled, but new one isn't: */
813 new_pred
->flags
&= ~IR3_INSTR_MARK
;
815 predicated
->regs
[1]->instr
= new_pred
;
816 /* don't need to remove old dag edge since old pred is
819 sched_node_add_dep(predicated
, new_pred
, 0);
820 di(predicated
, "new predicate");
824 /* all remaining predicated remapped to new pred: */
831 sched_node_init(struct ir3_sched_ctx
*ctx
, struct ir3_instruction
*instr
)
833 struct ir3_sched_node
*n
= rzalloc(ctx
->dag
, struct ir3_sched_node
);
835 dag_init_node(ctx
->dag
, &n
->dag
);
842 sched_node_add_dep(struct ir3_instruction
*instr
, struct ir3_instruction
*src
, int i
)
844 /* don't consider dependencies in other blocks: */
845 if (src
->block
!= instr
->block
)
848 /* we could have false-dep's that end up unused: */
849 if (src
->flags
& IR3_INSTR_UNUSED
) {
850 debug_assert(__is_false_dep(instr
, i
));
854 struct ir3_sched_node
*n
= instr
->data
;
855 struct ir3_sched_node
*sn
= src
->data
;
857 /* If src is consumed by a collect, track that to realize that once
858 * any of the collect srcs are live, we should hurry up and schedule
861 if (instr
->opc
== OPC_META_COLLECT
)
864 dag_add_edge(&sn
->dag
, &n
->dag
, NULL
);
866 unsigned d
= ir3_delayslots(src
, instr
, i
, true);
867 n
->delay
= MAX2(n
->delay
, d
);
871 mark_kill_path(struct ir3_instruction
*instr
)
873 struct ir3_sched_node
*n
= instr
->data
;
876 foreach_ssa_src (src
, instr
) {
877 if (src
->block
!= instr
->block
)
883 /* Is it an output? */
885 is_output_collect(struct ir3_instruction
*instr
)
887 struct ir3
*ir
= instr
->block
->shader
;
889 for (unsigned i
= 0; i
< ir
->outputs_count
; i
++) {
890 struct ir3_instruction
*collect
= ir
->outputs
[i
];
891 assert(collect
->opc
== OPC_META_COLLECT
);
892 if (instr
== collect
)
899 /* Is it's only use as output? */
901 is_output_only(struct ir3_instruction
*instr
)
903 if (!writes_gpr(instr
))
906 if (!(instr
->regs
[0]->flags
& IR3_REG_SSA
))
909 foreach_ssa_use (use
, instr
)
910 if (!is_output_collect(use
))
917 sched_node_add_deps(struct ir3_instruction
*instr
)
919 /* Since foreach_ssa_src() already handles false-dep's we can construct
920 * the DAG easily in a single pass.
922 foreach_ssa_src_n (src
, i
, instr
) {
923 sched_node_add_dep(instr
, src
, i
);
926 /* NOTE that all inputs must be scheduled before a kill, so
927 * mark these to be prioritized as well:
929 if (is_kill(instr
) || is_input(instr
)) {
930 mark_kill_path(instr
);
933 if (is_output_only(instr
)) {
934 struct ir3_sched_node
*n
= instr
->data
;
940 sched_dag_max_delay_cb(struct dag_node
*node
, void *state
)
942 struct ir3_sched_node
*n
= (struct ir3_sched_node
*)node
;
943 uint32_t max_delay
= 0;
945 util_dynarray_foreach(&n
->dag
.edges
, struct dag_edge
, edge
) {
946 struct ir3_sched_node
*child
= (struct ir3_sched_node
*)edge
->child
;
947 max_delay
= MAX2(child
->max_delay
, max_delay
);
950 n
->max_delay
= MAX2(n
->max_delay
, max_delay
+ n
->delay
);
954 sched_dag_init(struct ir3_sched_ctx
*ctx
)
956 ctx
->dag
= dag_create(ctx
);
958 foreach_instr (instr
, &ctx
->unscheduled_list
) {
959 sched_node_init(ctx
, instr
);
960 sched_node_add_deps(instr
);
963 dag_traverse_bottom_up(ctx
->dag
, sched_dag_max_delay_cb
, NULL
);
967 sched_dag_destroy(struct ir3_sched_ctx
*ctx
)
969 ralloc_free(ctx
->dag
);
974 sched_block(struct ir3_sched_ctx
*ctx
, struct ir3_block
*block
)
978 /* addr/pred writes are per-block: */
985 /* move all instructions to the unscheduled list, and
986 * empty the block's instruction list (to which we will
989 list_replace(&block
->instr_list
, &ctx
->unscheduled_list
);
990 list_inithead(&block
->instr_list
);
994 ctx
->remaining_kills
= 0;
995 ctx
->remaining_tex
= 0;
996 foreach_instr_safe (instr
, &ctx
->unscheduled_list
) {
998 ctx
->remaining_kills
++;
999 if (is_tex_or_prefetch(instr
))
1000 ctx
->remaining_tex
++;
1003 /* First schedule all meta:input instructions, followed by
1004 * tex-prefetch. We want all of the instructions that load
1005 * values into registers before the shader starts to go
1006 * before any other instructions. But in particular we
1007 * want inputs to come before prefetches. This is because
1008 * a FS's bary_ij input may not actually be live in the
1009 * shader, but it should not be scheduled on top of any
1010 * other input (but can be overwritten by a tex prefetch)
1012 foreach_instr_safe (instr
, &ctx
->unscheduled_list
)
1013 if (instr
->opc
== OPC_META_INPUT
)
1014 schedule(ctx
, instr
);
1016 foreach_instr_safe (instr
, &ctx
->unscheduled_list
)
1017 if (instr
->opc
== OPC_META_TEX_PREFETCH
)
1018 schedule(ctx
, instr
);
1020 while (!list_is_empty(&ctx
->unscheduled_list
)) {
1021 struct ir3_sched_notes notes
= {0};
1022 struct ir3_instruction
*instr
;
1024 instr
= choose_instr(ctx
, ¬es
);
1026 unsigned delay
= ir3_delay_calc(ctx
->block
, instr
, false, false);
1027 d("delay=%u", delay
);
1029 /* and if we run out of instructions that can be scheduled,
1030 * then it is time for nop's:
1032 debug_assert(delay
<= 6);
1038 schedule(ctx
, instr
);
1040 struct ir3_instruction
*new_instr
= NULL
;
1041 struct ir3
*ir
= block
->shader
;
1043 /* nothing available to schedule.. if we are blocked on
1044 * address/predicate register conflict, then break the
1045 * deadlock by cloning the instruction that wrote that
1048 if (notes
.addr0_conflict
) {
1049 new_instr
= split_addr(ctx
, &ctx
->addr0
,
1050 ir
->a0_users
, ir
->a0_users_count
);
1051 } else if (notes
.addr1_conflict
) {
1052 new_instr
= split_addr(ctx
, &ctx
->addr1
,
1053 ir
->a1_users
, ir
->a1_users_count
);
1054 } else if (notes
.pred_conflict
) {
1055 new_instr
= split_pred(ctx
);
1057 d("unscheduled_list:");
1058 foreach_instr (instr
, &ctx
->unscheduled_list
)
1059 di(instr
, "unscheduled: ");
1066 list_delinit(&new_instr
->node
);
1067 list_addtail(&new_instr
->node
, &ctx
->unscheduled_list
);
1072 sched_dag_destroy(ctx
);
1075 int ir3_sched(struct ir3
*ir
)
1077 struct ir3_sched_ctx
*ctx
= rzalloc(NULL
, struct ir3_sched_ctx
);
1079 foreach_block (block
, &ir
->block_list
) {
1080 foreach_instr (instr
, &block
->instr_list
) {
1085 ir3_count_instructions(ir
);
1087 ir3_find_ssa_uses(ir
, ctx
, false);
1089 foreach_block (block
, &ir
->block_list
) {
1090 sched_block(ctx
, block
);
1093 int ret
= ctx
->error
? -1 : 0;
1101 get_array_id(struct ir3_instruction
*instr
)
1103 /* The expectation is that there is only a single array
1104 * src or dst, ir3_cp should enforce this.
1107 for (unsigned i
= 0; i
< instr
->regs_count
; i
++)
1108 if (instr
->regs
[i
]->flags
& IR3_REG_ARRAY
)
1109 return instr
->regs
[i
]->array
.id
;
1111 unreachable("this was unexpected");
1114 /* does instruction 'prior' need to be scheduled before 'instr'? */
1116 depends_on(struct ir3_instruction
*instr
, struct ir3_instruction
*prior
)
1118 /* TODO for dependencies that are related to a specific object, ie
1119 * a specific SSBO/image/array, we could relax this constraint to
1120 * make accesses to unrelated objects not depend on each other (at
1121 * least as long as not declared coherent)
1123 if (((instr
->barrier_class
& IR3_BARRIER_EVERYTHING
) && prior
->barrier_class
) ||
1124 ((prior
->barrier_class
& IR3_BARRIER_EVERYTHING
) && instr
->barrier_class
))
1127 if (instr
->barrier_class
& prior
->barrier_conflict
) {
1128 if (!(instr
->barrier_class
& ~(IR3_BARRIER_ARRAY_R
| IR3_BARRIER_ARRAY_W
))) {
1129 /* if only array barrier, then we can further limit false-deps
1130 * by considering the array-id, ie reads/writes to different
1131 * arrays do not depend on each other (no aliasing)
1133 if (get_array_id(instr
) != get_array_id(prior
)) {
1145 add_barrier_deps(struct ir3_block
*block
, struct ir3_instruction
*instr
)
1147 struct list_head
*prev
= instr
->node
.prev
;
1148 struct list_head
*next
= instr
->node
.next
;
1150 /* add dependencies on previous instructions that must be scheduled
1151 * prior to the current instruction
1153 while (prev
!= &block
->instr_list
) {
1154 struct ir3_instruction
*pi
=
1155 LIST_ENTRY(struct ir3_instruction
, prev
, node
);
1162 if (instr
->barrier_class
== pi
->barrier_class
) {
1163 ir3_instr_add_dep(instr
, pi
);
1167 if (depends_on(instr
, pi
))
1168 ir3_instr_add_dep(instr
, pi
);
1171 /* add dependencies on this instruction to following instructions
1172 * that must be scheduled after the current instruction:
1174 while (next
!= &block
->instr_list
) {
1175 struct ir3_instruction
*ni
=
1176 LIST_ENTRY(struct ir3_instruction
, next
, node
);
1183 if (instr
->barrier_class
== ni
->barrier_class
) {
1184 ir3_instr_add_dep(ni
, instr
);
1188 if (depends_on(ni
, instr
))
1189 ir3_instr_add_dep(ni
, instr
);
1193 /* before scheduling a block, we need to add any necessary false-dependencies
1196 * (1) barriers are scheduled in the right order wrt instructions related
1199 * (2) reads that come before a write actually get scheduled before the
1203 ir3_sched_add_deps(struct ir3
*ir
)
1205 bool progress
= false;
1207 foreach_block (block
, &ir
->block_list
) {
1208 foreach_instr (instr
, &block
->instr_list
) {
1209 if (instr
->barrier_class
) {
1210 add_barrier_deps(block
, instr
);