2 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 * Rob Clark <robclark@freedesktop.org>
29 #include "util/u_math.h"
32 #include "ir3_compiler.h"
35 #define SCHED_DEBUG (ir3_shader_debug & IR3_DBG_SCHEDMSGS)
39 #define d(fmt, ...) do { if (SCHED_DEBUG) { \
40 printf("SCHED: "fmt"\n", ##__VA_ARGS__); \
43 #define di(instr, fmt, ...) do { if (SCHED_DEBUG) { \
44 printf("SCHED: "fmt": ", ##__VA_ARGS__); \
45 ir3_print_instr(instr); \
49 * Instruction Scheduling:
51 * A block-level pre-RA scheduler, which works by creating a DAG of
52 * instruction dependencies, and heuristically picking a DAG head
53 * (instruction with no unscheduled dependencies).
55 * Where possible, it tries to pick instructions that avoid nop delay
56 * slots, but it will prefer to pick instructions that reduce (or do
57 * not increase) the number of live values.
59 * If the only possible choices are instructions that increase the
60 * number of live values, it will try to pick the one with the earliest
61 * consumer (based on pre-sched program order).
63 * There are a few special cases that need to be handled, since sched
64 * is currently independent of register allocation. Usages of address
65 * register (a0.x) or predicate register (p0.x) must be serialized. Ie.
66 * if you have two pairs of instructions that write the same special
67 * register and then read it, then those pairs cannot be interleaved.
68 * To solve this, when we are in such a scheduling "critical section",
69 * and we encounter a conflicting write to a special register, we try
70 * to schedule any remaining instructions that use that value first.
72 * TODO we can detect too-large live_values here.. would be a good place
73 * to "spill" cheap things, like move from uniform/immed. (Constructing
74 * list of ssa def consumers before sched pass would make this easier.
75 * Also, in general it is general it might be best not to re-use load_immed
78 * TODO we can use (abs)/(neg) src modifiers in a lot of cases to reduce
79 * the # of immediates in play (or at least that would help with
80 * dEQP-GLES31.functional.ubo.random.all_per_block_buffers.*).. probably
81 * do this in a nir pass that inserts fneg/etc? The cp pass should fold
82 * these into src modifiers..
85 struct ir3_sched_ctx
{
86 struct ir3_block
*block
; /* the current block */
89 struct list_head unscheduled_list
; /* unscheduled instructions */
90 struct ir3_instruction
*scheduled
; /* last scheduled instr */
91 struct ir3_instruction
*addr0
; /* current a0.x user, if any */
92 struct ir3_instruction
*addr1
; /* current a1.x user, if any */
93 struct ir3_instruction
*pred
; /* current p0.x user, if any */
104 struct ir3_sched_node
{
105 struct dag_node dag
; /* must be first for util_dynarray_foreach */
106 struct ir3_instruction
*instr
;
111 /* For instructions that are a meta:collect src, once we schedule
112 * the first src of the collect, the entire vecN is live (at least
113 * from the PoV of the first RA pass.. the 2nd scalar pass can fill
114 * in some of the gaps, but often not all). So we want to help out
115 * RA, and realize that as soon as we schedule the first collect
116 * src, there is no penalty to schedule the remainder (ie. they
117 * don't make additional values live). In fact we'd prefer to
118 * schedule the rest ASAP to minimize the live range of the vecN.
120 * For instructions that are the src of a collect, we track the
121 * corresponding collect, and mark them as partially live as soon
122 * as any one of the src's is scheduled.
124 struct ir3_instruction
*collect
;
127 /* Is this instruction a direct or indirect dependency for a kill?
128 * If so, we should prioritize it when possible
132 /* This node represents a shader output. A semi-common pattern in
133 * shaders is something along the lines of:
137 * Which we'd prefer to schedule as late as possible, since it
138 * produces a live value that is never killed/consumed. So detect
139 * outputs up-front, and avoid scheduling them unless the reduce
140 * register pressure (or at least are neutral)
145 #define foreach_sched_node(__n, __list) \
146 list_for_each_entry(struct ir3_sched_node, __n, __list, dag.link)
148 static void sched_node_init(struct ir3_sched_ctx
*ctx
, struct ir3_instruction
*instr
);
149 static void sched_node_add_dep(struct ir3_instruction
*instr
, struct ir3_instruction
*src
, int i
);
151 static bool is_scheduled(struct ir3_instruction
*instr
)
153 return !!(instr
->flags
& IR3_INSTR_MARK
);
157 schedule(struct ir3_sched_ctx
*ctx
, struct ir3_instruction
*instr
)
159 debug_assert(ctx
->block
== instr
->block
);
161 /* remove from depth list:
163 list_delinit(&instr
->node
);
165 if (writes_addr0(instr
)) {
166 debug_assert(ctx
->addr0
== NULL
);
170 if (writes_addr1(instr
)) {
171 debug_assert(ctx
->addr1
== NULL
);
175 if (writes_pred(instr
)) {
176 debug_assert(ctx
->pred
== NULL
);
180 instr
->flags
|= IR3_INSTR_MARK
;
182 di(instr
, "schedule");
184 list_addtail(&instr
->node
, &instr
->block
->instr_list
);
185 ctx
->scheduled
= instr
;
188 assert(ctx
->remaining_kills
> 0);
189 ctx
->remaining_kills
--;
192 struct ir3_sched_node
*n
= instr
->data
;
194 /* If this instruction is a meta:collect src, mark the remaining
195 * collect srcs as partially live.
198 struct ir3_instruction
*src
;
199 foreach_ssa_src (src
, n
->collect
) {
200 if (src
->block
!= instr
->block
)
202 struct ir3_sched_node
*sn
= src
->data
;
203 sn
->partially_live
= true;
207 dag_prune_head(ctx
->dag
, &n
->dag
);
209 if (is_meta(instr
) && (instr
->opc
!= OPC_META_TEX_PREFETCH
))
214 } else if (check_src_cond(instr
, is_sfu
)) {
216 } else if (ctx
->sfu_delay
> 0) {
220 if (is_tex_or_prefetch(instr
)) {
221 /* NOTE that this isn't an attempt to hide texture fetch latency,
222 * but an attempt to hide the cost of switching to another warp.
223 * If we can, we'd like to try to schedule another texture fetch
224 * before scheduling something that would sync.
227 assert(ctx
->remaining_tex
> 0);
228 ctx
->remaining_tex
--;
229 } else if (check_src_cond(instr
, is_tex_or_prefetch
)) {
231 } else if (ctx
->tex_delay
> 0) {
236 struct ir3_sched_notes
{
237 /* there is at least one kill which could be scheduled, except
238 * for unscheduled bary.f's:
241 /* there is at least one instruction that could be scheduled,
242 * except for conflicting address/predicate register usage:
244 bool addr0_conflict
, addr1_conflict
, pred_conflict
;
247 /* could an instruction be scheduled if specified ssa src was scheduled? */
249 could_sched(struct ir3_instruction
*instr
, struct ir3_instruction
*src
)
251 struct ir3_instruction
*other_src
;
252 foreach_ssa_src (other_src
, instr
) {
253 /* if dependency not scheduled, we aren't ready yet: */
254 if ((src
!= other_src
) && !is_scheduled(other_src
)) {
261 /* Check if instruction is ok to schedule. Make sure it is not blocked
262 * by use of addr/predicate register, etc.
265 check_instr(struct ir3_sched_ctx
*ctx
, struct ir3_sched_notes
*notes
,
266 struct ir3_instruction
*instr
)
268 debug_assert(!is_scheduled(instr
));
270 if (ctx
->remaining_kills
&& (is_tex(instr
) || is_mem(instr
))) {
271 /* avoid texture/memory access if we have unscheduled kills
272 * that could make the expensive operation unnecessary. By
273 * definition, if there are remaining kills, and this instr
274 * is not a dependency of a kill, there are other instructions
275 * that we can choose from.
277 struct ir3_sched_node
*n
= instr
->data
;
282 /* For instructions that write address register we need to
283 * make sure there is at least one instruction that uses the
284 * addr value which is otherwise ready.
286 * NOTE if any instructions use pred register and have other
287 * src args, we would need to do the same for writes_pred()..
289 if (writes_addr0(instr
)) {
290 struct ir3
*ir
= instr
->block
->shader
;
292 for (unsigned i
= 0; (i
< ir
->a0_users_count
) && !ready
; i
++) {
293 struct ir3_instruction
*indirect
= ir
->a0_users
[i
];
296 if (indirect
->address
!= instr
)
298 ready
= could_sched(indirect
, instr
);
301 /* nothing could be scheduled, so keep looking: */
306 if (writes_addr1(instr
)) {
307 struct ir3
*ir
= instr
->block
->shader
;
309 for (unsigned i
= 0; (i
< ir
->a1_users_count
) && !ready
; i
++) {
310 struct ir3_instruction
*indirect
= ir
->a1_users
[i
];
313 if (indirect
->address
!= instr
)
315 ready
= could_sched(indirect
, instr
);
318 /* nothing could be scheduled, so keep looking: */
323 /* if this is a write to address/predicate register, and that
324 * register is currently in use, we need to defer until it is
327 if (writes_addr0(instr
) && ctx
->addr0
) {
328 debug_assert(ctx
->addr0
!= instr
);
329 notes
->addr0_conflict
= true;
333 if (writes_addr1(instr
) && ctx
->addr1
) {
334 debug_assert(ctx
->addr1
!= instr
);
335 notes
->addr1_conflict
= true;
339 if (writes_pred(instr
) && ctx
->pred
) {
340 debug_assert(ctx
->pred
!= instr
);
341 notes
->pred_conflict
= true;
345 /* if the instruction is a kill, we need to ensure *every*
346 * bary.f is scheduled. The hw seems unhappy if the thread
347 * gets killed before the end-input (ei) flag is hit.
349 * We could do this by adding each bary.f instruction as
350 * virtual ssa src for the kill instruction. But we have
351 * fixed length instr->regs[].
353 * TODO we could handle this by false-deps now, probably.
355 if (is_kill(instr
)) {
356 struct ir3
*ir
= instr
->block
->shader
;
358 for (unsigned i
= 0; i
< ir
->baryfs_count
; i
++) {
359 struct ir3_instruction
*baryf
= ir
->baryfs
[i
];
360 if (baryf
->flags
& IR3_INSTR_UNUSED
)
362 if (!is_scheduled(baryf
)) {
363 notes
->blocked_kill
= true;
372 /* Find the instr->ip of the closest use of an instruction, in
373 * pre-sched order. This isn't going to be the same as post-sched
374 * order, but it is a reasonable approximation to limit scheduling
375 * instructions *too* early. This is mostly to prevent bad behavior
376 * in cases where we have a large number of possible instructions
377 * to choose, to avoid creating too much parallelism (ie. blowing
378 * up register pressure)
380 * See dEQP-GLES31.functional.atomic_counter.layout.reverse_offset.inc_dec.8_counters_5_calls_1_thread
383 nearest_use(struct ir3_instruction
*instr
)
385 unsigned nearest
= ~0;
386 foreach_ssa_use (use
, instr
)
387 if (!is_scheduled(use
))
388 nearest
= MIN2(nearest
, use
->ip
);
390 /* slight hack.. this heuristic tends to push bary.f's to later
391 * in the shader, closer to their uses. But we actually would
392 * prefer to get these scheduled earlier, to unlock varying
393 * storage for more VS jobs:
402 use_count(struct ir3_instruction
*instr
)
405 foreach_ssa_use (use
, instr
)
406 if (!is_scheduled(use
))
411 /* find net change to live values if instruction were scheduled: */
413 live_effect(struct ir3_instruction
*instr
)
415 struct ir3_sched_node
*n
= instr
->data
;
416 struct ir3_instruction
*src
;
417 int new_live
= n
->partially_live
? 0 : dest_regs(instr
);
420 /* if we schedule something that causes a vecN to be live,
421 * then count all it's other components too:
424 new_live
*= n
->collect
->regs_count
- 1;
426 foreach_ssa_src_n (src
, n
, instr
) {
427 if (__is_false_dep(instr
, n
))
430 if (instr
->block
!= src
->block
)
433 if (use_count(src
) == 1)
434 freed_live
+= dest_regs(src
);
437 return new_live
- freed_live
;
440 /* Determine if this is an instruction that we'd prefer not to schedule
441 * yet, in order to avoid an (ss)/(sy) sync. This is limited by the
442 * sfu_delay/tex_delay counters, ie. the more cycles it has been since
443 * the last SFU/tex, the less costly a sync would be.
446 would_sync(struct ir3_sched_ctx
*ctx
, struct ir3_instruction
*instr
)
448 if (ctx
->sfu_delay
) {
449 if (check_src_cond(instr
, is_sfu
))
453 /* We mostly just want to try to schedule another texture fetch
454 * before scheduling something that would (sy) sync, so we can
455 * limit this rule to cases where there are remaining texture
458 if (ctx
->tex_delay
&& ctx
->remaining_tex
) {
459 if (check_src_cond(instr
, is_tex_or_prefetch
))
466 static struct ir3_sched_node
*
467 choose_instr_inc(struct ir3_sched_ctx
*ctx
, struct ir3_sched_notes
*notes
,
468 bool avoid_sync
, bool avoid_output
);
471 * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSR (Code
472 * Scheduling for Register pressure) heuristic.
474 * Only handles the case of choosing instructions that reduce register pressure
477 static struct ir3_sched_node
*
478 choose_instr_dec(struct ir3_sched_ctx
*ctx
, struct ir3_sched_notes
*notes
,
481 const char *mode
= avoid_sync
? "-as" : "";
482 struct ir3_sched_node
*chosen
= NULL
;
484 /* Find a ready inst with regs freed and pick the one with max cost. */
485 foreach_sched_node (n
, &ctx
->dag
->heads
) {
486 if (avoid_sync
&& would_sync(ctx
, n
->instr
))
489 unsigned d
= ir3_delay_calc(ctx
->block
, n
->instr
, false, false);
494 if (live_effect(n
->instr
) > -1)
497 if (!check_instr(ctx
, notes
, n
->instr
))
500 if (!chosen
|| chosen
->max_delay
< n
->max_delay
) {
506 di(chosen
->instr
, "dec%s: chose (freed+ready)", mode
);
510 /* Find a leader with regs freed and pick the one with max cost. */
511 foreach_sched_node (n
, &ctx
->dag
->heads
) {
512 if (avoid_sync
&& would_sync(ctx
, n
->instr
))
515 if (live_effect(n
->instr
) > -1)
518 if (!check_instr(ctx
, notes
, n
->instr
))
521 if (!chosen
|| chosen
->max_delay
< n
->max_delay
) {
527 di(chosen
->instr
, "dec%s: chose (freed)", mode
);
531 /* Contra the paper, pick a leader with no effect on used regs. This may
532 * open up new opportunities, as otherwise a single-operand instr consuming
533 * a value will tend to block finding freeing that value. This had a
534 * massive effect on reducing spilling on V3D.
536 * XXX: Should this prioritize ready?
538 foreach_sched_node (n
, &ctx
->dag
->heads
) {
539 if (avoid_sync
&& would_sync(ctx
, n
->instr
))
542 unsigned d
= ir3_delay_calc(ctx
->block
, n
->instr
, false, false);
547 if (live_effect(n
->instr
) > 0)
550 if (!check_instr(ctx
, notes
, n
->instr
))
553 if (!chosen
|| chosen
->max_delay
< n
->max_delay
)
558 di(chosen
->instr
, "dec%s: chose (neutral+ready)", mode
);
562 foreach_sched_node (n
, &ctx
->dag
->heads
) {
563 if (avoid_sync
&& would_sync(ctx
, n
->instr
))
566 if (live_effect(n
->instr
) > 0)
569 if (!check_instr(ctx
, notes
, n
->instr
))
572 if (!chosen
|| chosen
->max_delay
< n
->max_delay
)
577 di(chosen
->instr
, "dec%s: chose (neutral)", mode
);
581 return choose_instr_inc(ctx
, notes
, avoid_sync
, true);
585 * When we can't choose an instruction that reduces register pressure or
586 * is neutral, we end up here to try and pick the least bad option.
588 static struct ir3_sched_node
*
589 choose_instr_inc(struct ir3_sched_ctx
*ctx
, struct ir3_sched_notes
*notes
,
590 bool avoid_sync
, bool avoid_output
)
592 const char *mode
= avoid_sync
? "-as" : "";
593 struct ir3_sched_node
*chosen
= NULL
;
596 * From hear on out, we are picking something that increases
597 * register pressure. So try to pick something which will
600 unsigned chosen_distance
= 0;
602 /* Pick the max delay of the remaining ready set. */
603 foreach_sched_node (n
, &ctx
->dag
->heads
) {
604 if (avoid_output
&& n
->output
)
607 if (avoid_sync
&& would_sync(ctx
, n
->instr
))
610 unsigned d
= ir3_delay_calc(ctx
->block
, n
->instr
, false, false);
615 if (!check_instr(ctx
, notes
, n
->instr
))
618 unsigned distance
= nearest_use(n
->instr
);
620 if (!chosen
|| distance
< chosen_distance
) {
622 chosen_distance
= distance
;
627 di(chosen
->instr
, "inc%s: chose (distance+ready)", mode
);
631 /* Pick the max delay of the remaining leaders. */
632 foreach_sched_node (n
, &ctx
->dag
->heads
) {
633 if (avoid_output
&& n
->output
)
636 if (avoid_sync
&& would_sync(ctx
, n
->instr
))
639 if (!check_instr(ctx
, notes
, n
->instr
))
642 unsigned distance
= nearest_use(n
->instr
);
644 if (!chosen
|| distance
< chosen_distance
) {
646 chosen_distance
= distance
;
651 di(chosen
->instr
, "inc%s: chose (distance)", mode
);
658 /* Handles instruction selections for instructions we want to prioritize
659 * even if csp/csr would not pick them.
661 static struct ir3_sched_node
*
662 choose_instr_prio(struct ir3_sched_ctx
*ctx
, struct ir3_sched_notes
*notes
)
664 struct ir3_sched_node
*chosen
= NULL
;
666 foreach_sched_node (n
, &ctx
->dag
->heads
) {
667 if (!is_meta(n
->instr
))
670 if (!chosen
|| (chosen
->max_delay
< n
->max_delay
))
675 di(chosen
->instr
, "prio: chose (meta)");
683 dump_state(struct ir3_sched_ctx
*ctx
)
688 foreach_sched_node (n
, &ctx
->dag
->heads
) {
689 di(n
->instr
, "maxdel=%3d le=%d del=%u ",
690 n
->max_delay
, live_effect(n
->instr
),
691 ir3_delay_calc(ctx
->block
, n
->instr
, false, false));
693 util_dynarray_foreach(&n
->dag
.edges
, struct dag_edge
, edge
) {
694 struct ir3_sched_node
*child
= (struct ir3_sched_node
*)edge
->child
;
696 di(child
->instr
, " -> (%d parents) ", child
->dag
.parent_count
);
701 /* find instruction to schedule: */
702 static struct ir3_instruction
*
703 choose_instr(struct ir3_sched_ctx
*ctx
, struct ir3_sched_notes
*notes
)
705 struct ir3_sched_node
*chosen
;
709 chosen
= choose_instr_prio(ctx
, notes
);
711 return chosen
->instr
;
713 chosen
= choose_instr_dec(ctx
, notes
, true);
715 return chosen
->instr
;
717 chosen
= choose_instr_dec(ctx
, notes
, false);
719 return chosen
->instr
;
721 chosen
= choose_instr_inc(ctx
, notes
, false, false);
723 return chosen
->instr
;
728 static struct ir3_instruction
*
729 split_instr(struct ir3_sched_ctx
*ctx
, struct ir3_instruction
*orig_instr
)
731 struct ir3_instruction
*new_instr
= ir3_instr_clone(orig_instr
);
732 di(new_instr
, "split instruction");
733 sched_node_init(ctx
, new_instr
);
737 /* "spill" the address registers by remapping any unscheduled
738 * instructions which depend on the current address register
739 * to a clone of the instruction which wrote the address reg.
741 static struct ir3_instruction
*
742 split_addr(struct ir3_sched_ctx
*ctx
, struct ir3_instruction
**addr
,
743 struct ir3_instruction
**users
, unsigned users_count
)
745 struct ir3_instruction
*new_addr
= NULL
;
750 for (i
= 0; i
< users_count
; i
++) {
751 struct ir3_instruction
*indirect
= users
[i
];
756 /* skip instructions already scheduled: */
757 if (is_scheduled(indirect
))
760 /* remap remaining instructions using current addr
763 if (indirect
->address
== *addr
) {
765 new_addr
= split_instr(ctx
, *addr
);
766 /* original addr is scheduled, but new one isn't: */
767 new_addr
->flags
&= ~IR3_INSTR_MARK
;
769 indirect
->address
= new_addr
;
770 /* don't need to remove old dag edge since old addr is
773 sched_node_add_dep(indirect
, new_addr
, 0);
774 di(indirect
, "new address");
778 /* all remaining indirects remapped to new addr: */
784 /* "spill" the predicate register by remapping any unscheduled
785 * instructions which depend on the current predicate register
786 * to a clone of the instruction which wrote the address reg.
788 static struct ir3_instruction
*
789 split_pred(struct ir3_sched_ctx
*ctx
)
792 struct ir3_instruction
*new_pred
= NULL
;
795 debug_assert(ctx
->pred
);
797 ir
= ctx
->pred
->block
->shader
;
799 for (i
= 0; i
< ir
->predicates_count
; i
++) {
800 struct ir3_instruction
*predicated
= ir
->predicates
[i
];
802 /* skip instructions already scheduled: */
803 if (is_scheduled(predicated
))
806 /* remap remaining instructions using current pred
809 * TODO is there ever a case when pred isn't first
812 if (ssa(predicated
->regs
[1]) == ctx
->pred
) {
814 new_pred
= split_instr(ctx
, ctx
->pred
);
815 /* original pred is scheduled, but new one isn't: */
816 new_pred
->flags
&= ~IR3_INSTR_MARK
;
818 predicated
->regs
[1]->instr
= new_pred
;
819 /* don't need to remove old dag edge since old pred is
822 sched_node_add_dep(predicated
, new_pred
, 0);
823 di(predicated
, "new predicate");
827 /* all remaining predicated remapped to new pred: */
834 sched_node_init(struct ir3_sched_ctx
*ctx
, struct ir3_instruction
*instr
)
836 struct ir3_sched_node
*n
= rzalloc(ctx
->dag
, struct ir3_sched_node
);
838 dag_init_node(ctx
->dag
, &n
->dag
);
845 sched_node_add_dep(struct ir3_instruction
*instr
, struct ir3_instruction
*src
, int i
)
847 /* don't consider dependencies in other blocks: */
848 if (src
->block
!= instr
->block
)
851 /* we could have false-dep's that end up unused: */
852 if (src
->flags
& IR3_INSTR_UNUSED
) {
853 debug_assert(__is_false_dep(instr
, i
));
857 struct ir3_sched_node
*n
= instr
->data
;
858 struct ir3_sched_node
*sn
= src
->data
;
860 /* If src is consumed by a collect, track that to realize that once
861 * any of the collect srcs are live, we should hurry up and schedule
864 if (instr
->opc
== OPC_META_COLLECT
)
867 dag_add_edge(&sn
->dag
, &n
->dag
, NULL
);
869 unsigned d
= ir3_delayslots(src
, instr
, i
, true);
870 n
->delay
= MAX2(n
->delay
, d
);
874 mark_kill_path(struct ir3_instruction
*instr
)
876 struct ir3_sched_node
*n
= instr
->data
;
878 struct ir3_instruction
*src
;
879 foreach_ssa_src (src
, instr
) {
880 if (src
->block
!= instr
->block
)
886 /* Is it an output? */
888 is_output_collect(struct ir3_instruction
*instr
)
890 struct ir3
*ir
= instr
->block
->shader
;
892 for (unsigned i
= 0; i
< ir
->outputs_count
; i
++) {
893 struct ir3_instruction
*collect
= ir
->outputs
[i
];
894 assert(collect
->opc
== OPC_META_COLLECT
);
895 if (instr
== collect
)
902 /* Is it's only use as output? */
904 is_output_only(struct ir3_instruction
*instr
)
906 if (!writes_gpr(instr
))
909 if (!(instr
->regs
[0]->flags
& IR3_REG_SSA
))
912 foreach_ssa_use (use
, instr
)
913 if (!is_output_collect(use
))
920 sched_node_add_deps(struct ir3_instruction
*instr
)
922 struct ir3_instruction
*src
;
924 /* Since foreach_ssa_src() already handles false-dep's we can construct
925 * the DAG easily in a single pass.
927 foreach_ssa_src_n (src
, i
, instr
) {
928 sched_node_add_dep(instr
, src
, i
);
931 /* NOTE that all inputs must be scheduled before a kill, so
932 * mark these to be prioritized as well:
934 if (is_kill(instr
) || is_input(instr
)) {
935 mark_kill_path(instr
);
938 if (is_output_only(instr
)) {
939 struct ir3_sched_node
*n
= instr
->data
;
945 sched_dag_max_delay_cb(struct dag_node
*node
, void *state
)
947 struct ir3_sched_node
*n
= (struct ir3_sched_node
*)node
;
948 uint32_t max_delay
= 0;
950 util_dynarray_foreach(&n
->dag
.edges
, struct dag_edge
, edge
) {
951 struct ir3_sched_node
*child
= (struct ir3_sched_node
*)edge
->child
;
952 max_delay
= MAX2(child
->max_delay
, max_delay
);
955 n
->max_delay
= MAX2(n
->max_delay
, max_delay
+ n
->delay
);
959 sched_dag_init(struct ir3_sched_ctx
*ctx
)
961 ctx
->dag
= dag_create(ctx
);
963 foreach_instr (instr
, &ctx
->unscheduled_list
) {
964 sched_node_init(ctx
, instr
);
965 sched_node_add_deps(instr
);
968 dag_traverse_bottom_up(ctx
->dag
, sched_dag_max_delay_cb
, NULL
);
972 sched_dag_destroy(struct ir3_sched_ctx
*ctx
)
974 ralloc_free(ctx
->dag
);
979 sched_block(struct ir3_sched_ctx
*ctx
, struct ir3_block
*block
)
983 /* addr/pred writes are per-block: */
988 /* move all instructions to the unscheduled list, and
989 * empty the block's instruction list (to which we will
992 list_replace(&block
->instr_list
, &ctx
->unscheduled_list
);
993 list_inithead(&block
->instr_list
);
997 ctx
->remaining_kills
= 0;
998 ctx
->remaining_tex
= 0;
999 foreach_instr_safe (instr
, &ctx
->unscheduled_list
) {
1001 ctx
->remaining_kills
++;
1002 if (is_tex_or_prefetch(instr
))
1003 ctx
->remaining_tex
++;
1006 /* First schedule all meta:input instructions, followed by
1007 * tex-prefetch. We want all of the instructions that load
1008 * values into registers before the shader starts to go
1009 * before any other instructions. But in particular we
1010 * want inputs to come before prefetches. This is because
1011 * a FS's bary_ij input may not actually be live in the
1012 * shader, but it should not be scheduled on top of any
1013 * other input (but can be overwritten by a tex prefetch)
1015 foreach_instr_safe (instr
, &ctx
->unscheduled_list
)
1016 if (instr
->opc
== OPC_META_INPUT
)
1017 schedule(ctx
, instr
);
1019 foreach_instr_safe (instr
, &ctx
->unscheduled_list
)
1020 if (instr
->opc
== OPC_META_TEX_PREFETCH
)
1021 schedule(ctx
, instr
);
1023 while (!list_is_empty(&ctx
->unscheduled_list
)) {
1024 struct ir3_sched_notes notes
= {0};
1025 struct ir3_instruction
*instr
;
1027 instr
= choose_instr(ctx
, ¬es
);
1029 unsigned delay
= ir3_delay_calc(ctx
->block
, instr
, false, false);
1030 d("delay=%u", delay
);
1032 /* and if we run out of instructions that can be scheduled,
1033 * then it is time for nop's:
1035 debug_assert(delay
<= 6);
1041 schedule(ctx
, instr
);
1043 struct ir3_instruction
*new_instr
= NULL
;
1044 struct ir3
*ir
= block
->shader
;
1046 /* nothing available to schedule.. if we are blocked on
1047 * address/predicate register conflict, then break the
1048 * deadlock by cloning the instruction that wrote that
1051 if (notes
.addr0_conflict
) {
1052 new_instr
= split_addr(ctx
, &ctx
->addr0
,
1053 ir
->a0_users
, ir
->a0_users_count
);
1054 } else if (notes
.addr1_conflict
) {
1055 new_instr
= split_addr(ctx
, &ctx
->addr1
,
1056 ir
->a1_users
, ir
->a1_users_count
);
1057 } else if (notes
.pred_conflict
) {
1058 new_instr
= split_pred(ctx
);
1060 d("unscheduled_list:");
1061 foreach_instr (instr
, &ctx
->unscheduled_list
)
1062 di(instr
, "unscheduled: ");
1069 list_delinit(&new_instr
->node
);
1070 list_addtail(&new_instr
->node
, &ctx
->unscheduled_list
);
1075 sched_dag_destroy(ctx
);
1078 int ir3_sched(struct ir3
*ir
)
1080 struct ir3_sched_ctx
*ctx
= rzalloc(NULL
, struct ir3_sched_ctx
);
1082 foreach_block (block
, &ir
->block_list
) {
1083 foreach_instr (instr
, &block
->instr_list
) {
1088 ir3_count_instructions(ir
);
1090 ir3_find_ssa_uses(ir
, ctx
, false);
1092 foreach_block (block
, &ir
->block_list
) {
1093 sched_block(ctx
, block
);
1096 int ret
= ctx
->error
? -1 : 0;
1104 get_array_id(struct ir3_instruction
*instr
)
1106 /* The expectation is that there is only a single array
1107 * src or dst, ir3_cp should enforce this.
1110 for (unsigned i
= 0; i
< instr
->regs_count
; i
++)
1111 if (instr
->regs
[i
]->flags
& IR3_REG_ARRAY
)
1112 return instr
->regs
[i
]->array
.id
;
1114 unreachable("this was unexpected");
1117 /* does instruction 'prior' need to be scheduled before 'instr'? */
1119 depends_on(struct ir3_instruction
*instr
, struct ir3_instruction
*prior
)
1121 /* TODO for dependencies that are related to a specific object, ie
1122 * a specific SSBO/image/array, we could relax this constraint to
1123 * make accesses to unrelated objects not depend on each other (at
1124 * least as long as not declared coherent)
1126 if (((instr
->barrier_class
& IR3_BARRIER_EVERYTHING
) && prior
->barrier_class
) ||
1127 ((prior
->barrier_class
& IR3_BARRIER_EVERYTHING
) && instr
->barrier_class
))
1130 if (instr
->barrier_class
& prior
->barrier_conflict
) {
1131 if (!(instr
->barrier_class
& ~(IR3_BARRIER_ARRAY_R
| IR3_BARRIER_ARRAY_W
))) {
1132 /* if only array barrier, then we can further limit false-deps
1133 * by considering the array-id, ie reads/writes to different
1134 * arrays do not depend on each other (no aliasing)
1136 if (get_array_id(instr
) != get_array_id(prior
)) {
1148 add_barrier_deps(struct ir3_block
*block
, struct ir3_instruction
*instr
)
1150 struct list_head
*prev
= instr
->node
.prev
;
1151 struct list_head
*next
= instr
->node
.next
;
1153 /* add dependencies on previous instructions that must be scheduled
1154 * prior to the current instruction
1156 while (prev
!= &block
->instr_list
) {
1157 struct ir3_instruction
*pi
=
1158 LIST_ENTRY(struct ir3_instruction
, prev
, node
);
1165 if (instr
->barrier_class
== pi
->barrier_class
) {
1166 ir3_instr_add_dep(instr
, pi
);
1170 if (depends_on(instr
, pi
))
1171 ir3_instr_add_dep(instr
, pi
);
1174 /* add dependencies on this instruction to following instructions
1175 * that must be scheduled after the current instruction:
1177 while (next
!= &block
->instr_list
) {
1178 struct ir3_instruction
*ni
=
1179 LIST_ENTRY(struct ir3_instruction
, next
, node
);
1186 if (instr
->barrier_class
== ni
->barrier_class
) {
1187 ir3_instr_add_dep(ni
, instr
);
1191 if (depends_on(ni
, instr
))
1192 ir3_instr_add_dep(ni
, instr
);
1196 /* before scheduling a block, we need to add any necessary false-dependencies
1199 * (1) barriers are scheduled in the right order wrt instructions related
1202 * (2) reads that come before a write actually get scheduled before the
1206 calculate_deps(struct ir3_block
*block
)
1208 foreach_instr (instr
, &block
->instr_list
) {
1209 if (instr
->barrier_class
) {
1210 add_barrier_deps(block
, instr
);
1216 ir3_sched_add_deps(struct ir3
*ir
)
1218 foreach_block (block
, &ir
->block_list
) {
1219 calculate_deps(block
);