2 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 * Rob Clark <robclark@freedesktop.org>
29 #include "util/u_math.h"
32 #include "ir3_compiler.h"
35 #define SCHED_DEBUG (ir3_shader_debug & IR3_DBG_SCHEDMSGS)
39 #define d(fmt, ...) do { if (SCHED_DEBUG) { \
40 printf("SCHED: "fmt"\n", ##__VA_ARGS__); \
43 #define di(instr, fmt, ...) do { if (SCHED_DEBUG) { \
44 printf("SCHED: "fmt": ", ##__VA_ARGS__); \
45 ir3_print_instr(instr); \
49 * Instruction Scheduling:
51 * A block-level pre-RA scheduler, which works by creating a DAG of
52 * instruction dependencies, and heuristically picking a DAG head
53 * (instruction with no unscheduled dependencies).
55 * Where possible, it tries to pick instructions that avoid nop delay
56 * slots, but it will prefer to pick instructions that reduce (or do
57 * not increase) the number of live values.
59 * If the only possible choices are instructions that increase the
60 * number of live values, it will try to pick the one with the earliest
61 * consumer (based on pre-sched program order).
63 * There are a few special cases that need to be handled, since sched
64 * is currently independent of register allocation. Usages of address
65 * register (a0.x) or predicate register (p0.x) must be serialized. Ie.
66 * if you have two pairs of instructions that write the same special
67 * register and then read it, then those pairs cannot be interleaved.
68 * To solve this, when we are in such a scheduling "critical section",
69 * and we encounter a conflicting write to a special register, we try
70 * to schedule any remaining instructions that use that value first.
72 * TODO we can detect too-large live_values here.. would be a good place
73 * to "spill" cheap things, like move from uniform/immed. (Constructing
74 * list of ssa def consumers before sched pass would make this easier.
75 * Also, in general it is general it might be best not to re-use load_immed
78 * TODO we can use (abs)/(neg) src modifiers in a lot of cases to reduce
79 * the # of immediates in play (or at least that would help with
80 * dEQP-GLES31.functional.ubo.random.all_per_block_buffers.*).. probably
81 * do this in a nir pass that inserts fneg/etc? The cp pass should fold
82 * these into src modifiers..
85 struct ir3_sched_ctx
{
86 struct ir3_block
*block
; /* the current block */
89 struct list_head unscheduled_list
; /* unscheduled instructions */
90 struct ir3_instruction
*scheduled
; /* last scheduled instr */
91 struct ir3_instruction
*addr0
; /* current a0.x user, if any */
92 struct ir3_instruction
*addr1
; /* current a1.x user, if any */
93 struct ir3_instruction
*pred
; /* current p0.x user, if any */
100 struct ir3_sched_node
{
101 struct dag_node dag
; /* must be first for util_dynarray_foreach */
102 struct ir3_instruction
*instr
;
107 /* For instructions that are a meta:collect src, once we schedule
108 * the first src of the collect, the entire vecN is live (at least
109 * from the PoV of the first RA pass.. the 2nd scalar pass can fill
110 * in some of the gaps, but often not all). So we want to help out
111 * RA, and realize that as soon as we schedule the first collect
112 * src, there is no penalty to schedule the remainder (ie. they
113 * don't make additional values live). In fact we'd prefer to
114 * schedule the rest ASAP to minimize the live range of the vecN.
116 * For instructions that are the src of a collect, we track the
117 * corresponding collect, and mark them as partially live as soon
118 * as any one of the src's is scheduled.
120 struct ir3_instruction
*collect
;
123 /* Is this instruction a direct or indirect dependency for a kill?
124 * If so, we should prioritize it when possible
128 /* This node represents a shader output. A semi-common pattern in
129 * shaders is something along the lines of:
133 * Which we'd prefer to schedule as late as possible, since it
134 * produces a live value that is never killed/consumed. So detect
135 * outputs up-front, and avoid scheduling them unless the reduce
136 * register pressure (or at least are neutral)
141 #define foreach_sched_node(__n, __list) \
142 list_for_each_entry(struct ir3_sched_node, __n, __list, dag.link)
144 static void sched_node_init(struct ir3_sched_ctx
*ctx
, struct ir3_instruction
*instr
);
145 static void sched_node_add_dep(struct ir3_instruction
*instr
, struct ir3_instruction
*src
, int i
);
147 static bool is_scheduled(struct ir3_instruction
*instr
)
149 return !!(instr
->flags
& IR3_INSTR_MARK
);
153 schedule(struct ir3_sched_ctx
*ctx
, struct ir3_instruction
*instr
)
155 debug_assert(ctx
->block
== instr
->block
);
157 /* remove from depth list:
159 list_delinit(&instr
->node
);
161 if (writes_addr0(instr
)) {
162 debug_assert(ctx
->addr0
== NULL
);
166 if (writes_addr1(instr
)) {
167 debug_assert(ctx
->addr1
== NULL
);
171 if (writes_pred(instr
)) {
172 debug_assert(ctx
->pred
== NULL
);
176 instr
->flags
|= IR3_INSTR_MARK
;
178 di(instr
, "schedule");
180 list_addtail(&instr
->node
, &instr
->block
->instr_list
);
181 ctx
->scheduled
= instr
;
184 ctx
->remaining_kills
--;
187 struct ir3_sched_node
*n
= instr
->data
;
189 /* If this instruction is a meta:collect src, mark the remaining
190 * collect srcs as partially live.
193 struct ir3_instruction
*src
;
194 foreach_ssa_src (src
, n
->collect
) {
195 if (src
->block
!= instr
->block
)
197 struct ir3_sched_node
*sn
= src
->data
;
198 sn
->partially_live
= true;
202 dag_prune_head(ctx
->dag
, &n
->dag
);
205 struct ir3_sched_notes
{
206 /* there is at least one kill which could be scheduled, except
207 * for unscheduled bary.f's:
210 /* there is at least one instruction that could be scheduled,
211 * except for conflicting address/predicate register usage:
213 bool addr0_conflict
, addr1_conflict
, pred_conflict
;
216 /* could an instruction be scheduled if specified ssa src was scheduled? */
218 could_sched(struct ir3_instruction
*instr
, struct ir3_instruction
*src
)
220 struct ir3_instruction
*other_src
;
221 foreach_ssa_src (other_src
, instr
) {
222 /* if dependency not scheduled, we aren't ready yet: */
223 if ((src
!= other_src
) && !is_scheduled(other_src
)) {
230 /* Check if instruction is ok to schedule. Make sure it is not blocked
231 * by use of addr/predicate register, etc.
234 check_instr(struct ir3_sched_ctx
*ctx
, struct ir3_sched_notes
*notes
,
235 struct ir3_instruction
*instr
)
237 debug_assert(!is_scheduled(instr
));
239 if (ctx
->remaining_kills
&& (is_tex(instr
) || is_mem(instr
))) {
240 /* avoid texture/memory access if we have unscheduled kills
241 * that could make the expensive operation unnecessary. By
242 * definition, if there are remaining kills, and this instr
243 * is not a dependency of a kill, there are other instructions
244 * that we can choose from.
246 struct ir3_sched_node
*n
= instr
->data
;
251 /* For instructions that write address register we need to
252 * make sure there is at least one instruction that uses the
253 * addr value which is otherwise ready.
255 * NOTE if any instructions use pred register and have other
256 * src args, we would need to do the same for writes_pred()..
258 if (writes_addr0(instr
)) {
259 struct ir3
*ir
= instr
->block
->shader
;
261 for (unsigned i
= 0; (i
< ir
->a0_users_count
) && !ready
; i
++) {
262 struct ir3_instruction
*indirect
= ir
->a0_users
[i
];
265 if (indirect
->address
!= instr
)
267 ready
= could_sched(indirect
, instr
);
270 /* nothing could be scheduled, so keep looking: */
275 if (writes_addr1(instr
)) {
276 struct ir3
*ir
= instr
->block
->shader
;
278 for (unsigned i
= 0; (i
< ir
->a1_users_count
) && !ready
; i
++) {
279 struct ir3_instruction
*indirect
= ir
->a1_users
[i
];
282 if (indirect
->address
!= instr
)
284 ready
= could_sched(indirect
, instr
);
287 /* nothing could be scheduled, so keep looking: */
292 /* if this is a write to address/predicate register, and that
293 * register is currently in use, we need to defer until it is
296 if (writes_addr0(instr
) && ctx
->addr0
) {
297 debug_assert(ctx
->addr0
!= instr
);
298 notes
->addr0_conflict
= true;
302 if (writes_addr1(instr
) && ctx
->addr1
) {
303 debug_assert(ctx
->addr1
!= instr
);
304 notes
->addr1_conflict
= true;
308 if (writes_pred(instr
) && ctx
->pred
) {
309 debug_assert(ctx
->pred
!= instr
);
310 notes
->pred_conflict
= true;
314 /* if the instruction is a kill, we need to ensure *every*
315 * bary.f is scheduled. The hw seems unhappy if the thread
316 * gets killed before the end-input (ei) flag is hit.
318 * We could do this by adding each bary.f instruction as
319 * virtual ssa src for the kill instruction. But we have
320 * fixed length instr->regs[].
322 * TODO we could handle this by false-deps now, probably.
324 if (is_kill(instr
)) {
325 struct ir3
*ir
= instr
->block
->shader
;
327 for (unsigned i
= 0; i
< ir
->baryfs_count
; i
++) {
328 struct ir3_instruction
*baryf
= ir
->baryfs
[i
];
329 if (baryf
->flags
& IR3_INSTR_UNUSED
)
331 if (!is_scheduled(baryf
)) {
332 notes
->blocked_kill
= true;
341 /* Find the instr->ip of the closest use of an instruction, in
342 * pre-sched order. This isn't going to be the same as post-sched
343 * order, but it is a reasonable approximation to limit scheduling
344 * instructions *too* early. This is mostly to prevent bad behavior
345 * in cases where we have a large number of possible instructions
346 * to choose, to avoid creating too much parallelism (ie. blowing
347 * up register pressure)
349 * See dEQP-GLES31.functional.atomic_counter.layout.reverse_offset.inc_dec.8_counters_5_calls_1_thread
352 nearest_use(struct ir3_instruction
*instr
)
354 unsigned nearest
= ~0;
355 foreach_ssa_use (use
, instr
)
356 if (!is_scheduled(use
))
357 nearest
= MIN2(nearest
, use
->ip
);
359 /* slight hack.. this heuristic tends to push bary.f's to later
360 * in the shader, closer to their uses. But we actually would
361 * prefer to get these scheduled earlier, to unlock varying
362 * storage for more VS jobs:
371 use_count(struct ir3_instruction
*instr
)
374 foreach_ssa_use (use
, instr
)
375 if (!is_scheduled(use
))
380 /* find net change to live values if instruction were scheduled: */
382 live_effect(struct ir3_instruction
*instr
)
384 struct ir3_sched_node
*n
= instr
->data
;
385 struct ir3_instruction
*src
;
386 int new_live
= n
->partially_live
? 0 : dest_regs(instr
);
389 /* if we schedule something that causes a vecN to be live,
390 * then count all it's other components too:
393 new_live
*= n
->collect
->regs_count
- 1;
395 foreach_ssa_src_n (src
, n
, instr
) {
396 if (__is_false_dep(instr
, n
))
399 if (instr
->block
!= src
->block
)
402 if (use_count(src
) == 1)
403 freed_live
+= dest_regs(src
);
406 return new_live
- freed_live
;
409 static struct ir3_sched_node
* choose_instr_inc(struct ir3_sched_ctx
*ctx
,
410 struct ir3_sched_notes
*notes
, bool avoid_output
);
413 * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSR (Code
414 * Scheduling for Register pressure) heuristic.
416 * Only handles the case of choosing instructions that reduce register pressure
419 static struct ir3_sched_node
*
420 choose_instr_dec(struct ir3_sched_ctx
*ctx
, struct ir3_sched_notes
*notes
)
422 struct ir3_sched_node
*chosen
= NULL
;
424 /* Find a ready inst with regs freed and pick the one with max cost. */
425 foreach_sched_node (n
, &ctx
->dag
->heads
) {
426 unsigned d
= ir3_delay_calc(ctx
->block
, n
->instr
, false, false);
431 if (live_effect(n
->instr
) > -1)
434 if (!check_instr(ctx
, notes
, n
->instr
))
437 if (!chosen
|| chosen
->max_delay
< n
->max_delay
) {
443 di(chosen
->instr
, "dec: chose (freed+ready)");
447 /* Find a leader with regs freed and pick the one with max cost. */
448 foreach_sched_node (n
, &ctx
->dag
->heads
) {
449 if (live_effect(n
->instr
) > -1)
452 if (!check_instr(ctx
, notes
, n
->instr
))
455 if (!chosen
|| chosen
->max_delay
< n
->max_delay
) {
461 di(chosen
->instr
, "dec: chose (freed)");
465 /* Contra the paper, pick a leader with no effect on used regs. This may
466 * open up new opportunities, as otherwise a single-operand instr consuming
467 * a value will tend to block finding freeing that value. This had a
468 * massive effect on reducing spilling on V3D.
470 * XXX: Should this prioritize ready?
472 foreach_sched_node (n
, &ctx
->dag
->heads
) {
473 unsigned d
= ir3_delay_calc(ctx
->block
, n
->instr
, false, false);
478 if (live_effect(n
->instr
) > 0)
481 if (!check_instr(ctx
, notes
, n
->instr
))
484 if (!chosen
|| chosen
->max_delay
< n
->max_delay
)
489 di(chosen
->instr
, "dec: chose (neutral+ready)");
493 foreach_sched_node (n
, &ctx
->dag
->heads
) {
494 if (live_effect(n
->instr
) > 0)
497 if (!check_instr(ctx
, notes
, n
->instr
))
500 if (!chosen
|| chosen
->max_delay
< n
->max_delay
)
505 di(chosen
->instr
, "dec: chose (neutral)");
509 return choose_instr_inc(ctx
, notes
, true);
513 * When we can't choose an instruction that reduces register pressure or
514 * is neutral, we end up here to try and pick the least bad option.
516 static struct ir3_sched_node
*
517 choose_instr_inc(struct ir3_sched_ctx
*ctx
, struct ir3_sched_notes
*notes
,
520 struct ir3_sched_node
*chosen
= NULL
;
523 * From hear on out, we are picking something that increases
524 * register pressure. So try to pick something which will
527 unsigned chosen_distance
= 0;
529 /* Pick the max delay of the remaining ready set. */
530 foreach_sched_node (n
, &ctx
->dag
->heads
) {
531 if (avoid_output
&& n
->output
)
534 unsigned d
= ir3_delay_calc(ctx
->block
, n
->instr
, false, false);
539 if (!check_instr(ctx
, notes
, n
->instr
))
542 unsigned distance
= nearest_use(n
->instr
);
544 if (!chosen
|| distance
< chosen_distance
) {
546 chosen_distance
= distance
;
551 di(chosen
->instr
, "inc: chose (distance+ready)");
555 /* Pick the max delay of the remaining leaders. */
556 foreach_sched_node (n
, &ctx
->dag
->heads
) {
557 if (avoid_output
&& n
->output
)
560 if (!check_instr(ctx
, notes
, n
->instr
))
563 unsigned distance
= nearest_use(n
->instr
);
565 if (!chosen
|| distance
< chosen_distance
) {
567 chosen_distance
= distance
;
572 di(chosen
->instr
, "inc: chose (distance)");
579 /* Handles instruction selections for instructions we want to prioritize
580 * even if csp/csr would not pick them.
582 static struct ir3_sched_node
*
583 choose_instr_prio(struct ir3_sched_ctx
*ctx
, struct ir3_sched_notes
*notes
)
585 struct ir3_sched_node
*chosen
= NULL
;
587 foreach_sched_node (n
, &ctx
->dag
->heads
) {
588 if (!is_meta(n
->instr
))
591 if (!chosen
|| (chosen
->max_delay
< n
->max_delay
))
596 di(chosen
->instr
, "prio: chose (meta)");
604 dump_state(struct ir3_sched_ctx
*ctx
)
609 foreach_sched_node (n
, &ctx
->dag
->heads
) {
610 di(n
->instr
, "maxdel=%3d le=%d del=%u ",
611 n
->max_delay
, live_effect(n
->instr
),
612 ir3_delay_calc(ctx
->block
, n
->instr
, false, false));
614 util_dynarray_foreach(&n
->dag
.edges
, struct dag_edge
, edge
) {
615 struct ir3_sched_node
*child
= (struct ir3_sched_node
*)edge
->child
;
617 di(child
->instr
, " -> (%d parents) ", child
->dag
.parent_count
);
622 /* find instruction to schedule: */
623 static struct ir3_instruction
*
624 choose_instr(struct ir3_sched_ctx
*ctx
, struct ir3_sched_notes
*notes
)
626 struct ir3_sched_node
*chosen
;
630 chosen
= choose_instr_prio(ctx
, notes
);
632 return chosen
->instr
;
634 chosen
= choose_instr_dec(ctx
, notes
);
636 return chosen
->instr
;
638 chosen
= choose_instr_inc(ctx
, notes
, false);
640 return chosen
->instr
;
645 static struct ir3_instruction
*
646 split_instr(struct ir3_sched_ctx
*ctx
, struct ir3_instruction
*orig_instr
)
648 struct ir3_instruction
*new_instr
= ir3_instr_clone(orig_instr
);
649 di(new_instr
, "split instruction");
650 sched_node_init(ctx
, new_instr
);
654 /* "spill" the address registers by remapping any unscheduled
655 * instructions which depend on the current address register
656 * to a clone of the instruction which wrote the address reg.
658 static struct ir3_instruction
*
659 split_addr(struct ir3_sched_ctx
*ctx
, struct ir3_instruction
**addr
,
660 struct ir3_instruction
**users
, unsigned users_count
)
662 struct ir3_instruction
*new_addr
= NULL
;
667 for (i
= 0; i
< users_count
; i
++) {
668 struct ir3_instruction
*indirect
= users
[i
];
673 /* skip instructions already scheduled: */
674 if (is_scheduled(indirect
))
677 /* remap remaining instructions using current addr
680 if (indirect
->address
== *addr
) {
682 new_addr
= split_instr(ctx
, *addr
);
683 /* original addr is scheduled, but new one isn't: */
684 new_addr
->flags
&= ~IR3_INSTR_MARK
;
686 indirect
->address
= new_addr
;
687 /* don't need to remove old dag edge since old addr is
690 sched_node_add_dep(indirect
, new_addr
, 0);
691 di(indirect
, "new address");
695 /* all remaining indirects remapped to new addr: */
701 /* "spill" the predicate register by remapping any unscheduled
702 * instructions which depend on the current predicate register
703 * to a clone of the instruction which wrote the address reg.
705 static struct ir3_instruction
*
706 split_pred(struct ir3_sched_ctx
*ctx
)
709 struct ir3_instruction
*new_pred
= NULL
;
712 debug_assert(ctx
->pred
);
714 ir
= ctx
->pred
->block
->shader
;
716 for (i
= 0; i
< ir
->predicates_count
; i
++) {
717 struct ir3_instruction
*predicated
= ir
->predicates
[i
];
719 /* skip instructions already scheduled: */
720 if (is_scheduled(predicated
))
723 /* remap remaining instructions using current pred
726 * TODO is there ever a case when pred isn't first
729 if (ssa(predicated
->regs
[1]) == ctx
->pred
) {
731 new_pred
= split_instr(ctx
, ctx
->pred
);
732 /* original pred is scheduled, but new one isn't: */
733 new_pred
->flags
&= ~IR3_INSTR_MARK
;
735 predicated
->regs
[1]->instr
= new_pred
;
736 /* don't need to remove old dag edge since old pred is
739 sched_node_add_dep(predicated
, new_pred
, 0);
740 di(predicated
, "new predicate");
744 /* all remaining predicated remapped to new pred: */
751 sched_node_init(struct ir3_sched_ctx
*ctx
, struct ir3_instruction
*instr
)
753 struct ir3_sched_node
*n
= rzalloc(ctx
->dag
, struct ir3_sched_node
);
755 dag_init_node(ctx
->dag
, &n
->dag
);
762 sched_node_add_dep(struct ir3_instruction
*instr
, struct ir3_instruction
*src
, int i
)
764 /* don't consider dependencies in other blocks: */
765 if (src
->block
!= instr
->block
)
768 /* we could have false-dep's that end up unused: */
769 if (src
->flags
& IR3_INSTR_UNUSED
) {
770 debug_assert(__is_false_dep(instr
, i
));
774 struct ir3_sched_node
*n
= instr
->data
;
775 struct ir3_sched_node
*sn
= src
->data
;
777 /* If src is consumed by a collect, track that to realize that once
778 * any of the collect srcs are live, we should hurry up and schedule
781 if (instr
->opc
== OPC_META_COLLECT
)
784 dag_add_edge(&sn
->dag
, &n
->dag
, NULL
);
786 unsigned d
= ir3_delayslots(src
, instr
, i
, true);
787 n
->delay
= MAX2(n
->delay
, d
);
791 mark_kill_path(struct ir3_instruction
*instr
)
793 struct ir3_sched_node
*n
= instr
->data
;
795 struct ir3_instruction
*src
;
796 foreach_ssa_src (src
, instr
) {
797 if (src
->block
!= instr
->block
)
803 /* Is it an output? */
805 is_output_collect(struct ir3_instruction
*instr
)
807 struct ir3
*ir
= instr
->block
->shader
;
809 for (unsigned i
= 0; i
< ir
->outputs_count
; i
++) {
810 struct ir3_instruction
*collect
= ir
->outputs
[i
];
811 assert(collect
->opc
== OPC_META_COLLECT
);
812 if (instr
== collect
)
819 /* Is it's only use as output? */
821 is_output_only(struct ir3_instruction
*instr
)
823 if (!writes_gpr(instr
))
826 if (!(instr
->regs
[0]->flags
& IR3_REG_SSA
))
829 foreach_ssa_use (use
, instr
)
830 if (!is_output_collect(use
))
837 sched_node_add_deps(struct ir3_instruction
*instr
)
839 struct ir3_instruction
*src
;
841 /* Since foreach_ssa_src() already handles false-dep's we can construct
842 * the DAG easily in a single pass.
844 foreach_ssa_src_n (src
, i
, instr
) {
845 sched_node_add_dep(instr
, src
, i
);
848 /* NOTE that all inputs must be scheduled before a kill, so
849 * mark these to be prioritized as well:
851 if (is_kill(instr
) || is_input(instr
)) {
852 mark_kill_path(instr
);
855 if (is_output_only(instr
)) {
856 struct ir3_sched_node
*n
= instr
->data
;
862 sched_dag_max_delay_cb(struct dag_node
*node
, void *state
)
864 struct ir3_sched_node
*n
= (struct ir3_sched_node
*)node
;
865 uint32_t max_delay
= 0;
867 util_dynarray_foreach(&n
->dag
.edges
, struct dag_edge
, edge
) {
868 struct ir3_sched_node
*child
= (struct ir3_sched_node
*)edge
->child
;
869 max_delay
= MAX2(child
->max_delay
, max_delay
);
872 n
->max_delay
= MAX2(n
->max_delay
, max_delay
+ n
->delay
);
876 sched_dag_init(struct ir3_sched_ctx
*ctx
)
878 ctx
->dag
= dag_create(ctx
);
880 foreach_instr (instr
, &ctx
->unscheduled_list
) {
881 sched_node_init(ctx
, instr
);
882 sched_node_add_deps(instr
);
885 dag_traverse_bottom_up(ctx
->dag
, sched_dag_max_delay_cb
, NULL
);
889 sched_dag_destroy(struct ir3_sched_ctx
*ctx
)
891 ralloc_free(ctx
->dag
);
896 sched_block(struct ir3_sched_ctx
*ctx
, struct ir3_block
*block
)
900 /* addr/pred writes are per-block: */
905 /* move all instructions to the unscheduled list, and
906 * empty the block's instruction list (to which we will
909 list_replace(&block
->instr_list
, &ctx
->unscheduled_list
);
910 list_inithead(&block
->instr_list
);
914 ctx
->remaining_kills
= 0;
915 foreach_instr_safe (instr
, &ctx
->unscheduled_list
) {
917 ctx
->remaining_kills
++;
920 /* First schedule all meta:input instructions, followed by
921 * tex-prefetch. We want all of the instructions that load
922 * values into registers before the shader starts to go
923 * before any other instructions. But in particular we
924 * want inputs to come before prefetches. This is because
925 * a FS's bary_ij input may not actually be live in the
926 * shader, but it should not be scheduled on top of any
927 * other input (but can be overwritten by a tex prefetch)
929 foreach_instr_safe (instr
, &ctx
->unscheduled_list
)
930 if (instr
->opc
== OPC_META_INPUT
)
931 schedule(ctx
, instr
);
933 foreach_instr_safe (instr
, &ctx
->unscheduled_list
)
934 if (instr
->opc
== OPC_META_TEX_PREFETCH
)
935 schedule(ctx
, instr
);
937 while (!list_is_empty(&ctx
->unscheduled_list
)) {
938 struct ir3_sched_notes notes
= {0};
939 struct ir3_instruction
*instr
;
941 instr
= choose_instr(ctx
, ¬es
);
943 unsigned delay
= ir3_delay_calc(ctx
->block
, instr
, false, false);
944 d("delay=%u", delay
);
946 /* and if we run out of instructions that can be scheduled,
947 * then it is time for nop's:
949 debug_assert(delay
<= 6);
955 schedule(ctx
, instr
);
957 struct ir3_instruction
*new_instr
= NULL
;
958 struct ir3
*ir
= block
->shader
;
960 /* nothing available to schedule.. if we are blocked on
961 * address/predicate register conflict, then break the
962 * deadlock by cloning the instruction that wrote that
965 if (notes
.addr0_conflict
) {
966 new_instr
= split_addr(ctx
, &ctx
->addr0
,
967 ir
->a0_users
, ir
->a0_users_count
);
968 } else if (notes
.addr1_conflict
) {
969 new_instr
= split_addr(ctx
, &ctx
->addr1
,
970 ir
->a1_users
, ir
->a1_users_count
);
971 } else if (notes
.pred_conflict
) {
972 new_instr
= split_pred(ctx
);
974 d("unscheduled_list:");
975 foreach_instr (instr
, &ctx
->unscheduled_list
)
976 di(instr
, "unscheduled: ");
983 list_delinit(&new_instr
->node
);
984 list_addtail(&new_instr
->node
, &ctx
->unscheduled_list
);
989 sched_dag_destroy(ctx
);
992 int ir3_sched(struct ir3
*ir
)
994 struct ir3_sched_ctx
*ctx
= rzalloc(NULL
, struct ir3_sched_ctx
);
996 foreach_block (block
, &ir
->block_list
) {
997 foreach_instr (instr
, &block
->instr_list
) {
1002 ir3_count_instructions(ir
);
1004 ir3_find_ssa_uses(ir
, ctx
, false);
1006 foreach_block (block
, &ir
->block_list
) {
1007 sched_block(ctx
, block
);
1010 int ret
= ctx
->error
? -1 : 0;
1018 get_array_id(struct ir3_instruction
*instr
)
1020 /* The expectation is that there is only a single array
1021 * src or dst, ir3_cp should enforce this.
1024 for (unsigned i
= 0; i
< instr
->regs_count
; i
++)
1025 if (instr
->regs
[i
]->flags
& IR3_REG_ARRAY
)
1026 return instr
->regs
[i
]->array
.id
;
1028 unreachable("this was unexpected");
1031 /* does instruction 'prior' need to be scheduled before 'instr'? */
1033 depends_on(struct ir3_instruction
*instr
, struct ir3_instruction
*prior
)
1035 /* TODO for dependencies that are related to a specific object, ie
1036 * a specific SSBO/image/array, we could relax this constraint to
1037 * make accesses to unrelated objects not depend on each other (at
1038 * least as long as not declared coherent)
1040 if (((instr
->barrier_class
& IR3_BARRIER_EVERYTHING
) && prior
->barrier_class
) ||
1041 ((prior
->barrier_class
& IR3_BARRIER_EVERYTHING
) && instr
->barrier_class
))
1044 if (instr
->barrier_class
& prior
->barrier_conflict
) {
1045 if (!(instr
->barrier_class
& ~(IR3_BARRIER_ARRAY_R
| IR3_BARRIER_ARRAY_W
))) {
1046 /* if only array barrier, then we can further limit false-deps
1047 * by considering the array-id, ie reads/writes to different
1048 * arrays do not depend on each other (no aliasing)
1050 if (get_array_id(instr
) != get_array_id(prior
)) {
1062 add_barrier_deps(struct ir3_block
*block
, struct ir3_instruction
*instr
)
1064 struct list_head
*prev
= instr
->node
.prev
;
1065 struct list_head
*next
= instr
->node
.next
;
1067 /* add dependencies on previous instructions that must be scheduled
1068 * prior to the current instruction
1070 while (prev
!= &block
->instr_list
) {
1071 struct ir3_instruction
*pi
=
1072 LIST_ENTRY(struct ir3_instruction
, prev
, node
);
1079 if (instr
->barrier_class
== pi
->barrier_class
) {
1080 ir3_instr_add_dep(instr
, pi
);
1084 if (depends_on(instr
, pi
))
1085 ir3_instr_add_dep(instr
, pi
);
1088 /* add dependencies on this instruction to following instructions
1089 * that must be scheduled after the current instruction:
1091 while (next
!= &block
->instr_list
) {
1092 struct ir3_instruction
*ni
=
1093 LIST_ENTRY(struct ir3_instruction
, next
, node
);
1100 if (instr
->barrier_class
== ni
->barrier_class
) {
1101 ir3_instr_add_dep(ni
, instr
);
1105 if (depends_on(ni
, instr
))
1106 ir3_instr_add_dep(ni
, instr
);
1110 /* before scheduling a block, we need to add any necessary false-dependencies
1113 * (1) barriers are scheduled in the right order wrt instructions related
1116 * (2) reads that come before a write actually get scheduled before the
1120 calculate_deps(struct ir3_block
*block
)
1122 foreach_instr (instr
, &block
->instr_list
) {
1123 if (instr
->barrier_class
) {
1124 add_barrier_deps(block
, instr
);
1130 ir3_sched_add_deps(struct ir3
*ir
)
1132 foreach_block (block
, &ir
->block_list
) {
1133 calculate_deps(block
);