2 * Copyright (C) 2019 Google, Inc.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 * Rob Clark <robclark@freedesktop.org>
29 #include "util/u_math.h"
32 #include "ir3_compiler.h"
33 #include "ir3_context.h"
36 #define SCHED_DEBUG (ir3_shader_debug & IR3_DBG_SCHEDMSGS)
40 #define d(fmt, ...) do { if (SCHED_DEBUG) { \
41 printf("PSCHED: "fmt"\n", ##__VA_ARGS__); \
44 #define di(instr, fmt, ...) do { if (SCHED_DEBUG) { \
45 printf("PSCHED: "fmt": ", ##__VA_ARGS__); \
46 ir3_print_instr(instr); \
50 * Post RA Instruction Scheduling
53 struct ir3_postsched_ctx
{
54 struct ir3_context
*ctx
;
57 struct ir3_block
*block
; /* the current block */
60 struct list_head unscheduled_list
; /* unscheduled instructions */
68 struct ir3_postsched_node
{
69 struct dag_node dag
; /* must be first for util_dynarray_foreach */
70 struct ir3_instruction
*instr
;
71 bool partially_evaluated_path
;
77 #define foreach_sched_node(__n, __list) \
78 list_for_each_entry(struct ir3_postsched_node, __n, __list, dag.link)
80 #define foreach_bit(b, mask) \
81 for (uint32_t _m = ({debug_assert((mask) >= 1); (mask);}); _m && ({(b) = u_bit_scan(&_m); 1;});)
84 schedule(struct ir3_postsched_ctx
*ctx
, struct ir3_instruction
*instr
)
86 debug_assert(ctx
->block
== instr
->block
);
88 /* remove from unscheduled_list:
90 list_delinit(&instr
->node
);
92 di(instr
, "schedule");
94 list_addtail(&instr
->node
, &instr
->block
->instr_list
);
96 struct ir3_postsched_node
*n
= instr
->data
;
97 dag_prune_head(ctx
->dag
, &n
->dag
);
99 if (is_meta(instr
) && (instr
->opc
!= OPC_META_TEX_PREFETCH
))
104 } else if (check_src_cond(instr
, is_sfu
)) {
106 } else if (ctx
->sfu_delay
> 0) {
110 if (is_tex_or_prefetch(instr
)) {
112 } else if (check_src_cond(instr
, is_tex_or_prefetch
)) {
114 } else if (ctx
->tex_delay
> 0) {
120 dump_state(struct ir3_postsched_ctx
*ctx
)
125 foreach_sched_node (n
, &ctx
->dag
->heads
) {
126 di(n
->instr
, "maxdel=%3d ", n
->max_delay
);
128 util_dynarray_foreach(&n
->dag
.edges
, struct dag_edge
, edge
) {
129 struct ir3_postsched_node
*child
=
130 (struct ir3_postsched_node
*)edge
->child
;
132 di(child
->instr
, " -> (%d parents) ", child
->dag
.parent_count
);
137 /* Determine if this is an instruction that we'd prefer not to schedule
138 * yet, in order to avoid an (ss) sync. This is limited by the sfu_delay
139 * counter, ie. the more cycles it has been since the last SFU, the less
140 * costly a sync would be.
143 would_sync(struct ir3_postsched_ctx
*ctx
, struct ir3_instruction
*instr
)
145 if (ctx
->sfu_delay
) {
146 if (check_src_cond(instr
, is_sfu
))
150 if (ctx
->tex_delay
) {
151 if (check_src_cond(instr
, is_tex_or_prefetch
))
158 /* find instruction to schedule: */
159 static struct ir3_instruction
*
160 choose_instr(struct ir3_postsched_ctx
*ctx
)
162 struct ir3_postsched_node
*chosen
= NULL
;
166 foreach_sched_node (n
, &ctx
->dag
->heads
) {
167 if (!is_meta(n
->instr
))
170 if (!chosen
|| (chosen
->max_delay
< n
->max_delay
))
175 di(chosen
->instr
, "prio: chose (meta)");
176 return chosen
->instr
;
179 /* Try to schedule inputs with a higher priority, if possible, as
180 * the last bary.f unlocks varying storage to unblock more VS
183 foreach_sched_node (n
, &ctx
->dag
->heads
) {
184 if (!is_input(n
->instr
))
187 if (!chosen
|| (chosen
->max_delay
< n
->max_delay
))
192 di(chosen
->instr
, "prio: chose (input)");
193 return chosen
->instr
;
196 /* Next prioritize discards: */
197 foreach_sched_node (n
, &ctx
->dag
->heads
) {
198 unsigned d
= ir3_delay_calc(ctx
->block
, n
->instr
, false, false);
203 if (!is_kill(n
->instr
))
206 if (!chosen
|| (chosen
->max_delay
< n
->max_delay
))
211 di(chosen
->instr
, "csp: chose (kill, hard ready)");
212 return chosen
->instr
;
215 /* Next prioritize expensive instructions: */
216 foreach_sched_node (n
, &ctx
->dag
->heads
) {
217 unsigned d
= ir3_delay_calc(ctx
->block
, n
->instr
, false, false);
222 if (!(is_sfu(n
->instr
) || is_tex(n
->instr
)))
225 if (!chosen
|| (chosen
->max_delay
< n
->max_delay
))
230 di(chosen
->instr
, "csp: chose (sfu/tex, hard ready)");
231 return chosen
->instr
;
235 * Sometimes be better to take a nop, rather than scheduling an
236 * instruction that would require an (ss) shortly after another
237 * SFU.. ie. if last SFU was just one or two instr ago, and we
238 * could choose between taking a nop and then scheduling
239 * something else, vs scheduling the immed avail instruction that
240 * would require (ss), we are better with the nop.
242 for (unsigned delay
= 0; delay
< 4; delay
++) {
243 foreach_sched_node (n
, &ctx
->dag
->heads
) {
244 if (would_sync(ctx
, n
->instr
))
247 unsigned d
= ir3_delay_calc(ctx
->block
, n
->instr
, true, false);
252 if (!chosen
|| (chosen
->max_delay
< n
->max_delay
))
257 di(chosen
->instr
, "csp: chose (soft ready, delay=%u)", delay
);
258 return chosen
->instr
;
262 /* Next try to find a ready leader w/ soft delay (ie. including extra
263 * delay for things like tex fetch which can be synchronized w/ sync
264 * bit (but we probably do want to schedule some other instructions
267 foreach_sched_node (n
, &ctx
->dag
->heads
) {
268 unsigned d
= ir3_delay_calc(ctx
->block
, n
->instr
, true, false);
273 if (!chosen
|| (chosen
->max_delay
< n
->max_delay
))
278 di(chosen
->instr
, "csp: chose (soft ready)");
279 return chosen
->instr
;
282 /* Next try to find a ready leader that can be scheduled without nop's,
283 * which in the case of things that need (sy)/(ss) could result in
284 * stalls.. but we've already decided there is not a better option.
286 foreach_sched_node (n
, &ctx
->dag
->heads
) {
287 unsigned d
= ir3_delay_calc(ctx
->block
, n
->instr
, false, false);
292 if (!chosen
|| (chosen
->max_delay
< n
->max_delay
))
297 di(chosen
->instr
, "csp: chose (hard ready)");
298 return chosen
->instr
;
301 /* Otherwise choose leader with maximum cost:
303 * TODO should we try to balance cost and delays? I guess it is
304 * a balance between now-nop's and future-nop's?
306 foreach_sched_node (n
, &ctx
->dag
->heads
) {
307 if (!chosen
|| chosen
->max_delay
< n
->max_delay
)
312 di(chosen
->instr
, "csp: chose (leader)");
313 return chosen
->instr
;
319 struct ir3_postsched_deps_state
{
320 struct ir3_context
*ctx
;
322 enum { F
, R
} direction
;
326 /* Track the mapping between sched node (instruction) that last
327 * wrote a given register (in whichever direction we are iterating
330 * Note, this table is twice as big as the # of regs, to deal with
331 * half-precision regs. The approach differs depending on whether
332 * the half and full precision register files are "merged" (conflict,
333 * ie. a6xx+) in which case we consider each full precision dep
334 * as two half-precision dependencies, vs older separate (non-
335 * conflicting) in which case the first half of the table is used
336 * for full precision and 2nd half for half-precision.
338 struct ir3_postsched_node
*regs
[2 * 256];
341 /* bounds checking read/write accessors, since OoB access to stuff on
342 * the stack is gonna cause a bad day.
344 #define dep_reg(state, idx) *({ \
345 assert((idx) < ARRAY_SIZE((state)->regs)); \
346 &(state)->regs[(idx)]; \
350 add_dep(struct ir3_postsched_deps_state
*state
,
351 struct ir3_postsched_node
*before
,
352 struct ir3_postsched_node
*after
)
354 if (!before
|| !after
)
357 assert(before
!= after
);
359 if (state
->direction
== F
) {
360 dag_add_edge(&before
->dag
, &after
->dag
, NULL
);
362 dag_add_edge(&after
->dag
, &before
->dag
, NULL
);
367 add_single_reg_dep(struct ir3_postsched_deps_state
*state
,
368 struct ir3_postsched_node
*node
, unsigned num
, bool write
)
370 add_dep(state
, dep_reg(state
, num
), node
);
372 dep_reg(state
, num
) = node
;
376 /* This is where we handled full vs half-precision, and potential conflicts
377 * between half and full precision that result in additional dependencies.
378 * The 'reg' arg is really just to know half vs full precision.
381 add_reg_dep(struct ir3_postsched_deps_state
*state
,
382 struct ir3_postsched_node
*node
, const struct ir3_register
*reg
,
383 unsigned num
, bool write
)
386 if (reg
->flags
& IR3_REG_HALF
) {
387 /* single conflict in half-reg space: */
388 add_single_reg_dep(state
, node
, num
, write
);
390 /* two conflicts in half-reg space: */
391 add_single_reg_dep(state
, node
, 2 * num
+ 0, write
);
392 add_single_reg_dep(state
, node
, 2 * num
+ 1, write
);
395 if (reg
->flags
& IR3_REG_HALF
)
396 num
+= ARRAY_SIZE(state
->regs
) / 2;
397 add_single_reg_dep(state
, node
, num
, write
);
402 calculate_deps(struct ir3_postsched_deps_state
*state
,
403 struct ir3_postsched_node
*node
)
405 struct ir3_register
*reg
;
408 /* Add dependencies on instructions that previously (or next,
409 * in the reverse direction) wrote any of our src registers:
411 foreach_src_n (reg
, i
, node
->instr
) {
412 if (reg
->flags
& (IR3_REG_CONST
| IR3_REG_IMMED
))
415 if (reg
->flags
& IR3_REG_RELATIV
) {
416 /* mark entire array as read: */
417 struct ir3_array
*arr
= ir3_lookup_array(state
->ctx
->ir
, reg
->array
.id
);
418 for (unsigned i
= 0; i
< arr
->length
; i
++) {
419 add_reg_dep(state
, node
, reg
, arr
->reg
+ i
, false);
422 foreach_bit (b
, reg
->wrmask
) {
423 add_reg_dep(state
, node
, reg
, reg
->num
+ b
, false);
425 struct ir3_postsched_node
*dep
= dep_reg(state
, reg
->num
+ b
);
426 if (dep
&& (state
->direction
== F
)) {
427 unsigned d
= ir3_delayslots(dep
->instr
, node
->instr
, i
, true);
428 node
->delay
= MAX2(node
->delay
, d
);
434 if (node
->instr
->address
) {
435 add_reg_dep(state
, node
, node
->instr
->address
->regs
[0],
436 node
->instr
->address
->regs
[0]->num
,
440 if (dest_regs(node
->instr
) == 0)
443 /* And then after we update the state for what this instruction
446 reg
= node
->instr
->regs
[0];
447 if (reg
->flags
& IR3_REG_RELATIV
) {
448 /* mark the entire array as written: */
449 struct ir3_array
*arr
= ir3_lookup_array(state
->ctx
->ir
, reg
->array
.id
);
450 for (unsigned i
= 0; i
< arr
->length
; i
++) {
451 add_reg_dep(state
, node
, reg
, arr
->reg
+ i
, true);
454 foreach_bit (b
, reg
->wrmask
) {
455 add_reg_dep(state
, node
, reg
, reg
->num
+ b
, true);
461 calculate_forward_deps(struct ir3_postsched_ctx
*ctx
)
463 struct ir3_postsched_deps_state state
= {
466 .merged
= ctx
->ctx
->compiler
->gpu_id
>= 600,
469 foreach_instr (instr
, &ctx
->unscheduled_list
) {
470 calculate_deps(&state
, instr
->data
);
475 calculate_reverse_deps(struct ir3_postsched_ctx
*ctx
)
477 struct ir3_postsched_deps_state state
= {
480 .merged
= ctx
->ctx
->compiler
->gpu_id
>= 600,
483 foreach_instr_rev (instr
, &ctx
->unscheduled_list
) {
484 calculate_deps(&state
, instr
->data
);
489 sched_node_init(struct ir3_postsched_ctx
*ctx
, struct ir3_instruction
*instr
)
491 struct ir3_postsched_node
*n
= rzalloc(ctx
->mem_ctx
, struct ir3_postsched_node
);
493 dag_init_node(ctx
->dag
, &n
->dag
);
500 sched_dag_max_delay_cb(struct dag_node
*node
, void *state
)
502 struct ir3_postsched_node
*n
= (struct ir3_postsched_node
*)node
;
503 uint32_t max_delay
= 0;
505 util_dynarray_foreach(&n
->dag
.edges
, struct dag_edge
, edge
) {
506 struct ir3_postsched_node
*child
= (struct ir3_postsched_node
*)edge
->child
;
507 max_delay
= MAX2(child
->max_delay
, max_delay
);
510 n
->max_delay
= MAX2(n
->max_delay
, max_delay
+ n
->delay
);
514 sched_dag_init(struct ir3_postsched_ctx
*ctx
)
516 ctx
->mem_ctx
= ralloc_context(NULL
);
518 ctx
->dag
= dag_create(ctx
->mem_ctx
);
520 foreach_instr (instr
, &ctx
->unscheduled_list
)
521 sched_node_init(ctx
, instr
);
523 calculate_forward_deps(ctx
);
524 calculate_reverse_deps(ctx
);
527 * To avoid expensive texture fetches, etc, from being moved ahead
528 * of kills, track the kills we've seen so far, so we can add an
529 * extra dependency on them for tex/mem instructions
531 struct util_dynarray kills
;
532 util_dynarray_init(&kills
, ctx
->mem_ctx
);
535 * Normal srcs won't be in SSA at this point, those are dealt with in
536 * calculate_forward_deps() and calculate_reverse_deps(). But we still
537 * have the false-dep information in SSA form, so go ahead and add
538 * dependencies for that here:
540 foreach_instr (instr
, &ctx
->unscheduled_list
) {
541 struct ir3_postsched_node
*n
= instr
->data
;
542 struct ir3_instruction
*src
;
544 foreach_ssa_src_n (src
, i
, instr
) {
545 if (src
->block
!= instr
->block
)
548 /* we can end up with unused false-deps.. just skip them: */
549 if (src
->flags
& IR3_INSTR_UNUSED
)
552 struct ir3_postsched_node
*sn
= src
->data
;
554 /* don't consider dependencies in other blocks: */
555 if (src
->block
!= instr
->block
)
558 dag_add_edge(&sn
->dag
, &n
->dag
, NULL
);
561 if (is_kill(instr
)) {
562 util_dynarray_append(&kills
, struct ir3_instruction
*, instr
);
563 } else if (is_tex(instr
) || is_mem(instr
)) {
564 util_dynarray_foreach(&kills
, struct ir3_instruction
*, instrp
) {
565 struct ir3_instruction
*kill
= *instrp
;
566 struct ir3_postsched_node
*kn
= kill
->data
;
567 dag_add_edge(&kn
->dag
, &n
->dag
, NULL
);
572 // TODO do we want to do this after reverse-dependencies?
573 dag_traverse_bottom_up(ctx
->dag
, sched_dag_max_delay_cb
, NULL
);
577 sched_dag_destroy(struct ir3_postsched_ctx
*ctx
)
579 ralloc_free(ctx
->mem_ctx
);
585 sched_block(struct ir3_postsched_ctx
*ctx
, struct ir3_block
*block
)
589 /* move all instructions to the unscheduled list, and
590 * empty the block's instruction list (to which we will
593 list_replace(&block
->instr_list
, &ctx
->unscheduled_list
);
594 list_inithead(&block
->instr_list
);
596 // TODO once we are using post-sched for everything we can
597 // just not stick in NOP's prior to post-sched, and drop this.
598 // for now keep this, since it makes post-sched optional:
599 foreach_instr_safe (instr
, &ctx
->unscheduled_list
) {
600 switch (instr
->opc
) {
604 list_delinit(&instr
->node
);
613 /* First schedule all meta:input instructions, followed by
614 * tex-prefetch. We want all of the instructions that load
615 * values into registers before the shader starts to go
616 * before any other instructions. But in particular we
617 * want inputs to come before prefetches. This is because
618 * a FS's bary_ij input may not actually be live in the
619 * shader, but it should not be scheduled on top of any
620 * other input (but can be overwritten by a tex prefetch)
622 foreach_instr_safe (instr
, &ctx
->unscheduled_list
)
623 if (instr
->opc
== OPC_META_INPUT
)
624 schedule(ctx
, instr
);
626 foreach_instr_safe (instr
, &ctx
->unscheduled_list
)
627 if (instr
->opc
== OPC_META_TEX_PREFETCH
)
628 schedule(ctx
, instr
);
630 while (!list_is_empty(&ctx
->unscheduled_list
)) {
631 struct ir3_instruction
*instr
;
633 instr
= choose_instr(ctx
);
635 /* this shouldn't happen: */
641 unsigned delay
= ir3_delay_calc(ctx
->block
, instr
, false, false);
642 d("delay=%u", delay
);
644 /* and if we run out of instructions that can be scheduled,
645 * then it is time for nop's:
647 debug_assert(delay
<= 6);
653 schedule(ctx
, instr
);
656 sched_dag_destroy(ctx
);
661 is_self_mov(struct ir3_instruction
*instr
)
663 if (!is_same_type_mov(instr
))
666 if (instr
->regs
[0]->num
!= instr
->regs
[1]->num
)
669 if (instr
->regs
[0]->flags
& IR3_REG_RELATIV
)
672 if (instr
->regs
[1]->flags
& (IR3_REG_CONST
| IR3_REG_IMMED
|
673 IR3_REG_RELATIV
| IR3_REG_FNEG
| IR3_REG_FABS
|
674 IR3_REG_SNEG
| IR3_REG_SABS
| IR3_REG_BNOT
|
675 IR3_REG_EVEN
| IR3_REG_POS_INF
))
681 /* sometimes we end up w/ in-place mov's, ie. mov.u32u32 r1.y, r1.y
682 * as a result of places were before RA we are not sure that it is
683 * safe to eliminate. We could eliminate these earlier, but sometimes
684 * they are tangled up in false-dep's, etc, so it is easier just to
685 * let them exist until after RA
688 cleanup_self_movs(struct ir3
*ir
)
690 foreach_block (block
, &ir
->block_list
) {
691 foreach_instr_safe (instr
, &block
->instr_list
) {
692 struct ir3_register
*reg
;
694 foreach_src (reg
, instr
) {
698 if (is_self_mov(reg
->instr
)) {
699 list_delinit(®
->instr
->node
);
700 reg
->instr
= reg
->instr
->regs
[1]->instr
;
704 for (unsigned i
= 0; i
< instr
->deps_count
; i
++) {
705 if (instr
->deps
[i
] && is_self_mov(instr
->deps
[i
])) {
706 list_delinit(&instr
->deps
[i
]->node
);
707 instr
->deps
[i
] = instr
->deps
[i
]->regs
[1]->instr
;
715 ir3_postsched(struct ir3_context
*cctx
)
717 struct ir3_postsched_ctx ctx
= {
721 ir3_remove_nops(cctx
->ir
);
722 cleanup_self_movs(cctx
->ir
);
724 foreach_block (block
, &cctx
->ir
->block_list
) {
725 sched_block(&ctx
, block
);