freedreno/ir3/print: print (r) flag
[mesa.git] / src / freedreno / ir3 / ir3_postsched.c
1 /*
2 * Copyright (C) 2019 Google, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Rob Clark <robclark@freedesktop.org>
25 */
26
27
28 #include "util/dag.h"
29 #include "util/u_math.h"
30
31 #include "ir3.h"
32 #include "ir3_compiler.h"
33 #include "ir3_context.h"
34
35 #ifdef DEBUG
36 #define SCHED_DEBUG (ir3_shader_debug & IR3_DBG_SCHEDMSGS)
37 #else
38 #define SCHED_DEBUG 0
39 #endif
40 #define d(fmt, ...) do { if (SCHED_DEBUG) { \
41 printf("PSCHED: "fmt"\n", ##__VA_ARGS__); \
42 } } while (0)
43
44 #define di(instr, fmt, ...) do { if (SCHED_DEBUG) { \
45 printf("PSCHED: "fmt": ", ##__VA_ARGS__); \
46 ir3_print_instr(instr); \
47 } } while (0)
48
49 /*
50 * Post RA Instruction Scheduling
51 */
52
53 struct ir3_postsched_ctx {
54 struct ir3 *ir;
55
56 void *mem_ctx;
57 struct ir3_block *block; /* the current block */
58 struct dag *dag;
59
60 struct list_head unscheduled_list; /* unscheduled instructions */
61
62 int sfu_delay;
63 int tex_delay;
64 };
65
66 struct ir3_postsched_node {
67 struct dag_node dag; /* must be first for util_dynarray_foreach */
68 struct ir3_instruction *instr;
69 bool partially_evaluated_path;
70
71 unsigned delay;
72 unsigned max_delay;
73 };
74
75 #define foreach_sched_node(__n, __list) \
76 list_for_each_entry(struct ir3_postsched_node, __n, __list, dag.link)
77
78 #define foreach_bit(b, mask) \
79 for (uint32_t _m = ({debug_assert((mask) >= 1); (mask);}); _m && ({(b) = u_bit_scan(&_m); 1;});)
80
81 static void
82 schedule(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
83 {
84 debug_assert(ctx->block == instr->block);
85
86 /* remove from unscheduled_list:
87 */
88 list_delinit(&instr->node);
89
90 di(instr, "schedule");
91
92 list_addtail(&instr->node, &instr->block->instr_list);
93
94 struct ir3_postsched_node *n = instr->data;
95 dag_prune_head(ctx->dag, &n->dag);
96
97 if (is_meta(instr) && (instr->opc != OPC_META_TEX_PREFETCH))
98 return;
99
100 if (is_sfu(instr)) {
101 ctx->sfu_delay = 8;
102 } else if (check_src_cond(instr, is_sfu)) {
103 ctx->sfu_delay = 0;
104 } else if (ctx->sfu_delay > 0) {
105 ctx->sfu_delay--;
106 }
107
108 if (is_tex_or_prefetch(instr)) {
109 ctx->tex_delay = 10;
110 } else if (check_src_cond(instr, is_tex_or_prefetch)) {
111 ctx->tex_delay = 0;
112 } else if (ctx->tex_delay > 0) {
113 ctx->tex_delay--;
114 }
115 }
116
117 static void
118 dump_state(struct ir3_postsched_ctx *ctx)
119 {
120 if (!SCHED_DEBUG)
121 return;
122
123 foreach_sched_node (n, &ctx->dag->heads) {
124 di(n->instr, "maxdel=%3d ", n->max_delay);
125
126 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
127 struct ir3_postsched_node *child =
128 (struct ir3_postsched_node *)edge->child;
129
130 di(child->instr, " -> (%d parents) ", child->dag.parent_count);
131 }
132 }
133 }
134
135 /* Determine if this is an instruction that we'd prefer not to schedule
136 * yet, in order to avoid an (ss) sync. This is limited by the sfu_delay
137 * counter, ie. the more cycles it has been since the last SFU, the less
138 * costly a sync would be.
139 */
140 static bool
141 would_sync(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
142 {
143 if (ctx->sfu_delay) {
144 if (check_src_cond(instr, is_sfu))
145 return true;
146 }
147
148 if (ctx->tex_delay) {
149 if (check_src_cond(instr, is_tex_or_prefetch))
150 return true;
151 }
152
153 return false;
154 }
155
156 /* find instruction to schedule: */
157 static struct ir3_instruction *
158 choose_instr(struct ir3_postsched_ctx *ctx)
159 {
160 struct ir3_postsched_node *chosen = NULL;
161
162 dump_state(ctx);
163
164 foreach_sched_node (n, &ctx->dag->heads) {
165 if (!is_meta(n->instr))
166 continue;
167
168 if (!chosen || (chosen->max_delay < n->max_delay))
169 chosen = n;
170 }
171
172 if (chosen) {
173 di(chosen->instr, "prio: chose (meta)");
174 return chosen->instr;
175 }
176
177 /* Try to schedule inputs with a higher priority, if possible, as
178 * the last bary.f unlocks varying storage to unblock more VS
179 * warps.
180 */
181 foreach_sched_node (n, &ctx->dag->heads) {
182 if (!is_input(n->instr))
183 continue;
184
185 if (!chosen || (chosen->max_delay < n->max_delay))
186 chosen = n;
187 }
188
189 if (chosen) {
190 di(chosen->instr, "prio: chose (input)");
191 return chosen->instr;
192 }
193
194 /* Next prioritize discards: */
195 foreach_sched_node (n, &ctx->dag->heads) {
196 unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
197
198 if (d > 0)
199 continue;
200
201 if (!is_kill(n->instr))
202 continue;
203
204 if (!chosen || (chosen->max_delay < n->max_delay))
205 chosen = n;
206 }
207
208 if (chosen) {
209 di(chosen->instr, "csp: chose (kill, hard ready)");
210 return chosen->instr;
211 }
212
213 /* Next prioritize expensive instructions: */
214 foreach_sched_node (n, &ctx->dag->heads) {
215 unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
216
217 if (d > 0)
218 continue;
219
220 if (!(is_sfu(n->instr) || is_tex(n->instr)))
221 continue;
222
223 if (!chosen || (chosen->max_delay < n->max_delay))
224 chosen = n;
225 }
226
227 if (chosen) {
228 di(chosen->instr, "csp: chose (sfu/tex, hard ready)");
229 return chosen->instr;
230 }
231
232 /*
233 * Sometimes be better to take a nop, rather than scheduling an
234 * instruction that would require an (ss) shortly after another
235 * SFU.. ie. if last SFU was just one or two instr ago, and we
236 * could choose between taking a nop and then scheduling
237 * something else, vs scheduling the immed avail instruction that
238 * would require (ss), we are better with the nop.
239 */
240 for (unsigned delay = 0; delay < 4; delay++) {
241 foreach_sched_node (n, &ctx->dag->heads) {
242 if (would_sync(ctx, n->instr))
243 continue;
244
245 unsigned d = ir3_delay_calc(ctx->block, n->instr, true, false);
246
247 if (d > delay)
248 continue;
249
250 if (!chosen || (chosen->max_delay < n->max_delay))
251 chosen = n;
252 }
253
254 if (chosen) {
255 di(chosen->instr, "csp: chose (soft ready, delay=%u)", delay);
256 return chosen->instr;
257 }
258 }
259
260 /* Next try to find a ready leader w/ soft delay (ie. including extra
261 * delay for things like tex fetch which can be synchronized w/ sync
262 * bit (but we probably do want to schedule some other instructions
263 * while we wait)
264 */
265 foreach_sched_node (n, &ctx->dag->heads) {
266 unsigned d = ir3_delay_calc(ctx->block, n->instr, true, false);
267
268 if (d > 0)
269 continue;
270
271 if (!chosen || (chosen->max_delay < n->max_delay))
272 chosen = n;
273 }
274
275 if (chosen) {
276 di(chosen->instr, "csp: chose (soft ready)");
277 return chosen->instr;
278 }
279
280 /* Next try to find a ready leader that can be scheduled without nop's,
281 * which in the case of things that need (sy)/(ss) could result in
282 * stalls.. but we've already decided there is not a better option.
283 */
284 foreach_sched_node (n, &ctx->dag->heads) {
285 unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
286
287 if (d > 0)
288 continue;
289
290 if (!chosen || (chosen->max_delay < n->max_delay))
291 chosen = n;
292 }
293
294 if (chosen) {
295 di(chosen->instr, "csp: chose (hard ready)");
296 return chosen->instr;
297 }
298
299 /* Otherwise choose leader with maximum cost:
300 *
301 * TODO should we try to balance cost and delays? I guess it is
302 * a balance between now-nop's and future-nop's?
303 */
304 foreach_sched_node (n, &ctx->dag->heads) {
305 if (!chosen || chosen->max_delay < n->max_delay)
306 chosen = n;
307 }
308
309 if (chosen) {
310 di(chosen->instr, "csp: chose (leader)");
311 return chosen->instr;
312 }
313
314 return NULL;
315 }
316
317 struct ir3_postsched_deps_state {
318 struct ir3_postsched_ctx *ctx;
319
320 enum { F, R } direction;
321
322 bool merged;
323
324 /* Track the mapping between sched node (instruction) that last
325 * wrote a given register (in whichever direction we are iterating
326 * the block)
327 *
328 * Note, this table is twice as big as the # of regs, to deal with
329 * half-precision regs. The approach differs depending on whether
330 * the half and full precision register files are "merged" (conflict,
331 * ie. a6xx+) in which case we consider each full precision dep
332 * as two half-precision dependencies, vs older separate (non-
333 * conflicting) in which case the first half of the table is used
334 * for full precision and 2nd half for half-precision.
335 */
336 struct ir3_postsched_node *regs[2 * 256];
337 };
338
339 /* bounds checking read/write accessors, since OoB access to stuff on
340 * the stack is gonna cause a bad day.
341 */
342 #define dep_reg(state, idx) *({ \
343 assert((idx) < ARRAY_SIZE((state)->regs)); \
344 &(state)->regs[(idx)]; \
345 })
346
347 static void
348 add_dep(struct ir3_postsched_deps_state *state,
349 struct ir3_postsched_node *before,
350 struct ir3_postsched_node *after)
351 {
352 if (!before || !after)
353 return;
354
355 assert(before != after);
356
357 if (state->direction == F) {
358 dag_add_edge(&before->dag, &after->dag, NULL);
359 } else {
360 dag_add_edge(&after->dag, &before->dag, NULL);
361 }
362 }
363
364 static void
365 add_single_reg_dep(struct ir3_postsched_deps_state *state,
366 struct ir3_postsched_node *node, unsigned num, bool write)
367 {
368 add_dep(state, dep_reg(state, num), node);
369 if (write) {
370 dep_reg(state, num) = node;
371 }
372 }
373
374 /* This is where we handled full vs half-precision, and potential conflicts
375 * between half and full precision that result in additional dependencies.
376 * The 'reg' arg is really just to know half vs full precision.
377 */
378 static void
379 add_reg_dep(struct ir3_postsched_deps_state *state,
380 struct ir3_postsched_node *node, const struct ir3_register *reg,
381 unsigned num, bool write)
382 {
383 if (state->merged) {
384 if (reg->flags & IR3_REG_HALF) {
385 /* single conflict in half-reg space: */
386 add_single_reg_dep(state, node, num, write);
387 } else {
388 /* two conflicts in half-reg space: */
389 add_single_reg_dep(state, node, 2 * num + 0, write);
390 add_single_reg_dep(state, node, 2 * num + 1, write);
391 }
392 } else {
393 if (reg->flags & IR3_REG_HALF)
394 num += ARRAY_SIZE(state->regs) / 2;
395 add_single_reg_dep(state, node, num, write);
396 }
397 }
398
399 static void
400 calculate_deps(struct ir3_postsched_deps_state *state,
401 struct ir3_postsched_node *node)
402 {
403 int b;
404
405 /* Add dependencies on instructions that previously (or next,
406 * in the reverse direction) wrote any of our src registers:
407 */
408 foreach_src_n (reg, i, node->instr) {
409 if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
410 continue;
411
412 if (reg->flags & IR3_REG_RELATIV) {
413 /* mark entire array as read: */
414 struct ir3_array *arr = ir3_lookup_array(state->ctx->ir, reg->array.id);
415 for (unsigned i = 0; i < arr->length; i++) {
416 add_reg_dep(state, node, reg, arr->reg + i, false);
417 }
418 } else {
419 foreach_bit (b, reg->wrmask) {
420 add_reg_dep(state, node, reg, reg->num + b, false);
421
422 struct ir3_postsched_node *dep = dep_reg(state, reg->num + b);
423 if (dep && (state->direction == F)) {
424 unsigned d = ir3_delayslots(dep->instr, node->instr, i, true);
425 node->delay = MAX2(node->delay, d);
426 }
427 }
428 }
429 }
430
431 if (node->instr->address) {
432 add_reg_dep(state, node, node->instr->address->regs[0],
433 node->instr->address->regs[0]->num,
434 false);
435 }
436
437 if (dest_regs(node->instr) == 0)
438 return;
439
440 /* And then after we update the state for what this instruction
441 * wrote:
442 */
443 struct ir3_register *reg = node->instr->regs[0];
444 if (reg->flags & IR3_REG_RELATIV) {
445 /* mark the entire array as written: */
446 struct ir3_array *arr = ir3_lookup_array(state->ctx->ir, reg->array.id);
447 for (unsigned i = 0; i < arr->length; i++) {
448 add_reg_dep(state, node, reg, arr->reg + i, true);
449 }
450 } else {
451 foreach_bit (b, reg->wrmask) {
452 add_reg_dep(state, node, reg, reg->num + b, true);
453 }
454 }
455 }
456
457 static void
458 calculate_forward_deps(struct ir3_postsched_ctx *ctx)
459 {
460 struct ir3_postsched_deps_state state = {
461 .ctx = ctx,
462 .direction = F,
463 .merged = ctx->ir->compiler->gpu_id >= 600,
464 };
465
466 foreach_instr (instr, &ctx->unscheduled_list) {
467 calculate_deps(&state, instr->data);
468 }
469 }
470
471 static void
472 calculate_reverse_deps(struct ir3_postsched_ctx *ctx)
473 {
474 struct ir3_postsched_deps_state state = {
475 .ctx = ctx,
476 .direction = R,
477 .merged = ctx->ir->compiler->gpu_id >= 600,
478 };
479
480 foreach_instr_rev (instr, &ctx->unscheduled_list) {
481 calculate_deps(&state, instr->data);
482 }
483 }
484
485 static void
486 sched_node_init(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
487 {
488 struct ir3_postsched_node *n = rzalloc(ctx->mem_ctx, struct ir3_postsched_node);
489
490 dag_init_node(ctx->dag, &n->dag);
491
492 n->instr = instr;
493 instr->data = n;
494 }
495
496 static void
497 sched_dag_max_delay_cb(struct dag_node *node, void *state)
498 {
499 struct ir3_postsched_node *n = (struct ir3_postsched_node *)node;
500 uint32_t max_delay = 0;
501
502 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
503 struct ir3_postsched_node *child = (struct ir3_postsched_node *)edge->child;
504 max_delay = MAX2(child->max_delay, max_delay);
505 }
506
507 n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
508 }
509
510 static void
511 sched_dag_init(struct ir3_postsched_ctx *ctx)
512 {
513 ctx->mem_ctx = ralloc_context(NULL);
514
515 ctx->dag = dag_create(ctx->mem_ctx);
516
517 foreach_instr (instr, &ctx->unscheduled_list)
518 sched_node_init(ctx, instr);
519
520 calculate_forward_deps(ctx);
521 calculate_reverse_deps(ctx);
522
523 /*
524 * To avoid expensive texture fetches, etc, from being moved ahead
525 * of kills, track the kills we've seen so far, so we can add an
526 * extra dependency on them for tex/mem instructions
527 */
528 struct util_dynarray kills;
529 util_dynarray_init(&kills, ctx->mem_ctx);
530
531 /*
532 * Normal srcs won't be in SSA at this point, those are dealt with in
533 * calculate_forward_deps() and calculate_reverse_deps(). But we still
534 * have the false-dep information in SSA form, so go ahead and add
535 * dependencies for that here:
536 */
537 foreach_instr (instr, &ctx->unscheduled_list) {
538 struct ir3_postsched_node *n = instr->data;
539
540 foreach_ssa_src_n (src, i, instr) {
541 if (src->block != instr->block)
542 continue;
543
544 /* we can end up with unused false-deps.. just skip them: */
545 if (src->flags & IR3_INSTR_UNUSED)
546 continue;
547
548 struct ir3_postsched_node *sn = src->data;
549
550 /* don't consider dependencies in other blocks: */
551 if (src->block != instr->block)
552 continue;
553
554 dag_add_edge(&sn->dag, &n->dag, NULL);
555 }
556
557 if (is_kill(instr)) {
558 util_dynarray_append(&kills, struct ir3_instruction *, instr);
559 } else if (is_tex(instr) || is_mem(instr)) {
560 util_dynarray_foreach(&kills, struct ir3_instruction *, instrp) {
561 struct ir3_instruction *kill = *instrp;
562 struct ir3_postsched_node *kn = kill->data;
563 dag_add_edge(&kn->dag, &n->dag, NULL);
564 }
565 }
566 }
567
568 // TODO do we want to do this after reverse-dependencies?
569 dag_traverse_bottom_up(ctx->dag, sched_dag_max_delay_cb, NULL);
570 }
571
572 static void
573 sched_dag_destroy(struct ir3_postsched_ctx *ctx)
574 {
575 ralloc_free(ctx->mem_ctx);
576 ctx->mem_ctx = NULL;
577 ctx->dag = NULL;
578 }
579
580 static void
581 sched_block(struct ir3_postsched_ctx *ctx, struct ir3_block *block)
582 {
583 ctx->block = block;
584 ctx->tex_delay = 0;
585 ctx->sfu_delay = 0;
586
587 /* move all instructions to the unscheduled list, and
588 * empty the block's instruction list (to which we will
589 * be inserting).
590 */
591 list_replace(&block->instr_list, &ctx->unscheduled_list);
592 list_inithead(&block->instr_list);
593
594 // TODO once we are using post-sched for everything we can
595 // just not stick in NOP's prior to post-sched, and drop this.
596 // for now keep this, since it makes post-sched optional:
597 foreach_instr_safe (instr, &ctx->unscheduled_list) {
598 switch (instr->opc) {
599 case OPC_NOP:
600 case OPC_B:
601 case OPC_JUMP:
602 list_delinit(&instr->node);
603 break;
604 default:
605 break;
606 }
607 }
608
609 sched_dag_init(ctx);
610
611 /* First schedule all meta:input instructions, followed by
612 * tex-prefetch. We want all of the instructions that load
613 * values into registers before the shader starts to go
614 * before any other instructions. But in particular we
615 * want inputs to come before prefetches. This is because
616 * a FS's bary_ij input may not actually be live in the
617 * shader, but it should not be scheduled on top of any
618 * other input (but can be overwritten by a tex prefetch)
619 */
620 foreach_instr_safe (instr, &ctx->unscheduled_list)
621 if (instr->opc == OPC_META_INPUT)
622 schedule(ctx, instr);
623
624 foreach_instr_safe (instr, &ctx->unscheduled_list)
625 if (instr->opc == OPC_META_TEX_PREFETCH)
626 schedule(ctx, instr);
627
628 while (!list_is_empty(&ctx->unscheduled_list)) {
629 struct ir3_instruction *instr = choose_instr(ctx);
630
631 unsigned delay = ir3_delay_calc(ctx->block, instr, false, false);
632 d("delay=%u", delay);
633
634 /* and if we run out of instructions that can be scheduled,
635 * then it is time for nop's:
636 */
637 debug_assert(delay <= 6);
638 while (delay > 0) {
639 ir3_NOP(block);
640 delay--;
641 }
642
643 schedule(ctx, instr);
644 }
645
646 sched_dag_destroy(ctx);
647 }
648
649
650 static bool
651 is_self_mov(struct ir3_instruction *instr)
652 {
653 if (!is_same_type_mov(instr))
654 return false;
655
656 if (instr->regs[0]->num != instr->regs[1]->num)
657 return false;
658
659 if (instr->regs[0]->flags & IR3_REG_RELATIV)
660 return false;
661
662 if (instr->regs[1]->flags & (IR3_REG_CONST | IR3_REG_IMMED |
663 IR3_REG_RELATIV | IR3_REG_FNEG | IR3_REG_FABS |
664 IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT |
665 IR3_REG_EVEN | IR3_REG_POS_INF))
666 return false;
667
668 return true;
669 }
670
671 /* sometimes we end up w/ in-place mov's, ie. mov.u32u32 r1.y, r1.y
672 * as a result of places were before RA we are not sure that it is
673 * safe to eliminate. We could eliminate these earlier, but sometimes
674 * they are tangled up in false-dep's, etc, so it is easier just to
675 * let them exist until after RA
676 */
677 static void
678 cleanup_self_movs(struct ir3 *ir)
679 {
680 foreach_block (block, &ir->block_list) {
681 foreach_instr_safe (instr, &block->instr_list) {
682
683 foreach_src (reg, instr) {
684 if (!reg->instr)
685 continue;
686
687 if (is_self_mov(reg->instr)) {
688 list_delinit(&reg->instr->node);
689 reg->instr = reg->instr->regs[1]->instr;
690 }
691 }
692
693 for (unsigned i = 0; i < instr->deps_count; i++) {
694 if (instr->deps[i] && is_self_mov(instr->deps[i])) {
695 list_delinit(&instr->deps[i]->node);
696 instr->deps[i] = instr->deps[i]->regs[1]->instr;
697 }
698 }
699 }
700 }
701 }
702
703 bool
704 ir3_postsched(struct ir3 *ir)
705 {
706 struct ir3_postsched_ctx ctx = {
707 .ir = ir,
708 };
709
710 ir3_remove_nops(ir);
711 cleanup_self_movs(ir);
712
713 foreach_block (block, &ir->block_list) {
714 sched_block(&ctx, block);
715 }
716
717 return true;
718 }