ir3: use empirical size for params as used by the shader
[mesa.git] / src / freedreno / ir3 / ir3_postsched.c
1 /*
2 * Copyright (C) 2019 Google, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Rob Clark <robclark@freedesktop.org>
25 */
26
27
28 #include "util/dag.h"
29 #include "util/u_math.h"
30
31 #include "ir3.h"
32 #include "ir3_compiler.h"
33 #include "ir3_context.h"
34
35 #ifdef DEBUG
36 #define SCHED_DEBUG (ir3_shader_debug & IR3_DBG_SCHEDMSGS)
37 #else
38 #define SCHED_DEBUG 0
39 #endif
40 #define d(fmt, ...) do { if (SCHED_DEBUG) { \
41 printf("PSCHED: "fmt"\n", ##__VA_ARGS__); \
42 } } while (0)
43
44 #define di(instr, fmt, ...) do { if (SCHED_DEBUG) { \
45 printf("PSCHED: "fmt": ", ##__VA_ARGS__); \
46 ir3_print_instr(instr); \
47 } } while (0)
48
49 /*
50 * Post RA Instruction Scheduling
51 */
52
53 struct ir3_postsched_ctx {
54 struct ir3 *ir;
55
56 struct ir3_shader_variant *v;
57
58 void *mem_ctx;
59 struct ir3_block *block; /* the current block */
60 struct dag *dag;
61
62 struct list_head unscheduled_list; /* unscheduled instructions */
63
64 int sfu_delay;
65 int tex_delay;
66 };
67
68 struct ir3_postsched_node {
69 struct dag_node dag; /* must be first for util_dynarray_foreach */
70 struct ir3_instruction *instr;
71 bool partially_evaluated_path;
72
73 unsigned delay;
74 unsigned max_delay;
75 };
76
77 #define foreach_sched_node(__n, __list) \
78 list_for_each_entry(struct ir3_postsched_node, __n, __list, dag.link)
79
80 #define foreach_bit(b, mask) \
81 for (uint32_t _m = ({debug_assert((mask) >= 1); (mask);}); _m && ({(b) = u_bit_scan(&_m); 1;});)
82
83 static void
84 schedule(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
85 {
86 debug_assert(ctx->block == instr->block);
87
88 /* remove from unscheduled_list:
89 */
90 list_delinit(&instr->node);
91
92 di(instr, "schedule");
93
94 list_addtail(&instr->node, &instr->block->instr_list);
95
96 struct ir3_postsched_node *n = instr->data;
97 dag_prune_head(ctx->dag, &n->dag);
98
99 if (is_meta(instr) && (instr->opc != OPC_META_TEX_PREFETCH))
100 return;
101
102 if (is_sfu(instr)) {
103 ctx->sfu_delay = 8;
104 } else if (check_src_cond(instr, is_sfu)) {
105 ctx->sfu_delay = 0;
106 } else if (ctx->sfu_delay > 0) {
107 ctx->sfu_delay--;
108 }
109
110 if (is_tex_or_prefetch(instr)) {
111 ctx->tex_delay = 10;
112 } else if (check_src_cond(instr, is_tex_or_prefetch)) {
113 ctx->tex_delay = 0;
114 } else if (ctx->tex_delay > 0) {
115 ctx->tex_delay--;
116 }
117 }
118
119 static void
120 dump_state(struct ir3_postsched_ctx *ctx)
121 {
122 if (!SCHED_DEBUG)
123 return;
124
125 foreach_sched_node (n, &ctx->dag->heads) {
126 di(n->instr, "maxdel=%3d ", n->max_delay);
127
128 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
129 struct ir3_postsched_node *child =
130 (struct ir3_postsched_node *)edge->child;
131
132 di(child->instr, " -> (%d parents) ", child->dag.parent_count);
133 }
134 }
135 }
136
137 /* Determine if this is an instruction that we'd prefer not to schedule
138 * yet, in order to avoid an (ss) sync. This is limited by the sfu_delay
139 * counter, ie. the more cycles it has been since the last SFU, the less
140 * costly a sync would be.
141 */
142 static bool
143 would_sync(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
144 {
145 if (ctx->sfu_delay) {
146 if (check_src_cond(instr, is_sfu))
147 return true;
148 }
149
150 if (ctx->tex_delay) {
151 if (check_src_cond(instr, is_tex_or_prefetch))
152 return true;
153 }
154
155 return false;
156 }
157
158 /* find instruction to schedule: */
159 static struct ir3_instruction *
160 choose_instr(struct ir3_postsched_ctx *ctx)
161 {
162 struct ir3_postsched_node *chosen = NULL;
163
164 dump_state(ctx);
165
166 foreach_sched_node (n, &ctx->dag->heads) {
167 if (!is_meta(n->instr))
168 continue;
169
170 if (!chosen || (chosen->max_delay < n->max_delay))
171 chosen = n;
172 }
173
174 if (chosen) {
175 di(chosen->instr, "prio: chose (meta)");
176 return chosen->instr;
177 }
178
179 /* Try to schedule inputs with a higher priority, if possible, as
180 * the last bary.f unlocks varying storage to unblock more VS
181 * warps.
182 */
183 foreach_sched_node (n, &ctx->dag->heads) {
184 if (!is_input(n->instr))
185 continue;
186
187 if (!chosen || (chosen->max_delay < n->max_delay))
188 chosen = n;
189 }
190
191 if (chosen) {
192 di(chosen->instr, "prio: chose (input)");
193 return chosen->instr;
194 }
195
196 /* Next prioritize discards: */
197 foreach_sched_node (n, &ctx->dag->heads) {
198 unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
199
200 if (d > 0)
201 continue;
202
203 if (!is_kill(n->instr))
204 continue;
205
206 if (!chosen || (chosen->max_delay < n->max_delay))
207 chosen = n;
208 }
209
210 if (chosen) {
211 di(chosen->instr, "csp: chose (kill, hard ready)");
212 return chosen->instr;
213 }
214
215 /* Next prioritize expensive instructions: */
216 foreach_sched_node (n, &ctx->dag->heads) {
217 unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
218
219 if (d > 0)
220 continue;
221
222 if (!(is_sfu(n->instr) || is_tex(n->instr)))
223 continue;
224
225 if (!chosen || (chosen->max_delay < n->max_delay))
226 chosen = n;
227 }
228
229 if (chosen) {
230 di(chosen->instr, "csp: chose (sfu/tex, hard ready)");
231 return chosen->instr;
232 }
233
234 /*
235 * Sometimes be better to take a nop, rather than scheduling an
236 * instruction that would require an (ss) shortly after another
237 * SFU.. ie. if last SFU was just one or two instr ago, and we
238 * could choose between taking a nop and then scheduling
239 * something else, vs scheduling the immed avail instruction that
240 * would require (ss), we are better with the nop.
241 */
242 for (unsigned delay = 0; delay < 4; delay++) {
243 foreach_sched_node (n, &ctx->dag->heads) {
244 if (would_sync(ctx, n->instr))
245 continue;
246
247 unsigned d = ir3_delay_calc(ctx->block, n->instr, true, false);
248
249 if (d > delay)
250 continue;
251
252 if (!chosen || (chosen->max_delay < n->max_delay))
253 chosen = n;
254 }
255
256 if (chosen) {
257 di(chosen->instr, "csp: chose (soft ready, delay=%u)", delay);
258 return chosen->instr;
259 }
260 }
261
262 /* Next try to find a ready leader w/ soft delay (ie. including extra
263 * delay for things like tex fetch which can be synchronized w/ sync
264 * bit (but we probably do want to schedule some other instructions
265 * while we wait)
266 */
267 foreach_sched_node (n, &ctx->dag->heads) {
268 unsigned d = ir3_delay_calc(ctx->block, n->instr, true, false);
269
270 if (d > 0)
271 continue;
272
273 if (!chosen || (chosen->max_delay < n->max_delay))
274 chosen = n;
275 }
276
277 if (chosen) {
278 di(chosen->instr, "csp: chose (soft ready)");
279 return chosen->instr;
280 }
281
282 /* Next try to find a ready leader that can be scheduled without nop's,
283 * which in the case of things that need (sy)/(ss) could result in
284 * stalls.. but we've already decided there is not a better option.
285 */
286 foreach_sched_node (n, &ctx->dag->heads) {
287 unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
288
289 if (d > 0)
290 continue;
291
292 if (!chosen || (chosen->max_delay < n->max_delay))
293 chosen = n;
294 }
295
296 if (chosen) {
297 di(chosen->instr, "csp: chose (hard ready)");
298 return chosen->instr;
299 }
300
301 /* Otherwise choose leader with maximum cost:
302 *
303 * TODO should we try to balance cost and delays? I guess it is
304 * a balance between now-nop's and future-nop's?
305 */
306 foreach_sched_node (n, &ctx->dag->heads) {
307 if (!chosen || chosen->max_delay < n->max_delay)
308 chosen = n;
309 }
310
311 if (chosen) {
312 di(chosen->instr, "csp: chose (leader)");
313 return chosen->instr;
314 }
315
316 return NULL;
317 }
318
319 struct ir3_postsched_deps_state {
320 struct ir3_postsched_ctx *ctx;
321
322 enum { F, R } direction;
323
324 bool merged;
325
326 /* Track the mapping between sched node (instruction) that last
327 * wrote a given register (in whichever direction we are iterating
328 * the block)
329 *
330 * Note, this table is twice as big as the # of regs, to deal with
331 * half-precision regs. The approach differs depending on whether
332 * the half and full precision register files are "merged" (conflict,
333 * ie. a6xx+) in which case we consider each full precision dep
334 * as two half-precision dependencies, vs older separate (non-
335 * conflicting) in which case the first half of the table is used
336 * for full precision and 2nd half for half-precision.
337 */
338 struct ir3_postsched_node *regs[2 * 256];
339 };
340
341 /* bounds checking read/write accessors, since OoB access to stuff on
342 * the stack is gonna cause a bad day.
343 */
344 #define dep_reg(state, idx) *({ \
345 assert((idx) < ARRAY_SIZE((state)->regs)); \
346 &(state)->regs[(idx)]; \
347 })
348
349 static void
350 add_dep(struct ir3_postsched_deps_state *state,
351 struct ir3_postsched_node *before,
352 struct ir3_postsched_node *after)
353 {
354 if (!before || !after)
355 return;
356
357 assert(before != after);
358
359 if (state->direction == F) {
360 dag_add_edge(&before->dag, &after->dag, NULL);
361 } else {
362 dag_add_edge(&after->dag, &before->dag, NULL);
363 }
364 }
365
366 static void
367 add_single_reg_dep(struct ir3_postsched_deps_state *state,
368 struct ir3_postsched_node *node, unsigned num, bool write)
369 {
370 add_dep(state, dep_reg(state, num), node);
371 if (write) {
372 dep_reg(state, num) = node;
373 }
374 }
375
376 /* This is where we handled full vs half-precision, and potential conflicts
377 * between half and full precision that result in additional dependencies.
378 * The 'reg' arg is really just to know half vs full precision.
379 */
380 static void
381 add_reg_dep(struct ir3_postsched_deps_state *state,
382 struct ir3_postsched_node *node, const struct ir3_register *reg,
383 unsigned num, bool write)
384 {
385 if (state->merged) {
386 if (reg->flags & IR3_REG_HALF) {
387 /* single conflict in half-reg space: */
388 add_single_reg_dep(state, node, num, write);
389 } else {
390 /* two conflicts in half-reg space: */
391 add_single_reg_dep(state, node, 2 * num + 0, write);
392 add_single_reg_dep(state, node, 2 * num + 1, write);
393 }
394 } else {
395 if (reg->flags & IR3_REG_HALF)
396 num += ARRAY_SIZE(state->regs) / 2;
397 add_single_reg_dep(state, node, num, write);
398 }
399 }
400
401 static void
402 calculate_deps(struct ir3_postsched_deps_state *state,
403 struct ir3_postsched_node *node)
404 {
405 int b;
406
407 /* Add dependencies on instructions that previously (or next,
408 * in the reverse direction) wrote any of our src registers:
409 */
410 foreach_src_n (reg, i, node->instr) {
411 if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
412 continue;
413
414 if (reg->flags & IR3_REG_RELATIV) {
415 /* mark entire array as read: */
416 struct ir3_array *arr = ir3_lookup_array(state->ctx->ir, reg->array.id);
417 for (unsigned i = 0; i < arr->length; i++) {
418 add_reg_dep(state, node, reg, arr->reg + i, false);
419 }
420 } else {
421 foreach_bit (b, reg->wrmask) {
422 add_reg_dep(state, node, reg, reg->num + b, false);
423
424 struct ir3_postsched_node *dep = dep_reg(state, reg->num + b);
425 if (dep && (state->direction == F)) {
426 unsigned d = ir3_delayslots(dep->instr, node->instr, i, true);
427 node->delay = MAX2(node->delay, d);
428 }
429 }
430 }
431 }
432
433 if (node->instr->address) {
434 add_reg_dep(state, node, node->instr->address->regs[0],
435 node->instr->address->regs[0]->num,
436 false);
437 }
438
439 if (dest_regs(node->instr) == 0)
440 return;
441
442 /* And then after we update the state for what this instruction
443 * wrote:
444 */
445 struct ir3_register *reg = node->instr->regs[0];
446 if (reg->flags & IR3_REG_RELATIV) {
447 /* mark the entire array as written: */
448 struct ir3_array *arr = ir3_lookup_array(state->ctx->ir, reg->array.id);
449 for (unsigned i = 0; i < arr->length; i++) {
450 add_reg_dep(state, node, reg, arr->reg + i, true);
451 }
452 } else {
453 foreach_bit (b, reg->wrmask) {
454 add_reg_dep(state, node, reg, reg->num + b, true);
455 }
456 }
457 }
458
459 static void
460 calculate_forward_deps(struct ir3_postsched_ctx *ctx)
461 {
462 struct ir3_postsched_deps_state state = {
463 .ctx = ctx,
464 .direction = F,
465 .merged = ctx->v->mergedregs,
466 };
467
468 foreach_instr (instr, &ctx->unscheduled_list) {
469 calculate_deps(&state, instr->data);
470 }
471 }
472
473 static void
474 calculate_reverse_deps(struct ir3_postsched_ctx *ctx)
475 {
476 struct ir3_postsched_deps_state state = {
477 .ctx = ctx,
478 .direction = R,
479 .merged = ctx->v->mergedregs,
480 };
481
482 foreach_instr_rev (instr, &ctx->unscheduled_list) {
483 calculate_deps(&state, instr->data);
484 }
485 }
486
487 static void
488 sched_node_init(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
489 {
490 struct ir3_postsched_node *n = rzalloc(ctx->mem_ctx, struct ir3_postsched_node);
491
492 dag_init_node(ctx->dag, &n->dag);
493
494 n->instr = instr;
495 instr->data = n;
496 }
497
498 static void
499 sched_dag_max_delay_cb(struct dag_node *node, void *state)
500 {
501 struct ir3_postsched_node *n = (struct ir3_postsched_node *)node;
502 uint32_t max_delay = 0;
503
504 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
505 struct ir3_postsched_node *child = (struct ir3_postsched_node *)edge->child;
506 max_delay = MAX2(child->max_delay, max_delay);
507 }
508
509 n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
510 }
511
512 static void
513 sched_dag_init(struct ir3_postsched_ctx *ctx)
514 {
515 ctx->mem_ctx = ralloc_context(NULL);
516
517 ctx->dag = dag_create(ctx->mem_ctx);
518
519 foreach_instr (instr, &ctx->unscheduled_list)
520 sched_node_init(ctx, instr);
521
522 calculate_forward_deps(ctx);
523 calculate_reverse_deps(ctx);
524
525 /*
526 * To avoid expensive texture fetches, etc, from being moved ahead
527 * of kills, track the kills we've seen so far, so we can add an
528 * extra dependency on them for tex/mem instructions
529 */
530 struct util_dynarray kills;
531 util_dynarray_init(&kills, ctx->mem_ctx);
532
533 /*
534 * Normal srcs won't be in SSA at this point, those are dealt with in
535 * calculate_forward_deps() and calculate_reverse_deps(). But we still
536 * have the false-dep information in SSA form, so go ahead and add
537 * dependencies for that here:
538 */
539 foreach_instr (instr, &ctx->unscheduled_list) {
540 struct ir3_postsched_node *n = instr->data;
541
542 foreach_ssa_src_n (src, i, instr) {
543 if (src->block != instr->block)
544 continue;
545
546 /* we can end up with unused false-deps.. just skip them: */
547 if (src->flags & IR3_INSTR_UNUSED)
548 continue;
549
550 struct ir3_postsched_node *sn = src->data;
551
552 /* don't consider dependencies in other blocks: */
553 if (src->block != instr->block)
554 continue;
555
556 dag_add_edge(&sn->dag, &n->dag, NULL);
557 }
558
559 if (is_kill(instr)) {
560 util_dynarray_append(&kills, struct ir3_instruction *, instr);
561 } else if (is_tex(instr) || is_mem(instr)) {
562 util_dynarray_foreach(&kills, struct ir3_instruction *, instrp) {
563 struct ir3_instruction *kill = *instrp;
564 struct ir3_postsched_node *kn = kill->data;
565 dag_add_edge(&kn->dag, &n->dag, NULL);
566 }
567 }
568 }
569
570 // TODO do we want to do this after reverse-dependencies?
571 dag_traverse_bottom_up(ctx->dag, sched_dag_max_delay_cb, NULL);
572 }
573
574 static void
575 sched_dag_destroy(struct ir3_postsched_ctx *ctx)
576 {
577 ralloc_free(ctx->mem_ctx);
578 ctx->mem_ctx = NULL;
579 ctx->dag = NULL;
580 }
581
582 static void
583 sched_block(struct ir3_postsched_ctx *ctx, struct ir3_block *block)
584 {
585 ctx->block = block;
586 ctx->tex_delay = 0;
587 ctx->sfu_delay = 0;
588
589 /* move all instructions to the unscheduled list, and
590 * empty the block's instruction list (to which we will
591 * be inserting).
592 */
593 list_replace(&block->instr_list, &ctx->unscheduled_list);
594 list_inithead(&block->instr_list);
595
596 // TODO once we are using post-sched for everything we can
597 // just not stick in NOP's prior to post-sched, and drop this.
598 // for now keep this, since it makes post-sched optional:
599 foreach_instr_safe (instr, &ctx->unscheduled_list) {
600 switch (instr->opc) {
601 case OPC_NOP:
602 case OPC_B:
603 case OPC_JUMP:
604 list_delinit(&instr->node);
605 break;
606 default:
607 break;
608 }
609 }
610
611 sched_dag_init(ctx);
612
613 /* First schedule all meta:input instructions, followed by
614 * tex-prefetch. We want all of the instructions that load
615 * values into registers before the shader starts to go
616 * before any other instructions. But in particular we
617 * want inputs to come before prefetches. This is because
618 * a FS's bary_ij input may not actually be live in the
619 * shader, but it should not be scheduled on top of any
620 * other input (but can be overwritten by a tex prefetch)
621 */
622 foreach_instr_safe (instr, &ctx->unscheduled_list)
623 if (instr->opc == OPC_META_INPUT)
624 schedule(ctx, instr);
625
626 foreach_instr_safe (instr, &ctx->unscheduled_list)
627 if (instr->opc == OPC_META_TEX_PREFETCH)
628 schedule(ctx, instr);
629
630 while (!list_is_empty(&ctx->unscheduled_list)) {
631 struct ir3_instruction *instr = choose_instr(ctx);
632
633 unsigned delay = ir3_delay_calc(ctx->block, instr, false, false);
634 d("delay=%u", delay);
635
636 /* and if we run out of instructions that can be scheduled,
637 * then it is time for nop's:
638 */
639 debug_assert(delay <= 6);
640 while (delay > 0) {
641 ir3_NOP(block);
642 delay--;
643 }
644
645 schedule(ctx, instr);
646 }
647
648 sched_dag_destroy(ctx);
649 }
650
651
652 static bool
653 is_self_mov(struct ir3_instruction *instr)
654 {
655 if (!is_same_type_mov(instr))
656 return false;
657
658 if (instr->regs[0]->num != instr->regs[1]->num)
659 return false;
660
661 if (instr->regs[0]->flags & IR3_REG_RELATIV)
662 return false;
663
664 if (instr->regs[1]->flags & (IR3_REG_CONST | IR3_REG_IMMED |
665 IR3_REG_RELATIV | IR3_REG_FNEG | IR3_REG_FABS |
666 IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT |
667 IR3_REG_EVEN | IR3_REG_POS_INF))
668 return false;
669
670 return true;
671 }
672
673 /* sometimes we end up w/ in-place mov's, ie. mov.u32u32 r1.y, r1.y
674 * as a result of places were before RA we are not sure that it is
675 * safe to eliminate. We could eliminate these earlier, but sometimes
676 * they are tangled up in false-dep's, etc, so it is easier just to
677 * let them exist until after RA
678 */
679 static void
680 cleanup_self_movs(struct ir3 *ir)
681 {
682 foreach_block (block, &ir->block_list) {
683 foreach_instr_safe (instr, &block->instr_list) {
684
685 foreach_src (reg, instr) {
686 if (!reg->instr)
687 continue;
688
689 if (is_self_mov(reg->instr)) {
690 list_delinit(&reg->instr->node);
691 reg->instr = reg->instr->regs[1]->instr;
692 }
693 }
694
695 for (unsigned i = 0; i < instr->deps_count; i++) {
696 if (instr->deps[i] && is_self_mov(instr->deps[i])) {
697 list_delinit(&instr->deps[i]->node);
698 instr->deps[i] = instr->deps[i]->regs[1]->instr;
699 }
700 }
701 }
702 }
703 }
704
705 bool
706 ir3_postsched(struct ir3 *ir, struct ir3_shader_variant *v)
707 {
708 struct ir3_postsched_ctx ctx = {
709 .ir = ir,
710 .v = v,
711 };
712
713 ir3_remove_nops(ir);
714 cleanup_self_movs(ir);
715
716 foreach_block (block, &ir->block_list) {
717 sched_block(&ctx, block);
718 }
719
720 return true;
721 }