freedreno/ir3: Sync some new changes from envytools.
[mesa.git] / src / freedreno / ir3 / ir3_postsched.c
1 /*
2 * Copyright (C) 2019 Google, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Rob Clark <robclark@freedesktop.org>
25 */
26
27
28 #include "util/dag.h"
29 #include "util/u_math.h"
30
31 #include "ir3.h"
32 #include "ir3_compiler.h"
33 #include "ir3_context.h"
34
35 #ifdef DEBUG
36 #define SCHED_DEBUG (ir3_shader_debug & IR3_DBG_SCHEDMSGS)
37 #else
38 #define SCHED_DEBUG 0
39 #endif
40 #define d(fmt, ...) do { if (SCHED_DEBUG) { \
41 printf("PSCHED: "fmt"\n", ##__VA_ARGS__); \
42 } } while (0)
43
44 #define di(instr, fmt, ...) do { if (SCHED_DEBUG) { \
45 printf("PSCHED: "fmt": ", ##__VA_ARGS__); \
46 ir3_print_instr(instr); \
47 } } while (0)
48
49 /*
50 * Post RA Instruction Scheduling
51 */
52
53 struct ir3_postsched_ctx {
54 struct ir3_context *ctx;
55
56 void *mem_ctx;
57 struct ir3_block *block; /* the current block */
58 struct dag *dag;
59
60 struct list_head unscheduled_list; /* unscheduled instructions */
61
62 bool error;
63
64 int sfu_delay;
65 };
66
67 struct ir3_postsched_node {
68 struct dag_node dag; /* must be first for util_dynarray_foreach */
69 struct ir3_instruction *instr;
70 bool partially_evaluated_path;
71
72 unsigned delay;
73 unsigned max_delay;
74 };
75
76 #define foreach_sched_node(__n, __list) \
77 list_for_each_entry(struct ir3_postsched_node, __n, __list, dag.link)
78
79 #define foreach_bit(b, mask) \
80 for (uint32_t _m = ({debug_assert((mask) >= 1); (mask);}); _m && ({(b) = u_bit_scan(&_m); 1;});)
81
82 static void
83 schedule(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
84 {
85 debug_assert(ctx->block == instr->block);
86
87 /* remove from unscheduled_list:
88 */
89 list_delinit(&instr->node);
90
91 di(instr, "schedule");
92
93 list_addtail(&instr->node, &instr->block->instr_list);
94
95 if (is_sfu(instr)) {
96 ctx->sfu_delay = 8;
97 } else if (ctx->sfu_delay > 0) {
98 ctx->sfu_delay--;
99 }
100
101 struct ir3_postsched_node *n = instr->data;
102 dag_prune_head(ctx->dag, &n->dag);
103 }
104
105 static void
106 dump_state(struct ir3_postsched_ctx *ctx)
107 {
108 if (!SCHED_DEBUG)
109 return;
110
111 foreach_sched_node (n, &ctx->dag->heads) {
112 di(n->instr, "maxdel=%3d ", n->max_delay);
113
114 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
115 struct ir3_postsched_node *child =
116 (struct ir3_postsched_node *)edge->child;
117
118 di(child->instr, " -> (%d parents) ", child->dag.parent_count);
119 }
120 }
121 }
122
123 /* Determine if this is an instruction that we'd prefer not to schedule
124 * yet, in order to avoid an (ss) sync. This is limited by the sfu_delay
125 * counter, ie. the more cycles it has been since the last SFU, the less
126 * costly a sync would be.
127 */
128 static bool
129 would_sync(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
130 {
131 if (ctx->sfu_delay) {
132 struct ir3_register *reg;
133 foreach_src (reg, instr)
134 if (reg->instr && is_sfu(reg->instr))
135 return true;
136 }
137
138 return false;
139 }
140
141 /* find instruction to schedule: */
142 static struct ir3_instruction *
143 choose_instr(struct ir3_postsched_ctx *ctx)
144 {
145 struct ir3_postsched_node *chosen = NULL;
146
147 dump_state(ctx);
148
149 foreach_sched_node (n, &ctx->dag->heads) {
150 if (!is_meta(n->instr))
151 continue;
152
153 if (!chosen || (chosen->max_delay < n->max_delay))
154 chosen = n;
155 }
156
157 if (chosen) {
158 di(chosen->instr, "prio: chose (meta)");
159 return chosen->instr;
160 }
161
162 /* Try to schedule inputs with a higher priority, if possible, as
163 * the last bary.f unlocks varying storage to unblock more VS
164 * warps.
165 */
166 foreach_sched_node (n, &ctx->dag->heads) {
167 if (!is_input(n->instr))
168 continue;
169
170 if (!chosen || (chosen->max_delay < n->max_delay))
171 chosen = n;
172 }
173
174 if (chosen) {
175 di(chosen->instr, "prio: chose (input)");
176 return chosen->instr;
177 }
178
179 /* Next prioritize discards: */
180 foreach_sched_node (n, &ctx->dag->heads) {
181 unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
182
183 if (d > 0)
184 continue;
185
186 if (!is_kill(n->instr))
187 continue;
188
189 if (!chosen || (chosen->max_delay < n->max_delay))
190 chosen = n;
191 }
192
193 if (chosen) {
194 di(chosen->instr, "csp: chose (kill, hard ready)");
195 return chosen->instr;
196 }
197
198 /* Next prioritize expensive instructions: */
199 foreach_sched_node (n, &ctx->dag->heads) {
200 unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
201
202 if (d > 0)
203 continue;
204
205 if (!(is_sfu(n->instr) || is_tex(n->instr)))
206 continue;
207
208 if (!chosen || (chosen->max_delay < n->max_delay))
209 chosen = n;
210 }
211
212 if (chosen) {
213 di(chosen->instr, "csp: chose (sfu/tex, hard ready)");
214 return chosen->instr;
215 }
216
217 /*
218 * Sometimes be better to take a nop, rather than scheduling an
219 * instruction that would require an (ss) shortly after another
220 * SFU.. ie. if last SFU was just one or two instr ago, and we
221 * could choose between taking a nop and then scheduling
222 * something else, vs scheduling the immed avail instruction that
223 * would require (ss), we are better with the nop.
224 */
225 for (unsigned delay = 0; delay < 4; delay++) {
226 foreach_sched_node (n, &ctx->dag->heads) {
227 if (would_sync(ctx, n->instr))
228 continue;
229
230 unsigned d = ir3_delay_calc(ctx->block, n->instr, true, false);
231
232 if (d > delay)
233 continue;
234
235 if (!chosen || (chosen->max_delay < n->max_delay))
236 chosen = n;
237 }
238
239 if (chosen) {
240 di(chosen->instr, "csp: chose (soft ready, delay=%u)", delay);
241 return chosen->instr;
242 }
243 }
244
245 /* Next try to find a ready leader w/ soft delay (ie. including extra
246 * delay for things like tex fetch which can be synchronized w/ sync
247 * bit (but we probably do want to schedule some other instructions
248 * while we wait)
249 */
250 foreach_sched_node (n, &ctx->dag->heads) {
251 unsigned d = ir3_delay_calc(ctx->block, n->instr, true, false);
252
253 if (d > 0)
254 continue;
255
256 if (!chosen || (chosen->max_delay < n->max_delay))
257 chosen = n;
258 }
259
260 if (chosen) {
261 di(chosen->instr, "csp: chose (soft ready)");
262 return chosen->instr;
263 }
264
265 /* Next try to find a ready leader that can be scheduled without nop's,
266 * which in the case of things that need (sy)/(ss) could result in
267 * stalls.. but we've already decided there is not a better option.
268 */
269 foreach_sched_node (n, &ctx->dag->heads) {
270 unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
271
272 if (d > 0)
273 continue;
274
275 if (!chosen || (chosen->max_delay < n->max_delay))
276 chosen = n;
277 }
278
279 if (chosen) {
280 di(chosen->instr, "csp: chose (hard ready)");
281 return chosen->instr;
282 }
283
284 /* Otherwise choose leader with maximum cost:
285 *
286 * TODO should we try to balance cost and delays? I guess it is
287 * a balance between now-nop's and future-nop's?
288 */
289 foreach_sched_node (n, &ctx->dag->heads) {
290 if (!chosen || chosen->max_delay < n->max_delay)
291 chosen = n;
292 }
293
294 if (chosen) {
295 di(chosen->instr, "csp: chose (leader)");
296 return chosen->instr;
297 }
298
299 return NULL;
300 }
301
302 struct ir3_postsched_deps_state {
303 struct ir3_context *ctx;
304
305 enum { F, R } direction;
306
307 bool merged;
308
309 /* Track the mapping between sched node (instruction) that last
310 * wrote a given register (in whichever direction we are iterating
311 * the block)
312 *
313 * Note, this table is twice as big as the # of regs, to deal with
314 * half-precision regs. The approach differs depending on whether
315 * the half and full precision register files are "merged" (conflict,
316 * ie. a6xx+) in which case we consider each full precision dep
317 * as two half-precision dependencies, vs older separate (non-
318 * conflicting) in which case the first half of the table is used
319 * for full precision and 2nd half for half-precision.
320 */
321 struct ir3_postsched_node *regs[2 * 256];
322 };
323
324 /* bounds checking read/write accessors, since OoB access to stuff on
325 * the stack is gonna cause a bad day.
326 */
327 #define dep_reg(state, idx) *({ \
328 assert((idx) < ARRAY_SIZE((state)->regs)); \
329 &(state)->regs[(idx)]; \
330 })
331
332 static void
333 add_dep(struct ir3_postsched_deps_state *state,
334 struct ir3_postsched_node *before,
335 struct ir3_postsched_node *after)
336 {
337 if (!before || !after)
338 return;
339
340 assert(before != after);
341
342 if (state->direction == F) {
343 dag_add_edge(&before->dag, &after->dag, NULL);
344 } else {
345 dag_add_edge(&after->dag, &before->dag, NULL);
346 }
347 }
348
349 static void
350 add_single_reg_dep(struct ir3_postsched_deps_state *state,
351 struct ir3_postsched_node *node, unsigned num, bool write)
352 {
353 add_dep(state, dep_reg(state, num), node);
354 if (write) {
355 dep_reg(state, num) = node;
356 }
357 }
358
359 /* This is where we handled full vs half-precision, and potential conflicts
360 * between half and full precision that result in additional dependencies.
361 * The 'reg' arg is really just to know half vs full precision.
362 */
363 static void
364 add_reg_dep(struct ir3_postsched_deps_state *state,
365 struct ir3_postsched_node *node, const struct ir3_register *reg,
366 unsigned num, bool write)
367 {
368 if (state->merged) {
369 if (reg->flags & IR3_REG_HALF) {
370 /* single conflict in half-reg space: */
371 add_single_reg_dep(state, node, num, write);
372 } else {
373 /* two conflicts in half-reg space: */
374 add_single_reg_dep(state, node, 2 * num + 0, write);
375 add_single_reg_dep(state, node, 2 * num + 1, write);
376 }
377 } else {
378 if (reg->flags & IR3_REG_HALF)
379 num += ARRAY_SIZE(state->regs) / 2;
380 add_single_reg_dep(state, node, num, write);
381 }
382 }
383
384 static void
385 calculate_deps(struct ir3_postsched_deps_state *state,
386 struct ir3_postsched_node *node)
387 {
388 struct ir3_register *reg;
389 int b;
390
391 /* Add dependencies on instructions that previously (or next,
392 * in the reverse direction) wrote any of our src registers:
393 */
394 foreach_src_n (reg, i, node->instr) {
395 if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
396 continue;
397
398 if (reg->flags & IR3_REG_RELATIV) {
399 /* mark entire array as read: */
400 struct ir3_array *arr = ir3_lookup_array(state->ctx->ir, reg->array.id);
401 for (unsigned i = 0; i < arr->length; i++) {
402 add_reg_dep(state, node, reg, arr->reg + i, false);
403 }
404 } else {
405 foreach_bit (b, reg->wrmask) {
406 add_reg_dep(state, node, reg, reg->num + b, false);
407
408 struct ir3_postsched_node *dep = dep_reg(state, reg->num + b);
409 if (dep && (state->direction == F)) {
410 unsigned d = ir3_delayslots(dep->instr, node->instr, i, true);
411 node->delay = MAX2(node->delay, d);
412 }
413 }
414 }
415 }
416
417 if (node->instr->address) {
418 add_reg_dep(state, node, node->instr->address->regs[0],
419 node->instr->address->regs[0]->num,
420 false);
421 }
422
423 if (dest_regs(node->instr) == 0)
424 return;
425
426 /* And then after we update the state for what this instruction
427 * wrote:
428 */
429 reg = node->instr->regs[0];
430 if (reg->flags & IR3_REG_RELATIV) {
431 /* mark the entire array as written: */
432 struct ir3_array *arr = ir3_lookup_array(state->ctx->ir, reg->array.id);
433 for (unsigned i = 0; i < arr->length; i++) {
434 add_reg_dep(state, node, reg, arr->reg + i, true);
435 }
436 } else {
437 foreach_bit (b, reg->wrmask) {
438 add_reg_dep(state, node, reg, reg->num + b, true);
439 }
440 }
441 }
442
443 static void
444 calculate_forward_deps(struct ir3_postsched_ctx *ctx)
445 {
446 struct ir3_postsched_deps_state state = {
447 .ctx = ctx->ctx,
448 .direction = F,
449 .merged = ctx->ctx->compiler->gpu_id >= 600,
450 };
451
452 foreach_instr (instr, &ctx->unscheduled_list) {
453 calculate_deps(&state, instr->data);
454 }
455 }
456
457 static void
458 calculate_reverse_deps(struct ir3_postsched_ctx *ctx)
459 {
460 struct ir3_postsched_deps_state state = {
461 .ctx = ctx->ctx,
462 .direction = R,
463 .merged = ctx->ctx->compiler->gpu_id >= 600,
464 };
465
466 foreach_instr_rev (instr, &ctx->unscheduled_list) {
467 calculate_deps(&state, instr->data);
468 }
469 }
470
471 static void
472 sched_node_init(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
473 {
474 struct ir3_postsched_node *n = rzalloc(ctx->mem_ctx, struct ir3_postsched_node);
475
476 dag_init_node(ctx->dag, &n->dag);
477
478 n->instr = instr;
479 instr->data = n;
480 }
481
482 static void
483 sched_dag_max_delay_cb(struct dag_node *node, void *state)
484 {
485 struct ir3_postsched_node *n = (struct ir3_postsched_node *)node;
486 uint32_t max_delay = 0;
487
488 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
489 struct ir3_postsched_node *child = (struct ir3_postsched_node *)edge->child;
490 max_delay = MAX2(child->max_delay, max_delay);
491 }
492
493 n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
494 }
495
496 static void
497 sched_dag_init(struct ir3_postsched_ctx *ctx)
498 {
499 ctx->mem_ctx = ralloc_context(NULL);
500
501 ctx->dag = dag_create(ctx->mem_ctx);
502
503 foreach_instr (instr, &ctx->unscheduled_list)
504 sched_node_init(ctx, instr);
505
506 calculate_forward_deps(ctx);
507 calculate_reverse_deps(ctx);
508
509 /*
510 * To avoid expensive texture fetches, etc, from being moved ahead
511 * of kills, track the kills we've seen so far, so we can add an
512 * extra dependency on them for tex/mem instructions
513 */
514 struct util_dynarray kills;
515 util_dynarray_init(&kills, ctx->mem_ctx);
516
517 /*
518 * Normal srcs won't be in SSA at this point, those are dealt with in
519 * calculate_forward_deps() and calculate_reverse_deps(). But we still
520 * have the false-dep information in SSA form, so go ahead and add
521 * dependencies for that here:
522 */
523 foreach_instr (instr, &ctx->unscheduled_list) {
524 struct ir3_postsched_node *n = instr->data;
525 struct ir3_instruction *src;
526
527 foreach_ssa_src_n (src, i, instr) {
528 if (src->block != instr->block)
529 continue;
530
531 /* we can end up with unused false-deps.. just skip them: */
532 if (src->flags & IR3_INSTR_UNUSED)
533 continue;
534
535 struct ir3_postsched_node *sn = src->data;
536
537 /* don't consider dependencies in other blocks: */
538 if (src->block != instr->block)
539 continue;
540
541 dag_add_edge(&sn->dag, &n->dag, NULL);
542 }
543
544 if (is_kill(instr)) {
545 util_dynarray_append(&kills, struct ir3_instruction *, instr);
546 } else if (is_tex(instr) || is_mem(instr)) {
547 util_dynarray_foreach(&kills, struct ir3_instruction *, instrp) {
548 struct ir3_instruction *kill = *instrp;
549 struct ir3_postsched_node *kn = kill->data;
550 dag_add_edge(&kn->dag, &n->dag, NULL);
551 }
552 }
553 }
554
555 // TODO do we want to do this after reverse-dependencies?
556 dag_traverse_bottom_up(ctx->dag, sched_dag_max_delay_cb, NULL);
557 }
558
559 static void
560 sched_dag_destroy(struct ir3_postsched_ctx *ctx)
561 {
562 ralloc_free(ctx->mem_ctx);
563 ctx->mem_ctx = NULL;
564 ctx->dag = NULL;
565 }
566
567 static void
568 sched_block(struct ir3_postsched_ctx *ctx, struct ir3_block *block)
569 {
570 ctx->block = block;
571
572 /* move all instructions to the unscheduled list, and
573 * empty the block's instruction list (to which we will
574 * be inserting).
575 */
576 list_replace(&block->instr_list, &ctx->unscheduled_list);
577 list_inithead(&block->instr_list);
578
579 // TODO once we are using post-sched for everything we can
580 // just not stick in NOP's prior to post-sched, and drop this.
581 // for now keep this, since it makes post-sched optional:
582 foreach_instr_safe (instr, &ctx->unscheduled_list) {
583 switch (instr->opc) {
584 case OPC_NOP:
585 case OPC_B:
586 case OPC_JUMP:
587 list_delinit(&instr->node);
588 break;
589 default:
590 break;
591 }
592 }
593
594 sched_dag_init(ctx);
595
596 /* First schedule all meta:input instructions, followed by
597 * tex-prefetch. We want all of the instructions that load
598 * values into registers before the shader starts to go
599 * before any other instructions. But in particular we
600 * want inputs to come before prefetches. This is because
601 * a FS's bary_ij input may not actually be live in the
602 * shader, but it should not be scheduled on top of any
603 * other input (but can be overwritten by a tex prefetch)
604 */
605 foreach_instr_safe (instr, &ctx->unscheduled_list)
606 if (instr->opc == OPC_META_INPUT)
607 schedule(ctx, instr);
608
609 foreach_instr_safe (instr, &ctx->unscheduled_list)
610 if (instr->opc == OPC_META_TEX_PREFETCH)
611 schedule(ctx, instr);
612
613 while (!list_is_empty(&ctx->unscheduled_list)) {
614 struct ir3_instruction *instr;
615
616 instr = choose_instr(ctx);
617
618 /* this shouldn't happen: */
619 if (!instr) {
620 ctx->error = true;
621 break;
622 }
623
624 unsigned delay = ir3_delay_calc(ctx->block, instr, false, false);
625 d("delay=%u", delay);
626
627 /* and if we run out of instructions that can be scheduled,
628 * then it is time for nop's:
629 */
630 debug_assert(delay <= 6);
631 while (delay > 0) {
632 ir3_NOP(block);
633 delay--;
634 }
635
636 schedule(ctx, instr);
637 }
638
639 sched_dag_destroy(ctx);
640 }
641
642
643 static bool
644 is_self_mov(struct ir3_instruction *instr)
645 {
646 if (!is_same_type_mov(instr))
647 return false;
648
649 if (instr->regs[0]->num != instr->regs[1]->num)
650 return false;
651
652 if (instr->regs[0]->flags & IR3_REG_RELATIV)
653 return false;
654
655 if (instr->regs[1]->flags & (IR3_REG_CONST | IR3_REG_IMMED |
656 IR3_REG_RELATIV | IR3_REG_FNEG | IR3_REG_FABS |
657 IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT |
658 IR3_REG_EVEN | IR3_REG_POS_INF))
659 return false;
660
661 return true;
662 }
663
664 /* sometimes we end up w/ in-place mov's, ie. mov.u32u32 r1.y, r1.y
665 * as a result of places were before RA we are not sure that it is
666 * safe to eliminate. We could eliminate these earlier, but sometimes
667 * they are tangled up in false-dep's, etc, so it is easier just to
668 * let them exist until after RA
669 */
670 static void
671 cleanup_self_movs(struct ir3 *ir)
672 {
673 foreach_block (block, &ir->block_list) {
674 foreach_instr_safe (instr, &block->instr_list) {
675 struct ir3_register *reg;
676
677 foreach_src (reg, instr) {
678 if (!reg->instr)
679 continue;
680
681 if (is_self_mov(reg->instr)) {
682 list_delinit(&reg->instr->node);
683 reg->instr = reg->instr->regs[1]->instr;
684 }
685 }
686
687 for (unsigned i = 0; i < instr->deps_count; i++) {
688 if (instr->deps[i] && is_self_mov(instr->deps[i])) {
689 list_delinit(&instr->deps[i]->node);
690 instr->deps[i] = instr->deps[i]->regs[1]->instr;
691 }
692 }
693 }
694 }
695 }
696
697 int
698 ir3_postsched(struct ir3_context *cctx)
699 {
700 struct ir3_postsched_ctx ctx = {
701 .ctx = cctx,
702 };
703
704 ir3_remove_nops(cctx->ir);
705 cleanup_self_movs(cctx->ir);
706
707 foreach_block (block, &cctx->ir->block_list) {
708 sched_block(&ctx, block);
709 }
710
711 if (ctx.error)
712 return -1;
713
714 return 0;
715 }