freedreno/ir3: try to avoid syncs
[mesa.git] / src / freedreno / ir3 / ir3_postsched.c
1 /*
2 * Copyright (C) 2019 Google, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Rob Clark <robclark@freedesktop.org>
25 */
26
27
28 #include "util/dag.h"
29 #include "util/u_math.h"
30
31 #include "ir3.h"
32 #include "ir3_compiler.h"
33 #include "ir3_context.h"
34
35 #ifdef DEBUG
36 #define SCHED_DEBUG (ir3_shader_debug & IR3_DBG_SCHEDMSGS)
37 #else
38 #define SCHED_DEBUG 0
39 #endif
40 #define d(fmt, ...) do { if (SCHED_DEBUG) { \
41 printf("PSCHED: "fmt"\n", ##__VA_ARGS__); \
42 } } while (0)
43
44 #define di(instr, fmt, ...) do { if (SCHED_DEBUG) { \
45 printf("PSCHED: "fmt": ", ##__VA_ARGS__); \
46 ir3_print_instr(instr); \
47 } } while (0)
48
49 /*
50 * Post RA Instruction Scheduling
51 */
52
53 struct ir3_postsched_ctx {
54 struct ir3_context *ctx;
55
56 void *mem_ctx;
57 struct ir3_block *block; /* the current block */
58 struct dag *dag;
59
60 struct list_head unscheduled_list; /* unscheduled instructions */
61 struct ir3_instruction *scheduled; /* last scheduled instr */
62 struct ir3_instruction *pred; /* current p0.x user, if any */
63
64 bool error;
65
66 int sfu_delay;
67 };
68
69 struct ir3_postsched_node {
70 struct dag_node dag; /* must be first for util_dynarray_foreach */
71 struct ir3_instruction *instr;
72 bool partially_evaluated_path;
73
74 unsigned delay;
75 unsigned max_delay;
76 };
77
78 #define foreach_sched_node(__n, __list) \
79 list_for_each_entry(struct ir3_postsched_node, __n, __list, dag.link)
80
81 #define foreach_bit(b, mask) \
82 for (uint32_t _m = ({debug_assert((mask) >= 1); (mask);}); _m && ({(b) = u_bit_scan(&_m); 1;});)
83
84 static void
85 schedule(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
86 {
87 debug_assert(ctx->block == instr->block);
88
89 /* remove from unscheduled_list:
90 */
91 list_delinit(&instr->node);
92
93 if (writes_pred(instr)) {
94 ctx->pred = instr;
95 }
96
97 di(instr, "schedule");
98
99 list_addtail(&instr->node, &instr->block->instr_list);
100 ctx->scheduled = instr;
101
102 if (is_sfu(instr)) {
103 ctx->sfu_delay = 8;
104 } else if (ctx->sfu_delay > 0) {
105 ctx->sfu_delay--;
106 }
107
108 struct ir3_postsched_node *n = instr->data;
109 dag_prune_head(ctx->dag, &n->dag);
110 }
111
112 static void
113 dump_state(struct ir3_postsched_ctx *ctx)
114 {
115 if (!SCHED_DEBUG)
116 return;
117
118 foreach_sched_node (n, &ctx->dag->heads) {
119 di(n->instr, "maxdel=%3d ", n->max_delay);
120
121 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
122 struct ir3_postsched_node *child =
123 (struct ir3_postsched_node *)edge->child;
124
125 di(child->instr, " -> (%d parents) ", child->dag.parent_count);
126 }
127 }
128 }
129
130 /* Determine if this is an instruction that we'd prefer not to schedule
131 * yet, in order to avoid an (ss) sync. This is limited by the sfu_delay
132 * counter, ie. the more cycles it has been since the last SFU, the less
133 * costly a sync would be.
134 */
135 static bool
136 would_sync(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
137 {
138 if (ctx->sfu_delay) {
139 struct ir3_register *reg;
140 foreach_src (reg, instr)
141 if (reg->instr && is_sfu(reg->instr))
142 return true;
143 }
144
145 return false;
146 }
147
148 /* find instruction to schedule: */
149 static struct ir3_instruction *
150 choose_instr(struct ir3_postsched_ctx *ctx)
151 {
152 struct ir3_postsched_node *chosen = NULL;
153
154 dump_state(ctx);
155
156 foreach_sched_node (n, &ctx->dag->heads) {
157 if (!is_meta(n->instr))
158 continue;
159
160 if (!chosen || (chosen->max_delay < n->max_delay))
161 chosen = n;
162 }
163
164 if (chosen) {
165 di(chosen->instr, "prio: chose (meta)");
166 return chosen->instr;
167 }
168
169 /* Try to schedule inputs with a higher priority, if possible, as
170 * the last bary.f unlocks varying storage to unblock more VS
171 * warps.
172 */
173 foreach_sched_node (n, &ctx->dag->heads) {
174 if (!is_input(n->instr))
175 continue;
176
177 if (!chosen || (chosen->max_delay < n->max_delay))
178 chosen = n;
179 }
180
181 if (chosen) {
182 di(chosen->instr, "prio: chose (input)");
183 return chosen->instr;
184 }
185
186 /* Next prioritize discards: */
187 foreach_sched_node (n, &ctx->dag->heads) {
188 unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
189
190 if (d > 0)
191 continue;
192
193 if (!is_kill(n->instr))
194 continue;
195
196 if (!chosen || (chosen->max_delay < n->max_delay))
197 chosen = n;
198 }
199
200 if (chosen) {
201 di(chosen->instr, "csp: chose (kill, hard ready)");
202 return chosen->instr;
203 }
204
205 /* Next prioritize expensive instructions: */
206 foreach_sched_node (n, &ctx->dag->heads) {
207 unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
208
209 if (d > 0)
210 continue;
211
212 if (!(is_sfu(n->instr) || is_tex(n->instr)))
213 continue;
214
215 if (!chosen || (chosen->max_delay < n->max_delay))
216 chosen = n;
217 }
218
219 if (chosen) {
220 di(chosen->instr, "csp: chose (sfu/tex, hard ready)");
221 return chosen->instr;
222 }
223
224 /*
225 * Sometimes be better to take a nop, rather than scheduling an
226 * instruction that would require an (ss) shortly after another
227 * SFU.. ie. if last SFU was just one or two instr ago, and we
228 * could choose between taking a nop and then scheduling
229 * something else, vs scheduling the immed avail instruction that
230 * would require (ss), we are better with the nop.
231 */
232 for (unsigned delay = 0; delay < 4; delay++) {
233 foreach_sched_node (n, &ctx->dag->heads) {
234 if (would_sync(ctx, n->instr))
235 continue;
236
237 unsigned d = ir3_delay_calc(ctx->block, n->instr, true, false);
238
239 if (d > delay)
240 continue;
241
242 if (!chosen || (chosen->max_delay < n->max_delay))
243 chosen = n;
244 }
245
246 if (chosen) {
247 di(chosen->instr, "csp: chose (soft ready, delay=%u)", delay);
248 return chosen->instr;
249 }
250 }
251
252 /* Next try to find a ready leader w/ soft delay (ie. including extra
253 * delay for things like tex fetch which can be synchronized w/ sync
254 * bit (but we probably do want to schedule some other instructions
255 * while we wait)
256 */
257 foreach_sched_node (n, &ctx->dag->heads) {
258 unsigned d = ir3_delay_calc(ctx->block, n->instr, true, false);
259
260 if (d > 0)
261 continue;
262
263 if (!chosen || (chosen->max_delay < n->max_delay))
264 chosen = n;
265 }
266
267 if (chosen) {
268 di(chosen->instr, "csp: chose (soft ready)");
269 return chosen->instr;
270 }
271
272 /* Next try to find a ready leader that can be scheduled without nop's,
273 * which in the case of things that need (sy)/(ss) could result in
274 * stalls.. but we've already decided there is not a better option.
275 */
276 foreach_sched_node (n, &ctx->dag->heads) {
277 unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
278
279 if (d > 0)
280 continue;
281
282 if (!chosen || (chosen->max_delay < n->max_delay))
283 chosen = n;
284 }
285
286 if (chosen) {
287 di(chosen->instr, "csp: chose (hard ready)");
288 return chosen->instr;
289 }
290
291 /* Otherwise choose leader with maximum cost:
292 *
293 * TODO should we try to balance cost and delays? I guess it is
294 * a balance between now-nop's and future-nop's?
295 */
296 foreach_sched_node (n, &ctx->dag->heads) {
297 if (!chosen || chosen->max_delay < n->max_delay)
298 chosen = n;
299 }
300
301 if (chosen) {
302 di(chosen->instr, "csp: chose (leader)");
303 return chosen->instr;
304 }
305
306 return NULL;
307 }
308
309 struct ir3_postsched_deps_state {
310 struct ir3_context *ctx;
311
312 enum { F, R } direction;
313
314 bool merged;
315
316 /* Track the mapping between sched node (instruction) that last
317 * wrote a given register (in whichever direction we are iterating
318 * the block)
319 *
320 * Note, this table is twice as big as the # of regs, to deal with
321 * half-precision regs. The approach differs depending on whether
322 * the half and full precision register files are "merged" (conflict,
323 * ie. a6xx+) in which case we consider each full precision dep
324 * as two half-precision dependencies, vs older separate (non-
325 * conflicting) in which case the first half of the table is used
326 * for full precision and 2nd half for half-precision.
327 */
328 struct ir3_postsched_node *regs[2 * 256];
329 };
330
331 /* bounds checking read/write accessors, since OoB access to stuff on
332 * the stack is gonna cause a bad day.
333 */
334 #define dep_reg(state, idx) *({ \
335 assert((idx) < ARRAY_SIZE((state)->regs)); \
336 &(state)->regs[(idx)]; \
337 })
338
339 static void
340 add_dep(struct ir3_postsched_deps_state *state,
341 struct ir3_postsched_node *before,
342 struct ir3_postsched_node *after)
343 {
344 if (!before || !after)
345 return;
346
347 assert(before != after);
348
349 if (state->direction == F) {
350 dag_add_edge(&before->dag, &after->dag, NULL);
351 } else {
352 dag_add_edge(&after->dag, &before->dag, NULL);
353 }
354 }
355
356 static void
357 add_single_reg_dep(struct ir3_postsched_deps_state *state,
358 struct ir3_postsched_node *node, unsigned num, bool write)
359 {
360 add_dep(state, dep_reg(state, num), node);
361 if (write) {
362 dep_reg(state, num) = node;
363 }
364 }
365
366 /* This is where we handled full vs half-precision, and potential conflicts
367 * between half and full precision that result in additional dependencies.
368 * The 'reg' arg is really just to know half vs full precision.
369 */
370 static void
371 add_reg_dep(struct ir3_postsched_deps_state *state,
372 struct ir3_postsched_node *node, const struct ir3_register *reg,
373 unsigned num, bool write)
374 {
375 if (state->merged) {
376 if (reg->flags & IR3_REG_HALF) {
377 /* single conflict in half-reg space: */
378 add_single_reg_dep(state, node, num, write);
379 } else {
380 /* two conflicts in half-reg space: */
381 add_single_reg_dep(state, node, 2 * num + 0, write);
382 add_single_reg_dep(state, node, 2 * num + 1, write);
383 }
384 } else {
385 if (reg->flags & IR3_REG_HALF)
386 num += ARRAY_SIZE(state->regs) / 2;
387 add_single_reg_dep(state, node, num, write);
388 }
389 }
390
391 static void
392 calculate_deps(struct ir3_postsched_deps_state *state,
393 struct ir3_postsched_node *node)
394 {
395 static const struct ir3_register half_reg = { .flags = IR3_REG_HALF };
396 struct ir3_register *reg;
397 int b;
398
399 /* Add dependencies on instructions that previously (or next,
400 * in the reverse direction) wrote any of our src registers:
401 */
402 foreach_src_n (reg, i, node->instr) {
403 /* NOTE: relative access for a src can be either const or gpr: */
404 if (reg->flags & IR3_REG_RELATIV) {
405 /* also reads a0.x: */
406 add_reg_dep(state, node, &half_reg, regid(REG_A0, 0), false);
407 }
408
409 if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
410 continue;
411
412 if (reg->flags & IR3_REG_RELATIV) {
413 /* mark entire array as read: */
414 struct ir3_array *arr = ir3_lookup_array(state->ctx->ir, reg->array.id);
415 for (unsigned i = 0; i < arr->length; i++) {
416 add_reg_dep(state, node, reg, arr->reg + i, false);
417 }
418 } else {
419 foreach_bit (b, reg->wrmask) {
420 add_reg_dep(state, node, reg, reg->num + b, false);
421
422 struct ir3_postsched_node *dep = dep_reg(state, reg->num + b);
423 if (dep && (state->direction == F)) {
424 unsigned d = ir3_delayslots(dep->instr, node->instr, i, true);
425 node->delay = MAX2(node->delay, d);
426 }
427 }
428 }
429 }
430
431 if (dest_regs(node->instr) == 0)
432 return;
433
434 /* And then after we update the state for what this instruction
435 * wrote:
436 */
437 reg = node->instr->regs[0];
438 if (reg->flags & IR3_REG_RELATIV) {
439 /* mark the entire array as written: */
440 struct ir3_array *arr = ir3_lookup_array(state->ctx->ir, reg->array.id);
441 for (unsigned i = 0; i < arr->length; i++) {
442 add_reg_dep(state, node, reg, arr->reg + i, true);
443 }
444
445 /* also reads a0.x: */
446 add_reg_dep(state, node, &half_reg, regid(REG_A0, 0), false);
447 } else {
448 foreach_bit (b, reg->wrmask) {
449 add_reg_dep(state, node, reg, reg->num + b, true);
450 }
451 }
452 }
453
454 static void
455 calculate_forward_deps(struct ir3_postsched_ctx *ctx)
456 {
457 struct ir3_postsched_deps_state state = {
458 .ctx = ctx->ctx,
459 .direction = F,
460 .merged = ctx->ctx->compiler->gpu_id >= 600,
461 };
462
463 foreach_instr (instr, &ctx->unscheduled_list) {
464 calculate_deps(&state, instr->data);
465 }
466 }
467
468 static void
469 calculate_reverse_deps(struct ir3_postsched_ctx *ctx)
470 {
471 struct ir3_postsched_deps_state state = {
472 .ctx = ctx->ctx,
473 .direction = R,
474 .merged = ctx->ctx->compiler->gpu_id >= 600,
475 };
476
477 foreach_instr_rev (instr, &ctx->unscheduled_list) {
478 calculate_deps(&state, instr->data);
479 }
480 }
481
482 static void
483 sched_node_init(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
484 {
485 struct ir3_postsched_node *n = rzalloc(ctx->mem_ctx, struct ir3_postsched_node);
486
487 dag_init_node(ctx->dag, &n->dag);
488
489 n->instr = instr;
490 instr->data = n;
491 }
492
493 static void
494 sched_dag_max_delay_cb(struct dag_node *node, void *state)
495 {
496 struct ir3_postsched_node *n = (struct ir3_postsched_node *)node;
497 uint32_t max_delay = 0;
498
499 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
500 struct ir3_postsched_node *child = (struct ir3_postsched_node *)edge->child;
501 max_delay = MAX2(child->max_delay, max_delay);
502 }
503
504 n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
505 }
506
507 static void
508 sched_dag_init(struct ir3_postsched_ctx *ctx)
509 {
510 ctx->mem_ctx = ralloc_context(NULL);
511
512 ctx->dag = dag_create(ctx->mem_ctx);
513
514 foreach_instr (instr, &ctx->unscheduled_list)
515 sched_node_init(ctx, instr);
516
517 calculate_forward_deps(ctx);
518 calculate_reverse_deps(ctx);
519
520 /*
521 * Normal srcs won't be in SSA at this point, those are dealt with in
522 * calculate_forward_deps() and calculate_reverse_deps(). But we still
523 * have the false-dep information in SSA form, so go ahead and add
524 * dependencies for that here:
525 */
526 foreach_instr (instr, &ctx->unscheduled_list) {
527 struct ir3_postsched_node *n = instr->data;
528 struct ir3_instruction *src;
529
530 foreach_ssa_src_n (src, i, instr) {
531 if (src->block != instr->block)
532 continue;
533
534 /* we can end up with unused false-deps.. just skip them: */
535 if (src->flags & IR3_INSTR_UNUSED)
536 continue;
537
538 struct ir3_postsched_node *sn = src->data;
539
540 /* don't consider dependencies in other blocks: */
541 if (src->block != instr->block)
542 continue;
543
544 dag_add_edge(&sn->dag, &n->dag, NULL);
545 }
546 }
547
548 // TODO do we want to do this after reverse-dependencies?
549 dag_traverse_bottom_up(ctx->dag, sched_dag_max_delay_cb, NULL);
550 }
551
552 static void
553 sched_dag_destroy(struct ir3_postsched_ctx *ctx)
554 {
555 ralloc_free(ctx->mem_ctx);
556 ctx->mem_ctx = NULL;
557 ctx->dag = NULL;
558 }
559
560 static void
561 sched_block(struct ir3_postsched_ctx *ctx, struct ir3_block *block)
562 {
563 ctx->block = block;
564 ctx->scheduled = NULL;
565 ctx->pred = NULL;
566
567 /* move all instructions to the unscheduled list, and
568 * empty the block's instruction list (to which we will
569 * be inserting).
570 */
571 list_replace(&block->instr_list, &ctx->unscheduled_list);
572 list_inithead(&block->instr_list);
573
574 // TODO once we are using post-sched for everything we can
575 // just not stick in NOP's prior to post-sched, and drop this.
576 // for now keep this, since it makes post-sched optional:
577 foreach_instr_safe (instr, &ctx->unscheduled_list) {
578 switch (instr->opc) {
579 case OPC_NOP:
580 case OPC_BR:
581 case OPC_JUMP:
582 list_delinit(&instr->node);
583 break;
584 default:
585 break;
586 }
587 }
588
589 sched_dag_init(ctx);
590
591 /* First schedule all meta:input instructions, followed by
592 * tex-prefetch. We want all of the instructions that load
593 * values into registers before the shader starts to go
594 * before any other instructions. But in particular we
595 * want inputs to come before prefetches. This is because
596 * a FS's bary_ij input may not actually be live in the
597 * shader, but it should not be scheduled on top of any
598 * other input (but can be overwritten by a tex prefetch)
599 */
600 foreach_instr_safe (instr, &ctx->unscheduled_list)
601 if (instr->opc == OPC_META_INPUT)
602 schedule(ctx, instr);
603
604 foreach_instr_safe (instr, &ctx->unscheduled_list)
605 if (instr->opc == OPC_META_TEX_PREFETCH)
606 schedule(ctx, instr);
607
608 while (!list_is_empty(&ctx->unscheduled_list)) {
609 struct ir3_instruction *instr;
610
611 instr = choose_instr(ctx);
612
613 /* this shouldn't happen: */
614 if (!instr) {
615 ctx->error = true;
616 break;
617 }
618
619 unsigned delay = ir3_delay_calc(ctx->block, instr, false, false);
620 d("delay=%u", delay);
621
622 /* and if we run out of instructions that can be scheduled,
623 * then it is time for nop's:
624 */
625 debug_assert(delay <= 6);
626 while (delay > 0) {
627 ir3_NOP(block);
628 delay--;
629 }
630
631 schedule(ctx, instr);
632 }
633
634 sched_dag_destroy(ctx);
635 }
636
637
638 static bool
639 is_self_mov(struct ir3_instruction *instr)
640 {
641 if (!is_same_type_mov(instr))
642 return false;
643
644 if (instr->regs[0]->num != instr->regs[1]->num)
645 return false;
646
647 if (instr->regs[0]->flags & IR3_REG_RELATIV)
648 return false;
649
650 if (instr->regs[1]->flags & (IR3_REG_CONST | IR3_REG_IMMED |
651 IR3_REG_RELATIV | IR3_REG_FNEG | IR3_REG_FABS |
652 IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT |
653 IR3_REG_EVEN | IR3_REG_POS_INF))
654 return false;
655
656 return true;
657 }
658
659 /* sometimes we end up w/ in-place mov's, ie. mov.u32u32 r1.y, r1.y
660 * as a result of places were before RA we are not sure that it is
661 * safe to eliminate. We could eliminate these earlier, but sometimes
662 * they are tangled up in false-dep's, etc, so it is easier just to
663 * let them exist until after RA
664 */
665 static void
666 cleanup_self_movs(struct ir3 *ir)
667 {
668 foreach_block (block, &ir->block_list) {
669 foreach_instr_safe (instr, &block->instr_list) {
670 struct ir3_register *reg;
671
672 foreach_src (reg, instr) {
673 if (!reg->instr)
674 continue;
675
676 if (is_self_mov(reg->instr)) {
677 list_delinit(&reg->instr->node);
678 reg->instr = reg->instr->regs[1]->instr;
679 }
680 }
681
682 for (unsigned i = 0; i < instr->deps_count; i++) {
683 if (is_self_mov(instr->deps[i])) {
684 list_delinit(&instr->deps[i]->node);
685 instr->deps[i] = instr->deps[i]->regs[1]->instr;
686 }
687 }
688 }
689 }
690 }
691
692 int
693 ir3_postsched(struct ir3_context *cctx)
694 {
695 struct ir3_postsched_ctx ctx = {
696 .ctx = cctx,
697 };
698
699 ir3_remove_nops(cctx->ir);
700 cleanup_self_movs(cctx->ir);
701
702 foreach_block (block, &cctx->ir->block_list) {
703 sched_block(&ctx, block);
704 }
705
706 if (ctx.error)
707 return -1;
708
709 return 0;
710 }