freedreno/ir3: update SFU delay
[mesa.git] / src / freedreno / ir3 / ir3_postsched.c
1 /*
2 * Copyright (C) 2019 Google, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Rob Clark <robclark@freedesktop.org>
25 */
26
27
28 #include "util/dag.h"
29 #include "util/u_math.h"
30
31 #include "ir3.h"
32 #include "ir3_compiler.h"
33 #include "ir3_context.h"
34
35 #ifdef DEBUG
36 #define SCHED_DEBUG (ir3_shader_debug & IR3_DBG_SCHEDMSGS)
37 #else
38 #define SCHED_DEBUG 0
39 #endif
40 #define d(fmt, ...) do { if (SCHED_DEBUG) { \
41 printf("PSCHED: "fmt"\n", ##__VA_ARGS__); \
42 } } while (0)
43
44 #define di(instr, fmt, ...) do { if (SCHED_DEBUG) { \
45 printf("PSCHED: "fmt": ", ##__VA_ARGS__); \
46 ir3_print_instr(instr); \
47 } } while (0)
48
49 /*
50 * Post RA Instruction Scheduling
51 */
52
53 struct ir3_postsched_ctx {
54 struct ir3_context *ctx;
55
56 void *mem_ctx;
57 struct ir3_block *block; /* the current block */
58 struct dag *dag;
59
60 struct list_head unscheduled_list; /* unscheduled instructions */
61 struct ir3_instruction *scheduled; /* last scheduled instr */
62 struct ir3_instruction *pred; /* current p0.x user, if any */
63
64 bool error;
65 };
66
67 struct ir3_postsched_node {
68 struct dag_node dag; /* must be first for util_dynarray_foreach */
69 struct ir3_instruction *instr;
70 bool partially_evaluated_path;
71
72 unsigned delay;
73 unsigned max_delay;
74 };
75
76 #define foreach_sched_node(__n, __list) \
77 list_for_each_entry(struct ir3_postsched_node, __n, __list, dag.link)
78
79 #define foreach_bit(b, mask) \
80 for (uint32_t _m = ({debug_assert((mask) >= 1); (mask);}); _m && ({(b) = u_bit_scan(&_m); 1;});)
81
82 // TODO deduplicate
83 static bool is_sfu_or_mem(struct ir3_instruction *instr)
84 {
85 return is_sfu(instr) || is_mem(instr);
86 }
87
88 static void
89 schedule(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
90 {
91 debug_assert(ctx->block == instr->block);
92
93 /* maybe there is a better way to handle this than just stuffing
94 * a nop.. ideally we'd know about this constraint in the
95 * scheduling and depth calculation..
96 */
97 if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr))
98 ir3_NOP(ctx->block);
99
100 /* remove from unscheduled_list:
101 */
102 list_delinit(&instr->node);
103
104 if (writes_pred(instr)) {
105 ctx->pred = instr;
106 }
107
108 di(instr, "schedule");
109
110 list_addtail(&instr->node, &instr->block->instr_list);
111 ctx->scheduled = instr;
112
113 struct ir3_postsched_node *n = instr->data;
114 dag_prune_head(ctx->dag, &n->dag);
115 }
116
117 static void
118 dump_state(struct ir3_postsched_ctx *ctx)
119 {
120 if (!SCHED_DEBUG)
121 return;
122
123 foreach_sched_node (n, &ctx->dag->heads) {
124 di(n->instr, "maxdel=%3d ", n->max_delay);
125
126 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
127 struct ir3_postsched_node *child =
128 (struct ir3_postsched_node *)edge->child;
129
130 di(child->instr, " -> (%d parents) ", child->dag.parent_count);
131 }
132 }
133 }
134
135 /* find instruction to schedule: */
136 static struct ir3_instruction *
137 choose_instr(struct ir3_postsched_ctx *ctx)
138 {
139 struct ir3_postsched_node *chosen = NULL;
140
141 dump_state(ctx);
142
143 foreach_sched_node (n, &ctx->dag->heads) {
144 if (!is_meta(n->instr))
145 continue;
146
147 if (!chosen || (chosen->max_delay < n->max_delay))
148 chosen = n;
149 }
150
151 if (chosen) {
152 di(chosen->instr, "prio: chose (meta)");
153 return chosen->instr;
154 }
155
156 /* Try to schedule inputs with a higher priority, if possible, as
157 * the last bary.f unlocks varying storage to unblock more VS
158 * warps.
159 */
160 foreach_sched_node (n, &ctx->dag->heads) {
161 if (!is_input(n->instr))
162 continue;
163
164 if (!chosen || (chosen->max_delay < n->max_delay))
165 chosen = n;
166 }
167
168 if (chosen) {
169 di(chosen->instr, "prio: chose (input)");
170 return chosen->instr;
171 }
172
173 /* Next prioritize discards: */
174 foreach_sched_node (n, &ctx->dag->heads) {
175 unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
176
177 if (d > 0)
178 continue;
179
180 if (!is_kill(n->instr))
181 continue;
182
183 if (!chosen || (chosen->max_delay < n->max_delay))
184 chosen = n;
185 }
186
187 if (chosen) {
188 di(chosen->instr, "csp: chose (kill, hard ready)");
189 return chosen->instr;
190 }
191
192 /* Next prioritize expensive instructions: */
193 foreach_sched_node (n, &ctx->dag->heads) {
194 unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
195
196 if (d > 0)
197 continue;
198
199 if (!(is_sfu(n->instr) || is_tex(n->instr)))
200 continue;
201
202 if (!chosen || (chosen->max_delay < n->max_delay))
203 chosen = n;
204 }
205
206 if (chosen) {
207 di(chosen->instr, "csp: chose (sfu/tex, hard ready)");
208 return chosen->instr;
209 }
210
211 /* First try to find a ready leader w/ soft delay (ie. including extra
212 * delay for things like tex fetch which can be synchronized w/ sync
213 * bit (but we probably do want to schedule some other instructions
214 * while we wait)
215 */
216 foreach_sched_node (n, &ctx->dag->heads) {
217 unsigned d = ir3_delay_calc(ctx->block, n->instr, true, false);
218
219 if (d > 0)
220 continue;
221
222 if (!chosen || (chosen->max_delay < n->max_delay))
223 chosen = n;
224 }
225
226 if (chosen) {
227 di(chosen->instr, "csp: chose (soft ready)");
228 return chosen->instr;
229 }
230
231 /* Next try to find a ready leader that can be scheduled without nop's,
232 * which in the case of things that need (sy)/(ss) could result in
233 * stalls.. but we've already decided there is not a better option.
234 */
235 foreach_sched_node (n, &ctx->dag->heads) {
236 unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
237
238 if (d > 0)
239 continue;
240
241 if (!chosen || (chosen->max_delay < n->max_delay))
242 chosen = n;
243 }
244
245 if (chosen) {
246 di(chosen->instr, "csp: chose (hard ready)");
247 return chosen->instr;
248 }
249
250 /* Otherwise choose leader with maximum cost:
251 *
252 * TODO should we try to balance cost and delays? I guess it is
253 * a balance between now-nop's and future-nop's?
254 */
255 foreach_sched_node (n, &ctx->dag->heads) {
256 if (!chosen || chosen->max_delay < n->max_delay)
257 chosen = n;
258 }
259
260 if (chosen) {
261 di(chosen->instr, "csp: chose (leader)");
262 return chosen->instr;
263 }
264
265 return NULL;
266 }
267
268 struct ir3_postsched_deps_state {
269 struct ir3_context *ctx;
270
271 enum { F, R } direction;
272
273 bool merged;
274
275 /* Track the mapping between sched node (instruction) that last
276 * wrote a given register (in whichever direction we are iterating
277 * the block)
278 *
279 * Note, this table is twice as big as the # of regs, to deal with
280 * half-precision regs. The approach differs depending on whether
281 * the half and full precision register files are "merged" (conflict,
282 * ie. a6xx+) in which case we consider each full precision dep
283 * as two half-precision dependencies, vs older separate (non-
284 * conflicting) in which case the first half of the table is used
285 * for full precision and 2nd half for half-precision.
286 */
287 struct ir3_postsched_node *regs[2 * 256];
288 };
289
290 /* bounds checking read/write accessors, since OoB access to stuff on
291 * the stack is gonna cause a bad day.
292 */
293 #define dep_reg(state, idx) *({ \
294 assert((idx) < ARRAY_SIZE((state)->regs)); \
295 &(state)->regs[(idx)]; \
296 })
297
298 static void
299 add_dep(struct ir3_postsched_deps_state *state,
300 struct ir3_postsched_node *before,
301 struct ir3_postsched_node *after)
302 {
303 if (!before || !after)
304 return;
305
306 assert(before != after);
307
308 if (state->direction == F) {
309 dag_add_edge(&before->dag, &after->dag, NULL);
310 } else {
311 dag_add_edge(&after->dag, &before->dag, NULL);
312 }
313 }
314
315 static void
316 add_single_reg_dep(struct ir3_postsched_deps_state *state,
317 struct ir3_postsched_node *node, unsigned num, bool write)
318 {
319 add_dep(state, dep_reg(state, num), node);
320 if (write) {
321 dep_reg(state, num) = node;
322 }
323 }
324
325 /* This is where we handled full vs half-precision, and potential conflicts
326 * between half and full precision that result in additional dependencies.
327 * The 'reg' arg is really just to know half vs full precision.
328 */
329 static void
330 add_reg_dep(struct ir3_postsched_deps_state *state,
331 struct ir3_postsched_node *node, const struct ir3_register *reg,
332 unsigned num, bool write)
333 {
334 if (state->merged) {
335 if (reg->flags & IR3_REG_HALF) {
336 /* single conflict in half-reg space: */
337 add_single_reg_dep(state, node, num, write);
338 } else {
339 /* two conflicts in half-reg space: */
340 add_single_reg_dep(state, node, 2 * num + 0, write);
341 add_single_reg_dep(state, node, 2 * num + 1, write);
342 }
343 } else {
344 if (reg->flags & IR3_REG_HALF)
345 num += ARRAY_SIZE(state->regs) / 2;
346 add_single_reg_dep(state, node, num, write);
347 }
348 }
349
350 static void
351 calculate_deps(struct ir3_postsched_deps_state *state,
352 struct ir3_postsched_node *node)
353 {
354 static const struct ir3_register half_reg = { .flags = IR3_REG_HALF };
355 struct ir3_register *reg;
356 int b;
357
358 /* Add dependencies on instructions that previously (or next,
359 * in the reverse direction) wrote any of our src registers:
360 */
361 foreach_src_n (reg, i, node->instr) {
362 /* NOTE: relative access for a src can be either const or gpr: */
363 if (reg->flags & IR3_REG_RELATIV) {
364 /* also reads a0.x: */
365 add_reg_dep(state, node, &half_reg, regid(REG_A0, 0), false);
366 }
367
368 if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
369 continue;
370
371 if (reg->flags & IR3_REG_RELATIV) {
372 /* mark entire array as read: */
373 struct ir3_array *arr = ir3_lookup_array(state->ctx->ir, reg->array.id);
374 for (unsigned i = 0; i < arr->length; i++) {
375 add_reg_dep(state, node, reg, arr->reg + i, false);
376 }
377 } else {
378 foreach_bit (b, reg->wrmask) {
379 add_reg_dep(state, node, reg, reg->num + b, false);
380
381 struct ir3_postsched_node *dep = dep_reg(state, reg->num + b);
382 if (dep && (state->direction == F)) {
383 unsigned d = ir3_delayslots(dep->instr, node->instr, i, true);
384 node->delay = MAX2(node->delay, d);
385 }
386 }
387 }
388 }
389
390 if (dest_regs(node->instr) == 0)
391 return;
392
393 /* And then after we update the state for what this instruction
394 * wrote:
395 */
396 reg = node->instr->regs[0];
397 if (reg->flags & IR3_REG_RELATIV) {
398 /* mark the entire array as written: */
399 struct ir3_array *arr = ir3_lookup_array(state->ctx->ir, reg->array.id);
400 for (unsigned i = 0; i < arr->length; i++) {
401 add_reg_dep(state, node, reg, arr->reg + i, true);
402 }
403
404 /* also reads a0.x: */
405 add_reg_dep(state, node, &half_reg, regid(REG_A0, 0), false);
406 } else {
407 foreach_bit (b, reg->wrmask) {
408 add_reg_dep(state, node, reg, reg->num + b, true);
409 }
410 }
411 }
412
413 static void
414 calculate_forward_deps(struct ir3_postsched_ctx *ctx)
415 {
416 struct ir3_postsched_deps_state state = {
417 .ctx = ctx->ctx,
418 .direction = F,
419 .merged = ctx->ctx->compiler->gpu_id >= 600,
420 };
421
422 foreach_instr (instr, &ctx->unscheduled_list) {
423 calculate_deps(&state, instr->data);
424 }
425 }
426
427 static void
428 calculate_reverse_deps(struct ir3_postsched_ctx *ctx)
429 {
430 struct ir3_postsched_deps_state state = {
431 .ctx = ctx->ctx,
432 .direction = R,
433 .merged = ctx->ctx->compiler->gpu_id >= 600,
434 };
435
436 foreach_instr_rev (instr, &ctx->unscheduled_list) {
437 calculate_deps(&state, instr->data);
438 }
439 }
440
441 static void
442 sched_node_init(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
443 {
444 struct ir3_postsched_node *n = rzalloc(ctx->mem_ctx, struct ir3_postsched_node);
445
446 dag_init_node(ctx->dag, &n->dag);
447
448 n->instr = instr;
449 instr->data = n;
450 }
451
452 static void
453 sched_dag_max_delay_cb(struct dag_node *node, void *state)
454 {
455 struct ir3_postsched_node *n = (struct ir3_postsched_node *)node;
456 uint32_t max_delay = 0;
457
458 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
459 struct ir3_postsched_node *child = (struct ir3_postsched_node *)edge->child;
460 max_delay = MAX2(child->max_delay, max_delay);
461 }
462
463 n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
464 }
465
466 static void
467 sched_dag_init(struct ir3_postsched_ctx *ctx)
468 {
469 ctx->mem_ctx = ralloc_context(NULL);
470
471 ctx->dag = dag_create(ctx->mem_ctx);
472
473 foreach_instr (instr, &ctx->unscheduled_list)
474 sched_node_init(ctx, instr);
475
476 calculate_forward_deps(ctx);
477 calculate_reverse_deps(ctx);
478
479 /*
480 * Normal srcs won't be in SSA at this point, those are dealt with in
481 * calculate_forward_deps() and calculate_reverse_deps(). But we still
482 * have the false-dep information in SSA form, so go ahead and add
483 * dependencies for that here:
484 */
485 foreach_instr (instr, &ctx->unscheduled_list) {
486 struct ir3_postsched_node *n = instr->data;
487 struct ir3_instruction *src;
488
489 foreach_ssa_src_n (src, i, instr) {
490 if (src->block != instr->block)
491 continue;
492
493 /* we can end up with unused false-deps.. just skip them: */
494 if (src->flags & IR3_INSTR_UNUSED)
495 continue;
496
497 struct ir3_postsched_node *sn = src->data;
498
499 /* don't consider dependencies in other blocks: */
500 if (src->block != instr->block)
501 continue;
502
503 dag_add_edge(&sn->dag, &n->dag, NULL);
504 }
505 }
506
507 // TODO do we want to do this after reverse-dependencies?
508 dag_traverse_bottom_up(ctx->dag, sched_dag_max_delay_cb, NULL);
509 }
510
511 static void
512 sched_dag_destroy(struct ir3_postsched_ctx *ctx)
513 {
514 ralloc_free(ctx->mem_ctx);
515 ctx->mem_ctx = NULL;
516 ctx->dag = NULL;
517 }
518
519 static void
520 sched_block(struct ir3_postsched_ctx *ctx, struct ir3_block *block)
521 {
522 ctx->block = block;
523 ctx->scheduled = NULL;
524 ctx->pred = NULL;
525
526 /* move all instructions to the unscheduled list, and
527 * empty the block's instruction list (to which we will
528 * be inserting).
529 */
530 list_replace(&block->instr_list, &ctx->unscheduled_list);
531 list_inithead(&block->instr_list);
532
533 // TODO once we are using post-sched for everything we can
534 // just not stick in NOP's prior to post-sched, and drop this.
535 // for now keep this, since it makes post-sched optional:
536 foreach_instr_safe (instr, &ctx->unscheduled_list) {
537 switch (instr->opc) {
538 case OPC_NOP:
539 case OPC_BR:
540 case OPC_JUMP:
541 list_delinit(&instr->node);
542 break;
543 default:
544 break;
545 }
546 }
547
548 sched_dag_init(ctx);
549
550 /* First schedule all meta:input instructions, followed by
551 * tex-prefetch. We want all of the instructions that load
552 * values into registers before the shader starts to go
553 * before any other instructions. But in particular we
554 * want inputs to come before prefetches. This is because
555 * a FS's bary_ij input may not actually be live in the
556 * shader, but it should not be scheduled on top of any
557 * other input (but can be overwritten by a tex prefetch)
558 */
559 foreach_instr_safe (instr, &ctx->unscheduled_list)
560 if (instr->opc == OPC_META_INPUT)
561 schedule(ctx, instr);
562
563 foreach_instr_safe (instr, &ctx->unscheduled_list)
564 if (instr->opc == OPC_META_TEX_PREFETCH)
565 schedule(ctx, instr);
566
567 while (!list_is_empty(&ctx->unscheduled_list)) {
568 struct ir3_instruction *instr;
569
570 instr = choose_instr(ctx);
571
572 /* this shouldn't happen: */
573 if (!instr) {
574 ctx->error = true;
575 break;
576 }
577
578 unsigned delay = ir3_delay_calc(ctx->block, instr, false, false);
579 d("delay=%u", delay);
580
581 /* and if we run out of instructions that can be scheduled,
582 * then it is time for nop's:
583 */
584 debug_assert(delay <= 6);
585 while (delay > 0) {
586 ir3_NOP(block);
587 delay--;
588 }
589
590 schedule(ctx, instr);
591 }
592
593 sched_dag_destroy(ctx);
594 }
595
596
597 static bool
598 is_self_mov(struct ir3_instruction *instr)
599 {
600 if (!is_same_type_mov(instr))
601 return false;
602
603 if (instr->regs[0]->num != instr->regs[1]->num)
604 return false;
605
606 if (instr->regs[0]->flags & IR3_REG_RELATIV)
607 return false;
608
609 if (instr->regs[1]->flags & (IR3_REG_CONST | IR3_REG_IMMED |
610 IR3_REG_RELATIV | IR3_REG_FNEG | IR3_REG_FABS |
611 IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT |
612 IR3_REG_EVEN | IR3_REG_POS_INF))
613 return false;
614
615 return true;
616 }
617
618 /* sometimes we end up w/ in-place mov's, ie. mov.u32u32 r1.y, r1.y
619 * as a result of places were before RA we are not sure that it is
620 * safe to eliminate. We could eliminate these earlier, but sometimes
621 * they are tangled up in false-dep's, etc, so it is easier just to
622 * let them exist until after RA
623 */
624 static void
625 cleanup_self_movs(struct ir3 *ir)
626 {
627 foreach_block (block, &ir->block_list) {
628 foreach_instr_safe (instr, &block->instr_list) {
629 struct ir3_register *reg;
630
631 foreach_src (reg, instr) {
632 if (!reg->instr)
633 continue;
634
635 if (is_self_mov(reg->instr)) {
636 list_delinit(&reg->instr->node);
637 reg->instr = reg->instr->regs[1]->instr;
638 }
639 }
640
641 for (unsigned i = 0; i < instr->deps_count; i++) {
642 if (is_self_mov(instr->deps[i])) {
643 list_delinit(&instr->deps[i]->node);
644 instr->deps[i] = instr->deps[i]->regs[1]->instr;
645 }
646 }
647 }
648 }
649 }
650
651 int
652 ir3_postsched(struct ir3_context *cctx)
653 {
654 struct ir3_postsched_ctx ctx = {
655 .ctx = cctx,
656 };
657
658 ir3_remove_nops(cctx->ir);
659 cleanup_self_movs(cctx->ir);
660
661 foreach_block (block, &cctx->ir->block_list) {
662 sched_block(&ctx, block);
663 }
664
665 if (ctx.error)
666 return -1;
667
668 return 0;
669 }