freedreno/ir3: fix addr/pred spilling
[mesa.git] / src / freedreno / ir3 / ir3_sched.c
1 /*
2 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Rob Clark <robclark@freedesktop.org>
25 */
26
27
28 #include "util/u_math.h"
29
30 #include "ir3.h"
31
32 /*
33 * Instruction Scheduling:
34 *
35 * A recursive depth based scheduling algo. Recursively find an eligible
36 * instruction to schedule from the deepest instruction (recursing through
37 * it's unscheduled src instructions). Normally this would result in a
38 * lot of re-traversal of the same instructions, so we cache results in
39 * instr->data (and clear cached results that would be no longer valid
40 * after scheduling an instruction).
41 *
42 * There are a few special cases that need to be handled, since sched
43 * is currently independent of register allocation. Usages of address
44 * register (a0.x) or predicate register (p0.x) must be serialized. Ie.
45 * if you have two pairs of instructions that write the same special
46 * register and then read it, then those pairs cannot be interleaved.
47 * To solve this, when we are in such a scheduling "critical section",
48 * and we encounter a conflicting write to a special register, we try
49 * to schedule any remaining instructions that use that value first.
50 */
51
52 struct ir3_sched_ctx {
53 struct ir3_block *block; /* the current block */
54 struct list_head depth_list; /* depth sorted unscheduled instrs */
55 struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/
56 struct ir3_instruction *addr; /* current a0.x user, if any */
57 struct ir3_instruction *pred; /* current p0.x user, if any */
58 int live_values; /* estimate of current live values */
59 bool error;
60 };
61
62 static bool is_scheduled(struct ir3_instruction *instr)
63 {
64 return !!(instr->flags & IR3_INSTR_MARK);
65 }
66
67 static bool is_sfu_or_mem(struct ir3_instruction *instr)
68 {
69 return is_sfu(instr) || is_mem(instr);
70 }
71
72 static void
73 unuse_each_src(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
74 {
75 struct ir3_instruction *src;
76
77 foreach_ssa_src_n(src, n, instr) {
78 if (__is_false_dep(instr, n))
79 continue;
80 if (instr->block != src->block)
81 continue;
82 if ((src->opc == OPC_META_FI) || (src->opc == OPC_META_FO)) {
83 unuse_each_src(ctx, src);
84 } else {
85 debug_assert(src->use_count > 0);
86
87 if (--src->use_count == 0) {
88 ctx->live_values -= dest_regs(src);
89 debug_assert(ctx->live_values >= 0);
90 }
91 }
92 }
93 }
94
95 static void clear_cache(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr);
96 static void use_instr(struct ir3_instruction *instr);
97
98 /* transfers a use-count to new instruction, for cases where we
99 * "spill" address or predicate. Note this might cause the
100 * previous instruction that loaded a0.x/p0.x to become live
101 * again, when we previously thought it was dead.
102 */
103 static void
104 transfer_use(struct ir3_sched_ctx *ctx, struct ir3_instruction *orig_instr,
105 struct ir3_instruction *new_instr)
106 {
107 struct ir3_instruction *src;
108
109 debug_assert(is_scheduled(orig_instr));
110
111 foreach_ssa_src_n(src, n, new_instr) {
112 if (__is_false_dep(new_instr, n))
113 continue;
114 ctx->live_values += dest_regs(src);
115 use_instr(src);
116 }
117
118 clear_cache(ctx, orig_instr);
119 }
120
121 static void
122 use_each_src(struct ir3_instruction *instr)
123 {
124 struct ir3_instruction *src;
125
126 foreach_ssa_src_n(src, n, instr) {
127 if (__is_false_dep(instr, n))
128 continue;
129 use_instr(src);
130 }
131 }
132
133 static void
134 use_instr(struct ir3_instruction *instr)
135 {
136 if ((instr->opc == OPC_META_FI) || (instr->opc == OPC_META_FO)) {
137 use_each_src(instr);
138 } else {
139 instr->use_count++;
140 }
141 }
142
143 static void
144 update_live_values(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
145 {
146 if ((instr->opc == OPC_META_FI) || (instr->opc == OPC_META_FO))
147 return;
148
149 ctx->live_values += dest_regs(instr);
150 unuse_each_src(ctx, instr);
151 }
152
153 static void
154 update_use_count(struct ir3 *ir)
155 {
156 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
157 list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
158 instr->use_count = 0;
159 }
160 }
161
162 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
163 list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
164 if ((instr->opc == OPC_META_FI) || (instr->opc == OPC_META_FO))
165 continue;
166
167 use_each_src(instr);
168 }
169 }
170
171 /* Shader outputs are also used:
172 */
173 for (unsigned i = 0; i < ir->noutputs; i++) {
174 struct ir3_instruction *out = ir->outputs[i];
175
176 if (!out)
177 continue;
178
179 use_instr(out);
180 }
181 }
182
183 #define NULL_INSTR ((void *)~0)
184
185 static void
186 clear_cache(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
187 {
188 list_for_each_entry (struct ir3_instruction, instr2, &ctx->depth_list, node) {
189 if ((instr2->data == instr) || (instr2->data == NULL_INSTR) || !instr)
190 instr2->data = NULL;
191 }
192 }
193
194 static void
195 schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
196 {
197 debug_assert(ctx->block == instr->block);
198
199 /* maybe there is a better way to handle this than just stuffing
200 * a nop.. ideally we'd know about this constraint in the
201 * scheduling and depth calculation..
202 */
203 if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr))
204 ir3_NOP(ctx->block);
205
206 /* remove from depth list:
207 */
208 list_delinit(&instr->node);
209
210 if (writes_addr(instr)) {
211 debug_assert(ctx->addr == NULL);
212 ctx->addr = instr;
213 }
214
215 if (writes_pred(instr)) {
216 debug_assert(ctx->pred == NULL);
217 ctx->pred = instr;
218 }
219
220 instr->flags |= IR3_INSTR_MARK;
221
222 list_addtail(&instr->node, &instr->block->instr_list);
223 ctx->scheduled = instr;
224
225 update_live_values(ctx, instr);
226
227 if (writes_addr(instr) || writes_pred(instr) || is_input(instr)) {
228 clear_cache(ctx, NULL);
229 } else {
230 /* invalidate only the necessary entries.. */
231 clear_cache(ctx, instr);
232 }
233 }
234
235 static struct ir3_instruction *
236 deepest(struct ir3_instruction **srcs, unsigned nsrcs)
237 {
238 struct ir3_instruction *d = NULL;
239 unsigned i = 0, id = 0;
240
241 while ((i < nsrcs) && !(d = srcs[id = i]))
242 i++;
243
244 if (!d)
245 return NULL;
246
247 for (; i < nsrcs; i++)
248 if (srcs[i] && (srcs[i]->depth > d->depth))
249 d = srcs[id = i];
250
251 srcs[id] = NULL;
252
253 return d;
254 }
255
256 /**
257 * @block: the block to search in, starting from end; in first pass,
258 * this will be the block the instruction would be inserted into
259 * (but has not yet, ie. it only contains already scheduled
260 * instructions). For intra-block scheduling (second pass), this
261 * would be one of the predecessor blocks.
262 * @instr: the instruction to search for
263 * @maxd: max distance, bail after searching this # of instruction
264 * slots, since it means the instruction we are looking for is
265 * far enough away
266 * @pred: if true, recursively search into predecessor blocks to
267 * find the worst case (shortest) distance (only possible after
268 * individual blocks are all scheduled
269 */
270 static unsigned
271 distance(struct ir3_block *block, struct ir3_instruction *instr,
272 unsigned maxd, bool pred)
273 {
274 unsigned d = 0;
275
276 list_for_each_entry_rev (struct ir3_instruction, n, &block->instr_list, node) {
277 if ((n == instr) || (d >= maxd))
278 return d;
279 /* NOTE: don't count branch/jump since we don't know yet if they will
280 * be eliminated later in resolve_jumps().. really should do that
281 * earlier so we don't have this constraint.
282 */
283 if (is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR)))
284 d++;
285 }
286
287 /* if coming from a predecessor block, assume it is assigned far
288 * enough away.. we'll fix up later.
289 */
290 if (!pred)
291 return maxd;
292
293 if (pred && (block->data != block)) {
294 /* Search into predecessor blocks, finding the one with the
295 * shortest distance, since that will be the worst case
296 */
297 unsigned min = maxd - d;
298
299 /* (ab)use block->data to prevent recursion: */
300 block->data = block;
301
302 set_foreach(block->predecessors, entry) {
303 struct ir3_block *pred = (struct ir3_block *)entry->key;
304 unsigned n;
305
306 n = distance(pred, instr, min, pred);
307
308 min = MIN2(min, n);
309 }
310
311 block->data = NULL;
312 d += min;
313 }
314
315 return d;
316 }
317
318 /* calculate delay for specified src: */
319 static unsigned
320 delay_calc_srcn(struct ir3_block *block,
321 struct ir3_instruction *assigner,
322 struct ir3_instruction *consumer,
323 unsigned srcn, bool soft, bool pred)
324 {
325 unsigned delay = 0;
326
327 if (is_meta(assigner)) {
328 struct ir3_instruction *src;
329 foreach_ssa_src(src, assigner) {
330 unsigned d;
331 d = delay_calc_srcn(block, src, consumer, srcn, soft, pred);
332 delay = MAX2(delay, d);
333 }
334 } else {
335 if (soft) {
336 if (is_sfu(assigner)) {
337 delay = 4;
338 } else {
339 delay = ir3_delayslots(assigner, consumer, srcn);
340 }
341 } else {
342 delay = ir3_delayslots(assigner, consumer, srcn);
343 }
344 delay -= distance(block, assigner, delay, pred);
345 }
346
347 return delay;
348 }
349
350 /* calculate delay for instruction (maximum of delay for all srcs): */
351 static unsigned
352 delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
353 bool soft, bool pred)
354 {
355 unsigned delay = 0;
356 struct ir3_instruction *src;
357
358 foreach_ssa_src_n(src, i, instr) {
359 unsigned d;
360 d = delay_calc_srcn(block, src, instr, i, soft, pred);
361 delay = MAX2(delay, d);
362 }
363
364 return delay;
365 }
366
367 struct ir3_sched_notes {
368 /* there is at least one kill which could be scheduled, except
369 * for unscheduled bary.f's:
370 */
371 bool blocked_kill;
372 /* there is at least one instruction that could be scheduled,
373 * except for conflicting address/predicate register usage:
374 */
375 bool addr_conflict, pred_conflict;
376 };
377
378 /* could an instruction be scheduled if specified ssa src was scheduled? */
379 static bool
380 could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
381 {
382 struct ir3_instruction *other_src;
383 foreach_ssa_src(other_src, instr) {
384 /* if dependency not scheduled, we aren't ready yet: */
385 if ((src != other_src) && !is_scheduled(other_src)) {
386 return false;
387 }
388 }
389 return true;
390 }
391
392 /* Check if instruction is ok to schedule. Make sure it is not blocked
393 * by use of addr/predicate register, etc.
394 */
395 static bool
396 check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
397 struct ir3_instruction *instr)
398 {
399 debug_assert(!is_scheduled(instr));
400
401 /* For instructions that write address register we need to
402 * make sure there is at least one instruction that uses the
403 * addr value which is otherwise ready.
404 *
405 * TODO if any instructions use pred register and have other
406 * src args, we would need to do the same for writes_pred()..
407 */
408 if (writes_addr(instr)) {
409 struct ir3 *ir = instr->block->shader;
410 bool ready = false;
411 for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) {
412 struct ir3_instruction *indirect = ir->indirects[i];
413 if (!indirect)
414 continue;
415 if (indirect->address != instr)
416 continue;
417 ready = could_sched(indirect, instr);
418 }
419
420 /* nothing could be scheduled, so keep looking: */
421 if (!ready)
422 return false;
423 }
424
425 /* if this is a write to address/predicate register, and that
426 * register is currently in use, we need to defer until it is
427 * free:
428 */
429 if (writes_addr(instr) && ctx->addr) {
430 debug_assert(ctx->addr != instr);
431 notes->addr_conflict = true;
432 return false;
433 }
434
435 if (writes_pred(instr) && ctx->pred) {
436 debug_assert(ctx->pred != instr);
437 notes->pred_conflict = true;
438 return false;
439 }
440
441 /* if the instruction is a kill, we need to ensure *every*
442 * bary.f is scheduled. The hw seems unhappy if the thread
443 * gets killed before the end-input (ei) flag is hit.
444 *
445 * We could do this by adding each bary.f instruction as
446 * virtual ssa src for the kill instruction. But we have
447 * fixed length instr->regs[].
448 *
449 * TODO this wouldn't be quite right if we had multiple
450 * basic blocks, if any block was conditional. We'd need
451 * to schedule the bary.f's outside of any block which
452 * was conditional that contained a kill.. I think..
453 */
454 if (is_kill(instr)) {
455 struct ir3 *ir = instr->block->shader;
456
457 for (unsigned i = 0; i < ir->baryfs_count; i++) {
458 struct ir3_instruction *baryf = ir->baryfs[i];
459 if (baryf->flags & IR3_INSTR_UNUSED)
460 continue;
461 if (!is_scheduled(baryf)) {
462 notes->blocked_kill = true;
463 return false;
464 }
465 }
466 }
467
468 return true;
469 }
470
471 /* Find the best instruction to schedule from specified instruction or
472 * recursively it's ssa sources.
473 */
474 static struct ir3_instruction *
475 find_instr_recursive(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
476 struct ir3_instruction *instr)
477 {
478 struct ir3_instruction *srcs[__ssa_src_cnt(instr)];
479 struct ir3_instruction *src;
480 unsigned nsrcs = 0;
481
482 if (is_scheduled(instr))
483 return NULL;
484
485 /* use instr->data to cache the results of recursing up the
486 * instr src's. Otherwise the recursive algo can scale quite
487 * badly w/ shader size. But this takes some care to clear
488 * the cache appropriately when instructions are scheduled.
489 */
490 if (instr->data) {
491 if (instr->data == NULL_INSTR)
492 return NULL;
493 return instr->data;
494 }
495
496 /* find unscheduled srcs: */
497 foreach_ssa_src(src, instr) {
498 if (!is_scheduled(src) && (src->block == instr->block)) {
499 debug_assert(nsrcs < ARRAY_SIZE(srcs));
500 srcs[nsrcs++] = src;
501 }
502 }
503
504 /* if all our src's are already scheduled: */
505 if (nsrcs == 0) {
506 if (check_instr(ctx, notes, instr)) {
507 instr->data = instr;
508 return instr;
509 }
510 return NULL;
511 }
512
513 while ((src = deepest(srcs, nsrcs))) {
514 struct ir3_instruction *candidate;
515
516 candidate = find_instr_recursive(ctx, notes, src);
517 if (!candidate)
518 continue;
519
520 if (check_instr(ctx, notes, candidate)) {
521 instr->data = candidate;
522 return candidate;
523 }
524 }
525
526 instr->data = NULL_INSTR;
527 return NULL;
528 }
529
530 /* find net change to live values if instruction were scheduled: */
531 static int
532 live_effect(struct ir3_instruction *instr)
533 {
534 struct ir3_instruction *src;
535 int new_live = dest_regs(instr);
536 int old_live = 0;
537
538 foreach_ssa_src_n(src, n, instr) {
539 if (__is_false_dep(instr, n))
540 continue;
541
542 if (instr->block != src->block)
543 continue;
544
545 /* for fanout/split, just pass things along to the real src: */
546 if (src->opc == OPC_META_FO)
547 src = ssa(src->regs[1]);
548
549 /* for fanin/collect, if this is the last use of *each* src,
550 * then it will decrease the live values, since RA treats
551 * them as a whole:
552 */
553 if (src->opc == OPC_META_FI) {
554 struct ir3_instruction *src2;
555 bool last_use = true;
556
557 foreach_ssa_src(src2, src) {
558 if (src2->use_count > 1) {
559 last_use = false;
560 break;
561 }
562 }
563
564 if (last_use)
565 old_live += dest_regs(src);
566
567 } else {
568 debug_assert(src->use_count > 0);
569
570 if (src->use_count == 1) {
571 old_live += dest_regs(src);
572 }
573 }
574 }
575
576 return new_live - old_live;
577 }
578
579 /* find instruction to schedule: */
580 static struct ir3_instruction *
581 find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
582 bool soft)
583 {
584 struct ir3_instruction *best_instr = NULL;
585 int best_rank = INT_MAX; /* lower is better */
586 unsigned deepest = 0;
587
588 /* TODO we'd really rather use the list/array of block outputs. But we
589 * don't have such a thing. Recursing *every* instruction in the list
590 * will result in a lot of repeated traversal, since instructions will
591 * get traversed both when they appear as ssa src to a later instruction
592 * as well as where they appear in the depth_list.
593 */
594 list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) {
595 struct ir3_instruction *candidate;
596
597 candidate = find_instr_recursive(ctx, notes, instr);
598 if (!candidate)
599 continue;
600
601 if (is_meta(candidate))
602 return candidate;
603
604 deepest = MAX2(deepest, candidate->depth);
605 }
606
607 /* traverse the list a second time.. but since we cache the result of
608 * find_instr_recursive() it isn't as bad as it looks.
609 */
610 list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) {
611 struct ir3_instruction *candidate;
612
613 candidate = find_instr_recursive(ctx, notes, instr);
614 if (!candidate)
615 continue;
616
617 /* determine net change to # of live values: */
618 int le = live_effect(candidate);
619
620 /* if there is a net increase in # of live values, then apply some
621 * threshold to avoid instructions getting scheduled *too* early
622 * and increasing register pressure.
623 */
624 if (le >= 1) {
625 unsigned threshold;
626
627 if (ctx->live_values > 4*4) {
628 threshold = 4;
629 } else {
630 threshold = 6;
631 }
632
633 /* Filter out any "shallow" instructions which would otherwise
634 * tend to get scheduled too early to fill delay slots even
635 * when they are not needed for a while. There will probably
636 * be later delay slots that they could just as easily fill.
637 *
638 * A classic case where this comes up is frag shaders that
639 * write a constant value (like 1.0f) to one of the channels
640 * of the output color(s). Since the mov from immed has no
641 * dependencies, it would otherwise get scheduled early to
642 * fill delay slots, occupying a register until the end of
643 * the program.
644 */
645 if ((deepest - candidate->depth) > threshold)
646 continue;
647 }
648
649 int rank = delay_calc(ctx->block, candidate, soft, false);
650
651 /* if too many live values, prioritize instructions that reduce the
652 * number of live values:
653 */
654 if (ctx->live_values > 16*4) {
655 rank = le;
656 } else if (ctx->live_values > 4*4) {
657 rank += le;
658 }
659
660 if (rank < best_rank) {
661 best_instr = candidate;
662 best_rank = rank;
663 }
664 }
665
666 return best_instr;
667 }
668
669 static struct ir3_instruction *
670 split_instr(struct ir3_sched_ctx *ctx, struct ir3_instruction *orig_instr)
671 {
672 struct ir3_instruction *new_instr = ir3_instr_clone(orig_instr);
673 ir3_insert_by_depth(new_instr, &ctx->depth_list);
674 transfer_use(ctx, orig_instr, new_instr);
675 return new_instr;
676 }
677
678 /* "spill" the address register by remapping any unscheduled
679 * instructions which depend on the current address register
680 * to a clone of the instruction which wrote the address reg.
681 */
682 static struct ir3_instruction *
683 split_addr(struct ir3_sched_ctx *ctx)
684 {
685 struct ir3 *ir;
686 struct ir3_instruction *new_addr = NULL;
687 unsigned i;
688
689 debug_assert(ctx->addr);
690
691 ir = ctx->addr->block->shader;
692
693 for (i = 0; i < ir->indirects_count; i++) {
694 struct ir3_instruction *indirect = ir->indirects[i];
695
696 if (!indirect)
697 continue;
698
699 /* skip instructions already scheduled: */
700 if (is_scheduled(indirect))
701 continue;
702
703 /* remap remaining instructions using current addr
704 * to new addr:
705 */
706 if (indirect->address == ctx->addr) {
707 if (!new_addr) {
708 new_addr = split_instr(ctx, ctx->addr);
709 /* original addr is scheduled, but new one isn't: */
710 new_addr->flags &= ~IR3_INSTR_MARK;
711 }
712 ir3_instr_set_address(indirect, new_addr);
713 }
714 }
715
716 /* all remaining indirects remapped to new addr: */
717 ctx->addr = NULL;
718
719 return new_addr;
720 }
721
722 /* "spill" the predicate register by remapping any unscheduled
723 * instructions which depend on the current predicate register
724 * to a clone of the instruction which wrote the address reg.
725 */
726 static struct ir3_instruction *
727 split_pred(struct ir3_sched_ctx *ctx)
728 {
729 struct ir3 *ir;
730 struct ir3_instruction *new_pred = NULL;
731 unsigned i;
732
733 debug_assert(ctx->pred);
734
735 ir = ctx->pred->block->shader;
736
737 for (i = 0; i < ir->predicates_count; i++) {
738 struct ir3_instruction *predicated = ir->predicates[i];
739
740 /* skip instructions already scheduled: */
741 if (is_scheduled(predicated))
742 continue;
743
744 /* remap remaining instructions using current pred
745 * to new pred:
746 *
747 * TODO is there ever a case when pred isn't first
748 * (and only) src?
749 */
750 if (ssa(predicated->regs[1]) == ctx->pred) {
751 if (!new_pred) {
752 new_pred = split_instr(ctx, ctx->pred);
753 /* original pred is scheduled, but new one isn't: */
754 new_pred->flags &= ~IR3_INSTR_MARK;
755 }
756 predicated->regs[1]->instr = new_pred;
757 }
758 }
759
760 /* all remaining predicated remapped to new pred: */
761 ctx->pred = NULL;
762
763 return new_pred;
764 }
765
766 static void
767 sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
768 {
769 struct list_head unscheduled_list;
770
771 ctx->block = block;
772
773 /* addr/pred writes are per-block: */
774 ctx->addr = NULL;
775 ctx->pred = NULL;
776
777 /* move all instructions to the unscheduled list, and
778 * empty the block's instruction list (to which we will
779 * be inserting).
780 */
781 list_replace(&block->instr_list, &unscheduled_list);
782 list_inithead(&block->instr_list);
783 list_inithead(&ctx->depth_list);
784
785 /* first a pre-pass to schedule all meta:input instructions
786 * (which need to appear first so that RA knows the register is
787 * occupied), and move remaining to depth sorted list:
788 */
789 list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node) {
790 if (instr->opc == OPC_META_INPUT) {
791 schedule(ctx, instr);
792 } else {
793 ir3_insert_by_depth(instr, &ctx->depth_list);
794 }
795 }
796
797 while (!list_empty(&ctx->depth_list)) {
798 struct ir3_sched_notes notes = {0};
799 struct ir3_instruction *instr;
800
801 instr = find_eligible_instr(ctx, &notes, true);
802 if (!instr)
803 instr = find_eligible_instr(ctx, &notes, false);
804
805 if (instr) {
806 unsigned delay = delay_calc(ctx->block, instr, false, false);
807
808 /* and if we run out of instructions that can be scheduled,
809 * then it is time for nop's:
810 */
811 debug_assert(delay <= 6);
812 while (delay > 0) {
813 ir3_NOP(block);
814 delay--;
815 }
816
817 schedule(ctx, instr);
818 } else {
819 struct ir3_instruction *new_instr = NULL;
820
821 /* nothing available to schedule.. if we are blocked on
822 * address/predicate register conflict, then break the
823 * deadlock by cloning the instruction that wrote that
824 * reg:
825 */
826 if (notes.addr_conflict) {
827 new_instr = split_addr(ctx);
828 } else if (notes.pred_conflict) {
829 new_instr = split_pred(ctx);
830 } else {
831 debug_assert(0);
832 ctx->error = true;
833 return;
834 }
835
836 if (new_instr) {
837 /* clearing current addr/pred can change what is
838 * available to schedule, so clear cache..
839 */
840 clear_cache(ctx, NULL);
841
842 ir3_insert_by_depth(new_instr, &ctx->depth_list);
843 /* the original instr that wrote addr/pred may have
844 * originated from a different block:
845 */
846 new_instr->block = block;
847 }
848 }
849 }
850
851 /* And lastly, insert branch/jump instructions to take us to
852 * the next block. Later we'll strip back out the branches
853 * that simply jump to next instruction.
854 */
855 if (block->successors[1]) {
856 /* if/else, conditional branches to "then" or "else": */
857 struct ir3_instruction *br;
858 unsigned delay = 6;
859
860 debug_assert(ctx->pred);
861 debug_assert(block->condition);
862
863 delay -= distance(ctx->block, ctx->pred, delay, false);
864
865 while (delay > 0) {
866 ir3_NOP(block);
867 delay--;
868 }
869
870 /* create "else" branch first (since "then" block should
871 * frequently/always end up being a fall-thru):
872 */
873 br = ir3_BR(block);
874 br->cat0.inv = true;
875 br->cat0.target = block->successors[1];
876
877 /* NOTE: we have to hard code delay of 6 above, since
878 * we want to insert the nop's before constructing the
879 * branch. Throw in an assert so we notice if this
880 * ever breaks on future generation:
881 */
882 debug_assert(ir3_delayslots(ctx->pred, br, 0) == 6);
883
884 br = ir3_BR(block);
885 br->cat0.target = block->successors[0];
886
887 } else if (block->successors[0]) {
888 /* otherwise unconditional jump to next block: */
889 struct ir3_instruction *jmp;
890
891 jmp = ir3_JUMP(block);
892 jmp->cat0.target = block->successors[0];
893 }
894
895 /* NOTE: if we kept track of the predecessors, we could do a better
896 * job w/ (jp) flags.. every node w/ > predecessor is a join point.
897 * Note that as we eliminate blocks which contain only an unconditional
898 * jump we probably need to propagate (jp) flag..
899 */
900 }
901
902 /* After scheduling individual blocks, we still could have cases where
903 * one (or more) paths into a block, a value produced by a previous
904 * has too few delay slots to be legal. We can't deal with this in the
905 * first pass, because loops (ie. we can't ensure all predecessor blocks
906 * are already scheduled in the first pass). All we can really do at
907 * this point is stuff in extra nop's until things are legal.
908 */
909 static void
910 sched_intra_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
911 {
912 unsigned n = 0;
913
914 ctx->block = block;
915
916 list_for_each_entry_safe (struct ir3_instruction, instr, &block->instr_list, node) {
917 unsigned delay = 0;
918
919 set_foreach(block->predecessors, entry) {
920 struct ir3_block *pred = (struct ir3_block *)entry->key;
921 unsigned d = delay_calc(pred, instr, false, true);
922 delay = MAX2(d, delay);
923 }
924
925 while (delay > n) {
926 struct ir3_instruction *nop = ir3_NOP(block);
927
928 /* move to before instr: */
929 list_delinit(&nop->node);
930 list_addtail(&nop->node, &instr->node);
931
932 n++;
933 }
934
935 /* we can bail once we hit worst case delay: */
936 if (++n > 6)
937 break;
938 }
939 }
940
941 int ir3_sched(struct ir3 *ir)
942 {
943 struct ir3_sched_ctx ctx = {0};
944
945 ir3_clear_mark(ir);
946 update_use_count(ir);
947
948 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
949 ctx.live_values = 0;
950 sched_block(&ctx, block);
951 }
952
953 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
954 sched_intra_block(&ctx, block);
955 }
956
957 if (ctx.error)
958 return -1;
959
960 return 0;
961 }
962
963 static unsigned
964 get_array_id(struct ir3_instruction *instr)
965 {
966 /* The expectation is that there is only a single array
967 * src or dst, ir3_cp should enforce this.
968 */
969
970 for (unsigned i = 0; i < instr->regs_count; i++)
971 if (instr->regs[i]->flags & IR3_REG_ARRAY)
972 return instr->regs[i]->array.id;
973
974 unreachable("this was unexpected");
975 }
976
977 /* does instruction 'prior' need to be scheduled before 'instr'? */
978 static bool
979 depends_on(struct ir3_instruction *instr, struct ir3_instruction *prior)
980 {
981 /* TODO for dependencies that are related to a specific object, ie
982 * a specific SSBO/image/array, we could relax this constraint to
983 * make accesses to unrelated objects not depend on each other (at
984 * least as long as not declared coherent)
985 */
986 if (((instr->barrier_class & IR3_BARRIER_EVERYTHING) && prior->barrier_class) ||
987 ((prior->barrier_class & IR3_BARRIER_EVERYTHING) && instr->barrier_class))
988 return true;
989
990 if (instr->barrier_class & prior->barrier_conflict) {
991 if (!(instr->barrier_class & ~(IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W))) {
992 /* if only array barrier, then we can further limit false-deps
993 * by considering the array-id, ie reads/writes to different
994 * arrays do not depend on each other (no aliasing)
995 */
996 if (get_array_id(instr) != get_array_id(prior)) {
997 return false;
998 }
999 }
1000
1001 return true;
1002 }
1003
1004 return false;
1005 }
1006
1007 static void
1008 add_barrier_deps(struct ir3_block *block, struct ir3_instruction *instr)
1009 {
1010 struct list_head *prev = instr->node.prev;
1011 struct list_head *next = instr->node.next;
1012
1013 /* add dependencies on previous instructions that must be scheduled
1014 * prior to the current instruction
1015 */
1016 while (prev != &block->instr_list) {
1017 struct ir3_instruction *pi =
1018 LIST_ENTRY(struct ir3_instruction, prev, node);
1019
1020 prev = prev->prev;
1021
1022 if (is_meta(pi))
1023 continue;
1024
1025 if (instr->barrier_class == pi->barrier_class) {
1026 ir3_instr_add_dep(instr, pi);
1027 break;
1028 }
1029
1030 if (depends_on(instr, pi))
1031 ir3_instr_add_dep(instr, pi);
1032 }
1033
1034 /* add dependencies on this instruction to following instructions
1035 * that must be scheduled after the current instruction:
1036 */
1037 while (next != &block->instr_list) {
1038 struct ir3_instruction *ni =
1039 LIST_ENTRY(struct ir3_instruction, next, node);
1040
1041 next = next->next;
1042
1043 if (is_meta(ni))
1044 continue;
1045
1046 if (instr->barrier_class == ni->barrier_class) {
1047 ir3_instr_add_dep(ni, instr);
1048 break;
1049 }
1050
1051 if (depends_on(ni, instr))
1052 ir3_instr_add_dep(ni, instr);
1053 }
1054 }
1055
1056 /* before scheduling a block, we need to add any necessary false-dependencies
1057 * to ensure that:
1058 *
1059 * (1) barriers are scheduled in the right order wrt instructions related
1060 * to the barrier
1061 *
1062 * (2) reads that come before a write actually get scheduled before the
1063 * write
1064 */
1065 static void
1066 calculate_deps(struct ir3_block *block)
1067 {
1068 list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
1069 if (instr->barrier_class) {
1070 add_barrier_deps(block, instr);
1071 }
1072 }
1073 }
1074
1075 void
1076 ir3_sched_add_deps(struct ir3 *ir)
1077 {
1078 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
1079 calculate_deps(block);
1080 }
1081 }