freedreno/ir3: convert block->predecessors to set
[mesa.git] / src / freedreno / ir3 / ir3_sched.c
1 /*
2 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Rob Clark <robclark@freedesktop.org>
25 */
26
27
28 #include "util/u_math.h"
29
30 #include "ir3.h"
31
32 /*
33 * Instruction Scheduling:
34 *
35 * A recursive depth based scheduling algo. Recursively find an eligible
36 * instruction to schedule from the deepest instruction (recursing through
37 * it's unscheduled src instructions). Normally this would result in a
38 * lot of re-traversal of the same instructions, so we cache results in
39 * instr->data (and clear cached results that would be no longer valid
40 * after scheduling an instruction).
41 *
42 * There are a few special cases that need to be handled, since sched
43 * is currently independent of register allocation. Usages of address
44 * register (a0.x) or predicate register (p0.x) must be serialized. Ie.
45 * if you have two pairs of instructions that write the same special
46 * register and then read it, then those pairs cannot be interleaved.
47 * To solve this, when we are in such a scheduling "critical section",
48 * and we encounter a conflicting write to a special register, we try
49 * to schedule any remaining instructions that use that value first.
50 */
51
52 struct ir3_sched_ctx {
53 struct ir3_block *block; /* the current block */
54 struct list_head depth_list; /* depth sorted unscheduled instrs */
55 struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/
56 struct ir3_instruction *addr; /* current a0.x user, if any */
57 struct ir3_instruction *pred; /* current p0.x user, if any */
58 int live_values; /* estimate of current live values */
59 bool error;
60 };
61
62 static bool is_sfu_or_mem(struct ir3_instruction *instr)
63 {
64 return is_sfu(instr) || is_mem(instr);
65 }
66
67 static void
68 unuse_each_src(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
69 {
70 struct ir3_instruction *src;
71
72 foreach_ssa_src_n(src, n, instr) {
73 if (__is_false_dep(instr, n))
74 continue;
75 if (instr->block != src->block)
76 continue;
77 if ((src->opc == OPC_META_FI) || (src->opc == OPC_META_FO)) {
78 unuse_each_src(ctx, src);
79 } else {
80 debug_assert(src->use_count > 0);
81
82 if (--src->use_count == 0) {
83 ctx->live_values -= dest_regs(src);
84 debug_assert(ctx->live_values >= 0);
85 }
86 }
87 }
88 }
89
90 static void use_instr(struct ir3_instruction *instr);
91
92 static void
93 use_each_src(struct ir3_instruction *instr)
94 {
95 struct ir3_instruction *src;
96
97 foreach_ssa_src_n(src, n, instr) {
98 if (__is_false_dep(instr, n))
99 continue;
100 use_instr(src);
101 }
102 }
103
104 static void
105 use_instr(struct ir3_instruction *instr)
106 {
107 if ((instr->opc == OPC_META_FI) || (instr->opc == OPC_META_FO)) {
108 use_each_src(instr);
109 } else {
110 instr->use_count++;
111 }
112 }
113
114 static void
115 update_live_values(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
116 {
117 if ((instr->opc == OPC_META_FI) || (instr->opc == OPC_META_FO))
118 return;
119
120 ctx->live_values += dest_regs(instr);
121 unuse_each_src(ctx, instr);
122 }
123
124 static void
125 update_use_count(struct ir3 *ir)
126 {
127 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
128 list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
129 instr->use_count = 0;
130 }
131 }
132
133 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
134 list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
135 if ((instr->opc == OPC_META_FI) || (instr->opc == OPC_META_FO))
136 continue;
137
138 use_each_src(instr);
139 }
140 }
141
142 /* Shader outputs are also used:
143 */
144 for (unsigned i = 0; i < ir->noutputs; i++) {
145 struct ir3_instruction *out = ir->outputs[i];
146
147 if (!out)
148 continue;
149
150 use_instr(out);
151 }
152 }
153
154 #define NULL_INSTR ((void *)~0)
155
156 static void
157 clear_cache(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
158 {
159 list_for_each_entry (struct ir3_instruction, instr2, &ctx->depth_list, node) {
160 if ((instr2->data == instr) || (instr2->data == NULL_INSTR) || !instr)
161 instr2->data = NULL;
162 }
163 }
164
165 static void
166 schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
167 {
168 debug_assert(ctx->block == instr->block);
169
170 /* maybe there is a better way to handle this than just stuffing
171 * a nop.. ideally we'd know about this constraint in the
172 * scheduling and depth calculation..
173 */
174 if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr))
175 ir3_NOP(ctx->block);
176
177 /* remove from depth list:
178 */
179 list_delinit(&instr->node);
180
181 if (writes_addr(instr)) {
182 debug_assert(ctx->addr == NULL);
183 ctx->addr = instr;
184 }
185
186 if (writes_pred(instr)) {
187 debug_assert(ctx->pred == NULL);
188 ctx->pred = instr;
189 }
190
191 instr->flags |= IR3_INSTR_MARK;
192
193 list_addtail(&instr->node, &instr->block->instr_list);
194 ctx->scheduled = instr;
195
196 update_live_values(ctx, instr);
197
198 if (writes_addr(instr) || writes_pred(instr) || is_input(instr)) {
199 clear_cache(ctx, NULL);
200 } else {
201 /* invalidate only the necessary entries.. */
202 clear_cache(ctx, instr);
203 }
204 }
205
206 static struct ir3_instruction *
207 deepest(struct ir3_instruction **srcs, unsigned nsrcs)
208 {
209 struct ir3_instruction *d = NULL;
210 unsigned i = 0, id = 0;
211
212 while ((i < nsrcs) && !(d = srcs[id = i]))
213 i++;
214
215 if (!d)
216 return NULL;
217
218 for (; i < nsrcs; i++)
219 if (srcs[i] && (srcs[i]->depth > d->depth))
220 d = srcs[id = i];
221
222 srcs[id] = NULL;
223
224 return d;
225 }
226
227 /**
228 * @block: the block to search in, starting from end; in first pass,
229 * this will be the block the instruction would be inserted into
230 * (but has not yet, ie. it only contains already scheduled
231 * instructions). For intra-block scheduling (second pass), this
232 * would be one of the predecessor blocks.
233 * @instr: the instruction to search for
234 * @maxd: max distance, bail after searching this # of instruction
235 * slots, since it means the instruction we are looking for is
236 * far enough away
237 * @pred: if true, recursively search into predecessor blocks to
238 * find the worst case (shortest) distance (only possible after
239 * individual blocks are all scheduled
240 */
241 static unsigned
242 distance(struct ir3_block *block, struct ir3_instruction *instr,
243 unsigned maxd, bool pred)
244 {
245 unsigned d = 0;
246
247 list_for_each_entry_rev (struct ir3_instruction, n, &block->instr_list, node) {
248 if ((n == instr) || (d >= maxd))
249 return d;
250 /* NOTE: don't count branch/jump since we don't know yet if they will
251 * be eliminated later in resolve_jumps().. really should do that
252 * earlier so we don't have this constraint.
253 */
254 if (is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR)))
255 d++;
256 }
257
258 /* if coming from a predecessor block, assume it is assigned far
259 * enough away.. we'll fix up later.
260 */
261 if (!pred)
262 return maxd;
263
264 if (pred && (block->data != block)) {
265 /* Search into predecessor blocks, finding the one with the
266 * shortest distance, since that will be the worst case
267 */
268 unsigned min = maxd - d;
269
270 /* (ab)use block->data to prevent recursion: */
271 block->data = block;
272
273 set_foreach(block->predecessors, entry) {
274 struct ir3_block *pred = (struct ir3_block *)entry->key;
275 unsigned n;
276
277 n = distance(pred, instr, min, pred);
278
279 min = MIN2(min, n);
280 }
281
282 block->data = NULL;
283 d += min;
284 }
285
286 return d;
287 }
288
289 /* calculate delay for specified src: */
290 static unsigned
291 delay_calc_srcn(struct ir3_block *block,
292 struct ir3_instruction *assigner,
293 struct ir3_instruction *consumer,
294 unsigned srcn, bool soft, bool pred)
295 {
296 unsigned delay = 0;
297
298 if (is_meta(assigner)) {
299 struct ir3_instruction *src;
300 foreach_ssa_src(src, assigner) {
301 unsigned d;
302 d = delay_calc_srcn(block, src, consumer, srcn, soft, pred);
303 delay = MAX2(delay, d);
304 }
305 } else {
306 if (soft) {
307 if (is_sfu(assigner)) {
308 delay = 4;
309 } else {
310 delay = ir3_delayslots(assigner, consumer, srcn);
311 }
312 } else {
313 delay = ir3_delayslots(assigner, consumer, srcn);
314 }
315 delay -= distance(block, assigner, delay, pred);
316 }
317
318 return delay;
319 }
320
321 /* calculate delay for instruction (maximum of delay for all srcs): */
322 static unsigned
323 delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
324 bool soft, bool pred)
325 {
326 unsigned delay = 0;
327 struct ir3_instruction *src;
328
329 foreach_ssa_src_n(src, i, instr) {
330 unsigned d;
331 d = delay_calc_srcn(block, src, instr, i, soft, pred);
332 delay = MAX2(delay, d);
333 }
334
335 return delay;
336 }
337
338 struct ir3_sched_notes {
339 /* there is at least one kill which could be scheduled, except
340 * for unscheduled bary.f's:
341 */
342 bool blocked_kill;
343 /* there is at least one instruction that could be scheduled,
344 * except for conflicting address/predicate register usage:
345 */
346 bool addr_conflict, pred_conflict;
347 };
348
349 static bool is_scheduled(struct ir3_instruction *instr)
350 {
351 return !!(instr->flags & IR3_INSTR_MARK);
352 }
353
354 /* could an instruction be scheduled if specified ssa src was scheduled? */
355 static bool
356 could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
357 {
358 struct ir3_instruction *other_src;
359 foreach_ssa_src(other_src, instr) {
360 /* if dependency not scheduled, we aren't ready yet: */
361 if ((src != other_src) && !is_scheduled(other_src)) {
362 return false;
363 }
364 }
365 return true;
366 }
367
368 /* Check if instruction is ok to schedule. Make sure it is not blocked
369 * by use of addr/predicate register, etc.
370 */
371 static bool
372 check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
373 struct ir3_instruction *instr)
374 {
375 /* For instructions that write address register we need to
376 * make sure there is at least one instruction that uses the
377 * addr value which is otherwise ready.
378 *
379 * TODO if any instructions use pred register and have other
380 * src args, we would need to do the same for writes_pred()..
381 */
382 if (writes_addr(instr)) {
383 struct ir3 *ir = instr->block->shader;
384 bool ready = false;
385 for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) {
386 struct ir3_instruction *indirect = ir->indirects[i];
387 if (!indirect)
388 continue;
389 if (indirect->address != instr)
390 continue;
391 ready = could_sched(indirect, instr);
392 }
393
394 /* nothing could be scheduled, so keep looking: */
395 if (!ready)
396 return false;
397 }
398
399 /* if this is a write to address/predicate register, and that
400 * register is currently in use, we need to defer until it is
401 * free:
402 */
403 if (writes_addr(instr) && ctx->addr) {
404 debug_assert(ctx->addr != instr);
405 notes->addr_conflict = true;
406 return false;
407 }
408
409 if (writes_pred(instr) && ctx->pred) {
410 debug_assert(ctx->pred != instr);
411 notes->pred_conflict = true;
412 return false;
413 }
414
415 /* if the instruction is a kill, we need to ensure *every*
416 * bary.f is scheduled. The hw seems unhappy if the thread
417 * gets killed before the end-input (ei) flag is hit.
418 *
419 * We could do this by adding each bary.f instruction as
420 * virtual ssa src for the kill instruction. But we have
421 * fixed length instr->regs[].
422 *
423 * TODO this wouldn't be quite right if we had multiple
424 * basic blocks, if any block was conditional. We'd need
425 * to schedule the bary.f's outside of any block which
426 * was conditional that contained a kill.. I think..
427 */
428 if (is_kill(instr)) {
429 struct ir3 *ir = instr->block->shader;
430
431 for (unsigned i = 0; i < ir->baryfs_count; i++) {
432 struct ir3_instruction *baryf = ir->baryfs[i];
433 if (baryf->flags & IR3_INSTR_UNUSED)
434 continue;
435 if (!is_scheduled(baryf)) {
436 notes->blocked_kill = true;
437 return false;
438 }
439 }
440 }
441
442 return true;
443 }
444
445 /* Find the best instruction to schedule from specified instruction or
446 * recursively it's ssa sources.
447 */
448 static struct ir3_instruction *
449 find_instr_recursive(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
450 struct ir3_instruction *instr)
451 {
452 struct ir3_instruction *srcs[__ssa_src_cnt(instr)];
453 struct ir3_instruction *src;
454 unsigned nsrcs = 0;
455
456 if (is_scheduled(instr))
457 return NULL;
458
459 /* use instr->data to cache the results of recursing up the
460 * instr src's. Otherwise the recursive algo can scale quite
461 * badly w/ shader size. But this takes some care to clear
462 * the cache appropriately when instructions are scheduled.
463 */
464 if (instr->data) {
465 if (instr->data == NULL_INSTR)
466 return NULL;
467 return instr->data;
468 }
469
470 /* find unscheduled srcs: */
471 foreach_ssa_src(src, instr) {
472 if (!is_scheduled(src) && (src->block == instr->block)) {
473 debug_assert(nsrcs < ARRAY_SIZE(srcs));
474 srcs[nsrcs++] = src;
475 }
476 }
477
478 /* if all our src's are already scheduled: */
479 if (nsrcs == 0) {
480 if (check_instr(ctx, notes, instr)) {
481 instr->data = instr;
482 return instr;
483 }
484 return NULL;
485 }
486
487 while ((src = deepest(srcs, nsrcs))) {
488 struct ir3_instruction *candidate;
489
490 candidate = find_instr_recursive(ctx, notes, src);
491 if (!candidate)
492 continue;
493
494 if (check_instr(ctx, notes, candidate)) {
495 instr->data = candidate;
496 return candidate;
497 }
498 }
499
500 instr->data = NULL_INSTR;
501 return NULL;
502 }
503
504 /* find net change to live values if instruction were scheduled: */
505 static int
506 live_effect(struct ir3_instruction *instr)
507 {
508 struct ir3_instruction *src;
509 int new_live = dest_regs(instr);
510 int old_live = 0;
511
512 foreach_ssa_src_n(src, n, instr) {
513 if (__is_false_dep(instr, n))
514 continue;
515
516 if (instr->block != src->block)
517 continue;
518
519 /* for fanout/split, just pass things along to the real src: */
520 if (src->opc == OPC_META_FO)
521 src = ssa(src->regs[1]);
522
523 /* for fanin/collect, if this is the last use of *each* src,
524 * then it will decrease the live values, since RA treats
525 * them as a whole:
526 */
527 if (src->opc == OPC_META_FI) {
528 struct ir3_instruction *src2;
529 bool last_use = true;
530
531 foreach_ssa_src(src2, src) {
532 if (src2->use_count > 1) {
533 last_use = false;
534 break;
535 }
536 }
537
538 if (last_use)
539 old_live += dest_regs(src);
540
541 } else {
542 debug_assert(src->use_count > 0);
543
544 if (src->use_count == 1) {
545 old_live += dest_regs(src);
546 }
547 }
548 }
549
550 return new_live - old_live;
551 }
552
553 /* find instruction to schedule: */
554 static struct ir3_instruction *
555 find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
556 bool soft)
557 {
558 struct ir3_instruction *best_instr = NULL;
559 int best_rank = INT_MAX; /* lower is better */
560 unsigned deepest = 0;
561
562 /* TODO we'd really rather use the list/array of block outputs. But we
563 * don't have such a thing. Recursing *every* instruction in the list
564 * will result in a lot of repeated traversal, since instructions will
565 * get traversed both when they appear as ssa src to a later instruction
566 * as well as where they appear in the depth_list.
567 */
568 list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) {
569 struct ir3_instruction *candidate;
570
571 candidate = find_instr_recursive(ctx, notes, instr);
572 if (!candidate)
573 continue;
574
575 if (is_meta(candidate))
576 return candidate;
577
578 deepest = MAX2(deepest, candidate->depth);
579 }
580
581 /* traverse the list a second time.. but since we cache the result of
582 * find_instr_recursive() it isn't as bad as it looks.
583 */
584 list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) {
585 struct ir3_instruction *candidate;
586
587 candidate = find_instr_recursive(ctx, notes, instr);
588 if (!candidate)
589 continue;
590
591 /* determine net change to # of live values: */
592 int le = live_effect(candidate);
593
594 /* if there is a net increase in # of live values, then apply some
595 * threshold to avoid instructions getting scheduled *too* early
596 * and increasing register pressure.
597 */
598 if (le >= 1) {
599 unsigned threshold;
600
601 if (ctx->live_values > 4*4) {
602 threshold = 4;
603 } else {
604 threshold = 6;
605 }
606
607 /* Filter out any "shallow" instructions which would otherwise
608 * tend to get scheduled too early to fill delay slots even
609 * when they are not needed for a while. There will probably
610 * be later delay slots that they could just as easily fill.
611 *
612 * A classic case where this comes up is frag shaders that
613 * write a constant value (like 1.0f) to one of the channels
614 * of the output color(s). Since the mov from immed has no
615 * dependencies, it would otherwise get scheduled early to
616 * fill delay slots, occupying a register until the end of
617 * the program.
618 */
619 if ((deepest - candidate->depth) > threshold)
620 continue;
621 }
622
623 int rank = delay_calc(ctx->block, candidate, soft, false);
624
625 /* if too many live values, prioritize instructions that reduce the
626 * number of live values:
627 */
628 if (ctx->live_values > 16*4) {
629 rank = le;
630 } else if (ctx->live_values > 4*4) {
631 rank += le;
632 }
633
634 if (rank < best_rank) {
635 best_instr = candidate;
636 best_rank = rank;
637 }
638 }
639
640 return best_instr;
641 }
642
643 /* "spill" the address register by remapping any unscheduled
644 * instructions which depend on the current address register
645 * to a clone of the instruction which wrote the address reg.
646 */
647 static struct ir3_instruction *
648 split_addr(struct ir3_sched_ctx *ctx)
649 {
650 struct ir3 *ir;
651 struct ir3_instruction *new_addr = NULL;
652 unsigned i;
653
654 debug_assert(ctx->addr);
655
656 ir = ctx->addr->block->shader;
657
658 for (i = 0; i < ir->indirects_count; i++) {
659 struct ir3_instruction *indirect = ir->indirects[i];
660
661 if (!indirect)
662 continue;
663
664 /* skip instructions already scheduled: */
665 if (is_scheduled(indirect))
666 continue;
667
668 /* remap remaining instructions using current addr
669 * to new addr:
670 */
671 if (indirect->address == ctx->addr) {
672 if (!new_addr) {
673 new_addr = ir3_instr_clone(ctx->addr);
674 /* original addr is scheduled, but new one isn't: */
675 new_addr->flags &= ~IR3_INSTR_MARK;
676 }
677 ir3_instr_set_address(indirect, new_addr);
678 }
679 }
680
681 /* all remaining indirects remapped to new addr: */
682 ctx->addr = NULL;
683
684 return new_addr;
685 }
686
687 /* "spill" the predicate register by remapping any unscheduled
688 * instructions which depend on the current predicate register
689 * to a clone of the instruction which wrote the address reg.
690 */
691 static struct ir3_instruction *
692 split_pred(struct ir3_sched_ctx *ctx)
693 {
694 struct ir3 *ir;
695 struct ir3_instruction *new_pred = NULL;
696 unsigned i;
697
698 debug_assert(ctx->pred);
699
700 ir = ctx->pred->block->shader;
701
702 for (i = 0; i < ir->predicates_count; i++) {
703 struct ir3_instruction *predicated = ir->predicates[i];
704
705 /* skip instructions already scheduled: */
706 if (is_scheduled(predicated))
707 continue;
708
709 /* remap remaining instructions using current pred
710 * to new pred:
711 *
712 * TODO is there ever a case when pred isn't first
713 * (and only) src?
714 */
715 if (ssa(predicated->regs[1]) == ctx->pred) {
716 if (!new_pred) {
717 new_pred = ir3_instr_clone(ctx->pred);
718 /* original pred is scheduled, but new one isn't: */
719 new_pred->flags &= ~IR3_INSTR_MARK;
720 }
721 predicated->regs[1]->instr = new_pred;
722 }
723 }
724
725 /* all remaining predicated remapped to new pred: */
726 ctx->pred = NULL;
727
728 return new_pred;
729 }
730
731 static void
732 sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
733 {
734 struct list_head unscheduled_list;
735
736 ctx->block = block;
737
738 /* addr/pred writes are per-block: */
739 ctx->addr = NULL;
740 ctx->pred = NULL;
741
742 /* move all instructions to the unscheduled list, and
743 * empty the block's instruction list (to which we will
744 * be inserting).
745 */
746 list_replace(&block->instr_list, &unscheduled_list);
747 list_inithead(&block->instr_list);
748 list_inithead(&ctx->depth_list);
749
750 /* first a pre-pass to schedule all meta:input instructions
751 * (which need to appear first so that RA knows the register is
752 * occupied), and move remaining to depth sorted list:
753 */
754 list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node) {
755 if (instr->opc == OPC_META_INPUT) {
756 schedule(ctx, instr);
757 } else {
758 ir3_insert_by_depth(instr, &ctx->depth_list);
759 }
760 }
761
762 while (!list_empty(&ctx->depth_list)) {
763 struct ir3_sched_notes notes = {0};
764 struct ir3_instruction *instr;
765
766 instr = find_eligible_instr(ctx, &notes, true);
767 if (!instr)
768 instr = find_eligible_instr(ctx, &notes, false);
769
770 if (instr) {
771 unsigned delay = delay_calc(ctx->block, instr, false, false);
772
773 /* and if we run out of instructions that can be scheduled,
774 * then it is time for nop's:
775 */
776 debug_assert(delay <= 6);
777 while (delay > 0) {
778 ir3_NOP(block);
779 delay--;
780 }
781
782 schedule(ctx, instr);
783 } else {
784 struct ir3_instruction *new_instr = NULL;
785
786 /* nothing available to schedule.. if we are blocked on
787 * address/predicate register conflict, then break the
788 * deadlock by cloning the instruction that wrote that
789 * reg:
790 */
791 if (notes.addr_conflict) {
792 new_instr = split_addr(ctx);
793 } else if (notes.pred_conflict) {
794 new_instr = split_pred(ctx);
795 } else {
796 debug_assert(0);
797 ctx->error = true;
798 return;
799 }
800
801 if (new_instr) {
802 /* clearing current addr/pred can change what is
803 * available to schedule, so clear cache..
804 */
805 clear_cache(ctx, NULL);
806
807 ir3_insert_by_depth(new_instr, &ctx->depth_list);
808 /* the original instr that wrote addr/pred may have
809 * originated from a different block:
810 */
811 new_instr->block = block;
812 }
813 }
814 }
815
816 /* And lastly, insert branch/jump instructions to take us to
817 * the next block. Later we'll strip back out the branches
818 * that simply jump to next instruction.
819 */
820 if (block->successors[1]) {
821 /* if/else, conditional branches to "then" or "else": */
822 struct ir3_instruction *br;
823 unsigned delay = 6;
824
825 debug_assert(ctx->pred);
826 debug_assert(block->condition);
827
828 delay -= distance(ctx->block, ctx->pred, delay, false);
829
830 while (delay > 0) {
831 ir3_NOP(block);
832 delay--;
833 }
834
835 /* create "else" branch first (since "then" block should
836 * frequently/always end up being a fall-thru):
837 */
838 br = ir3_BR(block);
839 br->cat0.inv = true;
840 br->cat0.target = block->successors[1];
841
842 /* NOTE: we have to hard code delay of 6 above, since
843 * we want to insert the nop's before constructing the
844 * branch. Throw in an assert so we notice if this
845 * ever breaks on future generation:
846 */
847 debug_assert(ir3_delayslots(ctx->pred, br, 0) == 6);
848
849 br = ir3_BR(block);
850 br->cat0.target = block->successors[0];
851
852 } else if (block->successors[0]) {
853 /* otherwise unconditional jump to next block: */
854 struct ir3_instruction *jmp;
855
856 jmp = ir3_JUMP(block);
857 jmp->cat0.target = block->successors[0];
858 }
859
860 /* NOTE: if we kept track of the predecessors, we could do a better
861 * job w/ (jp) flags.. every node w/ > predecessor is a join point.
862 * Note that as we eliminate blocks which contain only an unconditional
863 * jump we probably need to propagate (jp) flag..
864 */
865 }
866
867 /* After scheduling individual blocks, we still could have cases where
868 * one (or more) paths into a block, a value produced by a previous
869 * has too few delay slots to be legal. We can't deal with this in the
870 * first pass, because loops (ie. we can't ensure all predecessor blocks
871 * are already scheduled in the first pass). All we can really do at
872 * this point is stuff in extra nop's until things are legal.
873 */
874 static void
875 sched_intra_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
876 {
877 unsigned n = 0;
878
879 ctx->block = block;
880
881 list_for_each_entry_safe (struct ir3_instruction, instr, &block->instr_list, node) {
882 unsigned delay = 0;
883
884 set_foreach(block->predecessors, entry) {
885 struct ir3_block *pred = (struct ir3_block *)entry->key;
886 unsigned d = delay_calc(pred, instr, false, true);
887 delay = MAX2(d, delay);
888 }
889
890 while (delay > n) {
891 struct ir3_instruction *nop = ir3_NOP(block);
892
893 /* move to before instr: */
894 list_delinit(&nop->node);
895 list_addtail(&nop->node, &instr->node);
896
897 n++;
898 }
899
900 /* we can bail once we hit worst case delay: */
901 if (++n > 6)
902 break;
903 }
904 }
905
906 int ir3_sched(struct ir3 *ir)
907 {
908 struct ir3_sched_ctx ctx = {0};
909
910 ir3_clear_mark(ir);
911 update_use_count(ir);
912
913 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
914 ctx.live_values = 0;
915 sched_block(&ctx, block);
916 }
917
918 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
919 sched_intra_block(&ctx, block);
920 }
921
922 if (ctx.error)
923 return -1;
924
925 return 0;
926 }
927
928 static unsigned
929 get_array_id(struct ir3_instruction *instr)
930 {
931 /* The expectation is that there is only a single array
932 * src or dst, ir3_cp should enforce this.
933 */
934
935 for (unsigned i = 0; i < instr->regs_count; i++)
936 if (instr->regs[i]->flags & IR3_REG_ARRAY)
937 return instr->regs[i]->array.id;
938
939 unreachable("this was unexpected");
940 }
941
942 /* does instruction 'prior' need to be scheduled before 'instr'? */
943 static bool
944 depends_on(struct ir3_instruction *instr, struct ir3_instruction *prior)
945 {
946 /* TODO for dependencies that are related to a specific object, ie
947 * a specific SSBO/image/array, we could relax this constraint to
948 * make accesses to unrelated objects not depend on each other (at
949 * least as long as not declared coherent)
950 */
951 if (((instr->barrier_class & IR3_BARRIER_EVERYTHING) && prior->barrier_class) ||
952 ((prior->barrier_class & IR3_BARRIER_EVERYTHING) && instr->barrier_class))
953 return true;
954
955 if (instr->barrier_class & prior->barrier_conflict) {
956 if (!(instr->barrier_class & ~(IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W))) {
957 /* if only array barrier, then we can further limit false-deps
958 * by considering the array-id, ie reads/writes to different
959 * arrays do not depend on each other (no aliasing)
960 */
961 if (get_array_id(instr) != get_array_id(prior)) {
962 return false;
963 }
964 }
965
966 return true;
967 }
968
969 return false;
970 }
971
972 static void
973 add_barrier_deps(struct ir3_block *block, struct ir3_instruction *instr)
974 {
975 struct list_head *prev = instr->node.prev;
976 struct list_head *next = instr->node.next;
977
978 /* add dependencies on previous instructions that must be scheduled
979 * prior to the current instruction
980 */
981 while (prev != &block->instr_list) {
982 struct ir3_instruction *pi =
983 LIST_ENTRY(struct ir3_instruction, prev, node);
984
985 prev = prev->prev;
986
987 if (is_meta(pi))
988 continue;
989
990 if (instr->barrier_class == pi->barrier_class) {
991 ir3_instr_add_dep(instr, pi);
992 break;
993 }
994
995 if (depends_on(instr, pi))
996 ir3_instr_add_dep(instr, pi);
997 }
998
999 /* add dependencies on this instruction to following instructions
1000 * that must be scheduled after the current instruction:
1001 */
1002 while (next != &block->instr_list) {
1003 struct ir3_instruction *ni =
1004 LIST_ENTRY(struct ir3_instruction, next, node);
1005
1006 next = next->next;
1007
1008 if (is_meta(ni))
1009 continue;
1010
1011 if (instr->barrier_class == ni->barrier_class) {
1012 ir3_instr_add_dep(ni, instr);
1013 break;
1014 }
1015
1016 if (depends_on(ni, instr))
1017 ir3_instr_add_dep(ni, instr);
1018 }
1019 }
1020
1021 /* before scheduling a block, we need to add any necessary false-dependencies
1022 * to ensure that:
1023 *
1024 * (1) barriers are scheduled in the right order wrt instructions related
1025 * to the barrier
1026 *
1027 * (2) reads that come before a write actually get scheduled before the
1028 * write
1029 */
1030 static void
1031 calculate_deps(struct ir3_block *block)
1032 {
1033 list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
1034 if (instr->barrier_class) {
1035 add_barrier_deps(block, instr);
1036 }
1037 }
1038 }
1039
1040 void
1041 ir3_sched_add_deps(struct ir3 *ir)
1042 {
1043 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
1044 calculate_deps(block);
1045 }
1046 }