util: rename list_empty() to list_is_empty()
[mesa.git] / src / freedreno / ir3 / ir3_sched.c
1 /*
2 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Rob Clark <robclark@freedesktop.org>
25 */
26
27
28 #include "util/u_math.h"
29
30 #include "ir3.h"
31
32 /*
33 * Instruction Scheduling:
34 *
35 * A recursive depth based scheduling algo. Recursively find an eligible
36 * instruction to schedule from the deepest instruction (recursing through
37 * it's unscheduled src instructions). Normally this would result in a
38 * lot of re-traversal of the same instructions, so we cache results in
39 * instr->data (and clear cached results that would be no longer valid
40 * after scheduling an instruction).
41 *
42 * There are a few special cases that need to be handled, since sched
43 * is currently independent of register allocation. Usages of address
44 * register (a0.x) or predicate register (p0.x) must be serialized. Ie.
45 * if you have two pairs of instructions that write the same special
46 * register and then read it, then those pairs cannot be interleaved.
47 * To solve this, when we are in such a scheduling "critical section",
48 * and we encounter a conflicting write to a special register, we try
49 * to schedule any remaining instructions that use that value first.
50 */
51
52 struct ir3_sched_ctx {
53 struct ir3_block *block; /* the current block */
54 struct list_head depth_list; /* depth sorted unscheduled instrs */
55 struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/
56 struct ir3_instruction *addr; /* current a0.x user, if any */
57 struct ir3_instruction *pred; /* current p0.x user, if any */
58 int live_values; /* estimate of current live values */
59 bool error;
60 };
61
62 static bool is_scheduled(struct ir3_instruction *instr)
63 {
64 return !!(instr->flags & IR3_INSTR_MARK);
65 }
66
67 static bool is_sfu_or_mem(struct ir3_instruction *instr)
68 {
69 return is_sfu(instr) || is_mem(instr);
70 }
71
72 static void
73 unuse_each_src(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
74 {
75 struct ir3_instruction *src;
76
77 foreach_ssa_src_n(src, n, instr) {
78 if (__is_false_dep(instr, n))
79 continue;
80 if (instr->block != src->block)
81 continue;
82 if ((src->opc == OPC_META_FI) || (src->opc == OPC_META_FO)) {
83 unuse_each_src(ctx, src);
84 } else {
85 debug_assert(src->use_count > 0);
86
87 if (--src->use_count == 0) {
88 ctx->live_values -= dest_regs(src);
89 debug_assert(ctx->live_values >= 0);
90 }
91 }
92 }
93 }
94
95 static void clear_cache(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr);
96 static void use_instr(struct ir3_instruction *instr);
97
98 /* transfers a use-count to new instruction, for cases where we
99 * "spill" address or predicate. Note this might cause the
100 * previous instruction that loaded a0.x/p0.x to become live
101 * again, when we previously thought it was dead.
102 */
103 static void
104 transfer_use(struct ir3_sched_ctx *ctx, struct ir3_instruction *orig_instr,
105 struct ir3_instruction *new_instr)
106 {
107 struct ir3_instruction *src;
108
109 debug_assert(is_scheduled(orig_instr));
110
111 foreach_ssa_src_n(src, n, new_instr) {
112 if (__is_false_dep(new_instr, n))
113 continue;
114 ctx->live_values += dest_regs(src);
115 use_instr(src);
116 }
117
118 clear_cache(ctx, orig_instr);
119 }
120
121 static void
122 use_each_src(struct ir3_instruction *instr)
123 {
124 struct ir3_instruction *src;
125
126 foreach_ssa_src_n(src, n, instr) {
127 if (__is_false_dep(instr, n))
128 continue;
129 use_instr(src);
130 }
131 }
132
133 static void
134 use_instr(struct ir3_instruction *instr)
135 {
136 if ((instr->opc == OPC_META_FI) || (instr->opc == OPC_META_FO)) {
137 use_each_src(instr);
138 } else {
139 instr->use_count++;
140 }
141 }
142
143 static void
144 update_live_values(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
145 {
146 if ((instr->opc == OPC_META_FI) || (instr->opc == OPC_META_FO))
147 return;
148
149 ctx->live_values += dest_regs(instr);
150 unuse_each_src(ctx, instr);
151 }
152
153 static void
154 update_use_count(struct ir3 *ir)
155 {
156 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
157 list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
158 instr->use_count = 0;
159 }
160 }
161
162 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
163 list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
164 if ((instr->opc == OPC_META_FI) || (instr->opc == OPC_META_FO))
165 continue;
166
167 use_each_src(instr);
168 }
169 }
170
171 /* Shader outputs are also used:
172 */
173 for (unsigned i = 0; i < ir->noutputs; i++) {
174 struct ir3_instruction *out = ir->outputs[i];
175
176 if (!out)
177 continue;
178
179 use_instr(out);
180 }
181 }
182
183 #define NULL_INSTR ((void *)~0)
184
185 static void
186 clear_cache(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
187 {
188 list_for_each_entry (struct ir3_instruction, instr2, &ctx->depth_list, node) {
189 if ((instr2->data == instr) || (instr2->data == NULL_INSTR) || !instr)
190 instr2->data = NULL;
191 }
192 }
193
194 static void
195 schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
196 {
197 debug_assert(ctx->block == instr->block);
198
199 /* maybe there is a better way to handle this than just stuffing
200 * a nop.. ideally we'd know about this constraint in the
201 * scheduling and depth calculation..
202 */
203 if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr))
204 ir3_NOP(ctx->block);
205
206 /* remove from depth list:
207 */
208 list_delinit(&instr->node);
209
210 if (writes_addr(instr)) {
211 debug_assert(ctx->addr == NULL);
212 ctx->addr = instr;
213 }
214
215 if (writes_pred(instr)) {
216 debug_assert(ctx->pred == NULL);
217 ctx->pred = instr;
218 }
219
220 instr->flags |= IR3_INSTR_MARK;
221
222 list_addtail(&instr->node, &instr->block->instr_list);
223 ctx->scheduled = instr;
224
225 update_live_values(ctx, instr);
226
227 if (writes_addr(instr) || writes_pred(instr) || is_input(instr)) {
228 clear_cache(ctx, NULL);
229 } else {
230 /* invalidate only the necessary entries.. */
231 clear_cache(ctx, instr);
232 }
233 }
234
235 static struct ir3_instruction *
236 deepest(struct ir3_instruction **srcs, unsigned nsrcs)
237 {
238 struct ir3_instruction *d = NULL;
239 unsigned i = 0, id = 0;
240
241 while ((i < nsrcs) && !(d = srcs[id = i]))
242 i++;
243
244 if (!d)
245 return NULL;
246
247 for (; i < nsrcs; i++)
248 if (srcs[i] && (srcs[i]->depth > d->depth))
249 d = srcs[id = i];
250
251 srcs[id] = NULL;
252
253 return d;
254 }
255
256 /**
257 * @block: the block to search in, starting from end; in first pass,
258 * this will be the block the instruction would be inserted into
259 * (but has not yet, ie. it only contains already scheduled
260 * instructions). For intra-block scheduling (second pass), this
261 * would be one of the predecessor blocks.
262 * @instr: the instruction to search for
263 * @maxd: max distance, bail after searching this # of instruction
264 * slots, since it means the instruction we are looking for is
265 * far enough away
266 * @pred: if true, recursively search into predecessor blocks to
267 * find the worst case (shortest) distance (only possible after
268 * individual blocks are all scheduled
269 */
270 static unsigned
271 distance(struct ir3_block *block, struct ir3_instruction *instr,
272 unsigned maxd, bool pred)
273 {
274 unsigned d = 0;
275
276 list_for_each_entry_rev (struct ir3_instruction, n, &block->instr_list, node) {
277 if ((n == instr) || (d >= maxd))
278 return d;
279 /* NOTE: don't count branch/jump since we don't know yet if they will
280 * be eliminated later in resolve_jumps().. really should do that
281 * earlier so we don't have this constraint.
282 */
283 if (is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR)))
284 d++;
285 }
286
287 /* if coming from a predecessor block, assume it is assigned far
288 * enough away.. we'll fix up later.
289 */
290 if (!pred)
291 return maxd;
292
293 if (pred && (block->data != block)) {
294 /* Search into predecessor blocks, finding the one with the
295 * shortest distance, since that will be the worst case
296 */
297 unsigned min = maxd - d;
298
299 /* (ab)use block->data to prevent recursion: */
300 block->data = block;
301
302 set_foreach(block->predecessors, entry) {
303 struct ir3_block *pred = (struct ir3_block *)entry->key;
304 unsigned n;
305
306 n = distance(pred, instr, min, pred);
307
308 min = MIN2(min, n);
309 }
310
311 block->data = NULL;
312 d += min;
313 }
314
315 return d;
316 }
317
318 /* calculate delay for specified src: */
319 static unsigned
320 delay_calc_srcn(struct ir3_block *block,
321 struct ir3_instruction *assigner,
322 struct ir3_instruction *consumer,
323 unsigned srcn, bool soft, bool pred)
324 {
325 unsigned delay = 0;
326
327 if (is_meta(assigner)) {
328 struct ir3_instruction *src;
329 foreach_ssa_src(src, assigner) {
330 unsigned d;
331 d = delay_calc_srcn(block, src, consumer, srcn, soft, pred);
332 delay = MAX2(delay, d);
333 }
334 } else {
335 if (soft) {
336 if (is_sfu(assigner)) {
337 delay = 4;
338 } else {
339 delay = ir3_delayslots(assigner, consumer, srcn);
340 }
341 } else {
342 delay = ir3_delayslots(assigner, consumer, srcn);
343 }
344 delay -= distance(block, assigner, delay, pred);
345 }
346
347 return delay;
348 }
349
350 /* calculate delay for instruction (maximum of delay for all srcs): */
351 static unsigned
352 delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
353 bool soft, bool pred)
354 {
355 unsigned delay = 0;
356 struct ir3_instruction *src;
357
358 foreach_ssa_src_n(src, i, instr) {
359 unsigned d;
360 d = delay_calc_srcn(block, src, instr, i, soft, pred);
361 delay = MAX2(delay, d);
362 }
363
364 return delay;
365 }
366
367 struct ir3_sched_notes {
368 /* there is at least one kill which could be scheduled, except
369 * for unscheduled bary.f's:
370 */
371 bool blocked_kill;
372 /* there is at least one instruction that could be scheduled,
373 * except for conflicting address/predicate register usage:
374 */
375 bool addr_conflict, pred_conflict;
376 };
377
378 /* could an instruction be scheduled if specified ssa src was scheduled? */
379 static bool
380 could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
381 {
382 struct ir3_instruction *other_src;
383 foreach_ssa_src(other_src, instr) {
384 /* if dependency not scheduled, we aren't ready yet: */
385 if ((src != other_src) && !is_scheduled(other_src)) {
386 return false;
387 }
388 }
389 return true;
390 }
391
392 /* Check if instruction is ok to schedule. Make sure it is not blocked
393 * by use of addr/predicate register, etc.
394 */
395 static bool
396 check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
397 struct ir3_instruction *instr)
398 {
399 debug_assert(!is_scheduled(instr));
400
401 /* For instructions that write address register we need to
402 * make sure there is at least one instruction that uses the
403 * addr value which is otherwise ready.
404 *
405 * TODO if any instructions use pred register and have other
406 * src args, we would need to do the same for writes_pred()..
407 */
408 if (writes_addr(instr)) {
409 struct ir3 *ir = instr->block->shader;
410 bool ready = false;
411 for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) {
412 struct ir3_instruction *indirect = ir->indirects[i];
413 if (!indirect)
414 continue;
415 if (indirect->address != instr)
416 continue;
417 ready = could_sched(indirect, instr);
418 }
419
420 /* nothing could be scheduled, so keep looking: */
421 if (!ready)
422 return false;
423 }
424
425 /* if this is a write to address/predicate register, and that
426 * register is currently in use, we need to defer until it is
427 * free:
428 */
429 if (writes_addr(instr) && ctx->addr) {
430 debug_assert(ctx->addr != instr);
431 notes->addr_conflict = true;
432 return false;
433 }
434
435 if (writes_pred(instr) && ctx->pred) {
436 debug_assert(ctx->pred != instr);
437 notes->pred_conflict = true;
438 return false;
439 }
440
441 /* if the instruction is a kill, we need to ensure *every*
442 * bary.f is scheduled. The hw seems unhappy if the thread
443 * gets killed before the end-input (ei) flag is hit.
444 *
445 * We could do this by adding each bary.f instruction as
446 * virtual ssa src for the kill instruction. But we have
447 * fixed length instr->regs[].
448 *
449 * TODO this wouldn't be quite right if we had multiple
450 * basic blocks, if any block was conditional. We'd need
451 * to schedule the bary.f's outside of any block which
452 * was conditional that contained a kill.. I think..
453 */
454 if (is_kill(instr)) {
455 struct ir3 *ir = instr->block->shader;
456
457 for (unsigned i = 0; i < ir->baryfs_count; i++) {
458 struct ir3_instruction *baryf = ir->baryfs[i];
459 if (baryf->flags & IR3_INSTR_UNUSED)
460 continue;
461 if (!is_scheduled(baryf)) {
462 notes->blocked_kill = true;
463 return false;
464 }
465 }
466 }
467
468 return true;
469 }
470
471 /* Find the best instruction to schedule from specified instruction or
472 * recursively it's ssa sources.
473 */
474 static struct ir3_instruction *
475 find_instr_recursive(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
476 struct ir3_instruction *instr)
477 {
478 struct ir3_instruction *srcs[__ssa_src_cnt(instr)];
479 struct ir3_instruction *src;
480 unsigned nsrcs = 0;
481
482 if (is_scheduled(instr))
483 return NULL;
484
485 /* use instr->data to cache the results of recursing up the
486 * instr src's. Otherwise the recursive algo can scale quite
487 * badly w/ shader size. But this takes some care to clear
488 * the cache appropriately when instructions are scheduled.
489 */
490 if (instr->data) {
491 if (instr->data == NULL_INSTR)
492 return NULL;
493 return instr->data;
494 }
495
496 /* find unscheduled srcs: */
497 foreach_ssa_src(src, instr) {
498 if (!is_scheduled(src) && (src->block == instr->block)) {
499 debug_assert(nsrcs < ARRAY_SIZE(srcs));
500 srcs[nsrcs++] = src;
501 }
502 }
503
504 /* if all our src's are already scheduled: */
505 if (nsrcs == 0) {
506 if (check_instr(ctx, notes, instr)) {
507 instr->data = instr;
508 return instr;
509 }
510 return NULL;
511 }
512
513 while ((src = deepest(srcs, nsrcs))) {
514 struct ir3_instruction *candidate;
515
516 candidate = find_instr_recursive(ctx, notes, src);
517 if (!candidate)
518 continue;
519
520 if (check_instr(ctx, notes, candidate)) {
521 instr->data = candidate;
522 return candidate;
523 }
524 }
525
526 instr->data = NULL_INSTR;
527 return NULL;
528 }
529
530 /* find net change to live values if instruction were scheduled: */
531 static int
532 live_effect(struct ir3_instruction *instr)
533 {
534 struct ir3_instruction *src;
535 int new_live = dest_regs(instr);
536 int old_live = 0;
537
538 foreach_ssa_src_n(src, n, instr) {
539 if (__is_false_dep(instr, n))
540 continue;
541
542 if (instr->block != src->block)
543 continue;
544
545 /* for fanout/split, just pass things along to the real src: */
546 if (src->opc == OPC_META_FO)
547 src = ssa(src->regs[1]);
548
549 /* for fanin/collect, if this is the last use of *each* src,
550 * then it will decrease the live values, since RA treats
551 * them as a whole:
552 */
553 if (src->opc == OPC_META_FI) {
554 struct ir3_instruction *src2;
555 bool last_use = true;
556
557 foreach_ssa_src(src2, src) {
558 if (src2->use_count > 1) {
559 last_use = false;
560 break;
561 }
562 }
563
564 if (last_use)
565 old_live += dest_regs(src);
566
567 } else {
568 debug_assert(src->use_count > 0);
569
570 if (src->use_count == 1) {
571 old_live += dest_regs(src);
572 }
573 }
574 }
575
576 return new_live - old_live;
577 }
578
579 /* find instruction to schedule: */
580 static struct ir3_instruction *
581 find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
582 bool soft)
583 {
584 struct ir3_instruction *best_instr = NULL;
585 int best_rank = INT_MAX; /* lower is better */
586 unsigned deepest = 0;
587
588 /* TODO we'd really rather use the list/array of block outputs. But we
589 * don't have such a thing. Recursing *every* instruction in the list
590 * will result in a lot of repeated traversal, since instructions will
591 * get traversed both when they appear as ssa src to a later instruction
592 * as well as where they appear in the depth_list.
593 */
594 list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) {
595 struct ir3_instruction *candidate;
596
597 candidate = find_instr_recursive(ctx, notes, instr);
598 if (!candidate)
599 continue;
600
601 if (is_meta(candidate))
602 return candidate;
603
604 deepest = MAX2(deepest, candidate->depth);
605 }
606
607 /* traverse the list a second time.. but since we cache the result of
608 * find_instr_recursive() it isn't as bad as it looks.
609 */
610 list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) {
611 struct ir3_instruction *candidate;
612
613 candidate = find_instr_recursive(ctx, notes, instr);
614 if (!candidate)
615 continue;
616
617 /* determine net change to # of live values: */
618 int le = live_effect(candidate);
619
620 /* if there is a net increase in # of live values, then apply some
621 * threshold to avoid instructions getting scheduled *too* early
622 * and increasing register pressure.
623 */
624 if (le >= 1) {
625 unsigned threshold;
626
627 if (ctx->live_values > 4*4) {
628 threshold = 4;
629 } else {
630 threshold = 6;
631 }
632
633 /* Filter out any "shallow" instructions which would otherwise
634 * tend to get scheduled too early to fill delay slots even
635 * when they are not needed for a while. There will probably
636 * be later delay slots that they could just as easily fill.
637 *
638 * A classic case where this comes up is frag shaders that
639 * write a constant value (like 1.0f) to one of the channels
640 * of the output color(s). Since the mov from immed has no
641 * dependencies, it would otherwise get scheduled early to
642 * fill delay slots, occupying a register until the end of
643 * the program.
644 */
645 if ((deepest - candidate->depth) > threshold)
646 continue;
647 }
648
649 int rank = delay_calc(ctx->block, candidate, soft, false);
650
651 /* if too many live values, prioritize instructions that reduce the
652 * number of live values:
653 */
654 if (ctx->live_values > 16*4) {
655 rank = le;
656 } else if (ctx->live_values > 4*4) {
657 rank += le;
658 }
659
660 if (rank < best_rank) {
661 best_instr = candidate;
662 best_rank = rank;
663 }
664 }
665
666 return best_instr;
667 }
668
669 static struct ir3_instruction *
670 split_instr(struct ir3_sched_ctx *ctx, struct ir3_instruction *orig_instr)
671 {
672 struct ir3_instruction *new_instr = ir3_instr_clone(orig_instr);
673 ir3_insert_by_depth(new_instr, &ctx->depth_list);
674 transfer_use(ctx, orig_instr, new_instr);
675 return new_instr;
676 }
677
678 /* "spill" the address register by remapping any unscheduled
679 * instructions which depend on the current address register
680 * to a clone of the instruction which wrote the address reg.
681 */
682 static struct ir3_instruction *
683 split_addr(struct ir3_sched_ctx *ctx)
684 {
685 struct ir3 *ir;
686 struct ir3_instruction *new_addr = NULL;
687 unsigned i;
688
689 debug_assert(ctx->addr);
690
691 ir = ctx->addr->block->shader;
692
693 for (i = 0; i < ir->indirects_count; i++) {
694 struct ir3_instruction *indirect = ir->indirects[i];
695
696 if (!indirect)
697 continue;
698
699 /* skip instructions already scheduled: */
700 if (is_scheduled(indirect))
701 continue;
702
703 /* remap remaining instructions using current addr
704 * to new addr:
705 */
706 if (indirect->address == ctx->addr) {
707 if (!new_addr) {
708 new_addr = split_instr(ctx, ctx->addr);
709 /* original addr is scheduled, but new one isn't: */
710 new_addr->flags &= ~IR3_INSTR_MARK;
711 }
712 indirect->address = NULL;
713 ir3_instr_set_address(indirect, new_addr);
714 }
715 }
716
717 /* all remaining indirects remapped to new addr: */
718 ctx->addr = NULL;
719
720 return new_addr;
721 }
722
723 /* "spill" the predicate register by remapping any unscheduled
724 * instructions which depend on the current predicate register
725 * to a clone of the instruction which wrote the address reg.
726 */
727 static struct ir3_instruction *
728 split_pred(struct ir3_sched_ctx *ctx)
729 {
730 struct ir3 *ir;
731 struct ir3_instruction *new_pred = NULL;
732 unsigned i;
733
734 debug_assert(ctx->pred);
735
736 ir = ctx->pred->block->shader;
737
738 for (i = 0; i < ir->predicates_count; i++) {
739 struct ir3_instruction *predicated = ir->predicates[i];
740
741 /* skip instructions already scheduled: */
742 if (is_scheduled(predicated))
743 continue;
744
745 /* remap remaining instructions using current pred
746 * to new pred:
747 *
748 * TODO is there ever a case when pred isn't first
749 * (and only) src?
750 */
751 if (ssa(predicated->regs[1]) == ctx->pred) {
752 if (!new_pred) {
753 new_pred = split_instr(ctx, ctx->pred);
754 /* original pred is scheduled, but new one isn't: */
755 new_pred->flags &= ~IR3_INSTR_MARK;
756 }
757 predicated->regs[1]->instr = new_pred;
758 }
759 }
760
761 /* all remaining predicated remapped to new pred: */
762 ctx->pred = NULL;
763
764 return new_pred;
765 }
766
767 static void
768 sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
769 {
770 struct list_head unscheduled_list;
771
772 ctx->block = block;
773
774 /* addr/pred writes are per-block: */
775 ctx->addr = NULL;
776 ctx->pred = NULL;
777
778 /* move all instructions to the unscheduled list, and
779 * empty the block's instruction list (to which we will
780 * be inserting).
781 */
782 list_replace(&block->instr_list, &unscheduled_list);
783 list_inithead(&block->instr_list);
784 list_inithead(&ctx->depth_list);
785
786 /* first a pre-pass to schedule all meta:input instructions
787 * (which need to appear first so that RA knows the register is
788 * occupied), and move remaining to depth sorted list:
789 */
790 list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node) {
791 if ((instr->opc == OPC_META_INPUT) ||
792 (instr->opc == OPC_META_TEX_PREFETCH)) {
793 schedule(ctx, instr);
794 } else {
795 ir3_insert_by_depth(instr, &ctx->depth_list);
796 }
797 }
798
799 while (!list_is_empty(&ctx->depth_list)) {
800 struct ir3_sched_notes notes = {0};
801 struct ir3_instruction *instr;
802
803 instr = find_eligible_instr(ctx, &notes, true);
804 if (!instr)
805 instr = find_eligible_instr(ctx, &notes, false);
806
807 if (instr) {
808 unsigned delay = delay_calc(ctx->block, instr, false, false);
809
810 /* and if we run out of instructions that can be scheduled,
811 * then it is time for nop's:
812 */
813 debug_assert(delay <= 6);
814 while (delay > 0) {
815 ir3_NOP(block);
816 delay--;
817 }
818
819 schedule(ctx, instr);
820 } else {
821 struct ir3_instruction *new_instr = NULL;
822
823 /* nothing available to schedule.. if we are blocked on
824 * address/predicate register conflict, then break the
825 * deadlock by cloning the instruction that wrote that
826 * reg:
827 */
828 if (notes.addr_conflict) {
829 new_instr = split_addr(ctx);
830 } else if (notes.pred_conflict) {
831 new_instr = split_pred(ctx);
832 } else {
833 debug_assert(0);
834 ctx->error = true;
835 return;
836 }
837
838 if (new_instr) {
839 /* clearing current addr/pred can change what is
840 * available to schedule, so clear cache..
841 */
842 clear_cache(ctx, NULL);
843
844 ir3_insert_by_depth(new_instr, &ctx->depth_list);
845 /* the original instr that wrote addr/pred may have
846 * originated from a different block:
847 */
848 new_instr->block = block;
849 }
850 }
851 }
852
853 /* And lastly, insert branch/jump instructions to take us to
854 * the next block. Later we'll strip back out the branches
855 * that simply jump to next instruction.
856 */
857 if (block->successors[1]) {
858 /* if/else, conditional branches to "then" or "else": */
859 struct ir3_instruction *br;
860 unsigned delay = 6;
861
862 debug_assert(ctx->pred);
863 debug_assert(block->condition);
864
865 delay -= distance(ctx->block, ctx->pred, delay, false);
866
867 while (delay > 0) {
868 ir3_NOP(block);
869 delay--;
870 }
871
872 /* create "else" branch first (since "then" block should
873 * frequently/always end up being a fall-thru):
874 */
875 br = ir3_BR(block);
876 br->cat0.inv = true;
877 br->cat0.target = block->successors[1];
878
879 /* NOTE: we have to hard code delay of 6 above, since
880 * we want to insert the nop's before constructing the
881 * branch. Throw in an assert so we notice if this
882 * ever breaks on future generation:
883 */
884 debug_assert(ir3_delayslots(ctx->pred, br, 0) == 6);
885
886 br = ir3_BR(block);
887 br->cat0.target = block->successors[0];
888
889 } else if (block->successors[0]) {
890 /* otherwise unconditional jump to next block: */
891 struct ir3_instruction *jmp;
892
893 jmp = ir3_JUMP(block);
894 jmp->cat0.target = block->successors[0];
895 }
896
897 /* NOTE: if we kept track of the predecessors, we could do a better
898 * job w/ (jp) flags.. every node w/ > predecessor is a join point.
899 * Note that as we eliminate blocks which contain only an unconditional
900 * jump we probably need to propagate (jp) flag..
901 */
902 }
903
904 /* After scheduling individual blocks, we still could have cases where
905 * one (or more) paths into a block, a value produced by a previous
906 * has too few delay slots to be legal. We can't deal with this in the
907 * first pass, because loops (ie. we can't ensure all predecessor blocks
908 * are already scheduled in the first pass). All we can really do at
909 * this point is stuff in extra nop's until things are legal.
910 */
911 static void
912 sched_intra_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
913 {
914 unsigned n = 0;
915
916 ctx->block = block;
917
918 list_for_each_entry_safe (struct ir3_instruction, instr, &block->instr_list, node) {
919 unsigned delay = 0;
920
921 set_foreach(block->predecessors, entry) {
922 struct ir3_block *pred = (struct ir3_block *)entry->key;
923 unsigned d = delay_calc(pred, instr, false, true);
924 delay = MAX2(d, delay);
925 }
926
927 while (delay > n) {
928 struct ir3_instruction *nop = ir3_NOP(block);
929
930 /* move to before instr: */
931 list_delinit(&nop->node);
932 list_addtail(&nop->node, &instr->node);
933
934 n++;
935 }
936
937 /* we can bail once we hit worst case delay: */
938 if (++n > 6)
939 break;
940 }
941 }
942
943 int ir3_sched(struct ir3 *ir)
944 {
945 struct ir3_sched_ctx ctx = {0};
946
947 ir3_clear_mark(ir);
948 update_use_count(ir);
949
950 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
951 ctx.live_values = 0;
952 sched_block(&ctx, block);
953 }
954
955 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
956 sched_intra_block(&ctx, block);
957 }
958
959 if (ctx.error)
960 return -1;
961
962 return 0;
963 }
964
965 static unsigned
966 get_array_id(struct ir3_instruction *instr)
967 {
968 /* The expectation is that there is only a single array
969 * src or dst, ir3_cp should enforce this.
970 */
971
972 for (unsigned i = 0; i < instr->regs_count; i++)
973 if (instr->regs[i]->flags & IR3_REG_ARRAY)
974 return instr->regs[i]->array.id;
975
976 unreachable("this was unexpected");
977 }
978
979 /* does instruction 'prior' need to be scheduled before 'instr'? */
980 static bool
981 depends_on(struct ir3_instruction *instr, struct ir3_instruction *prior)
982 {
983 /* TODO for dependencies that are related to a specific object, ie
984 * a specific SSBO/image/array, we could relax this constraint to
985 * make accesses to unrelated objects not depend on each other (at
986 * least as long as not declared coherent)
987 */
988 if (((instr->barrier_class & IR3_BARRIER_EVERYTHING) && prior->barrier_class) ||
989 ((prior->barrier_class & IR3_BARRIER_EVERYTHING) && instr->barrier_class))
990 return true;
991
992 if (instr->barrier_class & prior->barrier_conflict) {
993 if (!(instr->barrier_class & ~(IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W))) {
994 /* if only array barrier, then we can further limit false-deps
995 * by considering the array-id, ie reads/writes to different
996 * arrays do not depend on each other (no aliasing)
997 */
998 if (get_array_id(instr) != get_array_id(prior)) {
999 return false;
1000 }
1001 }
1002
1003 return true;
1004 }
1005
1006 return false;
1007 }
1008
1009 static void
1010 add_barrier_deps(struct ir3_block *block, struct ir3_instruction *instr)
1011 {
1012 struct list_head *prev = instr->node.prev;
1013 struct list_head *next = instr->node.next;
1014
1015 /* add dependencies on previous instructions that must be scheduled
1016 * prior to the current instruction
1017 */
1018 while (prev != &block->instr_list) {
1019 struct ir3_instruction *pi =
1020 LIST_ENTRY(struct ir3_instruction, prev, node);
1021
1022 prev = prev->prev;
1023
1024 if (is_meta(pi))
1025 continue;
1026
1027 if (instr->barrier_class == pi->barrier_class) {
1028 ir3_instr_add_dep(instr, pi);
1029 break;
1030 }
1031
1032 if (depends_on(instr, pi))
1033 ir3_instr_add_dep(instr, pi);
1034 }
1035
1036 /* add dependencies on this instruction to following instructions
1037 * that must be scheduled after the current instruction:
1038 */
1039 while (next != &block->instr_list) {
1040 struct ir3_instruction *ni =
1041 LIST_ENTRY(struct ir3_instruction, next, node);
1042
1043 next = next->next;
1044
1045 if (is_meta(ni))
1046 continue;
1047
1048 if (instr->barrier_class == ni->barrier_class) {
1049 ir3_instr_add_dep(ni, instr);
1050 break;
1051 }
1052
1053 if (depends_on(ni, instr))
1054 ir3_instr_add_dep(ni, instr);
1055 }
1056 }
1057
1058 /* before scheduling a block, we need to add any necessary false-dependencies
1059 * to ensure that:
1060 *
1061 * (1) barriers are scheduled in the right order wrt instructions related
1062 * to the barrier
1063 *
1064 * (2) reads that come before a write actually get scheduled before the
1065 * write
1066 */
1067 static void
1068 calculate_deps(struct ir3_block *block)
1069 {
1070 list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
1071 if (instr->barrier_class) {
1072 add_barrier_deps(block, instr);
1073 }
1074 }
1075 }
1076
1077 void
1078 ir3_sched_add_deps(struct ir3 *ir)
1079 {
1080 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
1081 calculate_deps(block);
1082 }
1083 }