freedreno/ir3: fix gpu hang with pre-fs-tex-fetch
[mesa.git] / src / freedreno / ir3 / ir3_sched.c
1 /*
2 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Rob Clark <robclark@freedesktop.org>
25 */
26
27
28 #include "util/u_math.h"
29
30 #include "ir3.h"
31
32 /*
33 * Instruction Scheduling:
34 *
35 * A recursive depth based scheduling algo. Recursively find an eligible
36 * instruction to schedule from the deepest instruction (recursing through
37 * it's unscheduled src instructions). Normally this would result in a
38 * lot of re-traversal of the same instructions, so we cache results in
39 * instr->data (and clear cached results that would be no longer valid
40 * after scheduling an instruction).
41 *
42 * There are a few special cases that need to be handled, since sched
43 * is currently independent of register allocation. Usages of address
44 * register (a0.x) or predicate register (p0.x) must be serialized. Ie.
45 * if you have two pairs of instructions that write the same special
46 * register and then read it, then those pairs cannot be interleaved.
47 * To solve this, when we are in such a scheduling "critical section",
48 * and we encounter a conflicting write to a special register, we try
49 * to schedule any remaining instructions that use that value first.
50 */
51
52 struct ir3_sched_ctx {
53 struct ir3_block *block; /* the current block */
54 struct list_head depth_list; /* depth sorted unscheduled instrs */
55 struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/
56 struct ir3_instruction *addr; /* current a0.x user, if any */
57 struct ir3_instruction *pred; /* current p0.x user, if any */
58 int live_values; /* estimate of current live values */
59 bool error;
60 };
61
62 static bool is_scheduled(struct ir3_instruction *instr)
63 {
64 return !!(instr->flags & IR3_INSTR_MARK);
65 }
66
67 static bool is_sfu_or_mem(struct ir3_instruction *instr)
68 {
69 return is_sfu(instr) || is_mem(instr);
70 }
71
72 static void
73 unuse_each_src(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
74 {
75 struct ir3_instruction *src;
76
77 foreach_ssa_src_n(src, n, instr) {
78 if (__is_false_dep(instr, n))
79 continue;
80 if (instr->block != src->block)
81 continue;
82 if ((src->opc == OPC_META_COLLECT) || (src->opc == OPC_META_SPLIT)) {
83 unuse_each_src(ctx, src);
84 } else {
85 debug_assert(src->use_count > 0);
86
87 if (--src->use_count == 0) {
88 ctx->live_values -= dest_regs(src);
89 debug_assert(ctx->live_values >= 0);
90 }
91 }
92 }
93 }
94
95 static void clear_cache(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr);
96 static void use_instr(struct ir3_instruction *instr);
97
98 /* transfers a use-count to new instruction, for cases where we
99 * "spill" address or predicate. Note this might cause the
100 * previous instruction that loaded a0.x/p0.x to become live
101 * again, when we previously thought it was dead.
102 */
103 static void
104 transfer_use(struct ir3_sched_ctx *ctx, struct ir3_instruction *orig_instr,
105 struct ir3_instruction *new_instr)
106 {
107 struct ir3_instruction *src;
108
109 debug_assert(is_scheduled(orig_instr));
110
111 foreach_ssa_src_n(src, n, new_instr) {
112 if (__is_false_dep(new_instr, n))
113 continue;
114 ctx->live_values += dest_regs(src);
115 use_instr(src);
116 }
117
118 clear_cache(ctx, orig_instr);
119 }
120
121 static void
122 use_each_src(struct ir3_instruction *instr)
123 {
124 struct ir3_instruction *src;
125
126 foreach_ssa_src_n(src, n, instr) {
127 if (__is_false_dep(instr, n))
128 continue;
129 use_instr(src);
130 }
131 }
132
133 static void
134 use_instr(struct ir3_instruction *instr)
135 {
136 if ((instr->opc == OPC_META_COLLECT) || (instr->opc == OPC_META_SPLIT)) {
137 use_each_src(instr);
138 } else {
139 instr->use_count++;
140 }
141 }
142
143 static void
144 update_live_values(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
145 {
146 if ((instr->opc == OPC_META_COLLECT) || (instr->opc == OPC_META_SPLIT))
147 return;
148
149 ctx->live_values += dest_regs(instr);
150 unuse_each_src(ctx, instr);
151 }
152
153 static void
154 update_use_count(struct ir3 *ir)
155 {
156 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
157 list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
158 instr->use_count = 0;
159 }
160 }
161
162 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
163 list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
164 if ((instr->opc == OPC_META_COLLECT) || (instr->opc == OPC_META_SPLIT))
165 continue;
166
167 use_each_src(instr);
168 }
169 }
170
171 /* Shader outputs are also used:
172 */
173 struct ir3_instruction *out;
174 foreach_output(out, ir)
175 use_instr(out);
176 }
177
178 #define NULL_INSTR ((void *)~0)
179
180 static void
181 clear_cache(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
182 {
183 list_for_each_entry (struct ir3_instruction, instr2, &ctx->depth_list, node) {
184 if ((instr2->data == instr) || (instr2->data == NULL_INSTR) || !instr)
185 instr2->data = NULL;
186 }
187 }
188
189 static void
190 schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
191 {
192 debug_assert(ctx->block == instr->block);
193
194 /* maybe there is a better way to handle this than just stuffing
195 * a nop.. ideally we'd know about this constraint in the
196 * scheduling and depth calculation..
197 */
198 if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr))
199 ir3_NOP(ctx->block);
200
201 /* remove from depth list:
202 */
203 list_delinit(&instr->node);
204
205 if (writes_addr(instr)) {
206 debug_assert(ctx->addr == NULL);
207 ctx->addr = instr;
208 }
209
210 if (writes_pred(instr)) {
211 debug_assert(ctx->pred == NULL);
212 ctx->pred = instr;
213 }
214
215 instr->flags |= IR3_INSTR_MARK;
216
217 list_addtail(&instr->node, &instr->block->instr_list);
218 ctx->scheduled = instr;
219
220 update_live_values(ctx, instr);
221
222 if (writes_addr(instr) || writes_pred(instr) || is_input(instr)) {
223 clear_cache(ctx, NULL);
224 } else {
225 /* invalidate only the necessary entries.. */
226 clear_cache(ctx, instr);
227 }
228 }
229
230 static struct ir3_instruction *
231 deepest(struct ir3_instruction **srcs, unsigned nsrcs)
232 {
233 struct ir3_instruction *d = NULL;
234 unsigned i = 0, id = 0;
235
236 while ((i < nsrcs) && !(d = srcs[id = i]))
237 i++;
238
239 if (!d)
240 return NULL;
241
242 for (; i < nsrcs; i++)
243 if (srcs[i] && (srcs[i]->depth > d->depth))
244 d = srcs[id = i];
245
246 srcs[id] = NULL;
247
248 return d;
249 }
250
251 /**
252 * @block: the block to search in, starting from end; in first pass,
253 * this will be the block the instruction would be inserted into
254 * (but has not yet, ie. it only contains already scheduled
255 * instructions). For intra-block scheduling (second pass), this
256 * would be one of the predecessor blocks.
257 * @instr: the instruction to search for
258 * @maxd: max distance, bail after searching this # of instruction
259 * slots, since it means the instruction we are looking for is
260 * far enough away
261 * @pred: if true, recursively search into predecessor blocks to
262 * find the worst case (shortest) distance (only possible after
263 * individual blocks are all scheduled
264 */
265 static unsigned
266 distance(struct ir3_block *block, struct ir3_instruction *instr,
267 unsigned maxd, bool pred)
268 {
269 unsigned d = 0;
270
271 list_for_each_entry_rev (struct ir3_instruction, n, &block->instr_list, node) {
272 if ((n == instr) || (d >= maxd))
273 return d;
274 /* NOTE: don't count branch/jump since we don't know yet if they will
275 * be eliminated later in resolve_jumps().. really should do that
276 * earlier so we don't have this constraint.
277 */
278 if (is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR)))
279 d++;
280 }
281
282 /* if coming from a predecessor block, assume it is assigned far
283 * enough away.. we'll fix up later.
284 */
285 if (!pred)
286 return maxd;
287
288 if (pred && (block->data != block)) {
289 /* Search into predecessor blocks, finding the one with the
290 * shortest distance, since that will be the worst case
291 */
292 unsigned min = maxd - d;
293
294 /* (ab)use block->data to prevent recursion: */
295 block->data = block;
296
297 set_foreach(block->predecessors, entry) {
298 struct ir3_block *pred = (struct ir3_block *)entry->key;
299 unsigned n;
300
301 n = distance(pred, instr, min, pred);
302
303 min = MIN2(min, n);
304 }
305
306 block->data = NULL;
307 d += min;
308 }
309
310 return d;
311 }
312
313 /* calculate delay for specified src: */
314 static unsigned
315 delay_calc_srcn(struct ir3_block *block,
316 struct ir3_instruction *assigner,
317 struct ir3_instruction *consumer,
318 unsigned srcn, bool soft, bool pred)
319 {
320 unsigned delay = 0;
321
322 if (is_meta(assigner)) {
323 struct ir3_instruction *src;
324 foreach_ssa_src(src, assigner) {
325 unsigned d;
326 d = delay_calc_srcn(block, src, consumer, srcn, soft, pred);
327 delay = MAX2(delay, d);
328 }
329 } else {
330 if (soft) {
331 if (is_sfu(assigner)) {
332 delay = 4;
333 } else {
334 delay = ir3_delayslots(assigner, consumer, srcn);
335 }
336 } else {
337 delay = ir3_delayslots(assigner, consumer, srcn);
338 }
339 delay -= distance(block, assigner, delay, pred);
340 }
341
342 return delay;
343 }
344
345 /* calculate delay for instruction (maximum of delay for all srcs): */
346 static unsigned
347 delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
348 bool soft, bool pred)
349 {
350 unsigned delay = 0;
351 struct ir3_instruction *src;
352
353 foreach_ssa_src_n(src, i, instr) {
354 unsigned d;
355 d = delay_calc_srcn(block, src, instr, i, soft, pred);
356 delay = MAX2(delay, d);
357 }
358
359 return delay;
360 }
361
362 struct ir3_sched_notes {
363 /* there is at least one kill which could be scheduled, except
364 * for unscheduled bary.f's:
365 */
366 bool blocked_kill;
367 /* there is at least one instruction that could be scheduled,
368 * except for conflicting address/predicate register usage:
369 */
370 bool addr_conflict, pred_conflict;
371 };
372
373 /* could an instruction be scheduled if specified ssa src was scheduled? */
374 static bool
375 could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
376 {
377 struct ir3_instruction *other_src;
378 foreach_ssa_src(other_src, instr) {
379 /* if dependency not scheduled, we aren't ready yet: */
380 if ((src != other_src) && !is_scheduled(other_src)) {
381 return false;
382 }
383 }
384 return true;
385 }
386
387 /* Check if instruction is ok to schedule. Make sure it is not blocked
388 * by use of addr/predicate register, etc.
389 */
390 static bool
391 check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
392 struct ir3_instruction *instr)
393 {
394 debug_assert(!is_scheduled(instr));
395
396 /* For instructions that write address register we need to
397 * make sure there is at least one instruction that uses the
398 * addr value which is otherwise ready.
399 *
400 * TODO if any instructions use pred register and have other
401 * src args, we would need to do the same for writes_pred()..
402 */
403 if (writes_addr(instr)) {
404 struct ir3 *ir = instr->block->shader;
405 bool ready = false;
406 for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) {
407 struct ir3_instruction *indirect = ir->indirects[i];
408 if (!indirect)
409 continue;
410 if (indirect->address != instr)
411 continue;
412 ready = could_sched(indirect, instr);
413 }
414
415 /* nothing could be scheduled, so keep looking: */
416 if (!ready)
417 return false;
418 }
419
420 /* if this is a write to address/predicate register, and that
421 * register is currently in use, we need to defer until it is
422 * free:
423 */
424 if (writes_addr(instr) && ctx->addr) {
425 debug_assert(ctx->addr != instr);
426 notes->addr_conflict = true;
427 return false;
428 }
429
430 if (writes_pred(instr) && ctx->pred) {
431 debug_assert(ctx->pred != instr);
432 notes->pred_conflict = true;
433 return false;
434 }
435
436 /* if the instruction is a kill, we need to ensure *every*
437 * bary.f is scheduled. The hw seems unhappy if the thread
438 * gets killed before the end-input (ei) flag is hit.
439 *
440 * We could do this by adding each bary.f instruction as
441 * virtual ssa src for the kill instruction. But we have
442 * fixed length instr->regs[].
443 *
444 * TODO this wouldn't be quite right if we had multiple
445 * basic blocks, if any block was conditional. We'd need
446 * to schedule the bary.f's outside of any block which
447 * was conditional that contained a kill.. I think..
448 */
449 if (is_kill(instr)) {
450 struct ir3 *ir = instr->block->shader;
451
452 for (unsigned i = 0; i < ir->baryfs_count; i++) {
453 struct ir3_instruction *baryf = ir->baryfs[i];
454 if (baryf->flags & IR3_INSTR_UNUSED)
455 continue;
456 if (!is_scheduled(baryf)) {
457 notes->blocked_kill = true;
458 return false;
459 }
460 }
461 }
462
463 return true;
464 }
465
466 /* Find the best instruction to schedule from specified instruction or
467 * recursively it's ssa sources.
468 */
469 static struct ir3_instruction *
470 find_instr_recursive(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
471 struct ir3_instruction *instr)
472 {
473 struct ir3_instruction *srcs[__ssa_src_cnt(instr)];
474 struct ir3_instruction *src;
475 unsigned nsrcs = 0;
476
477 if (is_scheduled(instr))
478 return NULL;
479
480 /* use instr->data to cache the results of recursing up the
481 * instr src's. Otherwise the recursive algo can scale quite
482 * badly w/ shader size. But this takes some care to clear
483 * the cache appropriately when instructions are scheduled.
484 */
485 if (instr->data) {
486 if (instr->data == NULL_INSTR)
487 return NULL;
488 return instr->data;
489 }
490
491 /* find unscheduled srcs: */
492 foreach_ssa_src(src, instr) {
493 if (!is_scheduled(src) && (src->block == instr->block)) {
494 debug_assert(nsrcs < ARRAY_SIZE(srcs));
495 srcs[nsrcs++] = src;
496 }
497 }
498
499 /* if all our src's are already scheduled: */
500 if (nsrcs == 0) {
501 if (check_instr(ctx, notes, instr)) {
502 instr->data = instr;
503 return instr;
504 }
505 return NULL;
506 }
507
508 while ((src = deepest(srcs, nsrcs))) {
509 struct ir3_instruction *candidate;
510
511 candidate = find_instr_recursive(ctx, notes, src);
512 if (!candidate)
513 continue;
514
515 if (check_instr(ctx, notes, candidate)) {
516 instr->data = candidate;
517 return candidate;
518 }
519 }
520
521 instr->data = NULL_INSTR;
522 return NULL;
523 }
524
525 /* find net change to live values if instruction were scheduled: */
526 static int
527 live_effect(struct ir3_instruction *instr)
528 {
529 struct ir3_instruction *src;
530 int new_live = dest_regs(instr);
531 int old_live = 0;
532
533 foreach_ssa_src_n(src, n, instr) {
534 if (__is_false_dep(instr, n))
535 continue;
536
537 if (instr->block != src->block)
538 continue;
539
540 /* for split, just pass things along to the real src: */
541 if (src->opc == OPC_META_SPLIT)
542 src = ssa(src->regs[1]);
543
544 /* for collect, if this is the last use of *each* src,
545 * then it will decrease the live values, since RA treats
546 * them as a whole:
547 */
548 if (src->opc == OPC_META_COLLECT) {
549 struct ir3_instruction *src2;
550 bool last_use = true;
551
552 foreach_ssa_src(src2, src) {
553 if (src2->use_count > 1) {
554 last_use = false;
555 break;
556 }
557 }
558
559 if (last_use)
560 old_live += dest_regs(src);
561
562 } else {
563 debug_assert(src->use_count > 0);
564
565 if (src->use_count == 1) {
566 old_live += dest_regs(src);
567 }
568 }
569 }
570
571 return new_live - old_live;
572 }
573
574 /* find instruction to schedule: */
575 static struct ir3_instruction *
576 find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
577 bool soft)
578 {
579 struct ir3_instruction *best_instr = NULL;
580 int best_rank = INT_MAX; /* lower is better */
581 unsigned deepest = 0;
582
583 /* TODO we'd really rather use the list/array of block outputs. But we
584 * don't have such a thing. Recursing *every* instruction in the list
585 * will result in a lot of repeated traversal, since instructions will
586 * get traversed both when they appear as ssa src to a later instruction
587 * as well as where they appear in the depth_list.
588 */
589 list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) {
590 struct ir3_instruction *candidate;
591
592 candidate = find_instr_recursive(ctx, notes, instr);
593 if (!candidate)
594 continue;
595
596 if (is_meta(candidate))
597 return candidate;
598
599 deepest = MAX2(deepest, candidate->depth);
600 }
601
602 /* traverse the list a second time.. but since we cache the result of
603 * find_instr_recursive() it isn't as bad as it looks.
604 */
605 list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) {
606 struct ir3_instruction *candidate;
607
608 candidate = find_instr_recursive(ctx, notes, instr);
609 if (!candidate)
610 continue;
611
612 /* determine net change to # of live values: */
613 int le = live_effect(candidate);
614
615 /* if there is a net increase in # of live values, then apply some
616 * threshold to avoid instructions getting scheduled *too* early
617 * and increasing register pressure.
618 */
619 if (le >= 1) {
620 unsigned threshold;
621
622 if (ctx->live_values > 4*4) {
623 threshold = 4;
624 } else {
625 threshold = 6;
626 }
627
628 /* Filter out any "shallow" instructions which would otherwise
629 * tend to get scheduled too early to fill delay slots even
630 * when they are not needed for a while. There will probably
631 * be later delay slots that they could just as easily fill.
632 *
633 * A classic case where this comes up is frag shaders that
634 * write a constant value (like 1.0f) to one of the channels
635 * of the output color(s). Since the mov from immed has no
636 * dependencies, it would otherwise get scheduled early to
637 * fill delay slots, occupying a register until the end of
638 * the program.
639 */
640 if ((deepest - candidate->depth) > threshold)
641 continue;
642 }
643
644 int rank = delay_calc(ctx->block, candidate, soft, false);
645
646 /* if too many live values, prioritize instructions that reduce the
647 * number of live values:
648 */
649 if (ctx->live_values > 16*4) {
650 rank = le;
651 } else if (ctx->live_values > 4*4) {
652 rank += le;
653 }
654
655 if (rank < best_rank) {
656 best_instr = candidate;
657 best_rank = rank;
658 }
659 }
660
661 return best_instr;
662 }
663
664 static struct ir3_instruction *
665 split_instr(struct ir3_sched_ctx *ctx, struct ir3_instruction *orig_instr)
666 {
667 struct ir3_instruction *new_instr = ir3_instr_clone(orig_instr);
668 ir3_insert_by_depth(new_instr, &ctx->depth_list);
669 transfer_use(ctx, orig_instr, new_instr);
670 return new_instr;
671 }
672
673 /* "spill" the address register by remapping any unscheduled
674 * instructions which depend on the current address register
675 * to a clone of the instruction which wrote the address reg.
676 */
677 static struct ir3_instruction *
678 split_addr(struct ir3_sched_ctx *ctx)
679 {
680 struct ir3 *ir;
681 struct ir3_instruction *new_addr = NULL;
682 unsigned i;
683
684 debug_assert(ctx->addr);
685
686 ir = ctx->addr->block->shader;
687
688 for (i = 0; i < ir->indirects_count; i++) {
689 struct ir3_instruction *indirect = ir->indirects[i];
690
691 if (!indirect)
692 continue;
693
694 /* skip instructions already scheduled: */
695 if (is_scheduled(indirect))
696 continue;
697
698 /* remap remaining instructions using current addr
699 * to new addr:
700 */
701 if (indirect->address == ctx->addr) {
702 if (!new_addr) {
703 new_addr = split_instr(ctx, ctx->addr);
704 /* original addr is scheduled, but new one isn't: */
705 new_addr->flags &= ~IR3_INSTR_MARK;
706 }
707 indirect->address = NULL;
708 ir3_instr_set_address(indirect, new_addr);
709 }
710 }
711
712 /* all remaining indirects remapped to new addr: */
713 ctx->addr = NULL;
714
715 return new_addr;
716 }
717
718 /* "spill" the predicate register by remapping any unscheduled
719 * instructions which depend on the current predicate register
720 * to a clone of the instruction which wrote the address reg.
721 */
722 static struct ir3_instruction *
723 split_pred(struct ir3_sched_ctx *ctx)
724 {
725 struct ir3 *ir;
726 struct ir3_instruction *new_pred = NULL;
727 unsigned i;
728
729 debug_assert(ctx->pred);
730
731 ir = ctx->pred->block->shader;
732
733 for (i = 0; i < ir->predicates_count; i++) {
734 struct ir3_instruction *predicated = ir->predicates[i];
735
736 /* skip instructions already scheduled: */
737 if (is_scheduled(predicated))
738 continue;
739
740 /* remap remaining instructions using current pred
741 * to new pred:
742 *
743 * TODO is there ever a case when pred isn't first
744 * (and only) src?
745 */
746 if (ssa(predicated->regs[1]) == ctx->pred) {
747 if (!new_pred) {
748 new_pred = split_instr(ctx, ctx->pred);
749 /* original pred is scheduled, but new one isn't: */
750 new_pred->flags &= ~IR3_INSTR_MARK;
751 }
752 predicated->regs[1]->instr = new_pred;
753 }
754 }
755
756 /* all remaining predicated remapped to new pred: */
757 ctx->pred = NULL;
758
759 return new_pred;
760 }
761
762 static void
763 sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
764 {
765 struct list_head unscheduled_list;
766
767 ctx->block = block;
768
769 /* addr/pred writes are per-block: */
770 ctx->addr = NULL;
771 ctx->pred = NULL;
772
773 /* move all instructions to the unscheduled list, and
774 * empty the block's instruction list (to which we will
775 * be inserting).
776 */
777 list_replace(&block->instr_list, &unscheduled_list);
778 list_inithead(&block->instr_list);
779 list_inithead(&ctx->depth_list);
780
781 /* First schedule all meta:input instructions, followed by
782 * tex-prefetch. We want all of the instructions that load
783 * values into registers before the shader starts to go
784 * before any other instructions. But in particular we
785 * want inputs to come before prefetches. This is because
786 * a FS's bary_ij input may not actually be live in the
787 * shader, but it should not be scheduled on top of any
788 * other input (but can be overwritten by a tex prefetch)
789 *
790 * Finally, move all the remaining instructions to the depth-
791 * list
792 */
793 list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node)
794 if (instr->opc == OPC_META_INPUT)
795 schedule(ctx, instr);
796
797 list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node)
798 if (instr->opc == OPC_META_TEX_PREFETCH)
799 schedule(ctx, instr);
800
801 list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node)
802 ir3_insert_by_depth(instr, &ctx->depth_list);
803
804 while (!list_is_empty(&ctx->depth_list)) {
805 struct ir3_sched_notes notes = {0};
806 struct ir3_instruction *instr;
807
808 instr = find_eligible_instr(ctx, &notes, true);
809 if (!instr)
810 instr = find_eligible_instr(ctx, &notes, false);
811
812 if (instr) {
813 unsigned delay = delay_calc(ctx->block, instr, false, false);
814
815 /* and if we run out of instructions that can be scheduled,
816 * then it is time for nop's:
817 */
818 debug_assert(delay <= 6);
819 while (delay > 0) {
820 ir3_NOP(block);
821 delay--;
822 }
823
824 schedule(ctx, instr);
825 } else {
826 struct ir3_instruction *new_instr = NULL;
827
828 /* nothing available to schedule.. if we are blocked on
829 * address/predicate register conflict, then break the
830 * deadlock by cloning the instruction that wrote that
831 * reg:
832 */
833 if (notes.addr_conflict) {
834 new_instr = split_addr(ctx);
835 } else if (notes.pred_conflict) {
836 new_instr = split_pred(ctx);
837 } else {
838 debug_assert(0);
839 ctx->error = true;
840 return;
841 }
842
843 if (new_instr) {
844 /* clearing current addr/pred can change what is
845 * available to schedule, so clear cache..
846 */
847 clear_cache(ctx, NULL);
848
849 ir3_insert_by_depth(new_instr, &ctx->depth_list);
850 /* the original instr that wrote addr/pred may have
851 * originated from a different block:
852 */
853 new_instr->block = block;
854 }
855 }
856 }
857
858 /* And lastly, insert branch/jump instructions to take us to
859 * the next block. Later we'll strip back out the branches
860 * that simply jump to next instruction.
861 */
862 if (block->successors[1]) {
863 /* if/else, conditional branches to "then" or "else": */
864 struct ir3_instruction *br;
865 unsigned delay = 6;
866
867 debug_assert(ctx->pred);
868 debug_assert(block->condition);
869
870 delay -= distance(ctx->block, ctx->pred, delay, false);
871
872 while (delay > 0) {
873 ir3_NOP(block);
874 delay--;
875 }
876
877 /* create "else" branch first (since "then" block should
878 * frequently/always end up being a fall-thru):
879 */
880 br = ir3_BR(block);
881 br->cat0.inv = true;
882 br->cat0.target = block->successors[1];
883
884 /* NOTE: we have to hard code delay of 6 above, since
885 * we want to insert the nop's before constructing the
886 * branch. Throw in an assert so we notice if this
887 * ever breaks on future generation:
888 */
889 debug_assert(ir3_delayslots(ctx->pred, br, 0) == 6);
890
891 br = ir3_BR(block);
892 br->cat0.target = block->successors[0];
893
894 } else if (block->successors[0]) {
895 /* otherwise unconditional jump to next block: */
896 struct ir3_instruction *jmp;
897
898 jmp = ir3_JUMP(block);
899 jmp->cat0.target = block->successors[0];
900 }
901
902 /* NOTE: if we kept track of the predecessors, we could do a better
903 * job w/ (jp) flags.. every node w/ > predecessor is a join point.
904 * Note that as we eliminate blocks which contain only an unconditional
905 * jump we probably need to propagate (jp) flag..
906 */
907 }
908
909 /* After scheduling individual blocks, we still could have cases where
910 * one (or more) paths into a block, a value produced by a previous
911 * has too few delay slots to be legal. We can't deal with this in the
912 * first pass, because loops (ie. we can't ensure all predecessor blocks
913 * are already scheduled in the first pass). All we can really do at
914 * this point is stuff in extra nop's until things are legal.
915 */
916 static void
917 sched_intra_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
918 {
919 unsigned n = 0;
920
921 ctx->block = block;
922
923 list_for_each_entry_safe (struct ir3_instruction, instr, &block->instr_list, node) {
924 unsigned delay = 0;
925
926 set_foreach(block->predecessors, entry) {
927 struct ir3_block *pred = (struct ir3_block *)entry->key;
928 unsigned d = delay_calc(pred, instr, false, true);
929 delay = MAX2(d, delay);
930 }
931
932 while (delay > n) {
933 struct ir3_instruction *nop = ir3_NOP(block);
934
935 /* move to before instr: */
936 list_delinit(&nop->node);
937 list_addtail(&nop->node, &instr->node);
938
939 n++;
940 }
941
942 /* we can bail once we hit worst case delay: */
943 if (++n > 6)
944 break;
945 }
946 }
947
948 int ir3_sched(struct ir3 *ir)
949 {
950 struct ir3_sched_ctx ctx = {0};
951
952 ir3_clear_mark(ir);
953 update_use_count(ir);
954
955 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
956 ctx.live_values = 0;
957 sched_block(&ctx, block);
958 }
959
960 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
961 sched_intra_block(&ctx, block);
962 }
963
964 if (ctx.error)
965 return -1;
966
967 return 0;
968 }
969
970 static unsigned
971 get_array_id(struct ir3_instruction *instr)
972 {
973 /* The expectation is that there is only a single array
974 * src or dst, ir3_cp should enforce this.
975 */
976
977 for (unsigned i = 0; i < instr->regs_count; i++)
978 if (instr->regs[i]->flags & IR3_REG_ARRAY)
979 return instr->regs[i]->array.id;
980
981 unreachable("this was unexpected");
982 }
983
984 /* does instruction 'prior' need to be scheduled before 'instr'? */
985 static bool
986 depends_on(struct ir3_instruction *instr, struct ir3_instruction *prior)
987 {
988 /* TODO for dependencies that are related to a specific object, ie
989 * a specific SSBO/image/array, we could relax this constraint to
990 * make accesses to unrelated objects not depend on each other (at
991 * least as long as not declared coherent)
992 */
993 if (((instr->barrier_class & IR3_BARRIER_EVERYTHING) && prior->barrier_class) ||
994 ((prior->barrier_class & IR3_BARRIER_EVERYTHING) && instr->barrier_class))
995 return true;
996
997 if (instr->barrier_class & prior->barrier_conflict) {
998 if (!(instr->barrier_class & ~(IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W))) {
999 /* if only array barrier, then we can further limit false-deps
1000 * by considering the array-id, ie reads/writes to different
1001 * arrays do not depend on each other (no aliasing)
1002 */
1003 if (get_array_id(instr) != get_array_id(prior)) {
1004 return false;
1005 }
1006 }
1007
1008 return true;
1009 }
1010
1011 return false;
1012 }
1013
1014 static void
1015 add_barrier_deps(struct ir3_block *block, struct ir3_instruction *instr)
1016 {
1017 struct list_head *prev = instr->node.prev;
1018 struct list_head *next = instr->node.next;
1019
1020 /* add dependencies on previous instructions that must be scheduled
1021 * prior to the current instruction
1022 */
1023 while (prev != &block->instr_list) {
1024 struct ir3_instruction *pi =
1025 LIST_ENTRY(struct ir3_instruction, prev, node);
1026
1027 prev = prev->prev;
1028
1029 if (is_meta(pi))
1030 continue;
1031
1032 if (instr->barrier_class == pi->barrier_class) {
1033 ir3_instr_add_dep(instr, pi);
1034 break;
1035 }
1036
1037 if (depends_on(instr, pi))
1038 ir3_instr_add_dep(instr, pi);
1039 }
1040
1041 /* add dependencies on this instruction to following instructions
1042 * that must be scheduled after the current instruction:
1043 */
1044 while (next != &block->instr_list) {
1045 struct ir3_instruction *ni =
1046 LIST_ENTRY(struct ir3_instruction, next, node);
1047
1048 next = next->next;
1049
1050 if (is_meta(ni))
1051 continue;
1052
1053 if (instr->barrier_class == ni->barrier_class) {
1054 ir3_instr_add_dep(ni, instr);
1055 break;
1056 }
1057
1058 if (depends_on(ni, instr))
1059 ir3_instr_add_dep(ni, instr);
1060 }
1061 }
1062
1063 /* before scheduling a block, we need to add any necessary false-dependencies
1064 * to ensure that:
1065 *
1066 * (1) barriers are scheduled in the right order wrt instructions related
1067 * to the barrier
1068 *
1069 * (2) reads that come before a write actually get scheduled before the
1070 * write
1071 */
1072 static void
1073 calculate_deps(struct ir3_block *block)
1074 {
1075 list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
1076 if (instr->barrier_class) {
1077 add_barrier_deps(block, instr);
1078 }
1079 }
1080 }
1081
1082 void
1083 ir3_sched_add_deps(struct ir3 *ir)
1084 {
1085 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
1086 calculate_deps(block);
1087 }
1088 }