freedreno/ir3: add iterator macros
[mesa.git] / src / freedreno / ir3 / ir3_sched.c
1 /*
2 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Rob Clark <robclark@freedesktop.org>
25 */
26
27
28 #include "util/u_math.h"
29
30 #include "ir3.h"
31 #include "ir3_compiler.h"
32
33 #ifdef DEBUG
34 #define SCHED_DEBUG (ir3_shader_debug & IR3_DBG_SCHEDMSGS)
35 #else
36 #define SCHED_DEBUG 0
37 #endif
38 #define d(fmt, ...) do { if (SCHED_DEBUG) { \
39 printf("SCHED: "fmt"\n", ##__VA_ARGS__); \
40 } } while (0)
41
42 #define di(instr, fmt, ...) do { if (SCHED_DEBUG) { \
43 printf("SCHED: "fmt": ", ##__VA_ARGS__); \
44 ir3_print_instr(instr); \
45 } } while (0)
46
47 /*
48 * Instruction Scheduling:
49 *
50 * A recursive depth based scheduling algo. Recursively find an eligible
51 * instruction to schedule from the deepest instruction (recursing through
52 * it's unscheduled src instructions). Normally this would result in a
53 * lot of re-traversal of the same instructions, so we cache results in
54 * instr->data (and clear cached results that would be no longer valid
55 * after scheduling an instruction).
56 *
57 * There are a few special cases that need to be handled, since sched
58 * is currently independent of register allocation. Usages of address
59 * register (a0.x) or predicate register (p0.x) must be serialized. Ie.
60 * if you have two pairs of instructions that write the same special
61 * register and then read it, then those pairs cannot be interleaved.
62 * To solve this, when we are in such a scheduling "critical section",
63 * and we encounter a conflicting write to a special register, we try
64 * to schedule any remaining instructions that use that value first.
65 */
66
67 struct ir3_sched_ctx {
68 struct ir3_block *block; /* the current block */
69 struct list_head depth_list; /* depth sorted unscheduled instrs */
70 struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/
71 struct ir3_instruction *addr; /* current a0.x user, if any */
72 struct ir3_instruction *pred; /* current p0.x user, if any */
73 int live_values; /* estimate of current live values */
74 bool error;
75 };
76
77 static bool is_scheduled(struct ir3_instruction *instr)
78 {
79 return !!(instr->flags & IR3_INSTR_MARK);
80 }
81
82 static bool is_sfu_or_mem(struct ir3_instruction *instr)
83 {
84 return is_sfu(instr) || is_mem(instr);
85 }
86
87 static void
88 unuse_each_src(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
89 {
90 struct ir3_instruction *src;
91
92 foreach_ssa_src_n(src, n, instr) {
93 if (__is_false_dep(instr, n))
94 continue;
95 if (instr->block != src->block)
96 continue;
97 if ((src->opc == OPC_META_COLLECT) || (src->opc == OPC_META_SPLIT)) {
98 unuse_each_src(ctx, src);
99 } else {
100 debug_assert(src->use_count > 0);
101
102 if (--src->use_count == 0) {
103 ctx->live_values -= dest_regs(src);
104 debug_assert(ctx->live_values >= 0);
105 }
106 }
107 }
108 }
109
110 static void clear_cache(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr);
111 static void use_instr(struct ir3_instruction *instr);
112
113 /* transfers a use-count to new instruction, for cases where we
114 * "spill" address or predicate. Note this might cause the
115 * previous instruction that loaded a0.x/p0.x to become live
116 * again, when we previously thought it was dead.
117 */
118 static void
119 transfer_use(struct ir3_sched_ctx *ctx, struct ir3_instruction *orig_instr,
120 struct ir3_instruction *new_instr)
121 {
122 struct ir3_instruction *src;
123
124 debug_assert(is_scheduled(orig_instr));
125
126 foreach_ssa_src_n(src, n, new_instr) {
127 if (__is_false_dep(new_instr, n))
128 continue;
129 ctx->live_values += dest_regs(src);
130 use_instr(src);
131 }
132
133 clear_cache(ctx, orig_instr);
134 }
135
136 static void
137 use_each_src(struct ir3_instruction *instr)
138 {
139 struct ir3_instruction *src;
140
141 foreach_ssa_src_n(src, n, instr) {
142 if (__is_false_dep(instr, n))
143 continue;
144 use_instr(src);
145 }
146 }
147
148 static void
149 use_instr(struct ir3_instruction *instr)
150 {
151 if ((instr->opc == OPC_META_COLLECT) || (instr->opc == OPC_META_SPLIT)) {
152 use_each_src(instr);
153 } else {
154 instr->use_count++;
155 }
156 }
157
158 static void
159 update_live_values(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
160 {
161 if ((instr->opc == OPC_META_COLLECT) || (instr->opc == OPC_META_SPLIT))
162 return;
163
164 ctx->live_values += dest_regs(instr);
165 unuse_each_src(ctx, instr);
166 }
167
168 static void
169 update_use_count(struct ir3 *ir)
170 {
171 foreach_block (block, &ir->block_list) {
172 foreach_instr (instr, &block->instr_list) {
173 instr->use_count = 0;
174 }
175 }
176
177 foreach_block (block, &ir->block_list) {
178 foreach_instr (instr, &block->instr_list) {
179 if ((instr->opc == OPC_META_COLLECT) || (instr->opc == OPC_META_SPLIT))
180 continue;
181
182 use_each_src(instr);
183 }
184 }
185
186 /* Shader outputs are also used:
187 */
188 struct ir3_instruction *out;
189 foreach_output(out, ir)
190 use_instr(out);
191 }
192
193 #define NULL_INSTR ((void *)~0)
194
195 static void
196 clear_cache(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
197 {
198 foreach_instr (instr2, &ctx->depth_list) {
199 if ((instr2->data == instr) || (instr2->data == NULL_INSTR) || !instr)
200 instr2->data = NULL;
201 }
202 }
203
204 static void
205 schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
206 {
207 debug_assert(ctx->block == instr->block);
208
209 /* maybe there is a better way to handle this than just stuffing
210 * a nop.. ideally we'd know about this constraint in the
211 * scheduling and depth calculation..
212 */
213 if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr))
214 ir3_NOP(ctx->block);
215
216 /* remove from depth list:
217 */
218 list_delinit(&instr->node);
219
220 if (writes_addr(instr)) {
221 debug_assert(ctx->addr == NULL);
222 ctx->addr = instr;
223 }
224
225 if (writes_pred(instr)) {
226 debug_assert(ctx->pred == NULL);
227 ctx->pred = instr;
228 }
229
230 instr->flags |= IR3_INSTR_MARK;
231
232 di(instr, "schedule");
233
234 list_addtail(&instr->node, &instr->block->instr_list);
235 ctx->scheduled = instr;
236
237 update_live_values(ctx, instr);
238
239 if (writes_addr(instr) || writes_pred(instr) || is_input(instr)) {
240 clear_cache(ctx, NULL);
241 } else {
242 /* invalidate only the necessary entries.. */
243 clear_cache(ctx, instr);
244 }
245 }
246
247 static struct ir3_instruction *
248 deepest(struct ir3_instruction **srcs, unsigned nsrcs)
249 {
250 struct ir3_instruction *d = NULL;
251 unsigned i = 0, id = 0;
252
253 while ((i < nsrcs) && !(d = srcs[id = i]))
254 i++;
255
256 if (!d)
257 return NULL;
258
259 for (; i < nsrcs; i++)
260 if (srcs[i] && (srcs[i]->depth > d->depth))
261 d = srcs[id = i];
262
263 srcs[id] = NULL;
264
265 return d;
266 }
267
268 /**
269 * @block: the block to search in, starting from end; in first pass,
270 * this will be the block the instruction would be inserted into
271 * (but has not yet, ie. it only contains already scheduled
272 * instructions). For intra-block scheduling (second pass), this
273 * would be one of the predecessor blocks.
274 * @instr: the instruction to search for
275 * @maxd: max distance, bail after searching this # of instruction
276 * slots, since it means the instruction we are looking for is
277 * far enough away
278 * @pred: if true, recursively search into predecessor blocks to
279 * find the worst case (shortest) distance (only possible after
280 * individual blocks are all scheduled
281 */
282 static unsigned
283 distance(struct ir3_block *block, struct ir3_instruction *instr,
284 unsigned maxd, bool pred)
285 {
286 unsigned d = 0;
287
288 foreach_instr_rev (n, &block->instr_list) {
289 if ((n == instr) || (d >= maxd))
290 return d;
291 /* NOTE: don't count branch/jump since we don't know yet if they will
292 * be eliminated later in resolve_jumps().. really should do that
293 * earlier so we don't have this constraint.
294 */
295 if (is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR)))
296 d++;
297 }
298
299 /* if coming from a predecessor block, assume it is assigned far
300 * enough away.. we'll fix up later.
301 */
302 if (!pred)
303 return maxd;
304
305 if (pred && (block->data != block)) {
306 /* Search into predecessor blocks, finding the one with the
307 * shortest distance, since that will be the worst case
308 */
309 unsigned min = maxd - d;
310
311 /* (ab)use block->data to prevent recursion: */
312 block->data = block;
313
314 set_foreach(block->predecessors, entry) {
315 struct ir3_block *pred = (struct ir3_block *)entry->key;
316 unsigned n;
317
318 n = distance(pred, instr, min, pred);
319
320 min = MIN2(min, n);
321 }
322
323 block->data = NULL;
324 d += min;
325 }
326
327 return d;
328 }
329
330 /* calculate delay for specified src: */
331 static unsigned
332 delay_calc_srcn(struct ir3_block *block,
333 struct ir3_instruction *assigner,
334 struct ir3_instruction *consumer,
335 unsigned srcn, bool soft, bool pred)
336 {
337 unsigned delay = 0;
338
339 if (is_meta(assigner)) {
340 struct ir3_instruction *src;
341 foreach_ssa_src(src, assigner) {
342 unsigned d;
343 d = delay_calc_srcn(block, src, consumer, srcn, soft, pred);
344 delay = MAX2(delay, d);
345 }
346 } else {
347 if (soft) {
348 if (is_sfu(assigner)) {
349 delay = 4;
350 } else {
351 delay = ir3_delayslots(assigner, consumer, srcn);
352 }
353 } else {
354 delay = ir3_delayslots(assigner, consumer, srcn);
355 }
356 delay -= distance(block, assigner, delay, pred);
357 }
358
359 return delay;
360 }
361
362 /* calculate delay for instruction (maximum of delay for all srcs): */
363 static unsigned
364 delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
365 bool soft, bool pred)
366 {
367 unsigned delay = 0;
368 struct ir3_instruction *src;
369
370 foreach_ssa_src_n(src, i, instr) {
371 unsigned d;
372 d = delay_calc_srcn(block, src, instr, i, soft, pred);
373 delay = MAX2(delay, d);
374 }
375
376 return delay;
377 }
378
379 struct ir3_sched_notes {
380 /* there is at least one kill which could be scheduled, except
381 * for unscheduled bary.f's:
382 */
383 bool blocked_kill;
384 /* there is at least one instruction that could be scheduled,
385 * except for conflicting address/predicate register usage:
386 */
387 bool addr_conflict, pred_conflict;
388 };
389
390 /* could an instruction be scheduled if specified ssa src was scheduled? */
391 static bool
392 could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
393 {
394 struct ir3_instruction *other_src;
395 foreach_ssa_src(other_src, instr) {
396 /* if dependency not scheduled, we aren't ready yet: */
397 if ((src != other_src) && !is_scheduled(other_src)) {
398 return false;
399 }
400 }
401 return true;
402 }
403
404 /* Check if instruction is ok to schedule. Make sure it is not blocked
405 * by use of addr/predicate register, etc.
406 */
407 static bool
408 check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
409 struct ir3_instruction *instr)
410 {
411 debug_assert(!is_scheduled(instr));
412
413 /* For instructions that write address register we need to
414 * make sure there is at least one instruction that uses the
415 * addr value which is otherwise ready.
416 *
417 * TODO if any instructions use pred register and have other
418 * src args, we would need to do the same for writes_pred()..
419 */
420 if (writes_addr(instr)) {
421 struct ir3 *ir = instr->block->shader;
422 bool ready = false;
423 for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) {
424 struct ir3_instruction *indirect = ir->indirects[i];
425 if (!indirect)
426 continue;
427 if (indirect->address != instr)
428 continue;
429 ready = could_sched(indirect, instr);
430 }
431
432 /* nothing could be scheduled, so keep looking: */
433 if (!ready)
434 return false;
435 }
436
437 /* if this is a write to address/predicate register, and that
438 * register is currently in use, we need to defer until it is
439 * free:
440 */
441 if (writes_addr(instr) && ctx->addr) {
442 debug_assert(ctx->addr != instr);
443 notes->addr_conflict = true;
444 return false;
445 }
446
447 if (writes_pred(instr) && ctx->pred) {
448 debug_assert(ctx->pred != instr);
449 notes->pred_conflict = true;
450 return false;
451 }
452
453 /* if the instruction is a kill, we need to ensure *every*
454 * bary.f is scheduled. The hw seems unhappy if the thread
455 * gets killed before the end-input (ei) flag is hit.
456 *
457 * We could do this by adding each bary.f instruction as
458 * virtual ssa src for the kill instruction. But we have
459 * fixed length instr->regs[].
460 *
461 * TODO this wouldn't be quite right if we had multiple
462 * basic blocks, if any block was conditional. We'd need
463 * to schedule the bary.f's outside of any block which
464 * was conditional that contained a kill.. I think..
465 */
466 if (is_kill(instr)) {
467 struct ir3 *ir = instr->block->shader;
468
469 for (unsigned i = 0; i < ir->baryfs_count; i++) {
470 struct ir3_instruction *baryf = ir->baryfs[i];
471 if (baryf->flags & IR3_INSTR_UNUSED)
472 continue;
473 if (!is_scheduled(baryf)) {
474 notes->blocked_kill = true;
475 return false;
476 }
477 }
478 }
479
480 return true;
481 }
482
483 /* Find the best instruction to schedule from specified instruction or
484 * recursively it's ssa sources.
485 */
486 static struct ir3_instruction *
487 find_instr_recursive(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
488 struct ir3_instruction *instr)
489 {
490 struct ir3_instruction *srcs[__ssa_src_cnt(instr)];
491 struct ir3_instruction *src;
492 unsigned nsrcs = 0;
493
494 if (is_scheduled(instr))
495 return NULL;
496
497 /* use instr->data to cache the results of recursing up the
498 * instr src's. Otherwise the recursive algo can scale quite
499 * badly w/ shader size. But this takes some care to clear
500 * the cache appropriately when instructions are scheduled.
501 */
502 if (instr->data) {
503 if (instr->data == NULL_INSTR)
504 return NULL;
505 return instr->data;
506 }
507
508 /* find unscheduled srcs: */
509 foreach_ssa_src(src, instr) {
510 if (!is_scheduled(src) && (src->block == instr->block)) {
511 debug_assert(nsrcs < ARRAY_SIZE(srcs));
512 srcs[nsrcs++] = src;
513 }
514 }
515
516 /* if all our src's are already scheduled: */
517 if (nsrcs == 0) {
518 if (check_instr(ctx, notes, instr)) {
519 instr->data = instr;
520 return instr;
521 }
522 return NULL;
523 }
524
525 while ((src = deepest(srcs, nsrcs))) {
526 struct ir3_instruction *candidate;
527
528 candidate = find_instr_recursive(ctx, notes, src);
529 if (!candidate)
530 continue;
531
532 if (check_instr(ctx, notes, candidate)) {
533 instr->data = candidate;
534 return candidate;
535 }
536 }
537
538 instr->data = NULL_INSTR;
539 return NULL;
540 }
541
542 /* find net change to live values if instruction were scheduled: */
543 static int
544 live_effect(struct ir3_instruction *instr)
545 {
546 struct ir3_instruction *src;
547 int new_live = dest_regs(instr);
548 int old_live = 0;
549
550 foreach_ssa_src_n(src, n, instr) {
551 if (__is_false_dep(instr, n))
552 continue;
553
554 if (instr->block != src->block)
555 continue;
556
557 /* for split, just pass things along to the real src: */
558 if (src->opc == OPC_META_SPLIT)
559 src = ssa(src->regs[1]);
560
561 /* for collect, if this is the last use of *each* src,
562 * then it will decrease the live values, since RA treats
563 * them as a whole:
564 */
565 if (src->opc == OPC_META_COLLECT) {
566 struct ir3_instruction *src2;
567 bool last_use = true;
568
569 foreach_ssa_src(src2, src) {
570 if (src2->use_count > 1) {
571 last_use = false;
572 break;
573 }
574 }
575
576 if (last_use)
577 old_live += dest_regs(src);
578
579 } else {
580 debug_assert(src->use_count > 0);
581
582 if (src->use_count == 1) {
583 old_live += dest_regs(src);
584 }
585 }
586 }
587
588 return new_live - old_live;
589 }
590
591 /* find instruction to schedule: */
592 static struct ir3_instruction *
593 find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
594 bool soft)
595 {
596 struct ir3_instruction *best_instr = NULL;
597 int best_rank = INT_MAX; /* lower is better */
598 unsigned deepest = 0;
599
600 /* TODO we'd really rather use the list/array of block outputs. But we
601 * don't have such a thing. Recursing *every* instruction in the list
602 * will result in a lot of repeated traversal, since instructions will
603 * get traversed both when they appear as ssa src to a later instruction
604 * as well as where they appear in the depth_list.
605 */
606 foreach_instr_rev (instr, &ctx->depth_list) {
607 struct ir3_instruction *candidate;
608
609 candidate = find_instr_recursive(ctx, notes, instr);
610 if (!candidate)
611 continue;
612
613 if (is_meta(candidate))
614 return candidate;
615
616 deepest = MAX2(deepest, candidate->depth);
617 }
618
619 /* traverse the list a second time.. but since we cache the result of
620 * find_instr_recursive() it isn't as bad as it looks.
621 */
622 foreach_instr_rev (instr, &ctx->depth_list) {
623 struct ir3_instruction *candidate;
624
625 candidate = find_instr_recursive(ctx, notes, instr);
626 if (!candidate)
627 continue;
628
629 /* determine net change to # of live values: */
630 int le = live_effect(candidate);
631
632 /* if there is a net increase in # of live values, then apply some
633 * threshold to avoid instructions getting scheduled *too* early
634 * and increasing register pressure.
635 */
636 if (le >= 1) {
637 unsigned threshold;
638
639 if (ctx->live_values > 4*4) {
640 threshold = 4;
641 } else {
642 threshold = 6;
643 }
644
645 /* Filter out any "shallow" instructions which would otherwise
646 * tend to get scheduled too early to fill delay slots even
647 * when they are not needed for a while. There will probably
648 * be later delay slots that they could just as easily fill.
649 *
650 * A classic case where this comes up is frag shaders that
651 * write a constant value (like 1.0f) to one of the channels
652 * of the output color(s). Since the mov from immed has no
653 * dependencies, it would otherwise get scheduled early to
654 * fill delay slots, occupying a register until the end of
655 * the program.
656 */
657 if ((deepest - candidate->depth) > threshold)
658 continue;
659 }
660
661 int rank = delay_calc(ctx->block, candidate, soft, false);
662
663 /* if too many live values, prioritize instructions that reduce the
664 * number of live values:
665 */
666 if (ctx->live_values > 16*4) {
667 rank = le;
668 } else if (ctx->live_values > 4*4) {
669 rank += le;
670 }
671
672 if (rank < best_rank) {
673 best_instr = candidate;
674 best_rank = rank;
675 }
676 }
677
678 return best_instr;
679 }
680
681 static struct ir3_instruction *
682 split_instr(struct ir3_sched_ctx *ctx, struct ir3_instruction *orig_instr)
683 {
684 struct ir3_instruction *new_instr = ir3_instr_clone(orig_instr);
685 ir3_insert_by_depth(new_instr, &ctx->depth_list);
686 transfer_use(ctx, orig_instr, new_instr);
687 return new_instr;
688 }
689
690 /* "spill" the address register by remapping any unscheduled
691 * instructions which depend on the current address register
692 * to a clone of the instruction which wrote the address reg.
693 */
694 static struct ir3_instruction *
695 split_addr(struct ir3_sched_ctx *ctx)
696 {
697 struct ir3 *ir;
698 struct ir3_instruction *new_addr = NULL;
699 unsigned i;
700
701 debug_assert(ctx->addr);
702
703 ir = ctx->addr->block->shader;
704
705 for (i = 0; i < ir->indirects_count; i++) {
706 struct ir3_instruction *indirect = ir->indirects[i];
707
708 if (!indirect)
709 continue;
710
711 /* skip instructions already scheduled: */
712 if (is_scheduled(indirect))
713 continue;
714
715 /* remap remaining instructions using current addr
716 * to new addr:
717 */
718 if (indirect->address == ctx->addr) {
719 if (!new_addr) {
720 new_addr = split_instr(ctx, ctx->addr);
721 /* original addr is scheduled, but new one isn't: */
722 new_addr->flags &= ~IR3_INSTR_MARK;
723 }
724 indirect->address = NULL;
725 ir3_instr_set_address(indirect, new_addr);
726 }
727 }
728
729 /* all remaining indirects remapped to new addr: */
730 ctx->addr = NULL;
731
732 return new_addr;
733 }
734
735 /* "spill" the predicate register by remapping any unscheduled
736 * instructions which depend on the current predicate register
737 * to a clone of the instruction which wrote the address reg.
738 */
739 static struct ir3_instruction *
740 split_pred(struct ir3_sched_ctx *ctx)
741 {
742 struct ir3 *ir;
743 struct ir3_instruction *new_pred = NULL;
744 unsigned i;
745
746 debug_assert(ctx->pred);
747
748 ir = ctx->pred->block->shader;
749
750 for (i = 0; i < ir->predicates_count; i++) {
751 struct ir3_instruction *predicated = ir->predicates[i];
752
753 /* skip instructions already scheduled: */
754 if (is_scheduled(predicated))
755 continue;
756
757 /* remap remaining instructions using current pred
758 * to new pred:
759 *
760 * TODO is there ever a case when pred isn't first
761 * (and only) src?
762 */
763 if (ssa(predicated->regs[1]) == ctx->pred) {
764 if (!new_pred) {
765 new_pred = split_instr(ctx, ctx->pred);
766 /* original pred is scheduled, but new one isn't: */
767 new_pred->flags &= ~IR3_INSTR_MARK;
768 }
769 predicated->regs[1]->instr = new_pred;
770 }
771 }
772
773 /* all remaining predicated remapped to new pred: */
774 ctx->pred = NULL;
775
776 return new_pred;
777 }
778
779 static void
780 sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
781 {
782 struct list_head unscheduled_list;
783
784 ctx->block = block;
785
786 /* addr/pred writes are per-block: */
787 ctx->addr = NULL;
788 ctx->pred = NULL;
789
790 /* move all instructions to the unscheduled list, and
791 * empty the block's instruction list (to which we will
792 * be inserting).
793 */
794 list_replace(&block->instr_list, &unscheduled_list);
795 list_inithead(&block->instr_list);
796 list_inithead(&ctx->depth_list);
797
798 /* First schedule all meta:input instructions, followed by
799 * tex-prefetch. We want all of the instructions that load
800 * values into registers before the shader starts to go
801 * before any other instructions. But in particular we
802 * want inputs to come before prefetches. This is because
803 * a FS's bary_ij input may not actually be live in the
804 * shader, but it should not be scheduled on top of any
805 * other input (but can be overwritten by a tex prefetch)
806 *
807 * Finally, move all the remaining instructions to the depth-
808 * list
809 */
810 foreach_instr_safe (instr, &unscheduled_list)
811 if (instr->opc == OPC_META_INPUT)
812 schedule(ctx, instr);
813
814 foreach_instr_safe (instr, &unscheduled_list)
815 if (instr->opc == OPC_META_TEX_PREFETCH)
816 schedule(ctx, instr);
817
818 foreach_instr_safe (instr, &unscheduled_list)
819 ir3_insert_by_depth(instr, &ctx->depth_list);
820
821 while (!list_is_empty(&ctx->depth_list)) {
822 struct ir3_sched_notes notes = {0};
823 struct ir3_instruction *instr;
824
825 instr = find_eligible_instr(ctx, &notes, true);
826 if (!instr)
827 instr = find_eligible_instr(ctx, &notes, false);
828
829 if (instr) {
830 unsigned delay = delay_calc(ctx->block, instr, false, false);
831
832 d("delay=%u", delay);
833
834 /* and if we run out of instructions that can be scheduled,
835 * then it is time for nop's:
836 */
837 debug_assert(delay <= 6);
838 while (delay > 0) {
839 ir3_NOP(block);
840 delay--;
841 }
842
843 schedule(ctx, instr);
844 } else {
845 struct ir3_instruction *new_instr = NULL;
846
847 /* nothing available to schedule.. if we are blocked on
848 * address/predicate register conflict, then break the
849 * deadlock by cloning the instruction that wrote that
850 * reg:
851 */
852 if (notes.addr_conflict) {
853 new_instr = split_addr(ctx);
854 } else if (notes.pred_conflict) {
855 new_instr = split_pred(ctx);
856 } else {
857 debug_assert(0);
858 ctx->error = true;
859 return;
860 }
861
862 if (new_instr) {
863 /* clearing current addr/pred can change what is
864 * available to schedule, so clear cache..
865 */
866 clear_cache(ctx, NULL);
867
868 ir3_insert_by_depth(new_instr, &ctx->depth_list);
869 /* the original instr that wrote addr/pred may have
870 * originated from a different block:
871 */
872 new_instr->block = block;
873 }
874 }
875 }
876
877 /* And lastly, insert branch/jump instructions to take us to
878 * the next block. Later we'll strip back out the branches
879 * that simply jump to next instruction.
880 */
881 if (block->successors[1]) {
882 /* if/else, conditional branches to "then" or "else": */
883 struct ir3_instruction *br;
884 unsigned delay = 6;
885
886 debug_assert(ctx->pred);
887 debug_assert(block->condition);
888
889 delay -= distance(ctx->block, ctx->pred, delay, false);
890
891 while (delay > 0) {
892 ir3_NOP(block);
893 delay--;
894 }
895
896 /* create "else" branch first (since "then" block should
897 * frequently/always end up being a fall-thru):
898 */
899 br = ir3_BR(block);
900 br->cat0.inv = true;
901 br->cat0.target = block->successors[1];
902
903 /* NOTE: we have to hard code delay of 6 above, since
904 * we want to insert the nop's before constructing the
905 * branch. Throw in an assert so we notice if this
906 * ever breaks on future generation:
907 */
908 debug_assert(ir3_delayslots(ctx->pred, br, 0) == 6);
909
910 br = ir3_BR(block);
911 br->cat0.target = block->successors[0];
912
913 } else if (block->successors[0]) {
914 /* otherwise unconditional jump to next block: */
915 struct ir3_instruction *jmp;
916
917 jmp = ir3_JUMP(block);
918 jmp->cat0.target = block->successors[0];
919 }
920
921 /* NOTE: if we kept track of the predecessors, we could do a better
922 * job w/ (jp) flags.. every node w/ > predecessor is a join point.
923 * Note that as we eliminate blocks which contain only an unconditional
924 * jump we probably need to propagate (jp) flag..
925 */
926 }
927
928 /* After scheduling individual blocks, we still could have cases where
929 * one (or more) paths into a block, a value produced by a previous
930 * has too few delay slots to be legal. We can't deal with this in the
931 * first pass, because loops (ie. we can't ensure all predecessor blocks
932 * are already scheduled in the first pass). All we can really do at
933 * this point is stuff in extra nop's until things are legal.
934 */
935 static void
936 sched_intra_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
937 {
938 unsigned n = 0;
939
940 ctx->block = block;
941
942 foreach_instr_safe (instr, &block->instr_list) {
943 unsigned delay = 0;
944
945 set_foreach(block->predecessors, entry) {
946 struct ir3_block *pred = (struct ir3_block *)entry->key;
947 unsigned d = delay_calc(pred, instr, false, true);
948 delay = MAX2(d, delay);
949 }
950
951 while (delay > n) {
952 struct ir3_instruction *nop = ir3_NOP(block);
953
954 /* move to before instr: */
955 list_delinit(&nop->node);
956 list_addtail(&nop->node, &instr->node);
957
958 n++;
959 }
960
961 /* we can bail once we hit worst case delay: */
962 if (++n > 6)
963 break;
964 }
965 }
966
967 int ir3_sched(struct ir3 *ir)
968 {
969 struct ir3_sched_ctx ctx = {0};
970
971 ir3_clear_mark(ir);
972 update_use_count(ir);
973
974 foreach_block (block, &ir->block_list) {
975 ctx.live_values = 0;
976 sched_block(&ctx, block);
977 }
978
979 foreach_block (block, &ir->block_list) {
980 sched_intra_block(&ctx, block);
981 }
982
983 if (ctx.error)
984 return -1;
985
986 return 0;
987 }
988
989 static unsigned
990 get_array_id(struct ir3_instruction *instr)
991 {
992 /* The expectation is that there is only a single array
993 * src or dst, ir3_cp should enforce this.
994 */
995
996 for (unsigned i = 0; i < instr->regs_count; i++)
997 if (instr->regs[i]->flags & IR3_REG_ARRAY)
998 return instr->regs[i]->array.id;
999
1000 unreachable("this was unexpected");
1001 }
1002
1003 /* does instruction 'prior' need to be scheduled before 'instr'? */
1004 static bool
1005 depends_on(struct ir3_instruction *instr, struct ir3_instruction *prior)
1006 {
1007 /* TODO for dependencies that are related to a specific object, ie
1008 * a specific SSBO/image/array, we could relax this constraint to
1009 * make accesses to unrelated objects not depend on each other (at
1010 * least as long as not declared coherent)
1011 */
1012 if (((instr->barrier_class & IR3_BARRIER_EVERYTHING) && prior->barrier_class) ||
1013 ((prior->barrier_class & IR3_BARRIER_EVERYTHING) && instr->barrier_class))
1014 return true;
1015
1016 if (instr->barrier_class & prior->barrier_conflict) {
1017 if (!(instr->barrier_class & ~(IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W))) {
1018 /* if only array barrier, then we can further limit false-deps
1019 * by considering the array-id, ie reads/writes to different
1020 * arrays do not depend on each other (no aliasing)
1021 */
1022 if (get_array_id(instr) != get_array_id(prior)) {
1023 return false;
1024 }
1025 }
1026
1027 return true;
1028 }
1029
1030 return false;
1031 }
1032
1033 static void
1034 add_barrier_deps(struct ir3_block *block, struct ir3_instruction *instr)
1035 {
1036 struct list_head *prev = instr->node.prev;
1037 struct list_head *next = instr->node.next;
1038
1039 /* add dependencies on previous instructions that must be scheduled
1040 * prior to the current instruction
1041 */
1042 while (prev != &block->instr_list) {
1043 struct ir3_instruction *pi =
1044 LIST_ENTRY(struct ir3_instruction, prev, node);
1045
1046 prev = prev->prev;
1047
1048 if (is_meta(pi))
1049 continue;
1050
1051 if (instr->barrier_class == pi->barrier_class) {
1052 ir3_instr_add_dep(instr, pi);
1053 break;
1054 }
1055
1056 if (depends_on(instr, pi))
1057 ir3_instr_add_dep(instr, pi);
1058 }
1059
1060 /* add dependencies on this instruction to following instructions
1061 * that must be scheduled after the current instruction:
1062 */
1063 while (next != &block->instr_list) {
1064 struct ir3_instruction *ni =
1065 LIST_ENTRY(struct ir3_instruction, next, node);
1066
1067 next = next->next;
1068
1069 if (is_meta(ni))
1070 continue;
1071
1072 if (instr->barrier_class == ni->barrier_class) {
1073 ir3_instr_add_dep(ni, instr);
1074 break;
1075 }
1076
1077 if (depends_on(ni, instr))
1078 ir3_instr_add_dep(ni, instr);
1079 }
1080 }
1081
1082 /* before scheduling a block, we need to add any necessary false-dependencies
1083 * to ensure that:
1084 *
1085 * (1) barriers are scheduled in the right order wrt instructions related
1086 * to the barrier
1087 *
1088 * (2) reads that come before a write actually get scheduled before the
1089 * write
1090 */
1091 static void
1092 calculate_deps(struct ir3_block *block)
1093 {
1094 foreach_instr (instr, &block->instr_list) {
1095 if (instr->barrier_class) {
1096 add_barrier_deps(block, instr);
1097 }
1098 }
1099 }
1100
1101 void
1102 ir3_sched_add_deps(struct ir3 *ir)
1103 {
1104 foreach_block (block, &ir->block_list) {
1105 calculate_deps(block);
1106 }
1107 }