etnaviv: drm: Use mesa's atomic definitions
[mesa.git] / src / freedreno / ir3 / ir3_sched.c
1 /*
2 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Rob Clark <robclark@freedesktop.org>
25 */
26
27
28 #include "util/u_math.h"
29
30 #include "ir3.h"
31
32 /*
33 * Instruction Scheduling:
34 *
35 * A recursive depth based scheduling algo. Recursively find an eligible
36 * instruction to schedule from the deepest instruction (recursing through
37 * it's unscheduled src instructions). Normally this would result in a
38 * lot of re-traversal of the same instructions, so we cache results in
39 * instr->data (and clear cached results that would be no longer valid
40 * after scheduling an instruction).
41 *
42 * There are a few special cases that need to be handled, since sched
43 * is currently independent of register allocation. Usages of address
44 * register (a0.x) or predicate register (p0.x) must be serialized. Ie.
45 * if you have two pairs of instructions that write the same special
46 * register and then read it, then those pairs cannot be interleaved.
47 * To solve this, when we are in such a scheduling "critical section",
48 * and we encounter a conflicting write to a special register, we try
49 * to schedule any remaining instructions that use that value first.
50 */
51
52 struct ir3_sched_ctx {
53 struct ir3_block *block; /* the current block */
54 struct list_head depth_list; /* depth sorted unscheduled instrs */
55 struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/
56 struct ir3_instruction *addr; /* current a0.x user, if any */
57 struct ir3_instruction *pred; /* current p0.x user, if any */
58 int live_values; /* estimate of current live values */
59 bool error;
60 };
61
62 static bool is_sfu_or_mem(struct ir3_instruction *instr)
63 {
64 return is_sfu(instr) || is_mem(instr);
65 }
66
67 static void
68 unuse_each_src(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
69 {
70 struct ir3_instruction *src;
71
72 foreach_ssa_src_n(src, n, instr) {
73 if (__is_false_dep(instr, n))
74 continue;
75 if (instr->block != src->block)
76 continue;
77 if ((src->opc == OPC_META_FI) || (src->opc == OPC_META_FO)) {
78 unuse_each_src(ctx, src);
79 } else {
80 debug_assert(src->use_count > 0);
81
82 if (--src->use_count == 0) {
83 ctx->live_values -= dest_regs(src);
84 debug_assert(ctx->live_values >= 0);
85 }
86 }
87 }
88 }
89
90 static void use_instr(struct ir3_instruction *instr);
91
92 static void
93 use_each_src(struct ir3_instruction *instr)
94 {
95 struct ir3_instruction *src;
96
97 foreach_ssa_src_n(src, n, instr) {
98 if (__is_false_dep(instr, n))
99 continue;
100 use_instr(src);
101 }
102 }
103
104 static void
105 use_instr(struct ir3_instruction *instr)
106 {
107 if ((instr->opc == OPC_META_FI) || (instr->opc == OPC_META_FO)) {
108 use_each_src(instr);
109 } else {
110 instr->use_count++;
111 }
112 }
113
114 static void
115 update_live_values(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
116 {
117 if ((instr->opc == OPC_META_FI) || (instr->opc == OPC_META_FO))
118 return;
119
120 ctx->live_values += dest_regs(instr);
121 unuse_each_src(ctx, instr);
122 }
123
124 static void
125 update_use_count(struct ir3 *ir)
126 {
127 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
128 list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
129 instr->use_count = 0;
130 }
131 }
132
133 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
134 list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
135 if ((instr->opc == OPC_META_FI) || (instr->opc == OPC_META_FO))
136 continue;
137
138 use_each_src(instr);
139 }
140 }
141
142 /* Shader outputs are also used:
143 */
144 for (unsigned i = 0; i < ir->noutputs; i++) {
145 struct ir3_instruction *out = ir->outputs[i];
146
147 if (!out)
148 continue;
149
150 use_instr(out);
151 }
152 }
153
154 #define NULL_INSTR ((void *)~0)
155
156 static void
157 clear_cache(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
158 {
159 list_for_each_entry (struct ir3_instruction, instr2, &ctx->depth_list, node) {
160 if ((instr2->data == instr) || (instr2->data == NULL_INSTR) || !instr)
161 instr2->data = NULL;
162 }
163 }
164
165 static void
166 schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
167 {
168 debug_assert(ctx->block == instr->block);
169
170 /* maybe there is a better way to handle this than just stuffing
171 * a nop.. ideally we'd know about this constraint in the
172 * scheduling and depth calculation..
173 */
174 if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr))
175 ir3_NOP(ctx->block);
176
177 /* remove from depth list:
178 */
179 list_delinit(&instr->node);
180
181 if (writes_addr(instr)) {
182 debug_assert(ctx->addr == NULL);
183 ctx->addr = instr;
184 }
185
186 if (writes_pred(instr)) {
187 debug_assert(ctx->pred == NULL);
188 ctx->pred = instr;
189 }
190
191 instr->flags |= IR3_INSTR_MARK;
192
193 list_addtail(&instr->node, &instr->block->instr_list);
194 ctx->scheduled = instr;
195
196 update_live_values(ctx, instr);
197
198 if (writes_addr(instr) || writes_pred(instr) || is_input(instr)) {
199 clear_cache(ctx, NULL);
200 } else {
201 /* invalidate only the necessary entries.. */
202 clear_cache(ctx, instr);
203 }
204 }
205
206 static struct ir3_instruction *
207 deepest(struct ir3_instruction **srcs, unsigned nsrcs)
208 {
209 struct ir3_instruction *d = NULL;
210 unsigned i = 0, id = 0;
211
212 while ((i < nsrcs) && !(d = srcs[id = i]))
213 i++;
214
215 if (!d)
216 return NULL;
217
218 for (; i < nsrcs; i++)
219 if (srcs[i] && (srcs[i]->depth > d->depth))
220 d = srcs[id = i];
221
222 srcs[id] = NULL;
223
224 return d;
225 }
226
227 /**
228 * @block: the block to search in, starting from end; in first pass,
229 * this will be the block the instruction would be inserted into
230 * (but has not yet, ie. it only contains already scheduled
231 * instructions). For intra-block scheduling (second pass), this
232 * would be one of the predecessor blocks.
233 * @instr: the instruction to search for
234 * @maxd: max distance, bail after searching this # of instruction
235 * slots, since it means the instruction we are looking for is
236 * far enough away
237 * @pred: if true, recursively search into predecessor blocks to
238 * find the worst case (shortest) distance (only possible after
239 * individual blocks are all scheduled
240 */
241 static unsigned
242 distance(struct ir3_block *block, struct ir3_instruction *instr,
243 unsigned maxd, bool pred)
244 {
245 unsigned d = 0;
246
247 list_for_each_entry_rev (struct ir3_instruction, n, &block->instr_list, node) {
248 if ((n == instr) || (d >= maxd))
249 return d;
250 /* NOTE: don't count branch/jump since we don't know yet if they will
251 * be eliminated later in resolve_jumps().. really should do that
252 * earlier so we don't have this constraint.
253 */
254 if (is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR)))
255 d++;
256 }
257
258 /* if coming from a predecessor block, assume it is assigned far
259 * enough away.. we'll fix up later.
260 */
261 if (!pred)
262 return maxd;
263
264 if (pred && (block->data != block)) {
265 /* Search into predecessor blocks, finding the one with the
266 * shortest distance, since that will be the worst case
267 */
268 unsigned min = maxd - d;
269
270 /* (ab)use block->data to prevent recursion: */
271 block->data = block;
272
273 for (unsigned i = 0; i < block->predecessors_count; i++) {
274 unsigned n;
275
276 n = distance(block->predecessors[i], instr, min, pred);
277
278 min = MIN2(min, n);
279 }
280
281 block->data = NULL;
282 d += min;
283 }
284
285 return d;
286 }
287
288 /* calculate delay for specified src: */
289 static unsigned
290 delay_calc_srcn(struct ir3_block *block,
291 struct ir3_instruction *assigner,
292 struct ir3_instruction *consumer,
293 unsigned srcn, bool soft, bool pred)
294 {
295 unsigned delay = 0;
296
297 if (is_meta(assigner)) {
298 struct ir3_instruction *src;
299 foreach_ssa_src(src, assigner) {
300 unsigned d;
301 d = delay_calc_srcn(block, src, consumer, srcn, soft, pred);
302 delay = MAX2(delay, d);
303 }
304 } else {
305 if (soft) {
306 if (is_sfu(assigner)) {
307 delay = 4;
308 } else {
309 delay = ir3_delayslots(assigner, consumer, srcn);
310 }
311 } else {
312 delay = ir3_delayslots(assigner, consumer, srcn);
313 }
314 delay -= distance(block, assigner, delay, pred);
315 }
316
317 return delay;
318 }
319
320 /* calculate delay for instruction (maximum of delay for all srcs): */
321 static unsigned
322 delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
323 bool soft, bool pred)
324 {
325 unsigned delay = 0;
326 struct ir3_instruction *src;
327
328 foreach_ssa_src_n(src, i, instr) {
329 unsigned d;
330 d = delay_calc_srcn(block, src, instr, i, soft, pred);
331 delay = MAX2(delay, d);
332 }
333
334 return delay;
335 }
336
337 struct ir3_sched_notes {
338 /* there is at least one kill which could be scheduled, except
339 * for unscheduled bary.f's:
340 */
341 bool blocked_kill;
342 /* there is at least one instruction that could be scheduled,
343 * except for conflicting address/predicate register usage:
344 */
345 bool addr_conflict, pred_conflict;
346 };
347
348 static bool is_scheduled(struct ir3_instruction *instr)
349 {
350 return !!(instr->flags & IR3_INSTR_MARK);
351 }
352
353 /* could an instruction be scheduled if specified ssa src was scheduled? */
354 static bool
355 could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
356 {
357 struct ir3_instruction *other_src;
358 foreach_ssa_src(other_src, instr) {
359 /* if dependency not scheduled, we aren't ready yet: */
360 if ((src != other_src) && !is_scheduled(other_src)) {
361 return false;
362 }
363 }
364 return true;
365 }
366
367 /* Check if instruction is ok to schedule. Make sure it is not blocked
368 * by use of addr/predicate register, etc.
369 */
370 static bool
371 check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
372 struct ir3_instruction *instr)
373 {
374 /* For instructions that write address register we need to
375 * make sure there is at least one instruction that uses the
376 * addr value which is otherwise ready.
377 *
378 * TODO if any instructions use pred register and have other
379 * src args, we would need to do the same for writes_pred()..
380 */
381 if (writes_addr(instr)) {
382 struct ir3 *ir = instr->block->shader;
383 bool ready = false;
384 for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) {
385 struct ir3_instruction *indirect = ir->indirects[i];
386 if (!indirect)
387 continue;
388 if (indirect->address != instr)
389 continue;
390 ready = could_sched(indirect, instr);
391 }
392
393 /* nothing could be scheduled, so keep looking: */
394 if (!ready)
395 return false;
396 }
397
398 /* if this is a write to address/predicate register, and that
399 * register is currently in use, we need to defer until it is
400 * free:
401 */
402 if (writes_addr(instr) && ctx->addr) {
403 debug_assert(ctx->addr != instr);
404 notes->addr_conflict = true;
405 return false;
406 }
407
408 if (writes_pred(instr) && ctx->pred) {
409 debug_assert(ctx->pred != instr);
410 notes->pred_conflict = true;
411 return false;
412 }
413
414 /* if the instruction is a kill, we need to ensure *every*
415 * bary.f is scheduled. The hw seems unhappy if the thread
416 * gets killed before the end-input (ei) flag is hit.
417 *
418 * We could do this by adding each bary.f instruction as
419 * virtual ssa src for the kill instruction. But we have
420 * fixed length instr->regs[].
421 *
422 * TODO this wouldn't be quite right if we had multiple
423 * basic blocks, if any block was conditional. We'd need
424 * to schedule the bary.f's outside of any block which
425 * was conditional that contained a kill.. I think..
426 */
427 if (is_kill(instr)) {
428 struct ir3 *ir = instr->block->shader;
429
430 for (unsigned i = 0; i < ir->baryfs_count; i++) {
431 struct ir3_instruction *baryf = ir->baryfs[i];
432 if (baryf->flags & IR3_INSTR_UNUSED)
433 continue;
434 if (!is_scheduled(baryf)) {
435 notes->blocked_kill = true;
436 return false;
437 }
438 }
439 }
440
441 return true;
442 }
443
444 /* Find the best instruction to schedule from specified instruction or
445 * recursively it's ssa sources.
446 */
447 static struct ir3_instruction *
448 find_instr_recursive(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
449 struct ir3_instruction *instr)
450 {
451 struct ir3_instruction *srcs[__ssa_src_cnt(instr)];
452 struct ir3_instruction *src;
453 unsigned nsrcs = 0;
454
455 if (is_scheduled(instr))
456 return NULL;
457
458 /* use instr->data to cache the results of recursing up the
459 * instr src's. Otherwise the recursive algo can scale quite
460 * badly w/ shader size. But this takes some care to clear
461 * the cache appropriately when instructions are scheduled.
462 */
463 if (instr->data) {
464 if (instr->data == NULL_INSTR)
465 return NULL;
466 return instr->data;
467 }
468
469 /* find unscheduled srcs: */
470 foreach_ssa_src(src, instr) {
471 if (!is_scheduled(src) && (src->block == instr->block)) {
472 debug_assert(nsrcs < ARRAY_SIZE(srcs));
473 srcs[nsrcs++] = src;
474 }
475 }
476
477 /* if all our src's are already scheduled: */
478 if (nsrcs == 0) {
479 if (check_instr(ctx, notes, instr)) {
480 instr->data = instr;
481 return instr;
482 }
483 return NULL;
484 }
485
486 while ((src = deepest(srcs, nsrcs))) {
487 struct ir3_instruction *candidate;
488
489 candidate = find_instr_recursive(ctx, notes, src);
490 if (!candidate)
491 continue;
492
493 if (check_instr(ctx, notes, candidate)) {
494 instr->data = candidate;
495 return candidate;
496 }
497 }
498
499 instr->data = NULL_INSTR;
500 return NULL;
501 }
502
503 /* find net change to live values if instruction were scheduled: */
504 static int
505 live_effect(struct ir3_instruction *instr)
506 {
507 struct ir3_instruction *src;
508 int new_live = dest_regs(instr);
509 int old_live = 0;
510
511 foreach_ssa_src_n(src, n, instr) {
512 if (__is_false_dep(instr, n))
513 continue;
514
515 if (instr->block != src->block)
516 continue;
517
518 /* for fanout/split, just pass things along to the real src: */
519 if (src->opc == OPC_META_FO)
520 src = ssa(src->regs[1]);
521
522 /* for fanin/collect, if this is the last use of *each* src,
523 * then it will decrease the live values, since RA treats
524 * them as a whole:
525 */
526 if (src->opc == OPC_META_FI) {
527 struct ir3_instruction *src2;
528 bool last_use = true;
529
530 foreach_ssa_src(src2, src) {
531 if (src2->use_count > 1) {
532 last_use = false;
533 break;
534 }
535 }
536
537 if (last_use)
538 old_live += dest_regs(src);
539
540 } else {
541 debug_assert(src->use_count > 0);
542
543 if (src->use_count == 1) {
544 old_live += dest_regs(src);
545 }
546 }
547 }
548
549 return new_live - old_live;
550 }
551
552 /* find instruction to schedule: */
553 static struct ir3_instruction *
554 find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
555 bool soft)
556 {
557 struct ir3_instruction *best_instr = NULL;
558 int best_rank = INT_MAX; /* lower is better */
559 unsigned deepest = 0;
560
561 /* TODO we'd really rather use the list/array of block outputs. But we
562 * don't have such a thing. Recursing *every* instruction in the list
563 * will result in a lot of repeated traversal, since instructions will
564 * get traversed both when they appear as ssa src to a later instruction
565 * as well as where they appear in the depth_list.
566 */
567 list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) {
568 struct ir3_instruction *candidate;
569
570 candidate = find_instr_recursive(ctx, notes, instr);
571 if (!candidate)
572 continue;
573
574 if (is_meta(candidate))
575 return candidate;
576
577 deepest = MAX2(deepest, candidate->depth);
578 }
579
580 /* traverse the list a second time.. but since we cache the result of
581 * find_instr_recursive() it isn't as bad as it looks.
582 */
583 list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) {
584 struct ir3_instruction *candidate;
585
586 candidate = find_instr_recursive(ctx, notes, instr);
587 if (!candidate)
588 continue;
589
590 /* determine net change to # of live values: */
591 int le = live_effect(candidate);
592
593 /* if there is a net increase in # of live values, then apply some
594 * threshold to avoid instructions getting scheduled *too* early
595 * and increasing register pressure.
596 */
597 if (le >= 1) {
598 unsigned threshold;
599
600 if (ctx->live_values > 4*4) {
601 threshold = 4;
602 } else {
603 threshold = 6;
604 }
605
606 /* Filter out any "shallow" instructions which would otherwise
607 * tend to get scheduled too early to fill delay slots even
608 * when they are not needed for a while. There will probably
609 * be later delay slots that they could just as easily fill.
610 *
611 * A classic case where this comes up is frag shaders that
612 * write a constant value (like 1.0f) to one of the channels
613 * of the output color(s). Since the mov from immed has no
614 * dependencies, it would otherwise get scheduled early to
615 * fill delay slots, occupying a register until the end of
616 * the program.
617 */
618 if ((deepest - candidate->depth) > threshold)
619 continue;
620 }
621
622 int rank = delay_calc(ctx->block, candidate, soft, false);
623
624 /* if too many live values, prioritize instructions that reduce the
625 * number of live values:
626 */
627 if (ctx->live_values > 16*4) {
628 rank = le;
629 } else if (ctx->live_values > 4*4) {
630 rank += le;
631 }
632
633 if (rank < best_rank) {
634 best_instr = candidate;
635 best_rank = rank;
636 }
637 }
638
639 return best_instr;
640 }
641
642 /* "spill" the address register by remapping any unscheduled
643 * instructions which depend on the current address register
644 * to a clone of the instruction which wrote the address reg.
645 */
646 static struct ir3_instruction *
647 split_addr(struct ir3_sched_ctx *ctx)
648 {
649 struct ir3 *ir;
650 struct ir3_instruction *new_addr = NULL;
651 unsigned i;
652
653 debug_assert(ctx->addr);
654
655 ir = ctx->addr->block->shader;
656
657 for (i = 0; i < ir->indirects_count; i++) {
658 struct ir3_instruction *indirect = ir->indirects[i];
659
660 if (!indirect)
661 continue;
662
663 /* skip instructions already scheduled: */
664 if (is_scheduled(indirect))
665 continue;
666
667 /* remap remaining instructions using current addr
668 * to new addr:
669 */
670 if (indirect->address == ctx->addr) {
671 if (!new_addr) {
672 new_addr = ir3_instr_clone(ctx->addr);
673 /* original addr is scheduled, but new one isn't: */
674 new_addr->flags &= ~IR3_INSTR_MARK;
675 }
676 ir3_instr_set_address(indirect, new_addr);
677 }
678 }
679
680 /* all remaining indirects remapped to new addr: */
681 ctx->addr = NULL;
682
683 return new_addr;
684 }
685
686 /* "spill" the predicate register by remapping any unscheduled
687 * instructions which depend on the current predicate register
688 * to a clone of the instruction which wrote the address reg.
689 */
690 static struct ir3_instruction *
691 split_pred(struct ir3_sched_ctx *ctx)
692 {
693 struct ir3 *ir;
694 struct ir3_instruction *new_pred = NULL;
695 unsigned i;
696
697 debug_assert(ctx->pred);
698
699 ir = ctx->pred->block->shader;
700
701 for (i = 0; i < ir->predicates_count; i++) {
702 struct ir3_instruction *predicated = ir->predicates[i];
703
704 /* skip instructions already scheduled: */
705 if (is_scheduled(predicated))
706 continue;
707
708 /* remap remaining instructions using current pred
709 * to new pred:
710 *
711 * TODO is there ever a case when pred isn't first
712 * (and only) src?
713 */
714 if (ssa(predicated->regs[1]) == ctx->pred) {
715 if (!new_pred) {
716 new_pred = ir3_instr_clone(ctx->pred);
717 /* original pred is scheduled, but new one isn't: */
718 new_pred->flags &= ~IR3_INSTR_MARK;
719 }
720 predicated->regs[1]->instr = new_pred;
721 }
722 }
723
724 /* all remaining predicated remapped to new pred: */
725 ctx->pred = NULL;
726
727 return new_pred;
728 }
729
730 static void
731 sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
732 {
733 struct list_head unscheduled_list;
734
735 ctx->block = block;
736
737 /* addr/pred writes are per-block: */
738 ctx->addr = NULL;
739 ctx->pred = NULL;
740
741 /* move all instructions to the unscheduled list, and
742 * empty the block's instruction list (to which we will
743 * be inserting).
744 */
745 list_replace(&block->instr_list, &unscheduled_list);
746 list_inithead(&block->instr_list);
747 list_inithead(&ctx->depth_list);
748
749 /* first a pre-pass to schedule all meta:input instructions
750 * (which need to appear first so that RA knows the register is
751 * occupied), and move remaining to depth sorted list:
752 */
753 list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node) {
754 if (instr->opc == OPC_META_INPUT) {
755 schedule(ctx, instr);
756 } else {
757 ir3_insert_by_depth(instr, &ctx->depth_list);
758 }
759 }
760
761 while (!list_empty(&ctx->depth_list)) {
762 struct ir3_sched_notes notes = {0};
763 struct ir3_instruction *instr;
764
765 instr = find_eligible_instr(ctx, &notes, true);
766 if (!instr)
767 instr = find_eligible_instr(ctx, &notes, false);
768
769 if (instr) {
770 unsigned delay = delay_calc(ctx->block, instr, false, false);
771
772 /* and if we run out of instructions that can be scheduled,
773 * then it is time for nop's:
774 */
775 debug_assert(delay <= 6);
776 while (delay > 0) {
777 ir3_NOP(block);
778 delay--;
779 }
780
781 schedule(ctx, instr);
782 } else {
783 struct ir3_instruction *new_instr = NULL;
784
785 /* nothing available to schedule.. if we are blocked on
786 * address/predicate register conflict, then break the
787 * deadlock by cloning the instruction that wrote that
788 * reg:
789 */
790 if (notes.addr_conflict) {
791 new_instr = split_addr(ctx);
792 } else if (notes.pred_conflict) {
793 new_instr = split_pred(ctx);
794 } else {
795 debug_assert(0);
796 ctx->error = true;
797 return;
798 }
799
800 if (new_instr) {
801 /* clearing current addr/pred can change what is
802 * available to schedule, so clear cache..
803 */
804 clear_cache(ctx, NULL);
805
806 ir3_insert_by_depth(new_instr, &ctx->depth_list);
807 /* the original instr that wrote addr/pred may have
808 * originated from a different block:
809 */
810 new_instr->block = block;
811 }
812 }
813 }
814
815 /* And lastly, insert branch/jump instructions to take us to
816 * the next block. Later we'll strip back out the branches
817 * that simply jump to next instruction.
818 */
819 if (block->successors[1]) {
820 /* if/else, conditional branches to "then" or "else": */
821 struct ir3_instruction *br;
822 unsigned delay = 6;
823
824 debug_assert(ctx->pred);
825 debug_assert(block->condition);
826
827 delay -= distance(ctx->block, ctx->pred, delay, false);
828
829 while (delay > 0) {
830 ir3_NOP(block);
831 delay--;
832 }
833
834 /* create "else" branch first (since "then" block should
835 * frequently/always end up being a fall-thru):
836 */
837 br = ir3_BR(block);
838 br->cat0.inv = true;
839 br->cat0.target = block->successors[1];
840
841 /* NOTE: we have to hard code delay of 6 above, since
842 * we want to insert the nop's before constructing the
843 * branch. Throw in an assert so we notice if this
844 * ever breaks on future generation:
845 */
846 debug_assert(ir3_delayslots(ctx->pred, br, 0) == 6);
847
848 br = ir3_BR(block);
849 br->cat0.target = block->successors[0];
850
851 } else if (block->successors[0]) {
852 /* otherwise unconditional jump to next block: */
853 struct ir3_instruction *jmp;
854
855 jmp = ir3_JUMP(block);
856 jmp->cat0.target = block->successors[0];
857 }
858
859 /* NOTE: if we kept track of the predecessors, we could do a better
860 * job w/ (jp) flags.. every node w/ > predecessor is a join point.
861 * Note that as we eliminate blocks which contain only an unconditional
862 * jump we probably need to propagate (jp) flag..
863 */
864 }
865
866 /* After scheduling individual blocks, we still could have cases where
867 * one (or more) paths into a block, a value produced by a previous
868 * has too few delay slots to be legal. We can't deal with this in the
869 * first pass, because loops (ie. we can't ensure all predecessor blocks
870 * are already scheduled in the first pass). All we can really do at
871 * this point is stuff in extra nop's until things are legal.
872 */
873 static void
874 sched_intra_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
875 {
876 unsigned n = 0;
877
878 ctx->block = block;
879
880 list_for_each_entry_safe (struct ir3_instruction, instr, &block->instr_list, node) {
881 unsigned delay = 0;
882
883 for (unsigned i = 0; i < block->predecessors_count; i++) {
884 unsigned d = delay_calc(block->predecessors[i], instr, false, true);
885 delay = MAX2(d, delay);
886 }
887
888 while (delay > n) {
889 struct ir3_instruction *nop = ir3_NOP(block);
890
891 /* move to before instr: */
892 list_delinit(&nop->node);
893 list_addtail(&nop->node, &instr->node);
894
895 n++;
896 }
897
898 /* we can bail once we hit worst case delay: */
899 if (++n > 6)
900 break;
901 }
902 }
903
904 int ir3_sched(struct ir3 *ir)
905 {
906 struct ir3_sched_ctx ctx = {0};
907
908 ir3_clear_mark(ir);
909 update_use_count(ir);
910
911 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
912 ctx.live_values = 0;
913 sched_block(&ctx, block);
914 }
915
916 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
917 sched_intra_block(&ctx, block);
918 }
919
920 if (ctx.error)
921 return -1;
922
923 return 0;
924 }
925
926 static unsigned
927 get_array_id(struct ir3_instruction *instr)
928 {
929 /* The expectation is that there is only a single array
930 * src or dst, ir3_cp should enforce this.
931 */
932
933 for (unsigned i = 0; i < instr->regs_count; i++)
934 if (instr->regs[i]->flags & IR3_REG_ARRAY)
935 return instr->regs[i]->array.id;
936
937 unreachable("this was unexpected");
938 }
939
940 /* does instruction 'prior' need to be scheduled before 'instr'? */
941 static bool
942 depends_on(struct ir3_instruction *instr, struct ir3_instruction *prior)
943 {
944 /* TODO for dependencies that are related to a specific object, ie
945 * a specific SSBO/image/array, we could relax this constraint to
946 * make accesses to unrelated objects not depend on each other (at
947 * least as long as not declared coherent)
948 */
949 if (((instr->barrier_class & IR3_BARRIER_EVERYTHING) && prior->barrier_class) ||
950 ((prior->barrier_class & IR3_BARRIER_EVERYTHING) && instr->barrier_class))
951 return true;
952
953 if (instr->barrier_class & prior->barrier_conflict) {
954 if (!(instr->barrier_class & ~(IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W))) {
955 /* if only array barrier, then we can further limit false-deps
956 * by considering the array-id, ie reads/writes to different
957 * arrays do not depend on each other (no aliasing)
958 */
959 if (get_array_id(instr) != get_array_id(prior)) {
960 return false;
961 }
962 }
963
964 return true;
965 }
966
967 return false;
968 }
969
970 static void
971 add_barrier_deps(struct ir3_block *block, struct ir3_instruction *instr)
972 {
973 struct list_head *prev = instr->node.prev;
974 struct list_head *next = instr->node.next;
975
976 /* add dependencies on previous instructions that must be scheduled
977 * prior to the current instruction
978 */
979 while (prev != &block->instr_list) {
980 struct ir3_instruction *pi =
981 LIST_ENTRY(struct ir3_instruction, prev, node);
982
983 prev = prev->prev;
984
985 if (is_meta(pi))
986 continue;
987
988 if (instr->barrier_class == pi->barrier_class) {
989 ir3_instr_add_dep(instr, pi);
990 break;
991 }
992
993 if (depends_on(instr, pi))
994 ir3_instr_add_dep(instr, pi);
995 }
996
997 /* add dependencies on this instruction to following instructions
998 * that must be scheduled after the current instruction:
999 */
1000 while (next != &block->instr_list) {
1001 struct ir3_instruction *ni =
1002 LIST_ENTRY(struct ir3_instruction, next, node);
1003
1004 next = next->next;
1005
1006 if (is_meta(ni))
1007 continue;
1008
1009 if (instr->barrier_class == ni->barrier_class) {
1010 ir3_instr_add_dep(ni, instr);
1011 break;
1012 }
1013
1014 if (depends_on(ni, instr))
1015 ir3_instr_add_dep(ni, instr);
1016 }
1017 }
1018
1019 /* before scheduling a block, we need to add any necessary false-dependencies
1020 * to ensure that:
1021 *
1022 * (1) barriers are scheduled in the right order wrt instructions related
1023 * to the barrier
1024 *
1025 * (2) reads that come before a write actually get scheduled before the
1026 * write
1027 */
1028 static void
1029 calculate_deps(struct ir3_block *block)
1030 {
1031 list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
1032 if (instr->barrier_class) {
1033 add_barrier_deps(block, instr);
1034 }
1035 }
1036 }
1037
1038 void
1039 ir3_sched_add_deps(struct ir3 *ir)
1040 {
1041 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
1042 calculate_deps(block);
1043 }
1044 }