freedreno/ir3: sched should mark outputs used
[mesa.git] / src / freedreno / ir3 / ir3_sched.c
1 /*
2 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Rob Clark <robclark@freedesktop.org>
25 */
26
27
28 #include "util/u_math.h"
29
30 #include "ir3.h"
31
32 /*
33 * Instruction Scheduling:
34 *
35 * A recursive depth based scheduling algo. Recursively find an eligible
36 * instruction to schedule from the deepest instruction (recursing through
37 * it's unscheduled src instructions). Normally this would result in a
38 * lot of re-traversal of the same instructions, so we cache results in
39 * instr->data (and clear cached results that would be no longer valid
40 * after scheduling an instruction).
41 *
42 * There are a few special cases that need to be handled, since sched
43 * is currently independent of register allocation. Usages of address
44 * register (a0.x) or predicate register (p0.x) must be serialized. Ie.
45 * if you have two pairs of instructions that write the same special
46 * register and then read it, then those pairs cannot be interleaved.
47 * To solve this, when we are in such a scheduling "critical section",
48 * and we encounter a conflicting write to a special register, we try
49 * to schedule any remaining instructions that use that value first.
50 */
51
52 struct ir3_sched_ctx {
53 struct ir3_block *block; /* the current block */
54 struct list_head depth_list; /* depth sorted unscheduled instrs */
55 struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/
56 struct ir3_instruction *addr; /* current a0.x user, if any */
57 struct ir3_instruction *pred; /* current p0.x user, if any */
58 int live_values; /* estimate of current live values */
59 bool error;
60 };
61
62 static bool is_sfu_or_mem(struct ir3_instruction *instr)
63 {
64 return is_sfu(instr) || is_mem(instr);
65 }
66
67 static void
68 unuse_each_src(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
69 {
70 struct ir3_instruction *src;
71
72 foreach_ssa_src_n(src, n, instr) {
73 if (__is_false_dep(instr, n))
74 continue;
75 if (instr->block != src->block)
76 continue;
77 if ((src->opc == OPC_META_FI) || (src->opc == OPC_META_FO)) {
78 unuse_each_src(ctx, src);
79 } else {
80 debug_assert(src->use_count > 0);
81
82 if (--src->use_count == 0) {
83 ctx->live_values -= dest_regs(src);
84 debug_assert(ctx->live_values >= 0);
85 }
86 }
87 }
88 }
89
90 static void use_instr(struct ir3_instruction *instr);
91
92 static void
93 use_each_src(struct ir3_instruction *instr)
94 {
95 struct ir3_instruction *src;
96
97 foreach_ssa_src_n(src, n, instr) {
98 if (__is_false_dep(instr, n))
99 continue;
100 use_instr(src);
101 }
102 }
103
104 static void
105 use_instr(struct ir3_instruction *instr)
106 {
107 if ((instr->opc == OPC_META_FI) || (instr->opc == OPC_META_FO)) {
108 use_each_src(instr);
109 } else {
110 instr->use_count++;
111 }
112 }
113
114 static void
115 update_live_values(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
116 {
117 if ((instr->opc == OPC_META_FI) || (instr->opc == OPC_META_FO))
118 return;
119
120 ctx->live_values += dest_regs(instr);
121 unuse_each_src(ctx, instr);
122 }
123
124 static void
125 update_use_count(struct ir3 *ir)
126 {
127 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
128 list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
129 instr->use_count = 0;
130 }
131 }
132
133 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
134 list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
135 if ((instr->opc == OPC_META_FI) || (instr->opc == OPC_META_FO))
136 continue;
137
138 use_each_src(instr);
139 }
140 }
141
142 /* Shader outputs are also used:
143 */
144 for (unsigned i = 0; i < ir->noutputs; i++) {
145 struct ir3_instruction *out = ir->outputs[i];
146
147 if (!out)
148 continue;
149
150 use_instr(out);
151 }
152 }
153
154 #define NULL_INSTR ((void *)~0)
155
156 static void
157 clear_cache(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
158 {
159 list_for_each_entry (struct ir3_instruction, instr2, &ctx->depth_list, node) {
160 if ((instr2->data == instr) || (instr2->data == NULL_INSTR) || !instr)
161 instr2->data = NULL;
162 }
163 }
164
165 static void
166 schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
167 {
168 debug_assert(ctx->block == instr->block);
169
170 /* maybe there is a better way to handle this than just stuffing
171 * a nop.. ideally we'd know about this constraint in the
172 * scheduling and depth calculation..
173 */
174 if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr))
175 ir3_NOP(ctx->block);
176
177 /* remove from depth list:
178 */
179 list_delinit(&instr->node);
180
181 if (writes_addr(instr)) {
182 debug_assert(ctx->addr == NULL);
183 ctx->addr = instr;
184 }
185
186 if (writes_pred(instr)) {
187 debug_assert(ctx->pred == NULL);
188 ctx->pred = instr;
189 }
190
191 instr->flags |= IR3_INSTR_MARK;
192
193 list_addtail(&instr->node, &instr->block->instr_list);
194 ctx->scheduled = instr;
195
196 update_live_values(ctx, instr);
197
198 if (writes_addr(instr) || writes_pred(instr) || is_input(instr)) {
199 clear_cache(ctx, NULL);
200 } else {
201 /* invalidate only the necessary entries.. */
202 clear_cache(ctx, instr);
203 }
204 }
205
206 static struct ir3_instruction *
207 deepest(struct ir3_instruction **srcs, unsigned nsrcs)
208 {
209 struct ir3_instruction *d = NULL;
210 unsigned i = 0, id = 0;
211
212 while ((i < nsrcs) && !(d = srcs[id = i]))
213 i++;
214
215 if (!d)
216 return NULL;
217
218 for (; i < nsrcs; i++)
219 if (srcs[i] && (srcs[i]->sun > d->sun))
220 d = srcs[id = i];
221
222 srcs[id] = NULL;
223
224 return d;
225 }
226
227 /**
228 * @block: the block to search in, starting from end; in first pass,
229 * this will be the block the instruction would be inserted into
230 * (but has not yet, ie. it only contains already scheduled
231 * instructions). For intra-block scheduling (second pass), this
232 * would be one of the predecessor blocks.
233 * @instr: the instruction to search for
234 * @maxd: max distance, bail after searching this # of instruction
235 * slots, since it means the instruction we are looking for is
236 * far enough away
237 * @pred: if true, recursively search into predecessor blocks to
238 * find the worst case (shortest) distance (only possible after
239 * individual blocks are all scheduled
240 */
241 static unsigned
242 distance(struct ir3_block *block, struct ir3_instruction *instr,
243 unsigned maxd, bool pred)
244 {
245 unsigned d = 0;
246
247 list_for_each_entry_rev (struct ir3_instruction, n, &block->instr_list, node) {
248 if ((n == instr) || (d >= maxd))
249 return d;
250 /* NOTE: don't count branch/jump since we don't know yet if they will
251 * be eliminated later in resolve_jumps().. really should do that
252 * earlier so we don't have this constraint.
253 */
254 if (is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR)))
255 d++;
256 }
257
258 /* if coming from a predecessor block, assume it is assigned far
259 * enough away.. we'll fix up later.
260 */
261 if (!pred)
262 return maxd;
263
264 if (pred && (block->data != block)) {
265 /* Search into predecessor blocks, finding the one with the
266 * shortest distance, since that will be the worst case
267 */
268 unsigned min = maxd - d;
269
270 /* (ab)use block->data to prevent recursion: */
271 block->data = block;
272
273 for (unsigned i = 0; i < block->predecessors_count; i++) {
274 unsigned n;
275
276 n = distance(block->predecessors[i], instr, min, pred);
277
278 min = MIN2(min, n);
279 }
280
281 block->data = NULL;
282 d += min;
283 }
284
285 return d;
286 }
287
288 /* calculate delay for specified src: */
289 static unsigned
290 delay_calc_srcn(struct ir3_block *block,
291 struct ir3_instruction *assigner,
292 struct ir3_instruction *consumer,
293 unsigned srcn, bool soft, bool pred)
294 {
295 unsigned delay = 0;
296
297 if (is_meta(assigner)) {
298 struct ir3_instruction *src;
299 foreach_ssa_src(src, assigner) {
300 unsigned d;
301 d = delay_calc_srcn(block, src, consumer, srcn, soft, pred);
302 delay = MAX2(delay, d);
303 }
304 } else {
305 if (soft) {
306 if (is_sfu(assigner)) {
307 delay = 4;
308 } else {
309 delay = ir3_delayslots(assigner, consumer, srcn);
310 }
311 } else {
312 delay = ir3_delayslots(assigner, consumer, srcn);
313 }
314 delay -= distance(block, assigner, delay, pred);
315 }
316
317 return delay;
318 }
319
320 /* calculate delay for instruction (maximum of delay for all srcs): */
321 static unsigned
322 delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
323 bool soft, bool pred)
324 {
325 unsigned delay = 0;
326 struct ir3_instruction *src;
327
328 foreach_ssa_src_n(src, i, instr) {
329 unsigned d;
330 d = delay_calc_srcn(block, src, instr, i, soft, pred);
331 delay = MAX2(delay, d);
332 }
333
334 return delay;
335 }
336
337 struct ir3_sched_notes {
338 /* there is at least one kill which could be scheduled, except
339 * for unscheduled bary.f's:
340 */
341 bool blocked_kill;
342 /* there is at least one instruction that could be scheduled,
343 * except for conflicting address/predicate register usage:
344 */
345 bool addr_conflict, pred_conflict;
346 };
347
348 static bool is_scheduled(struct ir3_instruction *instr)
349 {
350 return !!(instr->flags & IR3_INSTR_MARK);
351 }
352
353 /* could an instruction be scheduled if specified ssa src was scheduled? */
354 static bool
355 could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
356 {
357 struct ir3_instruction *other_src;
358 foreach_ssa_src(other_src, instr) {
359 /* if dependency not scheduled, we aren't ready yet: */
360 if ((src != other_src) && !is_scheduled(other_src)) {
361 return false;
362 }
363 }
364 return true;
365 }
366
367 /* Check if instruction is ok to schedule. Make sure it is not blocked
368 * by use of addr/predicate register, etc.
369 */
370 static bool
371 check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
372 struct ir3_instruction *instr)
373 {
374 /* For instructions that write address register we need to
375 * make sure there is at least one instruction that uses the
376 * addr value which is otherwise ready.
377 *
378 * TODO if any instructions use pred register and have other
379 * src args, we would need to do the same for writes_pred()..
380 */
381 if (writes_addr(instr)) {
382 struct ir3 *ir = instr->block->shader;
383 bool ready = false;
384 for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) {
385 struct ir3_instruction *indirect = ir->indirects[i];
386 if (!indirect)
387 continue;
388 if (indirect->address != instr)
389 continue;
390 ready = could_sched(indirect, instr);
391 }
392
393 /* nothing could be scheduled, so keep looking: */
394 if (!ready)
395 return false;
396 }
397
398 /* if this is a write to address/predicate register, and that
399 * register is currently in use, we need to defer until it is
400 * free:
401 */
402 if (writes_addr(instr) && ctx->addr) {
403 debug_assert(ctx->addr != instr);
404 notes->addr_conflict = true;
405 return false;
406 }
407
408 if (writes_pred(instr) && ctx->pred) {
409 debug_assert(ctx->pred != instr);
410 notes->pred_conflict = true;
411 return false;
412 }
413
414 /* if the instruction is a kill, we need to ensure *every*
415 * bary.f is scheduled. The hw seems unhappy if the thread
416 * gets killed before the end-input (ei) flag is hit.
417 *
418 * We could do this by adding each bary.f instruction as
419 * virtual ssa src for the kill instruction. But we have
420 * fixed length instr->regs[].
421 *
422 * TODO this wouldn't be quite right if we had multiple
423 * basic blocks, if any block was conditional. We'd need
424 * to schedule the bary.f's outside of any block which
425 * was conditional that contained a kill.. I think..
426 */
427 if (is_kill(instr)) {
428 struct ir3 *ir = instr->block->shader;
429
430 for (unsigned i = 0; i < ir->baryfs_count; i++) {
431 struct ir3_instruction *baryf = ir->baryfs[i];
432 if (baryf->flags & IR3_INSTR_UNUSED)
433 continue;
434 if (!is_scheduled(baryf)) {
435 notes->blocked_kill = true;
436 return false;
437 }
438 }
439 }
440
441 return true;
442 }
443
444 /* Find the best instruction to schedule from specified instruction or
445 * recursively it's ssa sources.
446 */
447 static struct ir3_instruction *
448 find_instr_recursive(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
449 struct ir3_instruction *instr)
450 {
451 struct ir3_instruction *srcs[__ssa_src_cnt(instr)];
452 struct ir3_instruction *src;
453 unsigned nsrcs = 0;
454
455 if (is_scheduled(instr))
456 return NULL;
457
458 /* use instr->data to cache the results of recursing up the
459 * instr src's. Otherwise the recursive algo can scale quite
460 * badly w/ shader size. But this takes some care to clear
461 * the cache appropriately when instructions are scheduled.
462 */
463 if (instr->data) {
464 if (instr->data == NULL_INSTR)
465 return NULL;
466 return instr->data;
467 }
468
469 /* find unscheduled srcs: */
470 foreach_ssa_src(src, instr) {
471 if (!is_scheduled(src) && (src->block == instr->block)) {
472 debug_assert(nsrcs < ARRAY_SIZE(srcs));
473 srcs[nsrcs++] = src;
474 }
475 }
476
477 /* if all our src's are already scheduled: */
478 if (nsrcs == 0) {
479 if (check_instr(ctx, notes, instr)) {
480 instr->data = instr;
481 return instr;
482 }
483 return NULL;
484 }
485
486 while ((src = deepest(srcs, nsrcs))) {
487 struct ir3_instruction *candidate;
488
489 candidate = find_instr_recursive(ctx, notes, src);
490 if (!candidate)
491 continue;
492
493 if (check_instr(ctx, notes, candidate)) {
494 instr->data = candidate;
495 return candidate;
496 }
497 }
498
499 instr->data = NULL_INSTR;
500 return NULL;
501 }
502
503 /* find instruction to schedule: */
504 static struct ir3_instruction *
505 find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
506 bool soft)
507 {
508 struct ir3_instruction *best_instr = NULL;
509 unsigned min_delay = ~0;
510
511 /* TODO we'd really rather use the list/array of block outputs. But we
512 * don't have such a thing. Recursing *every* instruction in the list
513 * will result in a lot of repeated traversal, since instructions will
514 * get traversed both when they appear as ssa src to a later instruction
515 * as well as where they appear in the depth_list.
516 */
517 list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) {
518 struct ir3_instruction *candidate;
519 unsigned delay;
520
521 candidate = find_instr_recursive(ctx, notes, instr);
522 if (!candidate)
523 continue;
524
525 if (ctx->live_values > 16*4) {
526 /* under register pressure, only care about reducing live values: */
527 if (!best_instr || (candidate->sun > best_instr->sun))
528 best_instr = candidate;
529 } else {
530 delay = delay_calc(ctx->block, candidate, soft, false);
531 if ((delay < min_delay) ||
532 ((delay <= (min_delay + 2)) && (candidate->sun > best_instr->sun))) {
533 best_instr = candidate;
534 min_delay = delay;
535 }
536 }
537 }
538
539 return best_instr;
540 }
541
542 /* "spill" the address register by remapping any unscheduled
543 * instructions which depend on the current address register
544 * to a clone of the instruction which wrote the address reg.
545 */
546 static struct ir3_instruction *
547 split_addr(struct ir3_sched_ctx *ctx)
548 {
549 struct ir3 *ir;
550 struct ir3_instruction *new_addr = NULL;
551 unsigned i;
552
553 debug_assert(ctx->addr);
554
555 ir = ctx->addr->block->shader;
556
557 for (i = 0; i < ir->indirects_count; i++) {
558 struct ir3_instruction *indirect = ir->indirects[i];
559
560 if (!indirect)
561 continue;
562
563 /* skip instructions already scheduled: */
564 if (is_scheduled(indirect))
565 continue;
566
567 /* remap remaining instructions using current addr
568 * to new addr:
569 */
570 if (indirect->address == ctx->addr) {
571 if (!new_addr) {
572 new_addr = ir3_instr_clone(ctx->addr);
573 /* original addr is scheduled, but new one isn't: */
574 new_addr->flags &= ~IR3_INSTR_MARK;
575 }
576 ir3_instr_set_address(indirect, new_addr);
577 }
578 }
579
580 /* all remaining indirects remapped to new addr: */
581 ctx->addr = NULL;
582
583 return new_addr;
584 }
585
586 /* "spill" the predicate register by remapping any unscheduled
587 * instructions which depend on the current predicate register
588 * to a clone of the instruction which wrote the address reg.
589 */
590 static struct ir3_instruction *
591 split_pred(struct ir3_sched_ctx *ctx)
592 {
593 struct ir3 *ir;
594 struct ir3_instruction *new_pred = NULL;
595 unsigned i;
596
597 debug_assert(ctx->pred);
598
599 ir = ctx->pred->block->shader;
600
601 for (i = 0; i < ir->predicates_count; i++) {
602 struct ir3_instruction *predicated = ir->predicates[i];
603
604 /* skip instructions already scheduled: */
605 if (is_scheduled(predicated))
606 continue;
607
608 /* remap remaining instructions using current pred
609 * to new pred:
610 *
611 * TODO is there ever a case when pred isn't first
612 * (and only) src?
613 */
614 if (ssa(predicated->regs[1]) == ctx->pred) {
615 if (!new_pred) {
616 new_pred = ir3_instr_clone(ctx->pred);
617 /* original pred is scheduled, but new one isn't: */
618 new_pred->flags &= ~IR3_INSTR_MARK;
619 }
620 predicated->regs[1]->instr = new_pred;
621 }
622 }
623
624 /* all remaining predicated remapped to new pred: */
625 ctx->pred = NULL;
626
627 return new_pred;
628 }
629
630 static void
631 sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
632 {
633 struct list_head unscheduled_list;
634
635 ctx->block = block;
636
637 /* addr/pred writes are per-block: */
638 ctx->addr = NULL;
639 ctx->pred = NULL;
640
641 /* move all instructions to the unscheduled list, and
642 * empty the block's instruction list (to which we will
643 * be inserting).
644 */
645 list_replace(&block->instr_list, &unscheduled_list);
646 list_inithead(&block->instr_list);
647 list_inithead(&ctx->depth_list);
648
649 /* first a pre-pass to schedule all meta:input instructions
650 * (which need to appear first so that RA knows the register is
651 * occupied), and move remaining to depth sorted list:
652 */
653 list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node) {
654 if (instr->opc == OPC_META_INPUT) {
655 schedule(ctx, instr);
656 } else {
657 ir3_insert_by_depth(instr, &ctx->depth_list);
658 }
659 }
660
661 while (!list_empty(&ctx->depth_list)) {
662 struct ir3_sched_notes notes = {0};
663 struct ir3_instruction *instr;
664
665 instr = find_eligible_instr(ctx, &notes, true);
666 if (!instr)
667 instr = find_eligible_instr(ctx, &notes, false);
668
669 if (instr) {
670 unsigned delay = delay_calc(ctx->block, instr, false, false);
671
672 /* and if we run out of instructions that can be scheduled,
673 * then it is time for nop's:
674 */
675 debug_assert(delay <= 6);
676 while (delay > 0) {
677 ir3_NOP(block);
678 delay--;
679 }
680
681 schedule(ctx, instr);
682 } else {
683 struct ir3_instruction *new_instr = NULL;
684
685 /* nothing available to schedule.. if we are blocked on
686 * address/predicate register conflict, then break the
687 * deadlock by cloning the instruction that wrote that
688 * reg:
689 */
690 if (notes.addr_conflict) {
691 new_instr = split_addr(ctx);
692 } else if (notes.pred_conflict) {
693 new_instr = split_pred(ctx);
694 } else {
695 debug_assert(0);
696 ctx->error = true;
697 return;
698 }
699
700 if (new_instr) {
701 /* clearing current addr/pred can change what is
702 * available to schedule, so clear cache..
703 */
704 clear_cache(ctx, NULL);
705
706 ir3_insert_by_depth(new_instr, &ctx->depth_list);
707 /* the original instr that wrote addr/pred may have
708 * originated from a different block:
709 */
710 new_instr->block = block;
711 }
712 }
713 }
714
715 /* And lastly, insert branch/jump instructions to take us to
716 * the next block. Later we'll strip back out the branches
717 * that simply jump to next instruction.
718 */
719 if (block->successors[1]) {
720 /* if/else, conditional branches to "then" or "else": */
721 struct ir3_instruction *br;
722 unsigned delay = 6;
723
724 debug_assert(ctx->pred);
725 debug_assert(block->condition);
726
727 delay -= distance(ctx->block, ctx->pred, delay, false);
728
729 while (delay > 0) {
730 ir3_NOP(block);
731 delay--;
732 }
733
734 /* create "else" branch first (since "then" block should
735 * frequently/always end up being a fall-thru):
736 */
737 br = ir3_BR(block);
738 br->cat0.inv = true;
739 br->cat0.target = block->successors[1];
740
741 /* NOTE: we have to hard code delay of 6 above, since
742 * we want to insert the nop's before constructing the
743 * branch. Throw in an assert so we notice if this
744 * ever breaks on future generation:
745 */
746 debug_assert(ir3_delayslots(ctx->pred, br, 0) == 6);
747
748 br = ir3_BR(block);
749 br->cat0.target = block->successors[0];
750
751 } else if (block->successors[0]) {
752 /* otherwise unconditional jump to next block: */
753 struct ir3_instruction *jmp;
754
755 jmp = ir3_JUMP(block);
756 jmp->cat0.target = block->successors[0];
757 }
758
759 /* NOTE: if we kept track of the predecessors, we could do a better
760 * job w/ (jp) flags.. every node w/ > predecessor is a join point.
761 * Note that as we eliminate blocks which contain only an unconditional
762 * jump we probably need to propagate (jp) flag..
763 */
764 }
765
766 /* After scheduling individual blocks, we still could have cases where
767 * one (or more) paths into a block, a value produced by a previous
768 * has too few delay slots to be legal. We can't deal with this in the
769 * first pass, because loops (ie. we can't ensure all predecessor blocks
770 * are already scheduled in the first pass). All we can really do at
771 * this point is stuff in extra nop's until things are legal.
772 */
773 static void
774 sched_intra_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
775 {
776 unsigned n = 0;
777
778 ctx->block = block;
779
780 list_for_each_entry_safe (struct ir3_instruction, instr, &block->instr_list, node) {
781 unsigned delay = 0;
782
783 for (unsigned i = 0; i < block->predecessors_count; i++) {
784 unsigned d = delay_calc(block->predecessors[i], instr, false, true);
785 delay = MAX2(d, delay);
786 }
787
788 while (delay > n) {
789 struct ir3_instruction *nop = ir3_NOP(block);
790
791 /* move to before instr: */
792 list_delinit(&nop->node);
793 list_addtail(&nop->node, &instr->node);
794
795 n++;
796 }
797
798 /* we can bail once we hit worst case delay: */
799 if (++n > 6)
800 break;
801 }
802 }
803
804 int ir3_sched(struct ir3 *ir)
805 {
806 struct ir3_sched_ctx ctx = {0};
807
808 ir3_clear_mark(ir);
809 update_use_count(ir);
810
811 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
812 ctx.live_values = 0;
813 sched_block(&ctx, block);
814 }
815
816 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
817 sched_intra_block(&ctx, block);
818 }
819
820 if (ctx.error)
821 return -1;
822
823 return 0;
824 }
825
826 static unsigned
827 get_array_id(struct ir3_instruction *instr)
828 {
829 /* The expectation is that there is only a single array
830 * src or dst, ir3_cp should enforce this.
831 */
832
833 for (unsigned i = 0; i < instr->regs_count; i++)
834 if (instr->regs[i]->flags & IR3_REG_ARRAY)
835 return instr->regs[i]->array.id;
836
837 unreachable("this was unexpected");
838 }
839
840 /* does instruction 'prior' need to be scheduled before 'instr'? */
841 static bool
842 depends_on(struct ir3_instruction *instr, struct ir3_instruction *prior)
843 {
844 /* TODO for dependencies that are related to a specific object, ie
845 * a specific SSBO/image/array, we could relax this constraint to
846 * make accesses to unrelated objects not depend on each other (at
847 * least as long as not declared coherent)
848 */
849 if (((instr->barrier_class & IR3_BARRIER_EVERYTHING) && prior->barrier_class) ||
850 ((prior->barrier_class & IR3_BARRIER_EVERYTHING) && instr->barrier_class))
851 return true;
852
853 if (instr->barrier_class & prior->barrier_conflict) {
854 if (!(instr->barrier_class & ~(IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W))) {
855 /* if only array barrier, then we can further limit false-deps
856 * by considering the array-id, ie reads/writes to different
857 * arrays do not depend on each other (no aliasing)
858 */
859 if (get_array_id(instr) != get_array_id(prior)) {
860 return false;
861 }
862 }
863
864 return true;
865 }
866
867 return false;
868 }
869
870 static void
871 add_barrier_deps(struct ir3_block *block, struct ir3_instruction *instr)
872 {
873 struct list_head *prev = instr->node.prev;
874 struct list_head *next = instr->node.next;
875
876 /* add dependencies on previous instructions that must be scheduled
877 * prior to the current instruction
878 */
879 while (prev != &block->instr_list) {
880 struct ir3_instruction *pi =
881 LIST_ENTRY(struct ir3_instruction, prev, node);
882
883 prev = prev->prev;
884
885 if (is_meta(pi))
886 continue;
887
888 if (instr->barrier_class == pi->barrier_class) {
889 ir3_instr_add_dep(instr, pi);
890 break;
891 }
892
893 if (depends_on(instr, pi))
894 ir3_instr_add_dep(instr, pi);
895 }
896
897 /* add dependencies on this instruction to following instructions
898 * that must be scheduled after the current instruction:
899 */
900 while (next != &block->instr_list) {
901 struct ir3_instruction *ni =
902 LIST_ENTRY(struct ir3_instruction, next, node);
903
904 next = next->next;
905
906 if (is_meta(ni))
907 continue;
908
909 if (instr->barrier_class == ni->barrier_class) {
910 ir3_instr_add_dep(ni, instr);
911 break;
912 }
913
914 if (depends_on(ni, instr))
915 ir3_instr_add_dep(ni, instr);
916 }
917 }
918
919 /* before scheduling a block, we need to add any necessary false-dependencies
920 * to ensure that:
921 *
922 * (1) barriers are scheduled in the right order wrt instructions related
923 * to the barrier
924 *
925 * (2) reads that come before a write actually get scheduled before the
926 * write
927 */
928 static void
929 calculate_deps(struct ir3_block *block)
930 {
931 list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
932 if (instr->barrier_class) {
933 add_barrier_deps(block, instr);
934 }
935 }
936 }
937
938 void
939 ir3_sched_add_deps(struct ir3 *ir)
940 {
941 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
942 calculate_deps(block);
943 }
944 }