freedreno/ir3: additional lowering
[mesa.git] / src / freedreno / ir3 / ir3_sched.c
1 /*
2 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Rob Clark <robclark@freedesktop.org>
25 */
26
27
28 #include "util/u_math.h"
29
30 #include "ir3.h"
31
32 /*
33 * Instruction Scheduling:
34 *
35 * A recursive depth based scheduling algo. Recursively find an eligible
36 * instruction to schedule from the deepest instruction (recursing through
37 * it's unscheduled src instructions). Normally this would result in a
38 * lot of re-traversal of the same instructions, so we cache results in
39 * instr->data (and clear cached results that would be no longer valid
40 * after scheduling an instruction).
41 *
42 * There are a few special cases that need to be handled, since sched
43 * is currently independent of register allocation. Usages of address
44 * register (a0.x) or predicate register (p0.x) must be serialized. Ie.
45 * if you have two pairs of instructions that write the same special
46 * register and then read it, then those pairs cannot be interleaved.
47 * To solve this, when we are in such a scheduling "critical section",
48 * and we encounter a conflicting write to a special register, we try
49 * to schedule any remaining instructions that use that value first.
50 */
51
52 struct ir3_sched_ctx {
53 struct ir3_block *block; /* the current block */
54 struct list_head depth_list; /* depth sorted unscheduled instrs */
55 struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/
56 struct ir3_instruction *addr; /* current a0.x user, if any */
57 struct ir3_instruction *pred; /* current p0.x user, if any */
58 int live_values; /* estimate of current live values */
59 bool error;
60 };
61
62 static bool is_sfu_or_mem(struct ir3_instruction *instr)
63 {
64 return is_sfu(instr) || is_mem(instr);
65 }
66
67 static void
68 unuse_each_src(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
69 {
70 struct ir3_instruction *src;
71
72 foreach_ssa_src_n(src, n, instr) {
73 if (__is_false_dep(instr, n))
74 continue;
75 if (instr->block != src->block)
76 continue;
77 if ((src->opc == OPC_META_FI) || (src->opc == OPC_META_FO)) {
78 unuse_each_src(ctx, src);
79 } else {
80 debug_assert(src->use_count > 0);
81
82 if (--src->use_count == 0) {
83 ctx->live_values -= dest_regs(src);
84 debug_assert(ctx->live_values >= 0);
85 }
86 }
87 }
88 }
89
90 static void
91 use_each_src(struct ir3_instruction *instr)
92 {
93 struct ir3_instruction *src;
94
95 foreach_ssa_src_n(src, n, instr) {
96 if (__is_false_dep(instr, n))
97 continue;
98 if (instr->block != src->block)
99 continue;
100 if ((src->opc == OPC_META_FI) || (src->opc == OPC_META_FO)) {
101 use_each_src(src);
102 } else {
103 src->use_count++;
104 }
105 }
106 }
107
108 static void
109 update_live_values(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
110 {
111 if ((instr->opc == OPC_META_FI) || (instr->opc == OPC_META_FO))
112 return;
113
114 ctx->live_values += dest_regs(instr);
115 unuse_each_src(ctx, instr);
116 }
117
118 /* This is *slightly* different than how ir3_cp uses use_count, in that
119 * we just track it per block (because we schedule a block at a time) and
120 * because we don't track meta instructions and false dependencies (since
121 * they don't contribute real register pressure).
122 */
123 static void
124 update_use_count(struct ir3_block *block)
125 {
126 list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
127 instr->use_count = 0;
128 }
129
130 list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
131 if ((instr->opc == OPC_META_FI) || (instr->opc == OPC_META_FO))
132 continue;
133
134 use_each_src(instr);
135 }
136 }
137
138 #define NULL_INSTR ((void *)~0)
139
140 static void
141 clear_cache(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
142 {
143 list_for_each_entry (struct ir3_instruction, instr2, &ctx->depth_list, node) {
144 if ((instr2->data == instr) || (instr2->data == NULL_INSTR) || !instr)
145 instr2->data = NULL;
146 }
147 }
148
149 static void
150 schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
151 {
152 debug_assert(ctx->block == instr->block);
153
154 /* maybe there is a better way to handle this than just stuffing
155 * a nop.. ideally we'd know about this constraint in the
156 * scheduling and depth calculation..
157 */
158 if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr))
159 ir3_NOP(ctx->block);
160
161 /* remove from depth list:
162 */
163 list_delinit(&instr->node);
164
165 if (writes_addr(instr)) {
166 debug_assert(ctx->addr == NULL);
167 ctx->addr = instr;
168 }
169
170 if (writes_pred(instr)) {
171 debug_assert(ctx->pred == NULL);
172 ctx->pred = instr;
173 }
174
175 instr->flags |= IR3_INSTR_MARK;
176
177 list_addtail(&instr->node, &instr->block->instr_list);
178 ctx->scheduled = instr;
179
180 update_live_values(ctx, instr);
181
182 if (writes_addr(instr) || writes_pred(instr) || is_input(instr)) {
183 clear_cache(ctx, NULL);
184 } else {
185 /* invalidate only the necessary entries.. */
186 clear_cache(ctx, instr);
187 }
188 }
189
190 static struct ir3_instruction *
191 deepest(struct ir3_instruction **srcs, unsigned nsrcs)
192 {
193 struct ir3_instruction *d = NULL;
194 unsigned i = 0, id = 0;
195
196 while ((i < nsrcs) && !(d = srcs[id = i]))
197 i++;
198
199 if (!d)
200 return NULL;
201
202 for (; i < nsrcs; i++)
203 if (srcs[i] && (srcs[i]->sun > d->sun))
204 d = srcs[id = i];
205
206 srcs[id] = NULL;
207
208 return d;
209 }
210
211 /**
212 * @block: the block to search in, starting from end; in first pass,
213 * this will be the block the instruction would be inserted into
214 * (but has not yet, ie. it only contains already scheduled
215 * instructions). For intra-block scheduling (second pass), this
216 * would be one of the predecessor blocks.
217 * @instr: the instruction to search for
218 * @maxd: max distance, bail after searching this # of instruction
219 * slots, since it means the instruction we are looking for is
220 * far enough away
221 * @pred: if true, recursively search into predecessor blocks to
222 * find the worst case (shortest) distance (only possible after
223 * individual blocks are all scheduled
224 */
225 static unsigned
226 distance(struct ir3_block *block, struct ir3_instruction *instr,
227 unsigned maxd, bool pred)
228 {
229 unsigned d = 0;
230
231 list_for_each_entry_rev (struct ir3_instruction, n, &block->instr_list, node) {
232 if ((n == instr) || (d >= maxd))
233 return d;
234 /* NOTE: don't count branch/jump since we don't know yet if they will
235 * be eliminated later in resolve_jumps().. really should do that
236 * earlier so we don't have this constraint.
237 */
238 if (is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR)))
239 d++;
240 }
241
242 /* if coming from a predecessor block, assume it is assigned far
243 * enough away.. we'll fix up later.
244 */
245 if (!pred)
246 return maxd;
247
248 if (pred && (block->data != block)) {
249 /* Search into predecessor blocks, finding the one with the
250 * shortest distance, since that will be the worst case
251 */
252 unsigned min = maxd - d;
253
254 /* (ab)use block->data to prevent recursion: */
255 block->data = block;
256
257 for (unsigned i = 0; i < block->predecessors_count; i++) {
258 unsigned n;
259
260 n = distance(block->predecessors[i], instr, min, pred);
261
262 min = MIN2(min, n);
263 }
264
265 block->data = NULL;
266 d += min;
267 }
268
269 return d;
270 }
271
272 /* calculate delay for specified src: */
273 static unsigned
274 delay_calc_srcn(struct ir3_block *block,
275 struct ir3_instruction *assigner,
276 struct ir3_instruction *consumer,
277 unsigned srcn, bool soft, bool pred)
278 {
279 unsigned delay = 0;
280
281 if (is_meta(assigner)) {
282 struct ir3_instruction *src;
283 foreach_ssa_src(src, assigner) {
284 unsigned d;
285 d = delay_calc_srcn(block, src, consumer, srcn, soft, pred);
286 delay = MAX2(delay, d);
287 }
288 } else {
289 if (soft) {
290 if (is_sfu(assigner)) {
291 delay = 4;
292 } else {
293 delay = ir3_delayslots(assigner, consumer, srcn);
294 }
295 } else {
296 delay = ir3_delayslots(assigner, consumer, srcn);
297 }
298 delay -= distance(block, assigner, delay, pred);
299 }
300
301 return delay;
302 }
303
304 /* calculate delay for instruction (maximum of delay for all srcs): */
305 static unsigned
306 delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
307 bool soft, bool pred)
308 {
309 unsigned delay = 0;
310 struct ir3_instruction *src;
311
312 foreach_ssa_src_n(src, i, instr) {
313 unsigned d;
314 d = delay_calc_srcn(block, src, instr, i, soft, pred);
315 delay = MAX2(delay, d);
316 }
317
318 return delay;
319 }
320
321 struct ir3_sched_notes {
322 /* there is at least one kill which could be scheduled, except
323 * for unscheduled bary.f's:
324 */
325 bool blocked_kill;
326 /* there is at least one instruction that could be scheduled,
327 * except for conflicting address/predicate register usage:
328 */
329 bool addr_conflict, pred_conflict;
330 };
331
332 static bool is_scheduled(struct ir3_instruction *instr)
333 {
334 return !!(instr->flags & IR3_INSTR_MARK);
335 }
336
337 /* could an instruction be scheduled if specified ssa src was scheduled? */
338 static bool
339 could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
340 {
341 struct ir3_instruction *other_src;
342 foreach_ssa_src(other_src, instr) {
343 /* if dependency not scheduled, we aren't ready yet: */
344 if ((src != other_src) && !is_scheduled(other_src)) {
345 return false;
346 }
347 }
348 return true;
349 }
350
351 /* Check if instruction is ok to schedule. Make sure it is not blocked
352 * by use of addr/predicate register, etc.
353 */
354 static bool
355 check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
356 struct ir3_instruction *instr)
357 {
358 /* For instructions that write address register we need to
359 * make sure there is at least one instruction that uses the
360 * addr value which is otherwise ready.
361 *
362 * TODO if any instructions use pred register and have other
363 * src args, we would need to do the same for writes_pred()..
364 */
365 if (writes_addr(instr)) {
366 struct ir3 *ir = instr->block->shader;
367 bool ready = false;
368 for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) {
369 struct ir3_instruction *indirect = ir->indirects[i];
370 if (!indirect)
371 continue;
372 if (indirect->address != instr)
373 continue;
374 ready = could_sched(indirect, instr);
375 }
376
377 /* nothing could be scheduled, so keep looking: */
378 if (!ready)
379 return false;
380 }
381
382 /* if this is a write to address/predicate register, and that
383 * register is currently in use, we need to defer until it is
384 * free:
385 */
386 if (writes_addr(instr) && ctx->addr) {
387 debug_assert(ctx->addr != instr);
388 notes->addr_conflict = true;
389 return false;
390 }
391
392 if (writes_pred(instr) && ctx->pred) {
393 debug_assert(ctx->pred != instr);
394 notes->pred_conflict = true;
395 return false;
396 }
397
398 /* if the instruction is a kill, we need to ensure *every*
399 * bary.f is scheduled. The hw seems unhappy if the thread
400 * gets killed before the end-input (ei) flag is hit.
401 *
402 * We could do this by adding each bary.f instruction as
403 * virtual ssa src for the kill instruction. But we have
404 * fixed length instr->regs[].
405 *
406 * TODO this wouldn't be quite right if we had multiple
407 * basic blocks, if any block was conditional. We'd need
408 * to schedule the bary.f's outside of any block which
409 * was conditional that contained a kill.. I think..
410 */
411 if (is_kill(instr)) {
412 struct ir3 *ir = instr->block->shader;
413
414 for (unsigned i = 0; i < ir->baryfs_count; i++) {
415 struct ir3_instruction *baryf = ir->baryfs[i];
416 if (baryf->flags & IR3_INSTR_UNUSED)
417 continue;
418 if (!is_scheduled(baryf)) {
419 notes->blocked_kill = true;
420 return false;
421 }
422 }
423 }
424
425 return true;
426 }
427
428 /* Find the best instruction to schedule from specified instruction or
429 * recursively it's ssa sources.
430 */
431 static struct ir3_instruction *
432 find_instr_recursive(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
433 struct ir3_instruction *instr)
434 {
435 struct ir3_instruction *srcs[__ssa_src_cnt(instr)];
436 struct ir3_instruction *src;
437 unsigned nsrcs = 0;
438
439 if (is_scheduled(instr))
440 return NULL;
441
442 /* use instr->data to cache the results of recursing up the
443 * instr src's. Otherwise the recursive algo can scale quite
444 * badly w/ shader size. But this takes some care to clear
445 * the cache appropriately when instructions are scheduled.
446 */
447 if (instr->data) {
448 if (instr->data == NULL_INSTR)
449 return NULL;
450 return instr->data;
451 }
452
453 /* find unscheduled srcs: */
454 foreach_ssa_src(src, instr) {
455 if (!is_scheduled(src)) {
456 debug_assert(nsrcs < ARRAY_SIZE(srcs));
457 srcs[nsrcs++] = src;
458 }
459 }
460
461 /* if all our src's are already scheduled: */
462 if (nsrcs == 0) {
463 if (check_instr(ctx, notes, instr)) {
464 instr->data = instr;
465 return instr;
466 }
467 return NULL;
468 }
469
470 while ((src = deepest(srcs, nsrcs))) {
471 struct ir3_instruction *candidate;
472
473 candidate = find_instr_recursive(ctx, notes, src);
474 if (!candidate)
475 continue;
476
477 if (check_instr(ctx, notes, candidate)) {
478 instr->data = candidate;
479 return candidate;
480 }
481 }
482
483 instr->data = NULL_INSTR;
484 return NULL;
485 }
486
487 /* find instruction to schedule: */
488 static struct ir3_instruction *
489 find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
490 bool soft)
491 {
492 struct ir3_instruction *best_instr = NULL;
493 unsigned min_delay = ~0;
494
495 /* TODO we'd really rather use the list/array of block outputs. But we
496 * don't have such a thing. Recursing *every* instruction in the list
497 * will result in a lot of repeated traversal, since instructions will
498 * get traversed both when they appear as ssa src to a later instruction
499 * as well as where they appear in the depth_list.
500 */
501 list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) {
502 struct ir3_instruction *candidate;
503 unsigned delay;
504
505 candidate = find_instr_recursive(ctx, notes, instr);
506 if (!candidate)
507 continue;
508
509 if (ctx->live_values > 16*4) {
510 /* under register pressure, only care about reducing live values: */
511 if (!best_instr || (candidate->sun > best_instr->sun))
512 best_instr = candidate;
513 } else {
514 delay = delay_calc(ctx->block, candidate, soft, false);
515 if ((delay < min_delay) ||
516 ((delay <= (min_delay + 2)) && (candidate->sun > best_instr->sun))) {
517 best_instr = candidate;
518 min_delay = delay;
519 }
520 }
521 }
522
523 return best_instr;
524 }
525
526 /* "spill" the address register by remapping any unscheduled
527 * instructions which depend on the current address register
528 * to a clone of the instruction which wrote the address reg.
529 */
530 static struct ir3_instruction *
531 split_addr(struct ir3_sched_ctx *ctx)
532 {
533 struct ir3 *ir;
534 struct ir3_instruction *new_addr = NULL;
535 unsigned i;
536
537 debug_assert(ctx->addr);
538
539 ir = ctx->addr->block->shader;
540
541 for (i = 0; i < ir->indirects_count; i++) {
542 struct ir3_instruction *indirect = ir->indirects[i];
543
544 if (!indirect)
545 continue;
546
547 /* skip instructions already scheduled: */
548 if (is_scheduled(indirect))
549 continue;
550
551 /* remap remaining instructions using current addr
552 * to new addr:
553 */
554 if (indirect->address == ctx->addr) {
555 if (!new_addr) {
556 new_addr = ir3_instr_clone(ctx->addr);
557 /* original addr is scheduled, but new one isn't: */
558 new_addr->flags &= ~IR3_INSTR_MARK;
559 }
560 ir3_instr_set_address(indirect, new_addr);
561 }
562 }
563
564 /* all remaining indirects remapped to new addr: */
565 ctx->addr = NULL;
566
567 return new_addr;
568 }
569
570 /* "spill" the predicate register by remapping any unscheduled
571 * instructions which depend on the current predicate register
572 * to a clone of the instruction which wrote the address reg.
573 */
574 static struct ir3_instruction *
575 split_pred(struct ir3_sched_ctx *ctx)
576 {
577 struct ir3 *ir;
578 struct ir3_instruction *new_pred = NULL;
579 unsigned i;
580
581 debug_assert(ctx->pred);
582
583 ir = ctx->pred->block->shader;
584
585 for (i = 0; i < ir->predicates_count; i++) {
586 struct ir3_instruction *predicated = ir->predicates[i];
587
588 /* skip instructions already scheduled: */
589 if (is_scheduled(predicated))
590 continue;
591
592 /* remap remaining instructions using current pred
593 * to new pred:
594 *
595 * TODO is there ever a case when pred isn't first
596 * (and only) src?
597 */
598 if (ssa(predicated->regs[1]) == ctx->pred) {
599 if (!new_pred) {
600 new_pred = ir3_instr_clone(ctx->pred);
601 /* original pred is scheduled, but new one isn't: */
602 new_pred->flags &= ~IR3_INSTR_MARK;
603 }
604 predicated->regs[1]->instr = new_pred;
605 }
606 }
607
608 /* all remaining predicated remapped to new pred: */
609 ctx->pred = NULL;
610
611 return new_pred;
612 }
613
614 static void
615 sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
616 {
617 struct list_head unscheduled_list;
618
619 ctx->block = block;
620
621 /* addr/pred writes are per-block: */
622 ctx->addr = NULL;
623 ctx->pred = NULL;
624
625 /* move all instructions to the unscheduled list, and
626 * empty the block's instruction list (to which we will
627 * be inserting).
628 */
629 list_replace(&block->instr_list, &unscheduled_list);
630 list_inithead(&block->instr_list);
631 list_inithead(&ctx->depth_list);
632
633 /* first a pre-pass to schedule all meta:input instructions
634 * (which need to appear first so that RA knows the register is
635 * occupied), and move remaining to depth sorted list:
636 */
637 list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node) {
638 if (instr->opc == OPC_META_INPUT) {
639 schedule(ctx, instr);
640 } else {
641 ir3_insert_by_depth(instr, &ctx->depth_list);
642 }
643 }
644
645 while (!list_empty(&ctx->depth_list)) {
646 struct ir3_sched_notes notes = {0};
647 struct ir3_instruction *instr;
648
649 instr = find_eligible_instr(ctx, &notes, true);
650 if (!instr)
651 instr = find_eligible_instr(ctx, &notes, false);
652
653 if (instr) {
654 unsigned delay = delay_calc(ctx->block, instr, false, false);
655
656 /* and if we run out of instructions that can be scheduled,
657 * then it is time for nop's:
658 */
659 debug_assert(delay <= 6);
660 while (delay > 0) {
661 ir3_NOP(block);
662 delay--;
663 }
664
665 schedule(ctx, instr);
666 } else {
667 struct ir3_instruction *new_instr = NULL;
668
669 /* nothing available to schedule.. if we are blocked on
670 * address/predicate register conflict, then break the
671 * deadlock by cloning the instruction that wrote that
672 * reg:
673 */
674 if (notes.addr_conflict) {
675 new_instr = split_addr(ctx);
676 } else if (notes.pred_conflict) {
677 new_instr = split_pred(ctx);
678 } else {
679 debug_assert(0);
680 ctx->error = true;
681 return;
682 }
683
684 if (new_instr) {
685 /* clearing current addr/pred can change what is
686 * available to schedule, so clear cache..
687 */
688 clear_cache(ctx, NULL);
689
690 ir3_insert_by_depth(new_instr, &ctx->depth_list);
691 /* the original instr that wrote addr/pred may have
692 * originated from a different block:
693 */
694 new_instr->block = block;
695 }
696 }
697 }
698
699 /* And lastly, insert branch/jump instructions to take us to
700 * the next block. Later we'll strip back out the branches
701 * that simply jump to next instruction.
702 */
703 if (block->successors[1]) {
704 /* if/else, conditional branches to "then" or "else": */
705 struct ir3_instruction *br;
706 unsigned delay = 6;
707
708 debug_assert(ctx->pred);
709 debug_assert(block->condition);
710
711 delay -= distance(ctx->block, ctx->pred, delay, false);
712
713 while (delay > 0) {
714 ir3_NOP(block);
715 delay--;
716 }
717
718 /* create "else" branch first (since "then" block should
719 * frequently/always end up being a fall-thru):
720 */
721 br = ir3_BR(block);
722 br->cat0.inv = true;
723 br->cat0.target = block->successors[1];
724
725 /* NOTE: we have to hard code delay of 6 above, since
726 * we want to insert the nop's before constructing the
727 * branch. Throw in an assert so we notice if this
728 * ever breaks on future generation:
729 */
730 debug_assert(ir3_delayslots(ctx->pred, br, 0) == 6);
731
732 br = ir3_BR(block);
733 br->cat0.target = block->successors[0];
734
735 } else if (block->successors[0]) {
736 /* otherwise unconditional jump to next block: */
737 struct ir3_instruction *jmp;
738
739 jmp = ir3_JUMP(block);
740 jmp->cat0.target = block->successors[0];
741 }
742
743 /* NOTE: if we kept track of the predecessors, we could do a better
744 * job w/ (jp) flags.. every node w/ > predecessor is a join point.
745 * Note that as we eliminate blocks which contain only an unconditional
746 * jump we probably need to propagate (jp) flag..
747 */
748 }
749
750 /* After scheduling individual blocks, we still could have cases where
751 * one (or more) paths into a block, a value produced by a previous
752 * has too few delay slots to be legal. We can't deal with this in the
753 * first pass, because loops (ie. we can't ensure all predecessor blocks
754 * are already scheduled in the first pass). All we can really do at
755 * this point is stuff in extra nop's until things are legal.
756 */
757 static void
758 sched_intra_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
759 {
760 unsigned n = 0;
761
762 ctx->block = block;
763
764 list_for_each_entry_safe (struct ir3_instruction, instr, &block->instr_list, node) {
765 unsigned delay = 0;
766
767 for (unsigned i = 0; i < block->predecessors_count; i++) {
768 unsigned d = delay_calc(block->predecessors[i], instr, false, true);
769 delay = MAX2(d, delay);
770 }
771
772 while (delay > n) {
773 struct ir3_instruction *nop = ir3_NOP(block);
774
775 /* move to before instr: */
776 list_delinit(&nop->node);
777 list_addtail(&nop->node, &instr->node);
778
779 n++;
780 }
781
782 /* we can bail once we hit worst case delay: */
783 if (++n > 6)
784 break;
785 }
786 }
787
788 int ir3_sched(struct ir3 *ir)
789 {
790 struct ir3_sched_ctx ctx = {0};
791
792 ir3_clear_mark(ir);
793
794 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
795 ctx.live_values = 0;
796 update_use_count(block);
797 sched_block(&ctx, block);
798 }
799
800 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
801 sched_intra_block(&ctx, block);
802 }
803
804 if (ctx.error)
805 return -1;
806
807 return 0;
808 }
809
810 /* does instruction 'prior' need to be scheduled before 'instr'? */
811 static bool
812 depends_on(struct ir3_instruction *instr, struct ir3_instruction *prior)
813 {
814 /* TODO for dependencies that are related to a specific object, ie
815 * a specific SSBO/image/array, we could relax this constraint to
816 * make accesses to unrelated objects not depend on each other (at
817 * least as long as not declared coherent)
818 */
819 if (((instr->barrier_class & IR3_BARRIER_EVERYTHING) && prior->barrier_class) ||
820 ((prior->barrier_class & IR3_BARRIER_EVERYTHING) && instr->barrier_class))
821 return true;
822 return !!(instr->barrier_class & prior->barrier_conflict);
823 }
824
825 static void
826 add_barrier_deps(struct ir3_block *block, struct ir3_instruction *instr)
827 {
828 struct list_head *prev = instr->node.prev;
829 struct list_head *next = instr->node.next;
830
831 /* add dependencies on previous instructions that must be scheduled
832 * prior to the current instruction
833 */
834 while (prev != &block->instr_list) {
835 struct ir3_instruction *pi =
836 LIST_ENTRY(struct ir3_instruction, prev, node);
837
838 prev = prev->prev;
839
840 if (is_meta(pi))
841 continue;
842
843 if (instr->barrier_class == pi->barrier_class) {
844 ir3_instr_add_dep(instr, pi);
845 break;
846 }
847
848 if (depends_on(instr, pi))
849 ir3_instr_add_dep(instr, pi);
850 }
851
852 /* add dependencies on this instruction to following instructions
853 * that must be scheduled after the current instruction:
854 */
855 while (next != &block->instr_list) {
856 struct ir3_instruction *ni =
857 LIST_ENTRY(struct ir3_instruction, next, node);
858
859 next = next->next;
860
861 if (is_meta(ni))
862 continue;
863
864 if (instr->barrier_class == ni->barrier_class) {
865 ir3_instr_add_dep(ni, instr);
866 break;
867 }
868
869 if (depends_on(ni, instr))
870 ir3_instr_add_dep(ni, instr);
871 }
872 }
873
874 /* before scheduling a block, we need to add any necessary false-dependencies
875 * to ensure that:
876 *
877 * (1) barriers are scheduled in the right order wrt instructions related
878 * to the barrier
879 *
880 * (2) reads that come before a write actually get scheduled before the
881 * write
882 */
883 static void
884 calculate_deps(struct ir3_block *block)
885 {
886 list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
887 if (instr->barrier_class) {
888 add_barrier_deps(block, instr);
889 }
890 }
891 }
892
893 void
894 ir3_sched_add_deps(struct ir3 *ir)
895 {
896 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
897 calculate_deps(block);
898 }
899 }