freedreno/ir3: move block-scheduling into legalize
[mesa.git] / src / freedreno / ir3 / ir3_sched.c
1 /*
2 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Rob Clark <robclark@freedesktop.org>
25 */
26
27
28 #include "util/u_math.h"
29
30 #include "ir3.h"
31 #include "ir3_compiler.h"
32
33 #ifdef DEBUG
34 #define SCHED_DEBUG (ir3_shader_debug & IR3_DBG_SCHEDMSGS)
35 #else
36 #define SCHED_DEBUG 0
37 #endif
38 #define d(fmt, ...) do { if (SCHED_DEBUG) { \
39 printf("SCHED: "fmt"\n", ##__VA_ARGS__); \
40 } } while (0)
41
42 #define di(instr, fmt, ...) do { if (SCHED_DEBUG) { \
43 printf("SCHED: "fmt": ", ##__VA_ARGS__); \
44 ir3_print_instr(instr); \
45 } } while (0)
46
47 /*
48 * Instruction Scheduling:
49 *
50 * A recursive depth based scheduling algo. Recursively find an eligible
51 * instruction to schedule from the deepest instruction (recursing through
52 * it's unscheduled src instructions). Normally this would result in a
53 * lot of re-traversal of the same instructions, so we cache results in
54 * instr->data (and clear cached results that would be no longer valid
55 * after scheduling an instruction).
56 *
57 * There are a few special cases that need to be handled, since sched
58 * is currently independent of register allocation. Usages of address
59 * register (a0.x) or predicate register (p0.x) must be serialized. Ie.
60 * if you have two pairs of instructions that write the same special
61 * register and then read it, then those pairs cannot be interleaved.
62 * To solve this, when we are in such a scheduling "critical section",
63 * and we encounter a conflicting write to a special register, we try
64 * to schedule any remaining instructions that use that value first.
65 */
66
67 struct ir3_sched_ctx {
68 struct ir3_block *block; /* the current block */
69 struct list_head depth_list; /* depth sorted unscheduled instrs */
70 struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/
71 struct ir3_instruction *addr; /* current a0.x user, if any */
72 struct ir3_instruction *pred; /* current p0.x user, if any */
73 int live_values; /* estimate of current live values */
74 bool error;
75 };
76
77 static bool is_scheduled(struct ir3_instruction *instr)
78 {
79 return !!(instr->flags & IR3_INSTR_MARK);
80 }
81
82 static bool is_sfu_or_mem(struct ir3_instruction *instr)
83 {
84 return is_sfu(instr) || is_mem(instr);
85 }
86
87 static void
88 unuse_each_src(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
89 {
90 struct ir3_instruction *src;
91
92 foreach_ssa_src_n(src, n, instr) {
93 if (__is_false_dep(instr, n))
94 continue;
95 if (instr->block != src->block)
96 continue;
97 if ((src->opc == OPC_META_COLLECT) || (src->opc == OPC_META_SPLIT)) {
98 unuse_each_src(ctx, src);
99 } else {
100 debug_assert(src->use_count > 0);
101
102 if (--src->use_count == 0) {
103 ctx->live_values -= dest_regs(src);
104 debug_assert(ctx->live_values >= 0);
105 }
106 }
107 }
108 }
109
110 static void clear_cache(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr);
111 static void use_instr(struct ir3_instruction *instr);
112
113 /* transfers a use-count to new instruction, for cases where we
114 * "spill" address or predicate. Note this might cause the
115 * previous instruction that loaded a0.x/p0.x to become live
116 * again, when we previously thought it was dead.
117 */
118 static void
119 transfer_use(struct ir3_sched_ctx *ctx, struct ir3_instruction *orig_instr,
120 struct ir3_instruction *new_instr)
121 {
122 struct ir3_instruction *src;
123
124 debug_assert(is_scheduled(orig_instr));
125
126 foreach_ssa_src_n(src, n, new_instr) {
127 if (__is_false_dep(new_instr, n))
128 continue;
129 ctx->live_values += dest_regs(src);
130 use_instr(src);
131 }
132
133 clear_cache(ctx, orig_instr);
134 }
135
136 static void
137 use_each_src(struct ir3_instruction *instr)
138 {
139 struct ir3_instruction *src;
140
141 foreach_ssa_src_n(src, n, instr) {
142 if (__is_false_dep(instr, n))
143 continue;
144 use_instr(src);
145 }
146 }
147
148 static void
149 use_instr(struct ir3_instruction *instr)
150 {
151 if ((instr->opc == OPC_META_COLLECT) || (instr->opc == OPC_META_SPLIT)) {
152 use_each_src(instr);
153 } else {
154 instr->use_count++;
155 }
156 }
157
158 static void
159 update_live_values(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
160 {
161 if ((instr->opc == OPC_META_COLLECT) || (instr->opc == OPC_META_SPLIT))
162 return;
163
164 ctx->live_values += dest_regs(instr);
165 unuse_each_src(ctx, instr);
166 }
167
168 static void
169 update_use_count(struct ir3 *ir)
170 {
171 foreach_block (block, &ir->block_list) {
172 foreach_instr (instr, &block->instr_list) {
173 instr->use_count = 0;
174 }
175 }
176
177 foreach_block (block, &ir->block_list) {
178 foreach_instr (instr, &block->instr_list) {
179 if ((instr->opc == OPC_META_COLLECT) || (instr->opc == OPC_META_SPLIT))
180 continue;
181
182 use_each_src(instr);
183 }
184 }
185
186 /* Shader outputs are also used:
187 */
188 struct ir3_instruction *out;
189 foreach_output(out, ir)
190 use_instr(out);
191 }
192
193 #define NULL_INSTR ((void *)~0)
194
195 static void
196 clear_cache(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
197 {
198 foreach_instr (instr2, &ctx->depth_list) {
199 if ((instr2->data == instr) || (instr2->data == NULL_INSTR) || !instr)
200 instr2->data = NULL;
201 }
202 }
203
204 static void
205 schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
206 {
207 debug_assert(ctx->block == instr->block);
208
209 /* maybe there is a better way to handle this than just stuffing
210 * a nop.. ideally we'd know about this constraint in the
211 * scheduling and depth calculation..
212 */
213 if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr))
214 ir3_NOP(ctx->block);
215
216 /* remove from depth list:
217 */
218 list_delinit(&instr->node);
219
220 if (writes_addr(instr)) {
221 debug_assert(ctx->addr == NULL);
222 ctx->addr = instr;
223 }
224
225 if (writes_pred(instr)) {
226 debug_assert(ctx->pred == NULL);
227 ctx->pred = instr;
228 }
229
230 instr->flags |= IR3_INSTR_MARK;
231
232 di(instr, "schedule");
233
234 list_addtail(&instr->node, &instr->block->instr_list);
235 ctx->scheduled = instr;
236
237 update_live_values(ctx, instr);
238
239 if (writes_addr(instr) || writes_pred(instr) || is_input(instr)) {
240 clear_cache(ctx, NULL);
241 } else {
242 /* invalidate only the necessary entries.. */
243 clear_cache(ctx, instr);
244 }
245 }
246
247 static struct ir3_instruction *
248 deepest(struct ir3_instruction **srcs, unsigned nsrcs)
249 {
250 struct ir3_instruction *d = NULL;
251 unsigned i = 0, id = 0;
252
253 while ((i < nsrcs) && !(d = srcs[id = i]))
254 i++;
255
256 if (!d)
257 return NULL;
258
259 for (; i < nsrcs; i++)
260 if (srcs[i] && (srcs[i]->depth > d->depth))
261 d = srcs[id = i];
262
263 srcs[id] = NULL;
264
265 return d;
266 }
267
268 struct ir3_sched_notes {
269 /* there is at least one kill which could be scheduled, except
270 * for unscheduled bary.f's:
271 */
272 bool blocked_kill;
273 /* there is at least one instruction that could be scheduled,
274 * except for conflicting address/predicate register usage:
275 */
276 bool addr_conflict, pred_conflict;
277 };
278
279 /* could an instruction be scheduled if specified ssa src was scheduled? */
280 static bool
281 could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
282 {
283 struct ir3_instruction *other_src;
284 foreach_ssa_src(other_src, instr) {
285 /* if dependency not scheduled, we aren't ready yet: */
286 if ((src != other_src) && !is_scheduled(other_src)) {
287 return false;
288 }
289 }
290 return true;
291 }
292
293 /* Check if instruction is ok to schedule. Make sure it is not blocked
294 * by use of addr/predicate register, etc.
295 */
296 static bool
297 check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
298 struct ir3_instruction *instr)
299 {
300 debug_assert(!is_scheduled(instr));
301
302 /* For instructions that write address register we need to
303 * make sure there is at least one instruction that uses the
304 * addr value which is otherwise ready.
305 *
306 * TODO if any instructions use pred register and have other
307 * src args, we would need to do the same for writes_pred()..
308 */
309 if (writes_addr(instr)) {
310 struct ir3 *ir = instr->block->shader;
311 bool ready = false;
312 for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) {
313 struct ir3_instruction *indirect = ir->indirects[i];
314 if (!indirect)
315 continue;
316 if (indirect->address != instr)
317 continue;
318 ready = could_sched(indirect, instr);
319 }
320
321 /* nothing could be scheduled, so keep looking: */
322 if (!ready)
323 return false;
324 }
325
326 /* if this is a write to address/predicate register, and that
327 * register is currently in use, we need to defer until it is
328 * free:
329 */
330 if (writes_addr(instr) && ctx->addr) {
331 debug_assert(ctx->addr != instr);
332 notes->addr_conflict = true;
333 return false;
334 }
335
336 if (writes_pred(instr) && ctx->pred) {
337 debug_assert(ctx->pred != instr);
338 notes->pred_conflict = true;
339 return false;
340 }
341
342 /* if the instruction is a kill, we need to ensure *every*
343 * bary.f is scheduled. The hw seems unhappy if the thread
344 * gets killed before the end-input (ei) flag is hit.
345 *
346 * We could do this by adding each bary.f instruction as
347 * virtual ssa src for the kill instruction. But we have
348 * fixed length instr->regs[].
349 *
350 * TODO this wouldn't be quite right if we had multiple
351 * basic blocks, if any block was conditional. We'd need
352 * to schedule the bary.f's outside of any block which
353 * was conditional that contained a kill.. I think..
354 */
355 if (is_kill(instr)) {
356 struct ir3 *ir = instr->block->shader;
357
358 for (unsigned i = 0; i < ir->baryfs_count; i++) {
359 struct ir3_instruction *baryf = ir->baryfs[i];
360 if (baryf->flags & IR3_INSTR_UNUSED)
361 continue;
362 if (!is_scheduled(baryf)) {
363 notes->blocked_kill = true;
364 return false;
365 }
366 }
367 }
368
369 return true;
370 }
371
372 /* Find the best instruction to schedule from specified instruction or
373 * recursively it's ssa sources.
374 */
375 static struct ir3_instruction *
376 find_instr_recursive(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
377 struct ir3_instruction *instr)
378 {
379 struct ir3_instruction *srcs[__ssa_src_cnt(instr)];
380 struct ir3_instruction *src;
381 unsigned nsrcs = 0;
382
383 if (is_scheduled(instr))
384 return NULL;
385
386 /* use instr->data to cache the results of recursing up the
387 * instr src's. Otherwise the recursive algo can scale quite
388 * badly w/ shader size. But this takes some care to clear
389 * the cache appropriately when instructions are scheduled.
390 */
391 if (instr->data) {
392 if (instr->data == NULL_INSTR)
393 return NULL;
394 return instr->data;
395 }
396
397 /* find unscheduled srcs: */
398 foreach_ssa_src(src, instr) {
399 if (!is_scheduled(src) && (src->block == instr->block)) {
400 debug_assert(nsrcs < ARRAY_SIZE(srcs));
401 srcs[nsrcs++] = src;
402 }
403 }
404
405 /* if all our src's are already scheduled: */
406 if (nsrcs == 0) {
407 if (check_instr(ctx, notes, instr)) {
408 instr->data = instr;
409 return instr;
410 }
411 return NULL;
412 }
413
414 while ((src = deepest(srcs, nsrcs))) {
415 struct ir3_instruction *candidate;
416
417 candidate = find_instr_recursive(ctx, notes, src);
418 if (!candidate)
419 continue;
420
421 if (check_instr(ctx, notes, candidate)) {
422 instr->data = candidate;
423 return candidate;
424 }
425 }
426
427 instr->data = NULL_INSTR;
428 return NULL;
429 }
430
431 /* find net change to live values if instruction were scheduled: */
432 static int
433 live_effect(struct ir3_instruction *instr)
434 {
435 struct ir3_instruction *src;
436 int new_live = dest_regs(instr);
437 int old_live = 0;
438
439 foreach_ssa_src_n(src, n, instr) {
440 if (__is_false_dep(instr, n))
441 continue;
442
443 if (instr->block != src->block)
444 continue;
445
446 /* for split, just pass things along to the real src: */
447 if (src->opc == OPC_META_SPLIT)
448 src = ssa(src->regs[1]);
449
450 /* for collect, if this is the last use of *each* src,
451 * then it will decrease the live values, since RA treats
452 * them as a whole:
453 */
454 if (src->opc == OPC_META_COLLECT) {
455 struct ir3_instruction *src2;
456 bool last_use = true;
457
458 foreach_ssa_src(src2, src) {
459 if (src2->use_count > 1) {
460 last_use = false;
461 break;
462 }
463 }
464
465 if (last_use)
466 old_live += dest_regs(src);
467
468 } else {
469 debug_assert(src->use_count > 0);
470
471 if (src->use_count == 1) {
472 old_live += dest_regs(src);
473 }
474 }
475 }
476
477 return new_live - old_live;
478 }
479
480 /* find instruction to schedule: */
481 static struct ir3_instruction *
482 find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
483 bool soft)
484 {
485 struct ir3_instruction *best_instr = NULL;
486 int best_rank = INT_MAX; /* lower is better */
487 unsigned deepest = 0;
488
489 /* TODO we'd really rather use the list/array of block outputs. But we
490 * don't have such a thing. Recursing *every* instruction in the list
491 * will result in a lot of repeated traversal, since instructions will
492 * get traversed both when they appear as ssa src to a later instruction
493 * as well as where they appear in the depth_list.
494 */
495 foreach_instr_rev (instr, &ctx->depth_list) {
496 struct ir3_instruction *candidate;
497
498 candidate = find_instr_recursive(ctx, notes, instr);
499 if (!candidate)
500 continue;
501
502 if (is_meta(candidate))
503 return candidate;
504
505 deepest = MAX2(deepest, candidate->depth);
506 }
507
508 /* traverse the list a second time.. but since we cache the result of
509 * find_instr_recursive() it isn't as bad as it looks.
510 */
511 foreach_instr_rev (instr, &ctx->depth_list) {
512 struct ir3_instruction *candidate;
513
514 candidate = find_instr_recursive(ctx, notes, instr);
515 if (!candidate)
516 continue;
517
518 /* determine net change to # of live values: */
519 int le = live_effect(candidate);
520
521 /* if there is a net increase in # of live values, then apply some
522 * threshold to avoid instructions getting scheduled *too* early
523 * and increasing register pressure.
524 */
525 if (le >= 1) {
526 unsigned threshold;
527
528 if (ctx->live_values > 4*4) {
529 threshold = 4;
530 } else {
531 threshold = 6;
532 }
533
534 /* Filter out any "shallow" instructions which would otherwise
535 * tend to get scheduled too early to fill delay slots even
536 * when they are not needed for a while. There will probably
537 * be later delay slots that they could just as easily fill.
538 *
539 * A classic case where this comes up is frag shaders that
540 * write a constant value (like 1.0f) to one of the channels
541 * of the output color(s). Since the mov from immed has no
542 * dependencies, it would otherwise get scheduled early to
543 * fill delay slots, occupying a register until the end of
544 * the program.
545 */
546 if ((deepest - candidate->depth) > threshold)
547 continue;
548 }
549
550 int rank = ir3_delay_calc(ctx->block, candidate, soft, false);
551
552 /* if too many live values, prioritize instructions that reduce the
553 * number of live values:
554 */
555 if (ctx->live_values > 16*4) {
556 rank = le;
557 } else if (ctx->live_values > 4*4) {
558 rank += le;
559 }
560
561 if (rank < best_rank) {
562 best_instr = candidate;
563 best_rank = rank;
564 }
565 }
566
567 return best_instr;
568 }
569
570 static struct ir3_instruction *
571 split_instr(struct ir3_sched_ctx *ctx, struct ir3_instruction *orig_instr)
572 {
573 struct ir3_instruction *new_instr = ir3_instr_clone(orig_instr);
574 ir3_insert_by_depth(new_instr, &ctx->depth_list);
575 transfer_use(ctx, orig_instr, new_instr);
576 return new_instr;
577 }
578
579 /* "spill" the address register by remapping any unscheduled
580 * instructions which depend on the current address register
581 * to a clone of the instruction which wrote the address reg.
582 */
583 static struct ir3_instruction *
584 split_addr(struct ir3_sched_ctx *ctx)
585 {
586 struct ir3 *ir;
587 struct ir3_instruction *new_addr = NULL;
588 unsigned i;
589
590 debug_assert(ctx->addr);
591
592 ir = ctx->addr->block->shader;
593
594 for (i = 0; i < ir->indirects_count; i++) {
595 struct ir3_instruction *indirect = ir->indirects[i];
596
597 if (!indirect)
598 continue;
599
600 /* skip instructions already scheduled: */
601 if (is_scheduled(indirect))
602 continue;
603
604 /* remap remaining instructions using current addr
605 * to new addr:
606 */
607 if (indirect->address == ctx->addr) {
608 if (!new_addr) {
609 new_addr = split_instr(ctx, ctx->addr);
610 /* original addr is scheduled, but new one isn't: */
611 new_addr->flags &= ~IR3_INSTR_MARK;
612 }
613 indirect->address = NULL;
614 ir3_instr_set_address(indirect, new_addr);
615 }
616 }
617
618 /* all remaining indirects remapped to new addr: */
619 ctx->addr = NULL;
620
621 return new_addr;
622 }
623
624 /* "spill" the predicate register by remapping any unscheduled
625 * instructions which depend on the current predicate register
626 * to a clone of the instruction which wrote the address reg.
627 */
628 static struct ir3_instruction *
629 split_pred(struct ir3_sched_ctx *ctx)
630 {
631 struct ir3 *ir;
632 struct ir3_instruction *new_pred = NULL;
633 unsigned i;
634
635 debug_assert(ctx->pred);
636
637 ir = ctx->pred->block->shader;
638
639 for (i = 0; i < ir->predicates_count; i++) {
640 struct ir3_instruction *predicated = ir->predicates[i];
641
642 /* skip instructions already scheduled: */
643 if (is_scheduled(predicated))
644 continue;
645
646 /* remap remaining instructions using current pred
647 * to new pred:
648 *
649 * TODO is there ever a case when pred isn't first
650 * (and only) src?
651 */
652 if (ssa(predicated->regs[1]) == ctx->pred) {
653 if (!new_pred) {
654 new_pred = split_instr(ctx, ctx->pred);
655 /* original pred is scheduled, but new one isn't: */
656 new_pred->flags &= ~IR3_INSTR_MARK;
657 }
658 predicated->regs[1]->instr = new_pred;
659 }
660 }
661
662 /* all remaining predicated remapped to new pred: */
663 ctx->pred = NULL;
664
665 return new_pred;
666 }
667
668 static void
669 sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
670 {
671 struct list_head unscheduled_list;
672
673 ctx->block = block;
674
675 /* addr/pred writes are per-block: */
676 ctx->addr = NULL;
677 ctx->pred = NULL;
678
679 /* move all instructions to the unscheduled list, and
680 * empty the block's instruction list (to which we will
681 * be inserting).
682 */
683 list_replace(&block->instr_list, &unscheduled_list);
684 list_inithead(&block->instr_list);
685 list_inithead(&ctx->depth_list);
686
687 /* First schedule all meta:input instructions, followed by
688 * tex-prefetch. We want all of the instructions that load
689 * values into registers before the shader starts to go
690 * before any other instructions. But in particular we
691 * want inputs to come before prefetches. This is because
692 * a FS's bary_ij input may not actually be live in the
693 * shader, but it should not be scheduled on top of any
694 * other input (but can be overwritten by a tex prefetch)
695 *
696 * Finally, move all the remaining instructions to the depth-
697 * list
698 */
699 foreach_instr_safe (instr, &unscheduled_list)
700 if (instr->opc == OPC_META_INPUT)
701 schedule(ctx, instr);
702
703 foreach_instr_safe (instr, &unscheduled_list)
704 if (instr->opc == OPC_META_TEX_PREFETCH)
705 schedule(ctx, instr);
706
707 foreach_instr_safe (instr, &unscheduled_list)
708 ir3_insert_by_depth(instr, &ctx->depth_list);
709
710 while (!list_is_empty(&ctx->depth_list)) {
711 struct ir3_sched_notes notes = {0};
712 struct ir3_instruction *instr;
713
714 instr = find_eligible_instr(ctx, &notes, true);
715 if (!instr)
716 instr = find_eligible_instr(ctx, &notes, false);
717
718 if (instr) {
719 unsigned delay = ir3_delay_calc(ctx->block, instr, false, false);
720 d("delay=%u", delay);
721
722 /* and if we run out of instructions that can be scheduled,
723 * then it is time for nop's:
724 */
725 debug_assert(delay <= 6);
726 while (delay > 0) {
727 ir3_NOP(block);
728 delay--;
729 }
730
731 schedule(ctx, instr);
732 } else {
733 struct ir3_instruction *new_instr = NULL;
734
735 /* nothing available to schedule.. if we are blocked on
736 * address/predicate register conflict, then break the
737 * deadlock by cloning the instruction that wrote that
738 * reg:
739 */
740 if (notes.addr_conflict) {
741 new_instr = split_addr(ctx);
742 } else if (notes.pred_conflict) {
743 new_instr = split_pred(ctx);
744 } else {
745 debug_assert(0);
746 ctx->error = true;
747 return;
748 }
749
750 if (new_instr) {
751 /* clearing current addr/pred can change what is
752 * available to schedule, so clear cache..
753 */
754 clear_cache(ctx, NULL);
755
756 ir3_insert_by_depth(new_instr, &ctx->depth_list);
757 /* the original instr that wrote addr/pred may have
758 * originated from a different block:
759 */
760 new_instr->block = block;
761 }
762 }
763 }
764 }
765
766 int ir3_sched(struct ir3 *ir)
767 {
768 struct ir3_sched_ctx ctx = {0};
769
770 ir3_clear_mark(ir);
771 update_use_count(ir);
772
773 foreach_block (block, &ir->block_list) {
774 ctx.live_values = 0;
775 sched_block(&ctx, block);
776 }
777
778 if (ctx.error)
779 return -1;
780
781 return 0;
782 }
783
784 static unsigned
785 get_array_id(struct ir3_instruction *instr)
786 {
787 /* The expectation is that there is only a single array
788 * src or dst, ir3_cp should enforce this.
789 */
790
791 for (unsigned i = 0; i < instr->regs_count; i++)
792 if (instr->regs[i]->flags & IR3_REG_ARRAY)
793 return instr->regs[i]->array.id;
794
795 unreachable("this was unexpected");
796 }
797
798 /* does instruction 'prior' need to be scheduled before 'instr'? */
799 static bool
800 depends_on(struct ir3_instruction *instr, struct ir3_instruction *prior)
801 {
802 /* TODO for dependencies that are related to a specific object, ie
803 * a specific SSBO/image/array, we could relax this constraint to
804 * make accesses to unrelated objects not depend on each other (at
805 * least as long as not declared coherent)
806 */
807 if (((instr->barrier_class & IR3_BARRIER_EVERYTHING) && prior->barrier_class) ||
808 ((prior->barrier_class & IR3_BARRIER_EVERYTHING) && instr->barrier_class))
809 return true;
810
811 if (instr->barrier_class & prior->barrier_conflict) {
812 if (!(instr->barrier_class & ~(IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W))) {
813 /* if only array barrier, then we can further limit false-deps
814 * by considering the array-id, ie reads/writes to different
815 * arrays do not depend on each other (no aliasing)
816 */
817 if (get_array_id(instr) != get_array_id(prior)) {
818 return false;
819 }
820 }
821
822 return true;
823 }
824
825 return false;
826 }
827
828 static void
829 add_barrier_deps(struct ir3_block *block, struct ir3_instruction *instr)
830 {
831 struct list_head *prev = instr->node.prev;
832 struct list_head *next = instr->node.next;
833
834 /* add dependencies on previous instructions that must be scheduled
835 * prior to the current instruction
836 */
837 while (prev != &block->instr_list) {
838 struct ir3_instruction *pi =
839 LIST_ENTRY(struct ir3_instruction, prev, node);
840
841 prev = prev->prev;
842
843 if (is_meta(pi))
844 continue;
845
846 if (instr->barrier_class == pi->barrier_class) {
847 ir3_instr_add_dep(instr, pi);
848 break;
849 }
850
851 if (depends_on(instr, pi))
852 ir3_instr_add_dep(instr, pi);
853 }
854
855 /* add dependencies on this instruction to following instructions
856 * that must be scheduled after the current instruction:
857 */
858 while (next != &block->instr_list) {
859 struct ir3_instruction *ni =
860 LIST_ENTRY(struct ir3_instruction, next, node);
861
862 next = next->next;
863
864 if (is_meta(ni))
865 continue;
866
867 if (instr->barrier_class == ni->barrier_class) {
868 ir3_instr_add_dep(ni, instr);
869 break;
870 }
871
872 if (depends_on(ni, instr))
873 ir3_instr_add_dep(ni, instr);
874 }
875 }
876
877 /* before scheduling a block, we need to add any necessary false-dependencies
878 * to ensure that:
879 *
880 * (1) barriers are scheduled in the right order wrt instructions related
881 * to the barrier
882 *
883 * (2) reads that come before a write actually get scheduled before the
884 * write
885 */
886 static void
887 calculate_deps(struct ir3_block *block)
888 {
889 foreach_instr (instr, &block->instr_list) {
890 if (instr->barrier_class) {
891 add_barrier_deps(block, instr);
892 }
893 }
894 }
895
896 void
897 ir3_sched_add_deps(struct ir3 *ir)
898 {
899 foreach_block (block, &ir->block_list) {
900 calculate_deps(block);
901 }
902 }