freedreno/ir3: track half-precision live values
[mesa.git] / src / freedreno / ir3 / ir3_sched.c
1 /*
2 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Rob Clark <robclark@freedesktop.org>
25 */
26
27
28 #include "util/u_math.h"
29
30 #include "ir3.h"
31 #include "ir3_compiler.h"
32
33 #ifdef DEBUG
34 #define SCHED_DEBUG (ir3_shader_debug & IR3_DBG_SCHEDMSGS)
35 #else
36 #define SCHED_DEBUG 0
37 #endif
38 #define d(fmt, ...) do { if (SCHED_DEBUG) { \
39 printf("SCHED: "fmt"\n", ##__VA_ARGS__); \
40 } } while (0)
41
42 #define di(instr, fmt, ...) do { if (SCHED_DEBUG) { \
43 printf("SCHED: "fmt": ", ##__VA_ARGS__); \
44 ir3_print_instr(instr); \
45 } } while (0)
46
47 /*
48 * Instruction Scheduling:
49 *
50 * A recursive depth based scheduling algo. Recursively find an eligible
51 * instruction to schedule from the deepest instruction (recursing through
52 * it's unscheduled src instructions). Normally this would result in a
53 * lot of re-traversal of the same instructions, so we cache results in
54 * instr->data (and clear cached results that would be no longer valid
55 * after scheduling an instruction).
56 *
57 * There are a few special cases that need to be handled, since sched
58 * is currently independent of register allocation. Usages of address
59 * register (a0.x) or predicate register (p0.x) must be serialized. Ie.
60 * if you have two pairs of instructions that write the same special
61 * register and then read it, then those pairs cannot be interleaved.
62 * To solve this, when we are in such a scheduling "critical section",
63 * and we encounter a conflicting write to a special register, we try
64 * to schedule any remaining instructions that use that value first.
65 */
66
67 struct ir3_sched_ctx {
68 struct ir3_block *block; /* the current block */
69 struct list_head depth_list; /* depth sorted unscheduled instrs */
70 struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/
71 struct ir3_instruction *addr; /* current a0.x user, if any */
72 struct ir3_instruction *pred; /* current p0.x user, if any */
73 int live_values; /* estimate of current live values */
74 int half_live_values; /* estimate of current half precision live values */
75 bool error;
76
77 unsigned live_threshold_hi;
78 unsigned live_threshold_lo;
79 unsigned depth_threshold_hi;
80 unsigned depth_threshold_lo;
81 };
82
83 static bool is_scheduled(struct ir3_instruction *instr)
84 {
85 return !!(instr->flags & IR3_INSTR_MARK);
86 }
87
88 static bool is_sfu_or_mem(struct ir3_instruction *instr)
89 {
90 return is_sfu(instr) || is_mem(instr);
91 }
92
93 static void
94 unuse_each_src(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
95 {
96 struct ir3_instruction *src;
97
98 foreach_ssa_src_n(src, n, instr) {
99 if (__is_false_dep(instr, n))
100 continue;
101 if (instr->block != src->block)
102 continue;
103 if ((src->opc == OPC_META_COLLECT) || (src->opc == OPC_META_SPLIT)) {
104 unuse_each_src(ctx, src);
105 } else {
106 debug_assert(src->use_count > 0);
107
108 if (--src->use_count == 0) {
109 if (is_half(src)) {
110 ctx->half_live_values -= dest_regs(src);
111 debug_assert(ctx->half_live_values >= 0);
112 } else {
113 ctx->live_values -= dest_regs(src);
114 debug_assert(ctx->live_values >= 0);
115 }
116 }
117 }
118 }
119 }
120
121 static void clear_cache(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr);
122 static void use_instr(struct ir3_instruction *instr);
123
124 /* transfers a use-count to new instruction, for cases where we
125 * "spill" address or predicate. Note this might cause the
126 * previous instruction that loaded a0.x/p0.x to become live
127 * again, when we previously thought it was dead.
128 */
129 static void
130 transfer_use(struct ir3_sched_ctx *ctx, struct ir3_instruction *orig_instr,
131 struct ir3_instruction *new_instr)
132 {
133 struct ir3_instruction *src;
134
135 debug_assert(is_scheduled(orig_instr));
136
137 foreach_ssa_src_n(src, n, new_instr) {
138 if (__is_false_dep(new_instr, n))
139 continue;
140 if (is_half(new_instr)) {
141 ctx->half_live_values += dest_regs(src);
142 } else {
143 ctx->live_values += dest_regs(src);
144 }
145 use_instr(src);
146 }
147
148 clear_cache(ctx, orig_instr);
149 }
150
151 static void
152 use_each_src(struct ir3_instruction *instr)
153 {
154 struct ir3_instruction *src;
155
156 foreach_ssa_src_n(src, n, instr) {
157 if (__is_false_dep(instr, n))
158 continue;
159 use_instr(src);
160 }
161 }
162
163 static void
164 use_instr(struct ir3_instruction *instr)
165 {
166 if ((instr->opc == OPC_META_COLLECT) || (instr->opc == OPC_META_SPLIT)) {
167 use_each_src(instr);
168 } else {
169 instr->use_count++;
170 }
171 }
172
173 static void
174 update_live_values(struct ir3_sched_ctx *ctx, struct ir3_instruction *scheduled)
175 {
176 if ((scheduled->opc == OPC_META_COLLECT) || (scheduled->opc == OPC_META_SPLIT))
177 return;
178
179 if ((scheduled->regs_count > 0) && is_half(scheduled)) {
180 ctx->half_live_values += dest_regs(scheduled);
181 } else {
182 ctx->live_values += dest_regs(scheduled);
183 }
184
185 unuse_each_src(ctx, scheduled);
186 }
187
188 static void
189 update_use_count(struct ir3 *ir)
190 {
191 foreach_block (block, &ir->block_list) {
192 foreach_instr (instr, &block->instr_list) {
193 instr->use_count = 0;
194 }
195 }
196
197 foreach_block (block, &ir->block_list) {
198 foreach_instr (instr, &block->instr_list) {
199 if ((instr->opc == OPC_META_COLLECT) || (instr->opc == OPC_META_SPLIT))
200 continue;
201
202 use_each_src(instr);
203 }
204 }
205
206 /* Shader outputs are also used:
207 */
208 struct ir3_instruction *out;
209 foreach_output(out, ir)
210 use_instr(out);
211 }
212
213 #define NULL_INSTR ((void *)~0)
214
215 static void
216 clear_cache(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
217 {
218 foreach_instr (instr2, &ctx->depth_list) {
219 if ((instr2->data == instr) || (instr2->data == NULL_INSTR) || !instr)
220 instr2->data = NULL;
221 }
222 }
223
224 static void
225 schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
226 {
227 debug_assert(ctx->block == instr->block);
228
229 /* maybe there is a better way to handle this than just stuffing
230 * a nop.. ideally we'd know about this constraint in the
231 * scheduling and depth calculation..
232 */
233 if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr))
234 ir3_NOP(ctx->block);
235
236 /* remove from depth list:
237 */
238 list_delinit(&instr->node);
239
240 if (writes_addr(instr)) {
241 debug_assert(ctx->addr == NULL);
242 ctx->addr = instr;
243 }
244
245 if (writes_pred(instr)) {
246 debug_assert(ctx->pred == NULL);
247 ctx->pred = instr;
248 }
249
250 instr->flags |= IR3_INSTR_MARK;
251
252 di(instr, "schedule");
253
254 list_addtail(&instr->node, &instr->block->instr_list);
255 ctx->scheduled = instr;
256
257 update_live_values(ctx, instr);
258
259 if (writes_addr(instr) || writes_pred(instr) || is_input(instr)) {
260 clear_cache(ctx, NULL);
261 } else {
262 /* invalidate only the necessary entries.. */
263 clear_cache(ctx, instr);
264 }
265 }
266
267 static struct ir3_instruction *
268 deepest(struct ir3_instruction **srcs, unsigned nsrcs)
269 {
270 struct ir3_instruction *d = NULL;
271 unsigned i = 0, id = 0;
272
273 while ((i < nsrcs) && !(d = srcs[id = i]))
274 i++;
275
276 if (!d)
277 return NULL;
278
279 for (; i < nsrcs; i++)
280 if (srcs[i] && (srcs[i]->depth > d->depth))
281 d = srcs[id = i];
282
283 srcs[id] = NULL;
284
285 return d;
286 }
287
288 struct ir3_sched_notes {
289 /* there is at least one kill which could be scheduled, except
290 * for unscheduled bary.f's:
291 */
292 bool blocked_kill;
293 /* there is at least one instruction that could be scheduled,
294 * except for conflicting address/predicate register usage:
295 */
296 bool addr_conflict, pred_conflict;
297 };
298
299 /* could an instruction be scheduled if specified ssa src was scheduled? */
300 static bool
301 could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
302 {
303 struct ir3_instruction *other_src;
304 foreach_ssa_src(other_src, instr) {
305 /* if dependency not scheduled, we aren't ready yet: */
306 if ((src != other_src) && !is_scheduled(other_src)) {
307 return false;
308 }
309 }
310 return true;
311 }
312
313 /* Check if instruction is ok to schedule. Make sure it is not blocked
314 * by use of addr/predicate register, etc.
315 */
316 static bool
317 check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
318 struct ir3_instruction *instr)
319 {
320 debug_assert(!is_scheduled(instr));
321
322 /* For instructions that write address register we need to
323 * make sure there is at least one instruction that uses the
324 * addr value which is otherwise ready.
325 *
326 * TODO if any instructions use pred register and have other
327 * src args, we would need to do the same for writes_pred()..
328 */
329 if (writes_addr(instr)) {
330 struct ir3 *ir = instr->block->shader;
331 bool ready = false;
332 for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) {
333 struct ir3_instruction *indirect = ir->indirects[i];
334 if (!indirect)
335 continue;
336 if (indirect->address != instr)
337 continue;
338 ready = could_sched(indirect, instr);
339 }
340
341 /* nothing could be scheduled, so keep looking: */
342 if (!ready)
343 return false;
344 }
345
346 /* if this is a write to address/predicate register, and that
347 * register is currently in use, we need to defer until it is
348 * free:
349 */
350 if (writes_addr(instr) && ctx->addr) {
351 debug_assert(ctx->addr != instr);
352 notes->addr_conflict = true;
353 return false;
354 }
355
356 if (writes_pred(instr) && ctx->pred) {
357 debug_assert(ctx->pred != instr);
358 notes->pred_conflict = true;
359 return false;
360 }
361
362 /* if the instruction is a kill, we need to ensure *every*
363 * bary.f is scheduled. The hw seems unhappy if the thread
364 * gets killed before the end-input (ei) flag is hit.
365 *
366 * We could do this by adding each bary.f instruction as
367 * virtual ssa src for the kill instruction. But we have
368 * fixed length instr->regs[].
369 *
370 * TODO this wouldn't be quite right if we had multiple
371 * basic blocks, if any block was conditional. We'd need
372 * to schedule the bary.f's outside of any block which
373 * was conditional that contained a kill.. I think..
374 */
375 if (is_kill(instr)) {
376 struct ir3 *ir = instr->block->shader;
377
378 for (unsigned i = 0; i < ir->baryfs_count; i++) {
379 struct ir3_instruction *baryf = ir->baryfs[i];
380 if (baryf->flags & IR3_INSTR_UNUSED)
381 continue;
382 if (!is_scheduled(baryf)) {
383 notes->blocked_kill = true;
384 return false;
385 }
386 }
387 }
388
389 return true;
390 }
391
392 /* Find the best instruction to schedule from specified instruction or
393 * recursively it's ssa sources.
394 */
395 static struct ir3_instruction *
396 find_instr_recursive(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
397 struct ir3_instruction *instr)
398 {
399 struct ir3_instruction *srcs[__ssa_src_cnt(instr)];
400 struct ir3_instruction *src;
401 unsigned nsrcs = 0;
402
403 if (is_scheduled(instr))
404 return NULL;
405
406 /* use instr->data to cache the results of recursing up the
407 * instr src's. Otherwise the recursive algo can scale quite
408 * badly w/ shader size. But this takes some care to clear
409 * the cache appropriately when instructions are scheduled.
410 */
411 if (instr->data) {
412 if (instr->data == NULL_INSTR)
413 return NULL;
414 return instr->data;
415 }
416
417 /* find unscheduled srcs: */
418 foreach_ssa_src(src, instr) {
419 if (!is_scheduled(src) && (src->block == instr->block)) {
420 debug_assert(nsrcs < ARRAY_SIZE(srcs));
421 srcs[nsrcs++] = src;
422 }
423 }
424
425 /* if all our src's are already scheduled: */
426 if (nsrcs == 0) {
427 if (check_instr(ctx, notes, instr)) {
428 instr->data = instr;
429 return instr;
430 }
431 return NULL;
432 }
433
434 while ((src = deepest(srcs, nsrcs))) {
435 struct ir3_instruction *candidate;
436
437 candidate = find_instr_recursive(ctx, notes, src);
438 if (!candidate)
439 continue;
440
441 if (check_instr(ctx, notes, candidate)) {
442 instr->data = candidate;
443 return candidate;
444 }
445 }
446
447 instr->data = NULL_INSTR;
448 return NULL;
449 }
450
451 /* find net change to live values if instruction were scheduled: */
452 static int
453 live_effect(struct ir3_instruction *instr)
454 {
455 struct ir3_instruction *src;
456 int new_live = dest_regs(instr);
457 int old_live = 0;
458
459 foreach_ssa_src_n(src, n, instr) {
460 if (__is_false_dep(instr, n))
461 continue;
462
463 if (instr->block != src->block)
464 continue;
465
466 /* for split, just pass things along to the real src: */
467 if (src->opc == OPC_META_SPLIT)
468 src = ssa(src->regs[1]);
469
470 /* for collect, if this is the last use of *each* src,
471 * then it will decrease the live values, since RA treats
472 * them as a whole:
473 */
474 if (src->opc == OPC_META_COLLECT) {
475 struct ir3_instruction *src2;
476 bool last_use = true;
477
478 foreach_ssa_src(src2, src) {
479 if (src2->use_count > 1) {
480 last_use = false;
481 break;
482 }
483 }
484
485 if (last_use)
486 old_live += dest_regs(src);
487
488 } else {
489 debug_assert(src->use_count > 0);
490
491 if (src->use_count == 1) {
492 old_live += dest_regs(src);
493 }
494 }
495 }
496
497 return new_live - old_live;
498 }
499
500 /* find instruction to schedule: */
501 static struct ir3_instruction *
502 find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
503 bool soft)
504 {
505 struct ir3_instruction *best_instr = NULL;
506 int best_rank = INT_MAX; /* lower is better */
507 unsigned deepest = 0;
508
509 /* TODO we'd really rather use the list/array of block outputs. But we
510 * don't have such a thing. Recursing *every* instruction in the list
511 * will result in a lot of repeated traversal, since instructions will
512 * get traversed both when they appear as ssa src to a later instruction
513 * as well as where they appear in the depth_list.
514 */
515 foreach_instr_rev (instr, &ctx->depth_list) {
516 struct ir3_instruction *candidate;
517
518 candidate = find_instr_recursive(ctx, notes, instr);
519 if (!candidate)
520 continue;
521
522 if (is_meta(candidate))
523 return candidate;
524
525 deepest = MAX2(deepest, candidate->depth);
526 }
527
528 /* traverse the list a second time.. but since we cache the result of
529 * find_instr_recursive() it isn't as bad as it looks.
530 */
531 foreach_instr_rev (instr, &ctx->depth_list) {
532 struct ir3_instruction *candidate;
533
534 candidate = find_instr_recursive(ctx, notes, instr);
535 if (!candidate)
536 continue;
537
538 /* determine net change to # of live values: */
539 int le = live_effect(candidate);
540 unsigned live_values = (2 * ctx->live_values) + ctx->half_live_values;
541
542 /* if there is a net increase in # of live values, then apply some
543 * threshold to avoid instructions getting scheduled *too* early
544 * and increasing register pressure.
545 */
546 if (le >= 1) {
547 unsigned threshold;
548
549 if (live_values > ctx->live_threshold_lo) {
550 threshold = ctx->depth_threshold_lo;
551 } else {
552 threshold = ctx->depth_threshold_hi;
553 }
554
555 /* Filter out any "shallow" instructions which would otherwise
556 * tend to get scheduled too early to fill delay slots even
557 * when they are not needed for a while. There will probably
558 * be later delay slots that they could just as easily fill.
559 *
560 * A classic case where this comes up is frag shaders that
561 * write a constant value (like 1.0f) to one of the channels
562 * of the output color(s). Since the mov from immed has no
563 * dependencies, it would otherwise get scheduled early to
564 * fill delay slots, occupying a register until the end of
565 * the program.
566 */
567 if ((deepest - candidate->depth) > threshold)
568 continue;
569 }
570
571 int rank = ir3_delay_calc(ctx->block, candidate, soft, false);
572
573 /* if too many live values, prioritize instructions that reduce the
574 * number of live values:
575 */
576 if (live_values > ctx->live_threshold_hi) {
577 rank = le;
578 } else if (live_values > ctx->live_threshold_lo) {
579 rank += le;
580 }
581
582 if (rank < best_rank) {
583 best_instr = candidate;
584 best_rank = rank;
585 }
586 }
587
588 return best_instr;
589 }
590
591 static struct ir3_instruction *
592 split_instr(struct ir3_sched_ctx *ctx, struct ir3_instruction *orig_instr)
593 {
594 struct ir3_instruction *new_instr = ir3_instr_clone(orig_instr);
595 ir3_insert_by_depth(new_instr, &ctx->depth_list);
596 transfer_use(ctx, orig_instr, new_instr);
597 return new_instr;
598 }
599
600 /* "spill" the address register by remapping any unscheduled
601 * instructions which depend on the current address register
602 * to a clone of the instruction which wrote the address reg.
603 */
604 static struct ir3_instruction *
605 split_addr(struct ir3_sched_ctx *ctx)
606 {
607 struct ir3 *ir;
608 struct ir3_instruction *new_addr = NULL;
609 unsigned i;
610
611 debug_assert(ctx->addr);
612
613 ir = ctx->addr->block->shader;
614
615 for (i = 0; i < ir->indirects_count; i++) {
616 struct ir3_instruction *indirect = ir->indirects[i];
617
618 if (!indirect)
619 continue;
620
621 /* skip instructions already scheduled: */
622 if (is_scheduled(indirect))
623 continue;
624
625 /* remap remaining instructions using current addr
626 * to new addr:
627 */
628 if (indirect->address == ctx->addr) {
629 if (!new_addr) {
630 new_addr = split_instr(ctx, ctx->addr);
631 /* original addr is scheduled, but new one isn't: */
632 new_addr->flags &= ~IR3_INSTR_MARK;
633 }
634 indirect->address = NULL;
635 ir3_instr_set_address(indirect, new_addr);
636 }
637 }
638
639 /* all remaining indirects remapped to new addr: */
640 ctx->addr = NULL;
641
642 return new_addr;
643 }
644
645 /* "spill" the predicate register by remapping any unscheduled
646 * instructions which depend on the current predicate register
647 * to a clone of the instruction which wrote the address reg.
648 */
649 static struct ir3_instruction *
650 split_pred(struct ir3_sched_ctx *ctx)
651 {
652 struct ir3 *ir;
653 struct ir3_instruction *new_pred = NULL;
654 unsigned i;
655
656 debug_assert(ctx->pred);
657
658 ir = ctx->pred->block->shader;
659
660 for (i = 0; i < ir->predicates_count; i++) {
661 struct ir3_instruction *predicated = ir->predicates[i];
662
663 /* skip instructions already scheduled: */
664 if (is_scheduled(predicated))
665 continue;
666
667 /* remap remaining instructions using current pred
668 * to new pred:
669 *
670 * TODO is there ever a case when pred isn't first
671 * (and only) src?
672 */
673 if (ssa(predicated->regs[1]) == ctx->pred) {
674 if (!new_pred) {
675 new_pred = split_instr(ctx, ctx->pred);
676 /* original pred is scheduled, but new one isn't: */
677 new_pred->flags &= ~IR3_INSTR_MARK;
678 }
679 predicated->regs[1]->instr = new_pred;
680 }
681 }
682
683 /* all remaining predicated remapped to new pred: */
684 ctx->pred = NULL;
685
686 return new_pred;
687 }
688
689 static void
690 sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
691 {
692 struct list_head unscheduled_list;
693
694 ctx->block = block;
695
696 /* addr/pred writes are per-block: */
697 ctx->addr = NULL;
698 ctx->pred = NULL;
699
700 /* move all instructions to the unscheduled list, and
701 * empty the block's instruction list (to which we will
702 * be inserting).
703 */
704 list_replace(&block->instr_list, &unscheduled_list);
705 list_inithead(&block->instr_list);
706 list_inithead(&ctx->depth_list);
707
708 /* First schedule all meta:input instructions, followed by
709 * tex-prefetch. We want all of the instructions that load
710 * values into registers before the shader starts to go
711 * before any other instructions. But in particular we
712 * want inputs to come before prefetches. This is because
713 * a FS's bary_ij input may not actually be live in the
714 * shader, but it should not be scheduled on top of any
715 * other input (but can be overwritten by a tex prefetch)
716 *
717 * Finally, move all the remaining instructions to the depth-
718 * list
719 */
720 foreach_instr_safe (instr, &unscheduled_list)
721 if (instr->opc == OPC_META_INPUT)
722 schedule(ctx, instr);
723
724 foreach_instr_safe (instr, &unscheduled_list)
725 if (instr->opc == OPC_META_TEX_PREFETCH)
726 schedule(ctx, instr);
727
728 foreach_instr_safe (instr, &unscheduled_list)
729 ir3_insert_by_depth(instr, &ctx->depth_list);
730
731 while (!list_is_empty(&ctx->depth_list)) {
732 struct ir3_sched_notes notes = {0};
733 struct ir3_instruction *instr;
734
735 instr = find_eligible_instr(ctx, &notes, true);
736 if (!instr)
737 instr = find_eligible_instr(ctx, &notes, false);
738
739 if (instr) {
740 unsigned delay = ir3_delay_calc(ctx->block, instr, false, false);
741 d("delay=%u", delay);
742
743 /* and if we run out of instructions that can be scheduled,
744 * then it is time for nop's:
745 */
746 debug_assert(delay <= 6);
747 while (delay > 0) {
748 ir3_NOP(block);
749 delay--;
750 }
751
752 schedule(ctx, instr);
753 } else {
754 struct ir3_instruction *new_instr = NULL;
755
756 /* nothing available to schedule.. if we are blocked on
757 * address/predicate register conflict, then break the
758 * deadlock by cloning the instruction that wrote that
759 * reg:
760 */
761 if (notes.addr_conflict) {
762 new_instr = split_addr(ctx);
763 } else if (notes.pred_conflict) {
764 new_instr = split_pred(ctx);
765 } else {
766 debug_assert(0);
767 ctx->error = true;
768 return;
769 }
770
771 if (new_instr) {
772 /* clearing current addr/pred can change what is
773 * available to schedule, so clear cache..
774 */
775 clear_cache(ctx, NULL);
776
777 ir3_insert_by_depth(new_instr, &ctx->depth_list);
778 /* the original instr that wrote addr/pred may have
779 * originated from a different block:
780 */
781 new_instr->block = block;
782 }
783 }
784 }
785 }
786
787 static bool
788 has_latency_to_hide(struct ir3 *ir)
789 {
790 foreach_block (block, &ir->block_list) {
791 foreach_instr (instr, &block->instr_list) {
792 if (is_tex(instr))
793 return true;
794
795 if (is_load(instr)) {
796 switch (instr->opc) {
797 case OPC_LDLV:
798 case OPC_LDL:
799 case OPC_LDLW:
800 break;
801 default:
802 return true;
803 }
804 }
805 }
806 }
807
808 return false;
809 }
810
811 static void
812 setup_thresholds(struct ir3_sched_ctx *ctx, struct ir3 *ir)
813 {
814 if (has_latency_to_hide(ir)) {
815 ctx->live_threshold_hi = 2 * 16 * 4;
816 ctx->live_threshold_lo = 2 * 4 * 4;
817 ctx->depth_threshold_hi = 6;
818 ctx->depth_threshold_lo = 4;
819 } else {
820 ctx->live_threshold_hi = 2 * 16 * 4;
821 ctx->live_threshold_lo = 2 * 12 * 4;
822 ctx->depth_threshold_hi = 16;
823 ctx->depth_threshold_lo = 16;
824 }
825 }
826
827 int ir3_sched(struct ir3 *ir)
828 {
829 struct ir3_sched_ctx ctx = {0};
830
831 setup_thresholds(&ctx, ir);
832
833 ir3_clear_mark(ir);
834 update_use_count(ir);
835
836 foreach_block (block, &ir->block_list) {
837 ctx.live_values = 0;
838 ctx.half_live_values = 0;
839 sched_block(&ctx, block);
840 }
841
842 if (ctx.error)
843 return -1;
844
845 return 0;
846 }
847
848 static unsigned
849 get_array_id(struct ir3_instruction *instr)
850 {
851 /* The expectation is that there is only a single array
852 * src or dst, ir3_cp should enforce this.
853 */
854
855 for (unsigned i = 0; i < instr->regs_count; i++)
856 if (instr->regs[i]->flags & IR3_REG_ARRAY)
857 return instr->regs[i]->array.id;
858
859 unreachable("this was unexpected");
860 }
861
862 /* does instruction 'prior' need to be scheduled before 'instr'? */
863 static bool
864 depends_on(struct ir3_instruction *instr, struct ir3_instruction *prior)
865 {
866 /* TODO for dependencies that are related to a specific object, ie
867 * a specific SSBO/image/array, we could relax this constraint to
868 * make accesses to unrelated objects not depend on each other (at
869 * least as long as not declared coherent)
870 */
871 if (((instr->barrier_class & IR3_BARRIER_EVERYTHING) && prior->barrier_class) ||
872 ((prior->barrier_class & IR3_BARRIER_EVERYTHING) && instr->barrier_class))
873 return true;
874
875 if (instr->barrier_class & prior->barrier_conflict) {
876 if (!(instr->barrier_class & ~(IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W))) {
877 /* if only array barrier, then we can further limit false-deps
878 * by considering the array-id, ie reads/writes to different
879 * arrays do not depend on each other (no aliasing)
880 */
881 if (get_array_id(instr) != get_array_id(prior)) {
882 return false;
883 }
884 }
885
886 return true;
887 }
888
889 return false;
890 }
891
892 static void
893 add_barrier_deps(struct ir3_block *block, struct ir3_instruction *instr)
894 {
895 struct list_head *prev = instr->node.prev;
896 struct list_head *next = instr->node.next;
897
898 /* add dependencies on previous instructions that must be scheduled
899 * prior to the current instruction
900 */
901 while (prev != &block->instr_list) {
902 struct ir3_instruction *pi =
903 LIST_ENTRY(struct ir3_instruction, prev, node);
904
905 prev = prev->prev;
906
907 if (is_meta(pi))
908 continue;
909
910 if (instr->barrier_class == pi->barrier_class) {
911 ir3_instr_add_dep(instr, pi);
912 break;
913 }
914
915 if (depends_on(instr, pi))
916 ir3_instr_add_dep(instr, pi);
917 }
918
919 /* add dependencies on this instruction to following instructions
920 * that must be scheduled after the current instruction:
921 */
922 while (next != &block->instr_list) {
923 struct ir3_instruction *ni =
924 LIST_ENTRY(struct ir3_instruction, next, node);
925
926 next = next->next;
927
928 if (is_meta(ni))
929 continue;
930
931 if (instr->barrier_class == ni->barrier_class) {
932 ir3_instr_add_dep(ni, instr);
933 break;
934 }
935
936 if (depends_on(ni, instr))
937 ir3_instr_add_dep(ni, instr);
938 }
939 }
940
941 /* before scheduling a block, we need to add any necessary false-dependencies
942 * to ensure that:
943 *
944 * (1) barriers are scheduled in the right order wrt instructions related
945 * to the barrier
946 *
947 * (2) reads that come before a write actually get scheduled before the
948 * write
949 */
950 static void
951 calculate_deps(struct ir3_block *block)
952 {
953 foreach_instr (instr, &block->instr_list) {
954 if (instr->barrier_class) {
955 add_barrier_deps(block, instr);
956 }
957 }
958 }
959
960 void
961 ir3_sched_add_deps(struct ir3 *ir)
962 {
963 foreach_block (block, &ir->block_list) {
964 calculate_deps(block);
965 }
966 }