freedreno/ir3: split out has_latency_to_hide()
[mesa.git] / src / freedreno / ir3 / ir3_sched.c
1 /*
2 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Rob Clark <robclark@freedesktop.org>
25 */
26
27
28 #include "util/u_math.h"
29
30 #include "ir3.h"
31 #include "ir3_compiler.h"
32
33 #ifdef DEBUG
34 #define SCHED_DEBUG (ir3_shader_debug & IR3_DBG_SCHEDMSGS)
35 #else
36 #define SCHED_DEBUG 0
37 #endif
38 #define d(fmt, ...) do { if (SCHED_DEBUG) { \
39 printf("SCHED: "fmt"\n", ##__VA_ARGS__); \
40 } } while (0)
41
42 #define di(instr, fmt, ...) do { if (SCHED_DEBUG) { \
43 printf("SCHED: "fmt": ", ##__VA_ARGS__); \
44 ir3_print_instr(instr); \
45 } } while (0)
46
47 /*
48 * Instruction Scheduling:
49 *
50 * A recursive depth based scheduling algo. Recursively find an eligible
51 * instruction to schedule from the deepest instruction (recursing through
52 * it's unscheduled src instructions). Normally this would result in a
53 * lot of re-traversal of the same instructions, so we cache results in
54 * instr->data (and clear cached results that would be no longer valid
55 * after scheduling an instruction).
56 *
57 * There are a few special cases that need to be handled, since sched
58 * is currently independent of register allocation. Usages of address
59 * register (a0.x) or predicate register (p0.x) must be serialized. Ie.
60 * if you have two pairs of instructions that write the same special
61 * register and then read it, then those pairs cannot be interleaved.
62 * To solve this, when we are in such a scheduling "critical section",
63 * and we encounter a conflicting write to a special register, we try
64 * to schedule any remaining instructions that use that value first.
65 */
66
67 struct ir3_sched_ctx {
68 struct ir3_block *block; /* the current block */
69 struct list_head depth_list; /* depth sorted unscheduled instrs */
70 struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/
71 struct ir3_instruction *addr; /* current a0.x user, if any */
72 struct ir3_instruction *pred; /* current p0.x user, if any */
73 int live_values; /* estimate of current live values */
74 int half_live_values; /* estimate of current half precision live values */
75 bool error;
76
77 unsigned live_threshold_hi;
78 unsigned live_threshold_lo;
79 unsigned depth_threshold_hi;
80 unsigned depth_threshold_lo;
81 };
82
83 static bool is_scheduled(struct ir3_instruction *instr)
84 {
85 return !!(instr->flags & IR3_INSTR_MARK);
86 }
87
88 static void
89 unuse_each_src(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
90 {
91 struct ir3_instruction *src;
92
93 foreach_ssa_src_n(src, n, instr) {
94 if (__is_false_dep(instr, n))
95 continue;
96 if (instr->block != src->block)
97 continue;
98 if ((src->opc == OPC_META_COLLECT) || (src->opc == OPC_META_SPLIT)) {
99 unuse_each_src(ctx, src);
100 } else {
101 debug_assert(src->use_count > 0);
102
103 if (--src->use_count == 0) {
104 if (is_half(src)) {
105 ctx->half_live_values -= dest_regs(src);
106 debug_assert(ctx->half_live_values >= 0);
107 } else {
108 ctx->live_values -= dest_regs(src);
109 debug_assert(ctx->live_values >= 0);
110 }
111 }
112 }
113 }
114 }
115
116 static void clear_cache(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr);
117 static void use_instr(struct ir3_instruction *instr);
118
119 /* transfers a use-count to new instruction, for cases where we
120 * "spill" address or predicate. Note this might cause the
121 * previous instruction that loaded a0.x/p0.x to become live
122 * again, when we previously thought it was dead.
123 */
124 static void
125 transfer_use(struct ir3_sched_ctx *ctx, struct ir3_instruction *orig_instr,
126 struct ir3_instruction *new_instr)
127 {
128 struct ir3_instruction *src;
129
130 debug_assert(is_scheduled(orig_instr));
131
132 foreach_ssa_src_n(src, n, new_instr) {
133 if (__is_false_dep(new_instr, n))
134 continue;
135 if (is_half(new_instr)) {
136 ctx->half_live_values += dest_regs(src);
137 } else {
138 ctx->live_values += dest_regs(src);
139 }
140 use_instr(src);
141 }
142
143 clear_cache(ctx, orig_instr);
144 }
145
146 static void
147 use_each_src(struct ir3_instruction *instr)
148 {
149 struct ir3_instruction *src;
150
151 foreach_ssa_src_n(src, n, instr) {
152 if (__is_false_dep(instr, n))
153 continue;
154 use_instr(src);
155 }
156 }
157
158 static void
159 use_instr(struct ir3_instruction *instr)
160 {
161 if ((instr->opc == OPC_META_COLLECT) || (instr->opc == OPC_META_SPLIT)) {
162 use_each_src(instr);
163 } else {
164 instr->use_count++;
165 }
166 }
167
168 static void
169 update_live_values(struct ir3_sched_ctx *ctx, struct ir3_instruction *scheduled)
170 {
171 if ((scheduled->opc == OPC_META_COLLECT) || (scheduled->opc == OPC_META_SPLIT))
172 return;
173
174 if ((scheduled->regs_count > 0) && is_half(scheduled)) {
175 ctx->half_live_values += dest_regs(scheduled);
176 } else {
177 ctx->live_values += dest_regs(scheduled);
178 }
179
180 unuse_each_src(ctx, scheduled);
181 }
182
183 static void
184 update_use_count(struct ir3 *ir)
185 {
186 foreach_block (block, &ir->block_list) {
187 foreach_instr (instr, &block->instr_list) {
188 instr->use_count = 0;
189 }
190 }
191
192 foreach_block (block, &ir->block_list) {
193 foreach_instr (instr, &block->instr_list) {
194 if ((instr->opc == OPC_META_COLLECT) || (instr->opc == OPC_META_SPLIT))
195 continue;
196
197 use_each_src(instr);
198 }
199 }
200
201 /* Shader outputs are also used:
202 */
203 struct ir3_instruction *out;
204 foreach_output(out, ir)
205 use_instr(out);
206 }
207
208 #define NULL_INSTR ((void *)~0)
209
210 static void
211 clear_cache(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
212 {
213 foreach_instr (instr2, &ctx->depth_list) {
214 if ((instr2->data == instr) || (instr2->data == NULL_INSTR) || !instr)
215 instr2->data = NULL;
216 }
217 }
218
219 static void
220 schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
221 {
222 debug_assert(ctx->block == instr->block);
223
224 /* remove from depth list:
225 */
226 list_delinit(&instr->node);
227
228 if (writes_addr(instr)) {
229 debug_assert(ctx->addr == NULL);
230 ctx->addr = instr;
231 }
232
233 if (writes_pred(instr)) {
234 debug_assert(ctx->pred == NULL);
235 ctx->pred = instr;
236 }
237
238 instr->flags |= IR3_INSTR_MARK;
239
240 di(instr, "schedule");
241
242 list_addtail(&instr->node, &instr->block->instr_list);
243 ctx->scheduled = instr;
244
245 update_live_values(ctx, instr);
246
247 if (writes_addr(instr) || writes_pred(instr) || is_input(instr)) {
248 clear_cache(ctx, NULL);
249 } else {
250 /* invalidate only the necessary entries.. */
251 clear_cache(ctx, instr);
252 }
253 }
254
255 static struct ir3_instruction *
256 deepest(struct ir3_instruction **srcs, unsigned nsrcs)
257 {
258 struct ir3_instruction *d = NULL;
259 unsigned i = 0, id = 0;
260
261 while ((i < nsrcs) && !(d = srcs[id = i]))
262 i++;
263
264 if (!d)
265 return NULL;
266
267 for (; i < nsrcs; i++)
268 if (srcs[i] && (srcs[i]->depth > d->depth))
269 d = srcs[id = i];
270
271 srcs[id] = NULL;
272
273 return d;
274 }
275
276 struct ir3_sched_notes {
277 /* there is at least one kill which could be scheduled, except
278 * for unscheduled bary.f's:
279 */
280 bool blocked_kill;
281 /* there is at least one instruction that could be scheduled,
282 * except for conflicting address/predicate register usage:
283 */
284 bool addr_conflict, pred_conflict;
285 };
286
287 /* could an instruction be scheduled if specified ssa src was scheduled? */
288 static bool
289 could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
290 {
291 struct ir3_instruction *other_src;
292 foreach_ssa_src(other_src, instr) {
293 /* if dependency not scheduled, we aren't ready yet: */
294 if ((src != other_src) && !is_scheduled(other_src)) {
295 return false;
296 }
297 }
298 return true;
299 }
300
301 /* Check if instruction is ok to schedule. Make sure it is not blocked
302 * by use of addr/predicate register, etc.
303 */
304 static bool
305 check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
306 struct ir3_instruction *instr)
307 {
308 debug_assert(!is_scheduled(instr));
309
310 /* For instructions that write address register we need to
311 * make sure there is at least one instruction that uses the
312 * addr value which is otherwise ready.
313 *
314 * TODO if any instructions use pred register and have other
315 * src args, we would need to do the same for writes_pred()..
316 */
317 if (writes_addr(instr)) {
318 struct ir3 *ir = instr->block->shader;
319 bool ready = false;
320 for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) {
321 struct ir3_instruction *indirect = ir->indirects[i];
322 if (!indirect)
323 continue;
324 if (indirect->address != instr)
325 continue;
326 ready = could_sched(indirect, instr);
327 }
328
329 /* nothing could be scheduled, so keep looking: */
330 if (!ready)
331 return false;
332 }
333
334 /* if this is a write to address/predicate register, and that
335 * register is currently in use, we need to defer until it is
336 * free:
337 */
338 if (writes_addr(instr) && ctx->addr) {
339 debug_assert(ctx->addr != instr);
340 notes->addr_conflict = true;
341 return false;
342 }
343
344 if (writes_pred(instr) && ctx->pred) {
345 debug_assert(ctx->pred != instr);
346 notes->pred_conflict = true;
347 return false;
348 }
349
350 /* if the instruction is a kill, we need to ensure *every*
351 * bary.f is scheduled. The hw seems unhappy if the thread
352 * gets killed before the end-input (ei) flag is hit.
353 *
354 * We could do this by adding each bary.f instruction as
355 * virtual ssa src for the kill instruction. But we have
356 * fixed length instr->regs[].
357 *
358 * TODO this wouldn't be quite right if we had multiple
359 * basic blocks, if any block was conditional. We'd need
360 * to schedule the bary.f's outside of any block which
361 * was conditional that contained a kill.. I think..
362 */
363 if (is_kill(instr)) {
364 struct ir3 *ir = instr->block->shader;
365
366 for (unsigned i = 0; i < ir->baryfs_count; i++) {
367 struct ir3_instruction *baryf = ir->baryfs[i];
368 if (baryf->flags & IR3_INSTR_UNUSED)
369 continue;
370 if (!is_scheduled(baryf)) {
371 notes->blocked_kill = true;
372 return false;
373 }
374 }
375 }
376
377 return true;
378 }
379
380 /* Find the best instruction to schedule from specified instruction or
381 * recursively it's ssa sources.
382 */
383 static struct ir3_instruction *
384 find_instr_recursive(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
385 struct ir3_instruction *instr)
386 {
387 struct ir3_instruction *srcs[__ssa_src_cnt(instr)];
388 struct ir3_instruction *src;
389 unsigned nsrcs = 0;
390
391 if (is_scheduled(instr))
392 return NULL;
393
394 /* use instr->data to cache the results of recursing up the
395 * instr src's. Otherwise the recursive algo can scale quite
396 * badly w/ shader size. But this takes some care to clear
397 * the cache appropriately when instructions are scheduled.
398 */
399 if (instr->data) {
400 if (instr->data == NULL_INSTR)
401 return NULL;
402 return instr->data;
403 }
404
405 /* find unscheduled srcs: */
406 foreach_ssa_src(src, instr) {
407 if (!is_scheduled(src) && (src->block == instr->block)) {
408 debug_assert(nsrcs < ARRAY_SIZE(srcs));
409 srcs[nsrcs++] = src;
410 }
411 }
412
413 /* if all our src's are already scheduled: */
414 if (nsrcs == 0) {
415 if (check_instr(ctx, notes, instr)) {
416 instr->data = instr;
417 return instr;
418 }
419 return NULL;
420 }
421
422 while ((src = deepest(srcs, nsrcs))) {
423 struct ir3_instruction *candidate;
424
425 candidate = find_instr_recursive(ctx, notes, src);
426 if (!candidate)
427 continue;
428
429 if (check_instr(ctx, notes, candidate)) {
430 instr->data = candidate;
431 return candidate;
432 }
433 }
434
435 instr->data = NULL_INSTR;
436 return NULL;
437 }
438
439 /* find net change to live values if instruction were scheduled: */
440 static int
441 live_effect(struct ir3_instruction *instr)
442 {
443 struct ir3_instruction *src;
444 int new_live = dest_regs(instr);
445 int old_live = 0;
446
447 foreach_ssa_src_n(src, n, instr) {
448 if (__is_false_dep(instr, n))
449 continue;
450
451 if (instr->block != src->block)
452 continue;
453
454 /* for split, just pass things along to the real src: */
455 if (src->opc == OPC_META_SPLIT)
456 src = ssa(src->regs[1]);
457
458 /* for collect, if this is the last use of *each* src,
459 * then it will decrease the live values, since RA treats
460 * them as a whole:
461 */
462 if (src->opc == OPC_META_COLLECT) {
463 struct ir3_instruction *src2;
464 bool last_use = true;
465
466 foreach_ssa_src(src2, src) {
467 if (src2->use_count > 1) {
468 last_use = false;
469 break;
470 }
471 }
472
473 if (last_use)
474 old_live += dest_regs(src);
475
476 } else {
477 debug_assert(src->use_count > 0);
478
479 if (src->use_count == 1) {
480 old_live += dest_regs(src);
481 }
482 }
483 }
484
485 return new_live - old_live;
486 }
487
488 /* find instruction to schedule: */
489 static struct ir3_instruction *
490 find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
491 bool soft)
492 {
493 struct ir3_instruction *best_instr = NULL;
494 int best_rank = INT_MAX; /* lower is better */
495 unsigned deepest = 0;
496
497 /* TODO we'd really rather use the list/array of block outputs. But we
498 * don't have such a thing. Recursing *every* instruction in the list
499 * will result in a lot of repeated traversal, since instructions will
500 * get traversed both when they appear as ssa src to a later instruction
501 * as well as where they appear in the depth_list.
502 */
503 foreach_instr_rev (instr, &ctx->depth_list) {
504 struct ir3_instruction *candidate;
505
506 candidate = find_instr_recursive(ctx, notes, instr);
507 if (!candidate)
508 continue;
509
510 if (is_meta(candidate))
511 return candidate;
512
513 deepest = MAX2(deepest, candidate->depth);
514 }
515
516 /* traverse the list a second time.. but since we cache the result of
517 * find_instr_recursive() it isn't as bad as it looks.
518 */
519 foreach_instr_rev (instr, &ctx->depth_list) {
520 struct ir3_instruction *candidate;
521
522 candidate = find_instr_recursive(ctx, notes, instr);
523 if (!candidate)
524 continue;
525
526 /* determine net change to # of live values: */
527 int le = live_effect(candidate);
528 unsigned live_values = (2 * ctx->live_values) + ctx->half_live_values;
529
530 /* if there is a net increase in # of live values, then apply some
531 * threshold to avoid instructions getting scheduled *too* early
532 * and increasing register pressure.
533 */
534 if (le >= 1) {
535 unsigned threshold;
536
537 if (live_values > ctx->live_threshold_lo) {
538 threshold = ctx->depth_threshold_lo;
539 } else {
540 threshold = ctx->depth_threshold_hi;
541 }
542
543 /* Filter out any "shallow" instructions which would otherwise
544 * tend to get scheduled too early to fill delay slots even
545 * when they are not needed for a while. There will probably
546 * be later delay slots that they could just as easily fill.
547 *
548 * A classic case where this comes up is frag shaders that
549 * write a constant value (like 1.0f) to one of the channels
550 * of the output color(s). Since the mov from immed has no
551 * dependencies, it would otherwise get scheduled early to
552 * fill delay slots, occupying a register until the end of
553 * the program.
554 */
555 if ((deepest - candidate->depth) > threshold)
556 continue;
557 }
558
559 int rank = ir3_delay_calc(ctx->block, candidate, soft, false);
560
561 /* if too many live values, prioritize instructions that reduce the
562 * number of live values:
563 */
564 if (live_values > ctx->live_threshold_hi) {
565 rank = le;
566 } else if (live_values > ctx->live_threshold_lo) {
567 rank += le;
568 }
569
570 if (rank < best_rank) {
571 best_instr = candidate;
572 best_rank = rank;
573 }
574 }
575
576 return best_instr;
577 }
578
579 static struct ir3_instruction *
580 split_instr(struct ir3_sched_ctx *ctx, struct ir3_instruction *orig_instr)
581 {
582 struct ir3_instruction *new_instr = ir3_instr_clone(orig_instr);
583 ir3_insert_by_depth(new_instr, &ctx->depth_list);
584 transfer_use(ctx, orig_instr, new_instr);
585 return new_instr;
586 }
587
588 /* "spill" the address register by remapping any unscheduled
589 * instructions which depend on the current address register
590 * to a clone of the instruction which wrote the address reg.
591 */
592 static struct ir3_instruction *
593 split_addr(struct ir3_sched_ctx *ctx)
594 {
595 struct ir3 *ir;
596 struct ir3_instruction *new_addr = NULL;
597 unsigned i;
598
599 debug_assert(ctx->addr);
600
601 ir = ctx->addr->block->shader;
602
603 for (i = 0; i < ir->indirects_count; i++) {
604 struct ir3_instruction *indirect = ir->indirects[i];
605
606 if (!indirect)
607 continue;
608
609 /* skip instructions already scheduled: */
610 if (is_scheduled(indirect))
611 continue;
612
613 /* remap remaining instructions using current addr
614 * to new addr:
615 */
616 if (indirect->address == ctx->addr) {
617 if (!new_addr) {
618 new_addr = split_instr(ctx, ctx->addr);
619 /* original addr is scheduled, but new one isn't: */
620 new_addr->flags &= ~IR3_INSTR_MARK;
621 }
622 indirect->address = NULL;
623 ir3_instr_set_address(indirect, new_addr);
624 }
625 }
626
627 /* all remaining indirects remapped to new addr: */
628 ctx->addr = NULL;
629
630 return new_addr;
631 }
632
633 /* "spill" the predicate register by remapping any unscheduled
634 * instructions which depend on the current predicate register
635 * to a clone of the instruction which wrote the address reg.
636 */
637 static struct ir3_instruction *
638 split_pred(struct ir3_sched_ctx *ctx)
639 {
640 struct ir3 *ir;
641 struct ir3_instruction *new_pred = NULL;
642 unsigned i;
643
644 debug_assert(ctx->pred);
645
646 ir = ctx->pred->block->shader;
647
648 for (i = 0; i < ir->predicates_count; i++) {
649 struct ir3_instruction *predicated = ir->predicates[i];
650
651 /* skip instructions already scheduled: */
652 if (is_scheduled(predicated))
653 continue;
654
655 /* remap remaining instructions using current pred
656 * to new pred:
657 *
658 * TODO is there ever a case when pred isn't first
659 * (and only) src?
660 */
661 if (ssa(predicated->regs[1]) == ctx->pred) {
662 if (!new_pred) {
663 new_pred = split_instr(ctx, ctx->pred);
664 /* original pred is scheduled, but new one isn't: */
665 new_pred->flags &= ~IR3_INSTR_MARK;
666 }
667 predicated->regs[1]->instr = new_pred;
668 }
669 }
670
671 /* all remaining predicated remapped to new pred: */
672 ctx->pred = NULL;
673
674 return new_pred;
675 }
676
677 static void
678 sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
679 {
680 struct list_head unscheduled_list;
681
682 ctx->block = block;
683
684 /* addr/pred writes are per-block: */
685 ctx->addr = NULL;
686 ctx->pred = NULL;
687
688 /* move all instructions to the unscheduled list, and
689 * empty the block's instruction list (to which we will
690 * be inserting).
691 */
692 list_replace(&block->instr_list, &unscheduled_list);
693 list_inithead(&block->instr_list);
694 list_inithead(&ctx->depth_list);
695
696 /* First schedule all meta:input instructions, followed by
697 * tex-prefetch. We want all of the instructions that load
698 * values into registers before the shader starts to go
699 * before any other instructions. But in particular we
700 * want inputs to come before prefetches. This is because
701 * a FS's bary_ij input may not actually be live in the
702 * shader, but it should not be scheduled on top of any
703 * other input (but can be overwritten by a tex prefetch)
704 *
705 * Finally, move all the remaining instructions to the depth-
706 * list
707 */
708 foreach_instr_safe (instr, &unscheduled_list)
709 if (instr->opc == OPC_META_INPUT)
710 schedule(ctx, instr);
711
712 foreach_instr_safe (instr, &unscheduled_list)
713 if (instr->opc == OPC_META_TEX_PREFETCH)
714 schedule(ctx, instr);
715
716 foreach_instr_safe (instr, &unscheduled_list)
717 ir3_insert_by_depth(instr, &ctx->depth_list);
718
719 while (!list_is_empty(&ctx->depth_list)) {
720 struct ir3_sched_notes notes = {0};
721 struct ir3_instruction *instr;
722
723 instr = find_eligible_instr(ctx, &notes, true);
724 if (!instr)
725 instr = find_eligible_instr(ctx, &notes, false);
726
727 if (instr) {
728 unsigned delay = ir3_delay_calc(ctx->block, instr, false, false);
729 d("delay=%u", delay);
730
731 /* and if we run out of instructions that can be scheduled,
732 * then it is time for nop's:
733 */
734 debug_assert(delay <= 6);
735 while (delay > 0) {
736 ir3_NOP(block);
737 delay--;
738 }
739
740 schedule(ctx, instr);
741 } else {
742 struct ir3_instruction *new_instr = NULL;
743
744 /* nothing available to schedule.. if we are blocked on
745 * address/predicate register conflict, then break the
746 * deadlock by cloning the instruction that wrote that
747 * reg:
748 */
749 if (notes.addr_conflict) {
750 new_instr = split_addr(ctx);
751 } else if (notes.pred_conflict) {
752 new_instr = split_pred(ctx);
753 } else {
754 debug_assert(0);
755 ctx->error = true;
756 return;
757 }
758
759 if (new_instr) {
760 /* clearing current addr/pred can change what is
761 * available to schedule, so clear cache..
762 */
763 clear_cache(ctx, NULL);
764
765 ir3_insert_by_depth(new_instr, &ctx->depth_list);
766 /* the original instr that wrote addr/pred may have
767 * originated from a different block:
768 */
769 new_instr->block = block;
770 }
771 }
772 }
773 }
774
775 static void
776 setup_thresholds(struct ir3_sched_ctx *ctx, struct ir3 *ir)
777 {
778 if (ir3_has_latency_to_hide(ir)) {
779 ctx->live_threshold_hi = 2 * 16 * 4;
780 ctx->live_threshold_lo = 2 * 4 * 4;
781 ctx->depth_threshold_hi = 6;
782 ctx->depth_threshold_lo = 4;
783 } else {
784 ctx->live_threshold_hi = 2 * 16 * 4;
785 ctx->live_threshold_lo = 2 * 12 * 4;
786 ctx->depth_threshold_hi = 16;
787 ctx->depth_threshold_lo = 16;
788 }
789 }
790
791 int ir3_sched(struct ir3 *ir)
792 {
793 struct ir3_sched_ctx ctx = {0};
794
795 setup_thresholds(&ctx, ir);
796
797 ir3_clear_mark(ir);
798 update_use_count(ir);
799
800 foreach_block (block, &ir->block_list) {
801 ctx.live_values = 0;
802 ctx.half_live_values = 0;
803 sched_block(&ctx, block);
804 }
805
806 if (ctx.error)
807 return -1;
808
809 return 0;
810 }
811
812 static unsigned
813 get_array_id(struct ir3_instruction *instr)
814 {
815 /* The expectation is that there is only a single array
816 * src or dst, ir3_cp should enforce this.
817 */
818
819 for (unsigned i = 0; i < instr->regs_count; i++)
820 if (instr->regs[i]->flags & IR3_REG_ARRAY)
821 return instr->regs[i]->array.id;
822
823 unreachable("this was unexpected");
824 }
825
826 /* does instruction 'prior' need to be scheduled before 'instr'? */
827 static bool
828 depends_on(struct ir3_instruction *instr, struct ir3_instruction *prior)
829 {
830 /* TODO for dependencies that are related to a specific object, ie
831 * a specific SSBO/image/array, we could relax this constraint to
832 * make accesses to unrelated objects not depend on each other (at
833 * least as long as not declared coherent)
834 */
835 if (((instr->barrier_class & IR3_BARRIER_EVERYTHING) && prior->barrier_class) ||
836 ((prior->barrier_class & IR3_BARRIER_EVERYTHING) && instr->barrier_class))
837 return true;
838
839 if (instr->barrier_class & prior->barrier_conflict) {
840 if (!(instr->barrier_class & ~(IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W))) {
841 /* if only array barrier, then we can further limit false-deps
842 * by considering the array-id, ie reads/writes to different
843 * arrays do not depend on each other (no aliasing)
844 */
845 if (get_array_id(instr) != get_array_id(prior)) {
846 return false;
847 }
848 }
849
850 return true;
851 }
852
853 return false;
854 }
855
856 static void
857 add_barrier_deps(struct ir3_block *block, struct ir3_instruction *instr)
858 {
859 struct list_head *prev = instr->node.prev;
860 struct list_head *next = instr->node.next;
861
862 /* add dependencies on previous instructions that must be scheduled
863 * prior to the current instruction
864 */
865 while (prev != &block->instr_list) {
866 struct ir3_instruction *pi =
867 LIST_ENTRY(struct ir3_instruction, prev, node);
868
869 prev = prev->prev;
870
871 if (is_meta(pi))
872 continue;
873
874 if (instr->barrier_class == pi->barrier_class) {
875 ir3_instr_add_dep(instr, pi);
876 break;
877 }
878
879 if (depends_on(instr, pi))
880 ir3_instr_add_dep(instr, pi);
881 }
882
883 /* add dependencies on this instruction to following instructions
884 * that must be scheduled after the current instruction:
885 */
886 while (next != &block->instr_list) {
887 struct ir3_instruction *ni =
888 LIST_ENTRY(struct ir3_instruction, next, node);
889
890 next = next->next;
891
892 if (is_meta(ni))
893 continue;
894
895 if (instr->barrier_class == ni->barrier_class) {
896 ir3_instr_add_dep(ni, instr);
897 break;
898 }
899
900 if (depends_on(ni, instr))
901 ir3_instr_add_dep(ni, instr);
902 }
903 }
904
905 /* before scheduling a block, we need to add any necessary false-dependencies
906 * to ensure that:
907 *
908 * (1) barriers are scheduled in the right order wrt instructions related
909 * to the barrier
910 *
911 * (2) reads that come before a write actually get scheduled before the
912 * write
913 */
914 static void
915 calculate_deps(struct ir3_block *block)
916 {
917 foreach_instr (instr, &block->instr_list) {
918 if (instr->barrier_class) {
919 add_barrier_deps(block, instr);
920 }
921 }
922 }
923
924 void
925 ir3_sched_add_deps(struct ir3 *ir)
926 {
927 foreach_block (block, &ir->block_list) {
928 calculate_deps(block);
929 }
930 }