freedreno/ir3/sched: avoid scheduling outputs
[mesa.git] / src / freedreno / ir3 / ir3_sched.c
1 /*
2 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Rob Clark <robclark@freedesktop.org>
25 */
26
27
28 #include "util/dag.h"
29 #include "util/u_math.h"
30
31 #include "ir3.h"
32 #include "ir3_compiler.h"
33
34 #ifdef DEBUG
35 #define SCHED_DEBUG (ir3_shader_debug & IR3_DBG_SCHEDMSGS)
36 #else
37 #define SCHED_DEBUG 0
38 #endif
39 #define d(fmt, ...) do { if (SCHED_DEBUG) { \
40 printf("SCHED: "fmt"\n", ##__VA_ARGS__); \
41 } } while (0)
42
43 #define di(instr, fmt, ...) do { if (SCHED_DEBUG) { \
44 printf("SCHED: "fmt": ", ##__VA_ARGS__); \
45 ir3_print_instr(instr); \
46 } } while (0)
47
48 /*
49 * Instruction Scheduling:
50 *
51 * A block-level pre-RA scheduler, which works by creating a DAG of
52 * instruction dependencies, and heuristically picking a DAG head
53 * (instruction with no unscheduled dependencies).
54 *
55 * Where possible, it tries to pick instructions that avoid nop delay
56 * slots, but it will prefer to pick instructions that reduce (or do
57 * not increase) the number of live values.
58 *
59 * If the only possible choices are instructions that increase the
60 * number of live values, it will try to pick the one with the earliest
61 * consumer (based on pre-sched program order).
62 *
63 * There are a few special cases that need to be handled, since sched
64 * is currently independent of register allocation. Usages of address
65 * register (a0.x) or predicate register (p0.x) must be serialized. Ie.
66 * if you have two pairs of instructions that write the same special
67 * register and then read it, then those pairs cannot be interleaved.
68 * To solve this, when we are in such a scheduling "critical section",
69 * and we encounter a conflicting write to a special register, we try
70 * to schedule any remaining instructions that use that value first.
71 *
72 * TODO we can detect too-large live_values here.. would be a good place
73 * to "spill" cheap things, like move from uniform/immed. (Constructing
74 * list of ssa def consumers before sched pass would make this easier.
75 * Also, in general it is general it might be best not to re-use load_immed
76 * across blocks.
77 *
78 * TODO we can use (abs)/(neg) src modifiers in a lot of cases to reduce
79 * the # of immediates in play (or at least that would help with
80 * dEQP-GLES31.functional.ubo.random.all_per_block_buffers.*).. probably
81 * do this in a nir pass that inserts fneg/etc? The cp pass should fold
82 * these into src modifiers..
83 */
84
85 struct ir3_sched_ctx {
86 struct ir3_block *block; /* the current block */
87 struct dag *dag;
88
89 struct list_head unscheduled_list; /* unscheduled instructions */
90 struct ir3_instruction *scheduled; /* last scheduled instr */
91 struct ir3_instruction *addr0; /* current a0.x user, if any */
92 struct ir3_instruction *addr1; /* current a1.x user, if any */
93 struct ir3_instruction *pred; /* current p0.x user, if any */
94
95 int remaining_kills;
96
97 bool error;
98 };
99
100 struct ir3_sched_node {
101 struct dag_node dag; /* must be first for util_dynarray_foreach */
102 struct ir3_instruction *instr;
103
104 unsigned delay;
105 unsigned max_delay;
106
107 /* For instructions that are a meta:collect src, once we schedule
108 * the first src of the collect, the entire vecN is live (at least
109 * from the PoV of the first RA pass.. the 2nd scalar pass can fill
110 * in some of the gaps, but often not all). So we want to help out
111 * RA, and realize that as soon as we schedule the first collect
112 * src, there is no penalty to schedule the remainder (ie. they
113 * don't make additional values live). In fact we'd prefer to
114 * schedule the rest ASAP to minimize the live range of the vecN.
115 *
116 * For instructions that are the src of a collect, we track the
117 * corresponding collect, and mark them as partially live as soon
118 * as any one of the src's is scheduled.
119 */
120 struct ir3_instruction *collect;
121 bool partially_live;
122
123 /* Is this instruction a direct or indirect dependency for a kill?
124 * If so, we should prioritize it when possible
125 */
126 bool kill_path;
127
128 /* This node represents a shader output. A semi-common pattern in
129 * shaders is something along the lines of:
130 *
131 * fragcolor.w = 1.0
132 *
133 * Which we'd prefer to schedule as late as possible, since it
134 * produces a live value that is never killed/consumed. So detect
135 * outputs up-front, and avoid scheduling them unless the reduce
136 * register pressure (or at least are neutral)
137 */
138 bool output;
139 };
140
141 #define foreach_sched_node(__n, __list) \
142 list_for_each_entry(struct ir3_sched_node, __n, __list, dag.link)
143
144 static void sched_node_init(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr);
145 static void sched_node_add_dep(struct ir3_instruction *instr, struct ir3_instruction *src, int i);
146
147 static bool is_scheduled(struct ir3_instruction *instr)
148 {
149 return !!(instr->flags & IR3_INSTR_MARK);
150 }
151
152 static void
153 schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
154 {
155 debug_assert(ctx->block == instr->block);
156
157 /* remove from depth list:
158 */
159 list_delinit(&instr->node);
160
161 if (writes_addr0(instr)) {
162 debug_assert(ctx->addr0 == NULL);
163 ctx->addr0 = instr;
164 }
165
166 if (writes_addr1(instr)) {
167 debug_assert(ctx->addr1 == NULL);
168 ctx->addr1 = instr;
169 }
170
171 if (writes_pred(instr)) {
172 debug_assert(ctx->pred == NULL);
173 ctx->pred = instr;
174 }
175
176 instr->flags |= IR3_INSTR_MARK;
177
178 di(instr, "schedule");
179
180 list_addtail(&instr->node, &instr->block->instr_list);
181 ctx->scheduled = instr;
182
183 if (is_kill(instr)){
184 ctx->remaining_kills--;
185 }
186
187 struct ir3_sched_node *n = instr->data;
188
189 /* If this instruction is a meta:collect src, mark the remaining
190 * collect srcs as partially live.
191 */
192 if (n->collect) {
193 struct ir3_instruction *src;
194 foreach_ssa_src (src, n->collect) {
195 if (src->block != instr->block)
196 continue;
197 struct ir3_sched_node *sn = src->data;
198 sn->partially_live = true;
199 }
200 }
201
202 dag_prune_head(ctx->dag, &n->dag);
203 }
204
205 struct ir3_sched_notes {
206 /* there is at least one kill which could be scheduled, except
207 * for unscheduled bary.f's:
208 */
209 bool blocked_kill;
210 /* there is at least one instruction that could be scheduled,
211 * except for conflicting address/predicate register usage:
212 */
213 bool addr0_conflict, addr1_conflict, pred_conflict;
214 };
215
216 /* could an instruction be scheduled if specified ssa src was scheduled? */
217 static bool
218 could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
219 {
220 struct ir3_instruction *other_src;
221 foreach_ssa_src (other_src, instr) {
222 /* if dependency not scheduled, we aren't ready yet: */
223 if ((src != other_src) && !is_scheduled(other_src)) {
224 return false;
225 }
226 }
227 return true;
228 }
229
230 /* Check if instruction is ok to schedule. Make sure it is not blocked
231 * by use of addr/predicate register, etc.
232 */
233 static bool
234 check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
235 struct ir3_instruction *instr)
236 {
237 debug_assert(!is_scheduled(instr));
238
239 if (ctx->remaining_kills && (is_tex(instr) || is_mem(instr))) {
240 /* avoid texture/memory access if we have unscheduled kills
241 * that could make the expensive operation unnecessary. By
242 * definition, if there are remaining kills, and this instr
243 * is not a dependency of a kill, there are other instructions
244 * that we can choose from.
245 */
246 struct ir3_sched_node *n = instr->data;
247 if (!n->kill_path)
248 return false;
249 }
250
251 /* For instructions that write address register we need to
252 * make sure there is at least one instruction that uses the
253 * addr value which is otherwise ready.
254 *
255 * NOTE if any instructions use pred register and have other
256 * src args, we would need to do the same for writes_pred()..
257 */
258 if (writes_addr0(instr)) {
259 struct ir3 *ir = instr->block->shader;
260 bool ready = false;
261 for (unsigned i = 0; (i < ir->a0_users_count) && !ready; i++) {
262 struct ir3_instruction *indirect = ir->a0_users[i];
263 if (!indirect)
264 continue;
265 if (indirect->address != instr)
266 continue;
267 ready = could_sched(indirect, instr);
268 }
269
270 /* nothing could be scheduled, so keep looking: */
271 if (!ready)
272 return false;
273 }
274
275 if (writes_addr1(instr)) {
276 struct ir3 *ir = instr->block->shader;
277 bool ready = false;
278 for (unsigned i = 0; (i < ir->a1_users_count) && !ready; i++) {
279 struct ir3_instruction *indirect = ir->a1_users[i];
280 if (!indirect)
281 continue;
282 if (indirect->address != instr)
283 continue;
284 ready = could_sched(indirect, instr);
285 }
286
287 /* nothing could be scheduled, so keep looking: */
288 if (!ready)
289 return false;
290 }
291
292 /* if this is a write to address/predicate register, and that
293 * register is currently in use, we need to defer until it is
294 * free:
295 */
296 if (writes_addr0(instr) && ctx->addr0) {
297 debug_assert(ctx->addr0 != instr);
298 notes->addr0_conflict = true;
299 return false;
300 }
301
302 if (writes_addr1(instr) && ctx->addr1) {
303 debug_assert(ctx->addr1 != instr);
304 notes->addr1_conflict = true;
305 return false;
306 }
307
308 if (writes_pred(instr) && ctx->pred) {
309 debug_assert(ctx->pred != instr);
310 notes->pred_conflict = true;
311 return false;
312 }
313
314 /* if the instruction is a kill, we need to ensure *every*
315 * bary.f is scheduled. The hw seems unhappy if the thread
316 * gets killed before the end-input (ei) flag is hit.
317 *
318 * We could do this by adding each bary.f instruction as
319 * virtual ssa src for the kill instruction. But we have
320 * fixed length instr->regs[].
321 *
322 * TODO we could handle this by false-deps now, probably.
323 */
324 if (is_kill(instr)) {
325 struct ir3 *ir = instr->block->shader;
326
327 for (unsigned i = 0; i < ir->baryfs_count; i++) {
328 struct ir3_instruction *baryf = ir->baryfs[i];
329 if (baryf->flags & IR3_INSTR_UNUSED)
330 continue;
331 if (!is_scheduled(baryf)) {
332 notes->blocked_kill = true;
333 return false;
334 }
335 }
336 }
337
338 return true;
339 }
340
341 /* Find the instr->ip of the closest use of an instruction, in
342 * pre-sched order. This isn't going to be the same as post-sched
343 * order, but it is a reasonable approximation to limit scheduling
344 * instructions *too* early. This is mostly to prevent bad behavior
345 * in cases where we have a large number of possible instructions
346 * to choose, to avoid creating too much parallelism (ie. blowing
347 * up register pressure)
348 *
349 * See dEQP-GLES31.functional.atomic_counter.layout.reverse_offset.inc_dec.8_counters_5_calls_1_thread
350 */
351 static int
352 nearest_use(struct ir3_instruction *instr)
353 {
354 unsigned nearest = ~0;
355 foreach_ssa_use (use, instr)
356 if (!is_scheduled(use))
357 nearest = MIN2(nearest, use->ip);
358
359 /* slight hack.. this heuristic tends to push bary.f's to later
360 * in the shader, closer to their uses. But we actually would
361 * prefer to get these scheduled earlier, to unlock varying
362 * storage for more VS jobs:
363 */
364 if (is_input(instr))
365 nearest /= 2;
366
367 return nearest;
368 }
369
370 static int
371 use_count(struct ir3_instruction *instr)
372 {
373 unsigned cnt = 0;
374 foreach_ssa_use (use, instr)
375 if (!is_scheduled(use))
376 cnt++;
377 return cnt;
378 }
379
380 /* find net change to live values if instruction were scheduled: */
381 static int
382 live_effect(struct ir3_instruction *instr)
383 {
384 struct ir3_sched_node *n = instr->data;
385 struct ir3_instruction *src;
386 int new_live = n->partially_live ? 0 : dest_regs(instr);
387 int freed_live = 0;
388
389 /* if we schedule something that causes a vecN to be live,
390 * then count all it's other components too:
391 */
392 if (n->collect)
393 new_live *= n->collect->regs_count - 1;
394
395 foreach_ssa_src_n (src, n, instr) {
396 if (__is_false_dep(instr, n))
397 continue;
398
399 if (instr->block != src->block)
400 continue;
401
402 if (use_count(src) == 1)
403 freed_live += dest_regs(src);
404 }
405
406 return new_live - freed_live;
407 }
408
409 static struct ir3_sched_node * choose_instr_inc(struct ir3_sched_ctx *ctx,
410 struct ir3_sched_notes *notes, bool avoid_output);
411
412 /**
413 * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSR (Code
414 * Scheduling for Register pressure) heuristic.
415 *
416 * Only handles the case of choosing instructions that reduce register pressure
417 * or are even.
418 */
419 static struct ir3_sched_node *
420 choose_instr_dec(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes)
421 {
422 struct ir3_sched_node *chosen = NULL;
423
424 /* Find a ready inst with regs freed and pick the one with max cost. */
425 foreach_sched_node (n, &ctx->dag->heads) {
426 unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
427
428 if (d > 0)
429 continue;
430
431 if (live_effect(n->instr) > -1)
432 continue;
433
434 if (!check_instr(ctx, notes, n->instr))
435 continue;
436
437 if (!chosen || chosen->max_delay < n->max_delay) {
438 chosen = n;
439 }
440 }
441
442 if (chosen) {
443 di(chosen->instr, "dec: chose (freed+ready)");
444 return chosen;
445 }
446
447 /* Find a leader with regs freed and pick the one with max cost. */
448 foreach_sched_node (n, &ctx->dag->heads) {
449 if (live_effect(n->instr) > -1)
450 continue;
451
452 if (!check_instr(ctx, notes, n->instr))
453 continue;
454
455 if (!chosen || chosen->max_delay < n->max_delay) {
456 chosen = n;
457 }
458 }
459
460 if (chosen) {
461 di(chosen->instr, "dec: chose (freed)");
462 return chosen;
463 }
464
465 /* Contra the paper, pick a leader with no effect on used regs. This may
466 * open up new opportunities, as otherwise a single-operand instr consuming
467 * a value will tend to block finding freeing that value. This had a
468 * massive effect on reducing spilling on V3D.
469 *
470 * XXX: Should this prioritize ready?
471 */
472 foreach_sched_node (n, &ctx->dag->heads) {
473 unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
474
475 if (d > 0)
476 continue;
477
478 if (live_effect(n->instr) > 0)
479 continue;
480
481 if (!check_instr(ctx, notes, n->instr))
482 continue;
483
484 if (!chosen || chosen->max_delay < n->max_delay)
485 chosen = n;
486 }
487
488 if (chosen) {
489 di(chosen->instr, "dec: chose (neutral+ready)");
490 return chosen;
491 }
492
493 foreach_sched_node (n, &ctx->dag->heads) {
494 if (live_effect(n->instr) > 0)
495 continue;
496
497 if (!check_instr(ctx, notes, n->instr))
498 continue;
499
500 if (!chosen || chosen->max_delay < n->max_delay)
501 chosen = n;
502 }
503
504 if (chosen) {
505 di(chosen->instr, "dec: chose (neutral)");
506 return chosen;
507 }
508
509 return choose_instr_inc(ctx, notes, true);
510 }
511
512 /**
513 * When we can't choose an instruction that reduces register pressure or
514 * is neutral, we end up here to try and pick the least bad option.
515 */
516 static struct ir3_sched_node *
517 choose_instr_inc(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
518 bool avoid_output)
519 {
520 struct ir3_sched_node *chosen = NULL;
521
522 /*
523 * From hear on out, we are picking something that increases
524 * register pressure. So try to pick something which will
525 * be consumed soon:
526 */
527 unsigned chosen_distance = 0;
528
529 /* Pick the max delay of the remaining ready set. */
530 foreach_sched_node (n, &ctx->dag->heads) {
531 if (avoid_output && n->output)
532 continue;
533
534 unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
535
536 if (d > 0)
537 continue;
538
539 if (!check_instr(ctx, notes, n->instr))
540 continue;
541
542 unsigned distance = nearest_use(n->instr);
543
544 if (!chosen || distance < chosen_distance) {
545 chosen = n;
546 chosen_distance = distance;
547 }
548 }
549
550 if (chosen) {
551 di(chosen->instr, "inc: chose (distance+ready)");
552 return chosen;
553 }
554
555 /* Pick the max delay of the remaining leaders. */
556 foreach_sched_node (n, &ctx->dag->heads) {
557 if (avoid_output && n->output)
558 continue;
559
560 if (!check_instr(ctx, notes, n->instr))
561 continue;
562
563 unsigned distance = nearest_use(n->instr);
564
565 if (!chosen || distance < chosen_distance) {
566 chosen = n;
567 chosen_distance = distance;
568 }
569 }
570
571 if (chosen) {
572 di(chosen->instr, "inc: chose (distance)");
573 return chosen;
574 }
575
576 return NULL;
577 }
578
579 /* Handles instruction selections for instructions we want to prioritize
580 * even if csp/csr would not pick them.
581 */
582 static struct ir3_sched_node *
583 choose_instr_prio(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes)
584 {
585 struct ir3_sched_node *chosen = NULL;
586
587 foreach_sched_node (n, &ctx->dag->heads) {
588 if (!is_meta(n->instr))
589 continue;
590
591 if (!chosen || (chosen->max_delay < n->max_delay))
592 chosen = n;
593 }
594
595 if (chosen) {
596 di(chosen->instr, "prio: chose (meta)");
597 return chosen;
598 }
599
600 return NULL;
601 }
602
603 static void
604 dump_state(struct ir3_sched_ctx *ctx)
605 {
606 if (!SCHED_DEBUG)
607 return;
608
609 foreach_sched_node (n, &ctx->dag->heads) {
610 di(n->instr, "maxdel=%3d le=%d del=%u ",
611 n->max_delay, live_effect(n->instr),
612 ir3_delay_calc(ctx->block, n->instr, false, false));
613
614 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
615 struct ir3_sched_node *child = (struct ir3_sched_node *)edge->child;
616
617 di(child->instr, " -> (%d parents) ", child->dag.parent_count);
618 }
619 }
620 }
621
622 /* find instruction to schedule: */
623 static struct ir3_instruction *
624 choose_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes)
625 {
626 struct ir3_sched_node *chosen;
627
628 dump_state(ctx);
629
630 chosen = choose_instr_prio(ctx, notes);
631 if (chosen)
632 return chosen->instr;
633
634 chosen = choose_instr_dec(ctx, notes);
635 if (chosen)
636 return chosen->instr;
637
638 chosen = choose_instr_inc(ctx, notes, false);
639 if (chosen)
640 return chosen->instr;
641
642 return NULL;
643 }
644
645 static struct ir3_instruction *
646 split_instr(struct ir3_sched_ctx *ctx, struct ir3_instruction *orig_instr)
647 {
648 struct ir3_instruction *new_instr = ir3_instr_clone(orig_instr);
649 di(new_instr, "split instruction");
650 sched_node_init(ctx, new_instr);
651 return new_instr;
652 }
653
654 /* "spill" the address registers by remapping any unscheduled
655 * instructions which depend on the current address register
656 * to a clone of the instruction which wrote the address reg.
657 */
658 static struct ir3_instruction *
659 split_addr(struct ir3_sched_ctx *ctx, struct ir3_instruction **addr,
660 struct ir3_instruction **users, unsigned users_count)
661 {
662 struct ir3_instruction *new_addr = NULL;
663 unsigned i;
664
665 debug_assert(*addr);
666
667 for (i = 0; i < users_count; i++) {
668 struct ir3_instruction *indirect = users[i];
669
670 if (!indirect)
671 continue;
672
673 /* skip instructions already scheduled: */
674 if (is_scheduled(indirect))
675 continue;
676
677 /* remap remaining instructions using current addr
678 * to new addr:
679 */
680 if (indirect->address == *addr) {
681 if (!new_addr) {
682 new_addr = split_instr(ctx, *addr);
683 /* original addr is scheduled, but new one isn't: */
684 new_addr->flags &= ~IR3_INSTR_MARK;
685 }
686 indirect->address = new_addr;
687 /* don't need to remove old dag edge since old addr is
688 * already scheduled:
689 */
690 sched_node_add_dep(indirect, new_addr, 0);
691 di(indirect, "new address");
692 }
693 }
694
695 /* all remaining indirects remapped to new addr: */
696 *addr = NULL;
697
698 return new_addr;
699 }
700
701 /* "spill" the predicate register by remapping any unscheduled
702 * instructions which depend on the current predicate register
703 * to a clone of the instruction which wrote the address reg.
704 */
705 static struct ir3_instruction *
706 split_pred(struct ir3_sched_ctx *ctx)
707 {
708 struct ir3 *ir;
709 struct ir3_instruction *new_pred = NULL;
710 unsigned i;
711
712 debug_assert(ctx->pred);
713
714 ir = ctx->pred->block->shader;
715
716 for (i = 0; i < ir->predicates_count; i++) {
717 struct ir3_instruction *predicated = ir->predicates[i];
718
719 /* skip instructions already scheduled: */
720 if (is_scheduled(predicated))
721 continue;
722
723 /* remap remaining instructions using current pred
724 * to new pred:
725 *
726 * TODO is there ever a case when pred isn't first
727 * (and only) src?
728 */
729 if (ssa(predicated->regs[1]) == ctx->pred) {
730 if (!new_pred) {
731 new_pred = split_instr(ctx, ctx->pred);
732 /* original pred is scheduled, but new one isn't: */
733 new_pred->flags &= ~IR3_INSTR_MARK;
734 }
735 predicated->regs[1]->instr = new_pred;
736 /* don't need to remove old dag edge since old pred is
737 * already scheduled:
738 */
739 sched_node_add_dep(predicated, new_pred, 0);
740 di(predicated, "new predicate");
741 }
742 }
743
744 /* all remaining predicated remapped to new pred: */
745 ctx->pred = NULL;
746
747 return new_pred;
748 }
749
750 static void
751 sched_node_init(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
752 {
753 struct ir3_sched_node *n = rzalloc(ctx->dag, struct ir3_sched_node);
754
755 dag_init_node(ctx->dag, &n->dag);
756
757 n->instr = instr;
758 instr->data = n;
759 }
760
761 static void
762 sched_node_add_dep(struct ir3_instruction *instr, struct ir3_instruction *src, int i)
763 {
764 /* don't consider dependencies in other blocks: */
765 if (src->block != instr->block)
766 return;
767
768 /* we could have false-dep's that end up unused: */
769 if (src->flags & IR3_INSTR_UNUSED) {
770 debug_assert(__is_false_dep(instr, i));
771 return;
772 }
773
774 struct ir3_sched_node *n = instr->data;
775 struct ir3_sched_node *sn = src->data;
776
777 /* If src is consumed by a collect, track that to realize that once
778 * any of the collect srcs are live, we should hurry up and schedule
779 * the rest.
780 */
781 if (instr->opc == OPC_META_COLLECT)
782 sn->collect = instr;
783
784 dag_add_edge(&sn->dag, &n->dag, NULL);
785
786 unsigned d = ir3_delayslots(src, instr, i, true);
787 n->delay = MAX2(n->delay, d);
788 }
789
790 static void
791 mark_kill_path(struct ir3_instruction *instr)
792 {
793 struct ir3_sched_node *n = instr->data;
794 n->kill_path = true;
795 struct ir3_instruction *src;
796 foreach_ssa_src (src, instr) {
797 if (src->block != instr->block)
798 continue;
799 mark_kill_path(src);
800 }
801 }
802
803 /* Is it an output? */
804 static bool
805 is_output_collect(struct ir3_instruction *instr)
806 {
807 struct ir3 *ir = instr->block->shader;
808
809 for (unsigned i = 0; i < ir->outputs_count; i++) {
810 struct ir3_instruction *collect = ir->outputs[i];
811 assert(collect->opc == OPC_META_COLLECT);
812 if (instr == collect)
813 return true;
814 }
815
816 return false;
817 }
818
819 /* Is it's only use as output? */
820 static bool
821 is_output_only(struct ir3_instruction *instr)
822 {
823 if (!writes_gpr(instr))
824 return false;
825
826 if (!(instr->regs[0]->flags & IR3_REG_SSA))
827 return false;
828
829 foreach_ssa_use (use, instr)
830 if (!is_output_collect(use))
831 return false;
832
833 return true;
834 }
835
836 static void
837 sched_node_add_deps(struct ir3_instruction *instr)
838 {
839 struct ir3_instruction *src;
840
841 /* Since foreach_ssa_src() already handles false-dep's we can construct
842 * the DAG easily in a single pass.
843 */
844 foreach_ssa_src_n (src, i, instr) {
845 sched_node_add_dep(instr, src, i);
846 }
847
848 /* NOTE that all inputs must be scheduled before a kill, so
849 * mark these to be prioritized as well:
850 */
851 if (is_kill(instr) || is_input(instr)) {
852 mark_kill_path(instr);
853 }
854
855 if (is_output_only(instr)) {
856 struct ir3_sched_node *n = instr->data;
857 n->output = true;
858 }
859 }
860
861 static void
862 sched_dag_max_delay_cb(struct dag_node *node, void *state)
863 {
864 struct ir3_sched_node *n = (struct ir3_sched_node *)node;
865 uint32_t max_delay = 0;
866
867 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
868 struct ir3_sched_node *child = (struct ir3_sched_node *)edge->child;
869 max_delay = MAX2(child->max_delay, max_delay);
870 }
871
872 n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
873 }
874
875 static void
876 sched_dag_init(struct ir3_sched_ctx *ctx)
877 {
878 ctx->dag = dag_create(ctx);
879
880 foreach_instr (instr, &ctx->unscheduled_list) {
881 sched_node_init(ctx, instr);
882 sched_node_add_deps(instr);
883 }
884
885 dag_traverse_bottom_up(ctx->dag, sched_dag_max_delay_cb, NULL);
886 }
887
888 static void
889 sched_dag_destroy(struct ir3_sched_ctx *ctx)
890 {
891 ralloc_free(ctx->dag);
892 ctx->dag = NULL;
893 }
894
895 static void
896 sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
897 {
898 ctx->block = block;
899
900 /* addr/pred writes are per-block: */
901 ctx->addr0 = NULL;
902 ctx->addr1 = NULL;
903 ctx->pred = NULL;
904
905 /* move all instructions to the unscheduled list, and
906 * empty the block's instruction list (to which we will
907 * be inserting).
908 */
909 list_replace(&block->instr_list, &ctx->unscheduled_list);
910 list_inithead(&block->instr_list);
911
912 sched_dag_init(ctx);
913
914 ctx->remaining_kills = 0;
915 foreach_instr_safe (instr, &ctx->unscheduled_list) {
916 if (is_kill(instr))
917 ctx->remaining_kills++;
918 }
919
920 /* First schedule all meta:input instructions, followed by
921 * tex-prefetch. We want all of the instructions that load
922 * values into registers before the shader starts to go
923 * before any other instructions. But in particular we
924 * want inputs to come before prefetches. This is because
925 * a FS's bary_ij input may not actually be live in the
926 * shader, but it should not be scheduled on top of any
927 * other input (but can be overwritten by a tex prefetch)
928 */
929 foreach_instr_safe (instr, &ctx->unscheduled_list)
930 if (instr->opc == OPC_META_INPUT)
931 schedule(ctx, instr);
932
933 foreach_instr_safe (instr, &ctx->unscheduled_list)
934 if (instr->opc == OPC_META_TEX_PREFETCH)
935 schedule(ctx, instr);
936
937 while (!list_is_empty(&ctx->unscheduled_list)) {
938 struct ir3_sched_notes notes = {0};
939 struct ir3_instruction *instr;
940
941 instr = choose_instr(ctx, &notes);
942 if (instr) {
943 unsigned delay = ir3_delay_calc(ctx->block, instr, false, false);
944 d("delay=%u", delay);
945
946 /* and if we run out of instructions that can be scheduled,
947 * then it is time for nop's:
948 */
949 debug_assert(delay <= 6);
950 while (delay > 0) {
951 ir3_NOP(block);
952 delay--;
953 }
954
955 schedule(ctx, instr);
956 } else {
957 struct ir3_instruction *new_instr = NULL;
958 struct ir3 *ir = block->shader;
959
960 /* nothing available to schedule.. if we are blocked on
961 * address/predicate register conflict, then break the
962 * deadlock by cloning the instruction that wrote that
963 * reg:
964 */
965 if (notes.addr0_conflict) {
966 new_instr = split_addr(ctx, &ctx->addr0,
967 ir->a0_users, ir->a0_users_count);
968 } else if (notes.addr1_conflict) {
969 new_instr = split_addr(ctx, &ctx->addr1,
970 ir->a1_users, ir->a1_users_count);
971 } else if (notes.pred_conflict) {
972 new_instr = split_pred(ctx);
973 } else {
974 d("unscheduled_list:");
975 foreach_instr (instr, &ctx->unscheduled_list)
976 di(instr, "unscheduled: ");
977 debug_assert(0);
978 ctx->error = true;
979 return;
980 }
981
982 if (new_instr) {
983 list_delinit(&new_instr->node);
984 list_addtail(&new_instr->node, &ctx->unscheduled_list);
985 }
986 }
987 }
988
989 sched_dag_destroy(ctx);
990 }
991
992 int ir3_sched(struct ir3 *ir)
993 {
994 struct ir3_sched_ctx *ctx = rzalloc(NULL, struct ir3_sched_ctx);
995
996 foreach_block (block, &ir->block_list) {
997 foreach_instr (instr, &block->instr_list) {
998 instr->data = NULL;
999 }
1000 }
1001
1002 ir3_count_instructions(ir);
1003 ir3_clear_mark(ir);
1004 ir3_find_ssa_uses(ir, ctx, false);
1005
1006 foreach_block (block, &ir->block_list) {
1007 sched_block(ctx, block);
1008 }
1009
1010 int ret = ctx->error ? -1 : 0;
1011
1012 ralloc_free(ctx);
1013
1014 return ret;
1015 }
1016
1017 static unsigned
1018 get_array_id(struct ir3_instruction *instr)
1019 {
1020 /* The expectation is that there is only a single array
1021 * src or dst, ir3_cp should enforce this.
1022 */
1023
1024 for (unsigned i = 0; i < instr->regs_count; i++)
1025 if (instr->regs[i]->flags & IR3_REG_ARRAY)
1026 return instr->regs[i]->array.id;
1027
1028 unreachable("this was unexpected");
1029 }
1030
1031 /* does instruction 'prior' need to be scheduled before 'instr'? */
1032 static bool
1033 depends_on(struct ir3_instruction *instr, struct ir3_instruction *prior)
1034 {
1035 /* TODO for dependencies that are related to a specific object, ie
1036 * a specific SSBO/image/array, we could relax this constraint to
1037 * make accesses to unrelated objects not depend on each other (at
1038 * least as long as not declared coherent)
1039 */
1040 if (((instr->barrier_class & IR3_BARRIER_EVERYTHING) && prior->barrier_class) ||
1041 ((prior->barrier_class & IR3_BARRIER_EVERYTHING) && instr->barrier_class))
1042 return true;
1043
1044 if (instr->barrier_class & prior->barrier_conflict) {
1045 if (!(instr->barrier_class & ~(IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W))) {
1046 /* if only array barrier, then we can further limit false-deps
1047 * by considering the array-id, ie reads/writes to different
1048 * arrays do not depend on each other (no aliasing)
1049 */
1050 if (get_array_id(instr) != get_array_id(prior)) {
1051 return false;
1052 }
1053 }
1054
1055 return true;
1056 }
1057
1058 return false;
1059 }
1060
1061 static void
1062 add_barrier_deps(struct ir3_block *block, struct ir3_instruction *instr)
1063 {
1064 struct list_head *prev = instr->node.prev;
1065 struct list_head *next = instr->node.next;
1066
1067 /* add dependencies on previous instructions that must be scheduled
1068 * prior to the current instruction
1069 */
1070 while (prev != &block->instr_list) {
1071 struct ir3_instruction *pi =
1072 LIST_ENTRY(struct ir3_instruction, prev, node);
1073
1074 prev = prev->prev;
1075
1076 if (is_meta(pi))
1077 continue;
1078
1079 if (instr->barrier_class == pi->barrier_class) {
1080 ir3_instr_add_dep(instr, pi);
1081 break;
1082 }
1083
1084 if (depends_on(instr, pi))
1085 ir3_instr_add_dep(instr, pi);
1086 }
1087
1088 /* add dependencies on this instruction to following instructions
1089 * that must be scheduled after the current instruction:
1090 */
1091 while (next != &block->instr_list) {
1092 struct ir3_instruction *ni =
1093 LIST_ENTRY(struct ir3_instruction, next, node);
1094
1095 next = next->next;
1096
1097 if (is_meta(ni))
1098 continue;
1099
1100 if (instr->barrier_class == ni->barrier_class) {
1101 ir3_instr_add_dep(ni, instr);
1102 break;
1103 }
1104
1105 if (depends_on(ni, instr))
1106 ir3_instr_add_dep(ni, instr);
1107 }
1108 }
1109
1110 /* before scheduling a block, we need to add any necessary false-dependencies
1111 * to ensure that:
1112 *
1113 * (1) barriers are scheduled in the right order wrt instructions related
1114 * to the barrier
1115 *
1116 * (2) reads that come before a write actually get scheduled before the
1117 * write
1118 */
1119 static void
1120 calculate_deps(struct ir3_block *block)
1121 {
1122 foreach_instr (instr, &block->instr_list) {
1123 if (instr->barrier_class) {
1124 add_barrier_deps(block, instr);
1125 }
1126 }
1127 }
1128
1129 void
1130 ir3_sched_add_deps(struct ir3 *ir)
1131 {
1132 foreach_block (block, &ir->block_list) {
1133 calculate_deps(block);
1134 }
1135 }