turnip: implement VK_EXT_4444_formats
[mesa.git] / src / freedreno / ir3 / ir3_sched.c
1 /*
2 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Rob Clark <robclark@freedesktop.org>
25 */
26
27
28 #include "util/dag.h"
29 #include "util/u_math.h"
30
31 #include "ir3.h"
32 #include "ir3_compiler.h"
33
34 #ifdef DEBUG
35 #define SCHED_DEBUG (ir3_shader_debug & IR3_DBG_SCHEDMSGS)
36 #else
37 #define SCHED_DEBUG 0
38 #endif
39 #define d(fmt, ...) do { if (SCHED_DEBUG) { \
40 printf("SCHED: "fmt"\n", ##__VA_ARGS__); \
41 } } while (0)
42
43 #define di(instr, fmt, ...) do { if (SCHED_DEBUG) { \
44 printf("SCHED: "fmt": ", ##__VA_ARGS__); \
45 ir3_print_instr(instr); \
46 } } while (0)
47
48 /*
49 * Instruction Scheduling:
50 *
51 * A block-level pre-RA scheduler, which works by creating a DAG of
52 * instruction dependencies, and heuristically picking a DAG head
53 * (instruction with no unscheduled dependencies).
54 *
55 * Where possible, it tries to pick instructions that avoid nop delay
56 * slots, but it will prefer to pick instructions that reduce (or do
57 * not increase) the number of live values.
58 *
59 * If the only possible choices are instructions that increase the
60 * number of live values, it will try to pick the one with the earliest
61 * consumer (based on pre-sched program order).
62 *
63 * There are a few special cases that need to be handled, since sched
64 * is currently independent of register allocation. Usages of address
65 * register (a0.x) or predicate register (p0.x) must be serialized. Ie.
66 * if you have two pairs of instructions that write the same special
67 * register and then read it, then those pairs cannot be interleaved.
68 * To solve this, when we are in such a scheduling "critical section",
69 * and we encounter a conflicting write to a special register, we try
70 * to schedule any remaining instructions that use that value first.
71 *
72 * TODO we can detect too-large live_values here.. would be a good place
73 * to "spill" cheap things, like move from uniform/immed. (Constructing
74 * list of ssa def consumers before sched pass would make this easier.
75 * Also, in general it is general it might be best not to re-use load_immed
76 * across blocks.
77 *
78 * TODO we can use (abs)/(neg) src modifiers in a lot of cases to reduce
79 * the # of immediates in play (or at least that would help with
80 * dEQP-GLES31.functional.ubo.random.all_per_block_buffers.*).. probably
81 * do this in a nir pass that inserts fneg/etc? The cp pass should fold
82 * these into src modifiers..
83 */
84
85 struct ir3_sched_ctx {
86 struct ir3_block *block; /* the current block */
87 struct dag *dag;
88
89 struct list_head unscheduled_list; /* unscheduled instructions */
90 struct ir3_instruction *scheduled; /* last scheduled instr */
91 struct ir3_instruction *addr0; /* current a0.x user, if any */
92 struct ir3_instruction *addr1; /* current a1.x user, if any */
93 struct ir3_instruction *pred; /* current p0.x user, if any */
94
95 int remaining_kills;
96 int remaining_tex;
97
98 bool error;
99
100 int sfu_delay;
101 int tex_delay;
102 };
103
104 struct ir3_sched_node {
105 struct dag_node dag; /* must be first for util_dynarray_foreach */
106 struct ir3_instruction *instr;
107
108 unsigned delay;
109 unsigned max_delay;
110
111 /* For instructions that are a meta:collect src, once we schedule
112 * the first src of the collect, the entire vecN is live (at least
113 * from the PoV of the first RA pass.. the 2nd scalar pass can fill
114 * in some of the gaps, but often not all). So we want to help out
115 * RA, and realize that as soon as we schedule the first collect
116 * src, there is no penalty to schedule the remainder (ie. they
117 * don't make additional values live). In fact we'd prefer to
118 * schedule the rest ASAP to minimize the live range of the vecN.
119 *
120 * For instructions that are the src of a collect, we track the
121 * corresponding collect, and mark them as partially live as soon
122 * as any one of the src's is scheduled.
123 */
124 struct ir3_instruction *collect;
125 bool partially_live;
126
127 /* Is this instruction a direct or indirect dependency for a kill?
128 * If so, we should prioritize it when possible
129 */
130 bool kill_path;
131
132 /* This node represents a shader output. A semi-common pattern in
133 * shaders is something along the lines of:
134 *
135 * fragcolor.w = 1.0
136 *
137 * Which we'd prefer to schedule as late as possible, since it
138 * produces a live value that is never killed/consumed. So detect
139 * outputs up-front, and avoid scheduling them unless the reduce
140 * register pressure (or at least are neutral)
141 */
142 bool output;
143 };
144
145 #define foreach_sched_node(__n, __list) \
146 list_for_each_entry(struct ir3_sched_node, __n, __list, dag.link)
147
148 static void sched_node_init(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr);
149 static void sched_node_add_dep(struct ir3_instruction *instr, struct ir3_instruction *src, int i);
150
151 static bool is_scheduled(struct ir3_instruction *instr)
152 {
153 return !!(instr->flags & IR3_INSTR_MARK);
154 }
155
156 static void
157 schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
158 {
159 debug_assert(ctx->block == instr->block);
160
161 /* remove from depth list:
162 */
163 list_delinit(&instr->node);
164
165 if (writes_addr0(instr)) {
166 debug_assert(ctx->addr0 == NULL);
167 ctx->addr0 = instr;
168 }
169
170 if (writes_addr1(instr)) {
171 debug_assert(ctx->addr1 == NULL);
172 ctx->addr1 = instr;
173 }
174
175 if (writes_pred(instr)) {
176 debug_assert(ctx->pred == NULL);
177 ctx->pred = instr;
178 }
179
180 instr->flags |= IR3_INSTR_MARK;
181
182 di(instr, "schedule");
183
184 list_addtail(&instr->node, &instr->block->instr_list);
185 ctx->scheduled = instr;
186
187 if (is_kill(instr)){
188 assert(ctx->remaining_kills > 0);
189 ctx->remaining_kills--;
190 }
191
192 struct ir3_sched_node *n = instr->data;
193
194 /* If this instruction is a meta:collect src, mark the remaining
195 * collect srcs as partially live.
196 */
197 if (n->collect) {
198 foreach_ssa_src (src, n->collect) {
199 if (src->block != instr->block)
200 continue;
201 struct ir3_sched_node *sn = src->data;
202 sn->partially_live = true;
203 }
204 }
205
206 dag_prune_head(ctx->dag, &n->dag);
207
208 if (is_meta(instr) && (instr->opc != OPC_META_TEX_PREFETCH))
209 return;
210
211 if (is_sfu(instr)) {
212 ctx->sfu_delay = 8;
213 } else if (check_src_cond(instr, is_sfu)) {
214 ctx->sfu_delay = 0;
215 } else if (ctx->sfu_delay > 0) {
216 ctx->sfu_delay--;
217 }
218
219 if (is_tex_or_prefetch(instr)) {
220 /* NOTE that this isn't an attempt to hide texture fetch latency,
221 * but an attempt to hide the cost of switching to another warp.
222 * If we can, we'd like to try to schedule another texture fetch
223 * before scheduling something that would sync.
224 */
225 ctx->tex_delay = 10;
226 assert(ctx->remaining_tex > 0);
227 ctx->remaining_tex--;
228 } else if (check_src_cond(instr, is_tex_or_prefetch)) {
229 ctx->tex_delay = 0;
230 } else if (ctx->tex_delay > 0) {
231 ctx->tex_delay--;
232 }
233 }
234
235 struct ir3_sched_notes {
236 /* there is at least one kill which could be scheduled, except
237 * for unscheduled bary.f's:
238 */
239 bool blocked_kill;
240 /* there is at least one instruction that could be scheduled,
241 * except for conflicting address/predicate register usage:
242 */
243 bool addr0_conflict, addr1_conflict, pred_conflict;
244 };
245
246 /* could an instruction be scheduled if specified ssa src was scheduled? */
247 static bool
248 could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
249 {
250 foreach_ssa_src (other_src, instr) {
251 /* if dependency not scheduled, we aren't ready yet: */
252 if ((src != other_src) && !is_scheduled(other_src)) {
253 return false;
254 }
255 }
256 return true;
257 }
258
259 /* Check if instruction is ok to schedule. Make sure it is not blocked
260 * by use of addr/predicate register, etc.
261 */
262 static bool
263 check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
264 struct ir3_instruction *instr)
265 {
266 debug_assert(!is_scheduled(instr));
267
268 if (ctx->remaining_kills && (is_tex(instr) || is_mem(instr))) {
269 /* avoid texture/memory access if we have unscheduled kills
270 * that could make the expensive operation unnecessary. By
271 * definition, if there are remaining kills, and this instr
272 * is not a dependency of a kill, there are other instructions
273 * that we can choose from.
274 */
275 struct ir3_sched_node *n = instr->data;
276 if (!n->kill_path)
277 return false;
278 }
279
280 /* For instructions that write address register we need to
281 * make sure there is at least one instruction that uses the
282 * addr value which is otherwise ready.
283 *
284 * NOTE if any instructions use pred register and have other
285 * src args, we would need to do the same for writes_pred()..
286 */
287 if (writes_addr0(instr)) {
288 struct ir3 *ir = instr->block->shader;
289 bool ready = false;
290 for (unsigned i = 0; (i < ir->a0_users_count) && !ready; i++) {
291 struct ir3_instruction *indirect = ir->a0_users[i];
292 if (!indirect)
293 continue;
294 if (indirect->address != instr)
295 continue;
296 ready = could_sched(indirect, instr);
297 }
298
299 /* nothing could be scheduled, so keep looking: */
300 if (!ready)
301 return false;
302 }
303
304 if (writes_addr1(instr)) {
305 struct ir3 *ir = instr->block->shader;
306 bool ready = false;
307 for (unsigned i = 0; (i < ir->a1_users_count) && !ready; i++) {
308 struct ir3_instruction *indirect = ir->a1_users[i];
309 if (!indirect)
310 continue;
311 if (indirect->address != instr)
312 continue;
313 ready = could_sched(indirect, instr);
314 }
315
316 /* nothing could be scheduled, so keep looking: */
317 if (!ready)
318 return false;
319 }
320
321 /* if this is a write to address/predicate register, and that
322 * register is currently in use, we need to defer until it is
323 * free:
324 */
325 if (writes_addr0(instr) && ctx->addr0) {
326 debug_assert(ctx->addr0 != instr);
327 notes->addr0_conflict = true;
328 return false;
329 }
330
331 if (writes_addr1(instr) && ctx->addr1) {
332 debug_assert(ctx->addr1 != instr);
333 notes->addr1_conflict = true;
334 return false;
335 }
336
337 if (writes_pred(instr) && ctx->pred) {
338 debug_assert(ctx->pred != instr);
339 notes->pred_conflict = true;
340 return false;
341 }
342
343 /* if the instruction is a kill, we need to ensure *every*
344 * bary.f is scheduled. The hw seems unhappy if the thread
345 * gets killed before the end-input (ei) flag is hit.
346 *
347 * We could do this by adding each bary.f instruction as
348 * virtual ssa src for the kill instruction. But we have
349 * fixed length instr->regs[].
350 *
351 * TODO we could handle this by false-deps now, probably.
352 */
353 if (is_kill(instr)) {
354 struct ir3 *ir = instr->block->shader;
355
356 for (unsigned i = 0; i < ir->baryfs_count; i++) {
357 struct ir3_instruction *baryf = ir->baryfs[i];
358 if (baryf->flags & IR3_INSTR_UNUSED)
359 continue;
360 if (!is_scheduled(baryf)) {
361 notes->blocked_kill = true;
362 return false;
363 }
364 }
365 }
366
367 return true;
368 }
369
370 /* Find the instr->ip of the closest use of an instruction, in
371 * pre-sched order. This isn't going to be the same as post-sched
372 * order, but it is a reasonable approximation to limit scheduling
373 * instructions *too* early. This is mostly to prevent bad behavior
374 * in cases where we have a large number of possible instructions
375 * to choose, to avoid creating too much parallelism (ie. blowing
376 * up register pressure)
377 *
378 * See dEQP-GLES31.functional.atomic_counter.layout.reverse_offset.inc_dec.8_counters_5_calls_1_thread
379 */
380 static int
381 nearest_use(struct ir3_instruction *instr)
382 {
383 unsigned nearest = ~0;
384 foreach_ssa_use (use, instr)
385 if (!is_scheduled(use))
386 nearest = MIN2(nearest, use->ip);
387
388 /* slight hack.. this heuristic tends to push bary.f's to later
389 * in the shader, closer to their uses. But we actually would
390 * prefer to get these scheduled earlier, to unlock varying
391 * storage for more VS jobs:
392 */
393 if (is_input(instr))
394 nearest /= 2;
395
396 return nearest;
397 }
398
399 static int
400 use_count(struct ir3_instruction *instr)
401 {
402 unsigned cnt = 0;
403 foreach_ssa_use (use, instr)
404 if (!is_scheduled(use))
405 cnt++;
406 return cnt;
407 }
408
409 /* find net change to live values if instruction were scheduled: */
410 static int
411 live_effect(struct ir3_instruction *instr)
412 {
413 struct ir3_sched_node *n = instr->data;
414 int new_live = n->partially_live ? 0 : dest_regs(instr);
415 int freed_live = 0;
416
417 /* if we schedule something that causes a vecN to be live,
418 * then count all it's other components too:
419 */
420 if (n->collect)
421 new_live *= n->collect->regs_count - 1;
422
423 foreach_ssa_src_n (src, n, instr) {
424 if (__is_false_dep(instr, n))
425 continue;
426
427 if (instr->block != src->block)
428 continue;
429
430 if (use_count(src) == 1)
431 freed_live += dest_regs(src);
432 }
433
434 return new_live - freed_live;
435 }
436
437 /* Determine if this is an instruction that we'd prefer not to schedule
438 * yet, in order to avoid an (ss)/(sy) sync. This is limited by the
439 * sfu_delay/tex_delay counters, ie. the more cycles it has been since
440 * the last SFU/tex, the less costly a sync would be.
441 */
442 static bool
443 would_sync(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
444 {
445 if (ctx->sfu_delay) {
446 if (check_src_cond(instr, is_sfu))
447 return true;
448 }
449
450 /* We mostly just want to try to schedule another texture fetch
451 * before scheduling something that would (sy) sync, so we can
452 * limit this rule to cases where there are remaining texture
453 * fetches
454 */
455 if (ctx->tex_delay && ctx->remaining_tex) {
456 if (check_src_cond(instr, is_tex_or_prefetch))
457 return true;
458 }
459
460 return false;
461 }
462
463 static struct ir3_sched_node *
464 choose_instr_inc(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
465 bool avoid_sync, bool avoid_output);
466
467 /**
468 * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSR (Code
469 * Scheduling for Register pressure) heuristic.
470 *
471 * Only handles the case of choosing instructions that reduce register pressure
472 * or are even.
473 */
474 static struct ir3_sched_node *
475 choose_instr_dec(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
476 bool avoid_sync)
477 {
478 const char *mode = avoid_sync ? "-as" : "";
479 struct ir3_sched_node *chosen = NULL;
480
481 /* Find a ready inst with regs freed and pick the one with max cost. */
482 foreach_sched_node (n, &ctx->dag->heads) {
483 if (avoid_sync && would_sync(ctx, n->instr))
484 continue;
485
486 unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
487
488 if (d > 0)
489 continue;
490
491 if (live_effect(n->instr) > -1)
492 continue;
493
494 if (!check_instr(ctx, notes, n->instr))
495 continue;
496
497 if (!chosen || chosen->max_delay < n->max_delay) {
498 chosen = n;
499 }
500 }
501
502 if (chosen) {
503 di(chosen->instr, "dec%s: chose (freed+ready)", mode);
504 return chosen;
505 }
506
507 /* Find a leader with regs freed and pick the one with max cost. */
508 foreach_sched_node (n, &ctx->dag->heads) {
509 if (avoid_sync && would_sync(ctx, n->instr))
510 continue;
511
512 if (live_effect(n->instr) > -1)
513 continue;
514
515 if (!check_instr(ctx, notes, n->instr))
516 continue;
517
518 if (!chosen || chosen->max_delay < n->max_delay) {
519 chosen = n;
520 }
521 }
522
523 if (chosen) {
524 di(chosen->instr, "dec%s: chose (freed)", mode);
525 return chosen;
526 }
527
528 /* Contra the paper, pick a leader with no effect on used regs. This may
529 * open up new opportunities, as otherwise a single-operand instr consuming
530 * a value will tend to block finding freeing that value. This had a
531 * massive effect on reducing spilling on V3D.
532 *
533 * XXX: Should this prioritize ready?
534 */
535 foreach_sched_node (n, &ctx->dag->heads) {
536 if (avoid_sync && would_sync(ctx, n->instr))
537 continue;
538
539 unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
540
541 if (d > 0)
542 continue;
543
544 if (live_effect(n->instr) > 0)
545 continue;
546
547 if (!check_instr(ctx, notes, n->instr))
548 continue;
549
550 if (!chosen || chosen->max_delay < n->max_delay)
551 chosen = n;
552 }
553
554 if (chosen) {
555 di(chosen->instr, "dec%s: chose (neutral+ready)", mode);
556 return chosen;
557 }
558
559 foreach_sched_node (n, &ctx->dag->heads) {
560 if (avoid_sync && would_sync(ctx, n->instr))
561 continue;
562
563 if (live_effect(n->instr) > 0)
564 continue;
565
566 if (!check_instr(ctx, notes, n->instr))
567 continue;
568
569 if (!chosen || chosen->max_delay < n->max_delay)
570 chosen = n;
571 }
572
573 if (chosen) {
574 di(chosen->instr, "dec%s: chose (neutral)", mode);
575 return chosen;
576 }
577
578 return choose_instr_inc(ctx, notes, avoid_sync, true);
579 }
580
581 /**
582 * When we can't choose an instruction that reduces register pressure or
583 * is neutral, we end up here to try and pick the least bad option.
584 */
585 static struct ir3_sched_node *
586 choose_instr_inc(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
587 bool avoid_sync, bool avoid_output)
588 {
589 const char *mode = avoid_sync ? "-as" : "";
590 struct ir3_sched_node *chosen = NULL;
591
592 /*
593 * From hear on out, we are picking something that increases
594 * register pressure. So try to pick something which will
595 * be consumed soon:
596 */
597 unsigned chosen_distance = 0;
598
599 /* Pick the max delay of the remaining ready set. */
600 foreach_sched_node (n, &ctx->dag->heads) {
601 if (avoid_output && n->output)
602 continue;
603
604 if (avoid_sync && would_sync(ctx, n->instr))
605 continue;
606
607 unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
608
609 if (d > 0)
610 continue;
611
612 if (!check_instr(ctx, notes, n->instr))
613 continue;
614
615 unsigned distance = nearest_use(n->instr);
616
617 if (!chosen || distance < chosen_distance) {
618 chosen = n;
619 chosen_distance = distance;
620 }
621 }
622
623 if (chosen) {
624 di(chosen->instr, "inc%s: chose (distance+ready)", mode);
625 return chosen;
626 }
627
628 /* Pick the max delay of the remaining leaders. */
629 foreach_sched_node (n, &ctx->dag->heads) {
630 if (avoid_output && n->output)
631 continue;
632
633 if (avoid_sync && would_sync(ctx, n->instr))
634 continue;
635
636 if (!check_instr(ctx, notes, n->instr))
637 continue;
638
639 unsigned distance = nearest_use(n->instr);
640
641 if (!chosen || distance < chosen_distance) {
642 chosen = n;
643 chosen_distance = distance;
644 }
645 }
646
647 if (chosen) {
648 di(chosen->instr, "inc%s: chose (distance)", mode);
649 return chosen;
650 }
651
652 return NULL;
653 }
654
655 /* Handles instruction selections for instructions we want to prioritize
656 * even if csp/csr would not pick them.
657 */
658 static struct ir3_sched_node *
659 choose_instr_prio(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes)
660 {
661 struct ir3_sched_node *chosen = NULL;
662
663 foreach_sched_node (n, &ctx->dag->heads) {
664 if (!is_meta(n->instr))
665 continue;
666
667 if (!chosen || (chosen->max_delay < n->max_delay))
668 chosen = n;
669 }
670
671 if (chosen) {
672 di(chosen->instr, "prio: chose (meta)");
673 return chosen;
674 }
675
676 return NULL;
677 }
678
679 static void
680 dump_state(struct ir3_sched_ctx *ctx)
681 {
682 if (!SCHED_DEBUG)
683 return;
684
685 foreach_sched_node (n, &ctx->dag->heads) {
686 di(n->instr, "maxdel=%3d le=%d del=%u ",
687 n->max_delay, live_effect(n->instr),
688 ir3_delay_calc(ctx->block, n->instr, false, false));
689
690 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
691 struct ir3_sched_node *child = (struct ir3_sched_node *)edge->child;
692
693 di(child->instr, " -> (%d parents) ", child->dag.parent_count);
694 }
695 }
696 }
697
698 /* find instruction to schedule: */
699 static struct ir3_instruction *
700 choose_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes)
701 {
702 struct ir3_sched_node *chosen;
703
704 dump_state(ctx);
705
706 chosen = choose_instr_prio(ctx, notes);
707 if (chosen)
708 return chosen->instr;
709
710 chosen = choose_instr_dec(ctx, notes, true);
711 if (chosen)
712 return chosen->instr;
713
714 chosen = choose_instr_dec(ctx, notes, false);
715 if (chosen)
716 return chosen->instr;
717
718 chosen = choose_instr_inc(ctx, notes, false, false);
719 if (chosen)
720 return chosen->instr;
721
722 return NULL;
723 }
724
725 static struct ir3_instruction *
726 split_instr(struct ir3_sched_ctx *ctx, struct ir3_instruction *orig_instr)
727 {
728 struct ir3_instruction *new_instr = ir3_instr_clone(orig_instr);
729 di(new_instr, "split instruction");
730 sched_node_init(ctx, new_instr);
731 return new_instr;
732 }
733
734 /* "spill" the address registers by remapping any unscheduled
735 * instructions which depend on the current address register
736 * to a clone of the instruction which wrote the address reg.
737 */
738 static struct ir3_instruction *
739 split_addr(struct ir3_sched_ctx *ctx, struct ir3_instruction **addr,
740 struct ir3_instruction **users, unsigned users_count)
741 {
742 struct ir3_instruction *new_addr = NULL;
743 unsigned i;
744
745 debug_assert(*addr);
746
747 for (i = 0; i < users_count; i++) {
748 struct ir3_instruction *indirect = users[i];
749
750 if (!indirect)
751 continue;
752
753 /* skip instructions already scheduled: */
754 if (is_scheduled(indirect))
755 continue;
756
757 /* remap remaining instructions using current addr
758 * to new addr:
759 */
760 if (indirect->address == *addr) {
761 if (!new_addr) {
762 new_addr = split_instr(ctx, *addr);
763 /* original addr is scheduled, but new one isn't: */
764 new_addr->flags &= ~IR3_INSTR_MARK;
765 }
766 indirect->address = new_addr;
767 /* don't need to remove old dag edge since old addr is
768 * already scheduled:
769 */
770 sched_node_add_dep(indirect, new_addr, 0);
771 di(indirect, "new address");
772 }
773 }
774
775 /* all remaining indirects remapped to new addr: */
776 *addr = NULL;
777
778 return new_addr;
779 }
780
781 /* "spill" the predicate register by remapping any unscheduled
782 * instructions which depend on the current predicate register
783 * to a clone of the instruction which wrote the address reg.
784 */
785 static struct ir3_instruction *
786 split_pred(struct ir3_sched_ctx *ctx)
787 {
788 struct ir3 *ir;
789 struct ir3_instruction *new_pred = NULL;
790 unsigned i;
791
792 debug_assert(ctx->pred);
793
794 ir = ctx->pred->block->shader;
795
796 for (i = 0; i < ir->predicates_count; i++) {
797 struct ir3_instruction *predicated = ir->predicates[i];
798
799 /* skip instructions already scheduled: */
800 if (is_scheduled(predicated))
801 continue;
802
803 /* remap remaining instructions using current pred
804 * to new pred:
805 *
806 * TODO is there ever a case when pred isn't first
807 * (and only) src?
808 */
809 if (ssa(predicated->regs[1]) == ctx->pred) {
810 if (!new_pred) {
811 new_pred = split_instr(ctx, ctx->pred);
812 /* original pred is scheduled, but new one isn't: */
813 new_pred->flags &= ~IR3_INSTR_MARK;
814 }
815 predicated->regs[1]->instr = new_pred;
816 /* don't need to remove old dag edge since old pred is
817 * already scheduled:
818 */
819 sched_node_add_dep(predicated, new_pred, 0);
820 di(predicated, "new predicate");
821 }
822 }
823
824 /* all remaining predicated remapped to new pred: */
825 ctx->pred = NULL;
826
827 return new_pred;
828 }
829
830 static void
831 sched_node_init(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
832 {
833 struct ir3_sched_node *n = rzalloc(ctx->dag, struct ir3_sched_node);
834
835 dag_init_node(ctx->dag, &n->dag);
836
837 n->instr = instr;
838 instr->data = n;
839 }
840
841 static void
842 sched_node_add_dep(struct ir3_instruction *instr, struct ir3_instruction *src, int i)
843 {
844 /* don't consider dependencies in other blocks: */
845 if (src->block != instr->block)
846 return;
847
848 /* we could have false-dep's that end up unused: */
849 if (src->flags & IR3_INSTR_UNUSED) {
850 debug_assert(__is_false_dep(instr, i));
851 return;
852 }
853
854 struct ir3_sched_node *n = instr->data;
855 struct ir3_sched_node *sn = src->data;
856
857 /* If src is consumed by a collect, track that to realize that once
858 * any of the collect srcs are live, we should hurry up and schedule
859 * the rest.
860 */
861 if (instr->opc == OPC_META_COLLECT)
862 sn->collect = instr;
863
864 dag_add_edge(&sn->dag, &n->dag, NULL);
865
866 unsigned d = ir3_delayslots(src, instr, i, true);
867 n->delay = MAX2(n->delay, d);
868 }
869
870 static void
871 mark_kill_path(struct ir3_instruction *instr)
872 {
873 struct ir3_sched_node *n = instr->data;
874 n->kill_path = true;
875
876 foreach_ssa_src (src, instr) {
877 if (src->block != instr->block)
878 continue;
879 mark_kill_path(src);
880 }
881 }
882
883 /* Is it an output? */
884 static bool
885 is_output_collect(struct ir3_instruction *instr)
886 {
887 struct ir3 *ir = instr->block->shader;
888
889 for (unsigned i = 0; i < ir->outputs_count; i++) {
890 struct ir3_instruction *collect = ir->outputs[i];
891 assert(collect->opc == OPC_META_COLLECT);
892 if (instr == collect)
893 return true;
894 }
895
896 return false;
897 }
898
899 /* Is it's only use as output? */
900 static bool
901 is_output_only(struct ir3_instruction *instr)
902 {
903 if (!writes_gpr(instr))
904 return false;
905
906 if (!(instr->regs[0]->flags & IR3_REG_SSA))
907 return false;
908
909 foreach_ssa_use (use, instr)
910 if (!is_output_collect(use))
911 return false;
912
913 return true;
914 }
915
916 static void
917 sched_node_add_deps(struct ir3_instruction *instr)
918 {
919 /* Since foreach_ssa_src() already handles false-dep's we can construct
920 * the DAG easily in a single pass.
921 */
922 foreach_ssa_src_n (src, i, instr) {
923 sched_node_add_dep(instr, src, i);
924 }
925
926 /* NOTE that all inputs must be scheduled before a kill, so
927 * mark these to be prioritized as well:
928 */
929 if (is_kill(instr) || is_input(instr)) {
930 mark_kill_path(instr);
931 }
932
933 if (is_output_only(instr)) {
934 struct ir3_sched_node *n = instr->data;
935 n->output = true;
936 }
937 }
938
939 static void
940 sched_dag_max_delay_cb(struct dag_node *node, void *state)
941 {
942 struct ir3_sched_node *n = (struct ir3_sched_node *)node;
943 uint32_t max_delay = 0;
944
945 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
946 struct ir3_sched_node *child = (struct ir3_sched_node *)edge->child;
947 max_delay = MAX2(child->max_delay, max_delay);
948 }
949
950 n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
951 }
952
953 static void
954 sched_dag_init(struct ir3_sched_ctx *ctx)
955 {
956 ctx->dag = dag_create(ctx);
957
958 foreach_instr (instr, &ctx->unscheduled_list) {
959 sched_node_init(ctx, instr);
960 sched_node_add_deps(instr);
961 }
962
963 dag_traverse_bottom_up(ctx->dag, sched_dag_max_delay_cb, NULL);
964 }
965
966 static void
967 sched_dag_destroy(struct ir3_sched_ctx *ctx)
968 {
969 ralloc_free(ctx->dag);
970 ctx->dag = NULL;
971 }
972
973 static void
974 sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
975 {
976 ctx->block = block;
977
978 /* addr/pred writes are per-block: */
979 ctx->addr0 = NULL;
980 ctx->addr1 = NULL;
981 ctx->pred = NULL;
982 ctx->tex_delay = 0;
983 ctx->sfu_delay = 0;
984
985 /* move all instructions to the unscheduled list, and
986 * empty the block's instruction list (to which we will
987 * be inserting).
988 */
989 list_replace(&block->instr_list, &ctx->unscheduled_list);
990 list_inithead(&block->instr_list);
991
992 sched_dag_init(ctx);
993
994 ctx->remaining_kills = 0;
995 ctx->remaining_tex = 0;
996 foreach_instr_safe (instr, &ctx->unscheduled_list) {
997 if (is_kill(instr))
998 ctx->remaining_kills++;
999 if (is_tex_or_prefetch(instr))
1000 ctx->remaining_tex++;
1001 }
1002
1003 /* First schedule all meta:input instructions, followed by
1004 * tex-prefetch. We want all of the instructions that load
1005 * values into registers before the shader starts to go
1006 * before any other instructions. But in particular we
1007 * want inputs to come before prefetches. This is because
1008 * a FS's bary_ij input may not actually be live in the
1009 * shader, but it should not be scheduled on top of any
1010 * other input (but can be overwritten by a tex prefetch)
1011 */
1012 foreach_instr_safe (instr, &ctx->unscheduled_list)
1013 if (instr->opc == OPC_META_INPUT)
1014 schedule(ctx, instr);
1015
1016 foreach_instr_safe (instr, &ctx->unscheduled_list)
1017 if (instr->opc == OPC_META_TEX_PREFETCH)
1018 schedule(ctx, instr);
1019
1020 while (!list_is_empty(&ctx->unscheduled_list)) {
1021 struct ir3_sched_notes notes = {0};
1022 struct ir3_instruction *instr;
1023
1024 instr = choose_instr(ctx, &notes);
1025 if (instr) {
1026 unsigned delay = ir3_delay_calc(ctx->block, instr, false, false);
1027 d("delay=%u", delay);
1028
1029 /* and if we run out of instructions that can be scheduled,
1030 * then it is time for nop's:
1031 */
1032 debug_assert(delay <= 6);
1033 while (delay > 0) {
1034 ir3_NOP(block);
1035 delay--;
1036 }
1037
1038 schedule(ctx, instr);
1039 } else {
1040 struct ir3_instruction *new_instr = NULL;
1041 struct ir3 *ir = block->shader;
1042
1043 /* nothing available to schedule.. if we are blocked on
1044 * address/predicate register conflict, then break the
1045 * deadlock by cloning the instruction that wrote that
1046 * reg:
1047 */
1048 if (notes.addr0_conflict) {
1049 new_instr = split_addr(ctx, &ctx->addr0,
1050 ir->a0_users, ir->a0_users_count);
1051 } else if (notes.addr1_conflict) {
1052 new_instr = split_addr(ctx, &ctx->addr1,
1053 ir->a1_users, ir->a1_users_count);
1054 } else if (notes.pred_conflict) {
1055 new_instr = split_pred(ctx);
1056 } else {
1057 d("unscheduled_list:");
1058 foreach_instr (instr, &ctx->unscheduled_list)
1059 di(instr, "unscheduled: ");
1060 debug_assert(0);
1061 ctx->error = true;
1062 return;
1063 }
1064
1065 if (new_instr) {
1066 list_delinit(&new_instr->node);
1067 list_addtail(&new_instr->node, &ctx->unscheduled_list);
1068 }
1069 }
1070 }
1071
1072 sched_dag_destroy(ctx);
1073 }
1074
1075 int ir3_sched(struct ir3 *ir)
1076 {
1077 struct ir3_sched_ctx *ctx = rzalloc(NULL, struct ir3_sched_ctx);
1078
1079 foreach_block (block, &ir->block_list) {
1080 foreach_instr (instr, &block->instr_list) {
1081 instr->data = NULL;
1082 }
1083 }
1084
1085 ir3_count_instructions(ir);
1086 ir3_clear_mark(ir);
1087 ir3_find_ssa_uses(ir, ctx, false);
1088
1089 foreach_block (block, &ir->block_list) {
1090 sched_block(ctx, block);
1091 }
1092
1093 int ret = ctx->error ? -1 : 0;
1094
1095 ralloc_free(ctx);
1096
1097 return ret;
1098 }
1099
1100 static unsigned
1101 get_array_id(struct ir3_instruction *instr)
1102 {
1103 /* The expectation is that there is only a single array
1104 * src or dst, ir3_cp should enforce this.
1105 */
1106
1107 for (unsigned i = 0; i < instr->regs_count; i++)
1108 if (instr->regs[i]->flags & IR3_REG_ARRAY)
1109 return instr->regs[i]->array.id;
1110
1111 unreachable("this was unexpected");
1112 }
1113
1114 /* does instruction 'prior' need to be scheduled before 'instr'? */
1115 static bool
1116 depends_on(struct ir3_instruction *instr, struct ir3_instruction *prior)
1117 {
1118 /* TODO for dependencies that are related to a specific object, ie
1119 * a specific SSBO/image/array, we could relax this constraint to
1120 * make accesses to unrelated objects not depend on each other (at
1121 * least as long as not declared coherent)
1122 */
1123 if (((instr->barrier_class & IR3_BARRIER_EVERYTHING) && prior->barrier_class) ||
1124 ((prior->barrier_class & IR3_BARRIER_EVERYTHING) && instr->barrier_class))
1125 return true;
1126
1127 if (instr->barrier_class & prior->barrier_conflict) {
1128 if (!(instr->barrier_class & ~(IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W))) {
1129 /* if only array barrier, then we can further limit false-deps
1130 * by considering the array-id, ie reads/writes to different
1131 * arrays do not depend on each other (no aliasing)
1132 */
1133 if (get_array_id(instr) != get_array_id(prior)) {
1134 return false;
1135 }
1136 }
1137
1138 return true;
1139 }
1140
1141 return false;
1142 }
1143
1144 static void
1145 add_barrier_deps(struct ir3_block *block, struct ir3_instruction *instr)
1146 {
1147 struct list_head *prev = instr->node.prev;
1148 struct list_head *next = instr->node.next;
1149
1150 /* add dependencies on previous instructions that must be scheduled
1151 * prior to the current instruction
1152 */
1153 while (prev != &block->instr_list) {
1154 struct ir3_instruction *pi =
1155 LIST_ENTRY(struct ir3_instruction, prev, node);
1156
1157 prev = prev->prev;
1158
1159 if (is_meta(pi))
1160 continue;
1161
1162 if (instr->barrier_class == pi->barrier_class) {
1163 ir3_instr_add_dep(instr, pi);
1164 break;
1165 }
1166
1167 if (depends_on(instr, pi))
1168 ir3_instr_add_dep(instr, pi);
1169 }
1170
1171 /* add dependencies on this instruction to following instructions
1172 * that must be scheduled after the current instruction:
1173 */
1174 while (next != &block->instr_list) {
1175 struct ir3_instruction *ni =
1176 LIST_ENTRY(struct ir3_instruction, next, node);
1177
1178 next = next->next;
1179
1180 if (is_meta(ni))
1181 continue;
1182
1183 if (instr->barrier_class == ni->barrier_class) {
1184 ir3_instr_add_dep(ni, instr);
1185 break;
1186 }
1187
1188 if (depends_on(ni, instr))
1189 ir3_instr_add_dep(ni, instr);
1190 }
1191 }
1192
1193 /* before scheduling a block, we need to add any necessary false-dependencies
1194 * to ensure that:
1195 *
1196 * (1) barriers are scheduled in the right order wrt instructions related
1197 * to the barrier
1198 *
1199 * (2) reads that come before a write actually get scheduled before the
1200 * write
1201 */
1202 bool
1203 ir3_sched_add_deps(struct ir3 *ir)
1204 {
1205 bool progress = false;
1206
1207 foreach_block (block, &ir->block_list) {
1208 foreach_instr (instr, &block->instr_list) {
1209 if (instr->barrier_class) {
1210 add_barrier_deps(block, instr);
1211 progress = true;
1212 }
1213 }
1214 }
1215
1216 return progress;
1217 }