freedreno/ir3/sched: try to avoid syncs
[mesa.git] / src / freedreno / ir3 / ir3_sched.c
1 /*
2 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Rob Clark <robclark@freedesktop.org>
25 */
26
27
28 #include "util/dag.h"
29 #include "util/u_math.h"
30
31 #include "ir3.h"
32 #include "ir3_compiler.h"
33
34 #ifdef DEBUG
35 #define SCHED_DEBUG (ir3_shader_debug & IR3_DBG_SCHEDMSGS)
36 #else
37 #define SCHED_DEBUG 0
38 #endif
39 #define d(fmt, ...) do { if (SCHED_DEBUG) { \
40 printf("SCHED: "fmt"\n", ##__VA_ARGS__); \
41 } } while (0)
42
43 #define di(instr, fmt, ...) do { if (SCHED_DEBUG) { \
44 printf("SCHED: "fmt": ", ##__VA_ARGS__); \
45 ir3_print_instr(instr); \
46 } } while (0)
47
48 /*
49 * Instruction Scheduling:
50 *
51 * A block-level pre-RA scheduler, which works by creating a DAG of
52 * instruction dependencies, and heuristically picking a DAG head
53 * (instruction with no unscheduled dependencies).
54 *
55 * Where possible, it tries to pick instructions that avoid nop delay
56 * slots, but it will prefer to pick instructions that reduce (or do
57 * not increase) the number of live values.
58 *
59 * If the only possible choices are instructions that increase the
60 * number of live values, it will try to pick the one with the earliest
61 * consumer (based on pre-sched program order).
62 *
63 * There are a few special cases that need to be handled, since sched
64 * is currently independent of register allocation. Usages of address
65 * register (a0.x) or predicate register (p0.x) must be serialized. Ie.
66 * if you have two pairs of instructions that write the same special
67 * register and then read it, then those pairs cannot be interleaved.
68 * To solve this, when we are in such a scheduling "critical section",
69 * and we encounter a conflicting write to a special register, we try
70 * to schedule any remaining instructions that use that value first.
71 *
72 * TODO we can detect too-large live_values here.. would be a good place
73 * to "spill" cheap things, like move from uniform/immed. (Constructing
74 * list of ssa def consumers before sched pass would make this easier.
75 * Also, in general it is general it might be best not to re-use load_immed
76 * across blocks.
77 *
78 * TODO we can use (abs)/(neg) src modifiers in a lot of cases to reduce
79 * the # of immediates in play (or at least that would help with
80 * dEQP-GLES31.functional.ubo.random.all_per_block_buffers.*).. probably
81 * do this in a nir pass that inserts fneg/etc? The cp pass should fold
82 * these into src modifiers..
83 */
84
85 struct ir3_sched_ctx {
86 struct ir3_block *block; /* the current block */
87 struct dag *dag;
88
89 struct list_head unscheduled_list; /* unscheduled instructions */
90 struct ir3_instruction *scheduled; /* last scheduled instr */
91 struct ir3_instruction *addr0; /* current a0.x user, if any */
92 struct ir3_instruction *addr1; /* current a1.x user, if any */
93 struct ir3_instruction *pred; /* current p0.x user, if any */
94
95 int remaining_kills;
96 int remaining_tex;
97
98 bool error;
99
100 int sfu_delay;
101 int tex_delay;
102 };
103
104 struct ir3_sched_node {
105 struct dag_node dag; /* must be first for util_dynarray_foreach */
106 struct ir3_instruction *instr;
107
108 unsigned delay;
109 unsigned max_delay;
110
111 /* For instructions that are a meta:collect src, once we schedule
112 * the first src of the collect, the entire vecN is live (at least
113 * from the PoV of the first RA pass.. the 2nd scalar pass can fill
114 * in some of the gaps, but often not all). So we want to help out
115 * RA, and realize that as soon as we schedule the first collect
116 * src, there is no penalty to schedule the remainder (ie. they
117 * don't make additional values live). In fact we'd prefer to
118 * schedule the rest ASAP to minimize the live range of the vecN.
119 *
120 * For instructions that are the src of a collect, we track the
121 * corresponding collect, and mark them as partially live as soon
122 * as any one of the src's is scheduled.
123 */
124 struct ir3_instruction *collect;
125 bool partially_live;
126
127 /* Is this instruction a direct or indirect dependency for a kill?
128 * If so, we should prioritize it when possible
129 */
130 bool kill_path;
131
132 /* This node represents a shader output. A semi-common pattern in
133 * shaders is something along the lines of:
134 *
135 * fragcolor.w = 1.0
136 *
137 * Which we'd prefer to schedule as late as possible, since it
138 * produces a live value that is never killed/consumed. So detect
139 * outputs up-front, and avoid scheduling them unless the reduce
140 * register pressure (or at least are neutral)
141 */
142 bool output;
143 };
144
145 #define foreach_sched_node(__n, __list) \
146 list_for_each_entry(struct ir3_sched_node, __n, __list, dag.link)
147
148 static void sched_node_init(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr);
149 static void sched_node_add_dep(struct ir3_instruction *instr, struct ir3_instruction *src, int i);
150
151 static bool is_scheduled(struct ir3_instruction *instr)
152 {
153 return !!(instr->flags & IR3_INSTR_MARK);
154 }
155
156 static void
157 schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
158 {
159 debug_assert(ctx->block == instr->block);
160
161 /* remove from depth list:
162 */
163 list_delinit(&instr->node);
164
165 if (writes_addr0(instr)) {
166 debug_assert(ctx->addr0 == NULL);
167 ctx->addr0 = instr;
168 }
169
170 if (writes_addr1(instr)) {
171 debug_assert(ctx->addr1 == NULL);
172 ctx->addr1 = instr;
173 }
174
175 if (writes_pred(instr)) {
176 debug_assert(ctx->pred == NULL);
177 ctx->pred = instr;
178 }
179
180 instr->flags |= IR3_INSTR_MARK;
181
182 di(instr, "schedule");
183
184 list_addtail(&instr->node, &instr->block->instr_list);
185 ctx->scheduled = instr;
186
187 if (is_kill(instr)){
188 assert(ctx->remaining_kills > 0);
189 ctx->remaining_kills--;
190 }
191
192 struct ir3_sched_node *n = instr->data;
193
194 /* If this instruction is a meta:collect src, mark the remaining
195 * collect srcs as partially live.
196 */
197 if (n->collect) {
198 struct ir3_instruction *src;
199 foreach_ssa_src (src, n->collect) {
200 if (src->block != instr->block)
201 continue;
202 struct ir3_sched_node *sn = src->data;
203 sn->partially_live = true;
204 }
205 }
206
207 dag_prune_head(ctx->dag, &n->dag);
208
209 if (is_meta(instr) && (instr->opc != OPC_META_TEX_PREFETCH))
210 return;
211
212 if (is_sfu(instr)) {
213 ctx->sfu_delay = 8;
214 } else if (check_src_cond(instr, is_sfu)) {
215 ctx->sfu_delay = 0;
216 } else if (ctx->sfu_delay > 0) {
217 ctx->sfu_delay--;
218 }
219
220 if (is_tex_or_prefetch(instr)) {
221 /* NOTE that this isn't an attempt to hide texture fetch latency,
222 * but an attempt to hide the cost of switching to another warp.
223 * If we can, we'd like to try to schedule another texture fetch
224 * before scheduling something that would sync.
225 */
226 ctx->tex_delay = 10;
227 assert(ctx->remaining_tex > 0);
228 ctx->remaining_tex--;
229 } else if (check_src_cond(instr, is_tex_or_prefetch)) {
230 ctx->tex_delay = 0;
231 } else if (ctx->tex_delay > 0) {
232 ctx->tex_delay--;
233 }
234 }
235
236 struct ir3_sched_notes {
237 /* there is at least one kill which could be scheduled, except
238 * for unscheduled bary.f's:
239 */
240 bool blocked_kill;
241 /* there is at least one instruction that could be scheduled,
242 * except for conflicting address/predicate register usage:
243 */
244 bool addr0_conflict, addr1_conflict, pred_conflict;
245 };
246
247 /* could an instruction be scheduled if specified ssa src was scheduled? */
248 static bool
249 could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
250 {
251 struct ir3_instruction *other_src;
252 foreach_ssa_src (other_src, instr) {
253 /* if dependency not scheduled, we aren't ready yet: */
254 if ((src != other_src) && !is_scheduled(other_src)) {
255 return false;
256 }
257 }
258 return true;
259 }
260
261 /* Check if instruction is ok to schedule. Make sure it is not blocked
262 * by use of addr/predicate register, etc.
263 */
264 static bool
265 check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
266 struct ir3_instruction *instr)
267 {
268 debug_assert(!is_scheduled(instr));
269
270 if (ctx->remaining_kills && (is_tex(instr) || is_mem(instr))) {
271 /* avoid texture/memory access if we have unscheduled kills
272 * that could make the expensive operation unnecessary. By
273 * definition, if there are remaining kills, and this instr
274 * is not a dependency of a kill, there are other instructions
275 * that we can choose from.
276 */
277 struct ir3_sched_node *n = instr->data;
278 if (!n->kill_path)
279 return false;
280 }
281
282 /* For instructions that write address register we need to
283 * make sure there is at least one instruction that uses the
284 * addr value which is otherwise ready.
285 *
286 * NOTE if any instructions use pred register and have other
287 * src args, we would need to do the same for writes_pred()..
288 */
289 if (writes_addr0(instr)) {
290 struct ir3 *ir = instr->block->shader;
291 bool ready = false;
292 for (unsigned i = 0; (i < ir->a0_users_count) && !ready; i++) {
293 struct ir3_instruction *indirect = ir->a0_users[i];
294 if (!indirect)
295 continue;
296 if (indirect->address != instr)
297 continue;
298 ready = could_sched(indirect, instr);
299 }
300
301 /* nothing could be scheduled, so keep looking: */
302 if (!ready)
303 return false;
304 }
305
306 if (writes_addr1(instr)) {
307 struct ir3 *ir = instr->block->shader;
308 bool ready = false;
309 for (unsigned i = 0; (i < ir->a1_users_count) && !ready; i++) {
310 struct ir3_instruction *indirect = ir->a1_users[i];
311 if (!indirect)
312 continue;
313 if (indirect->address != instr)
314 continue;
315 ready = could_sched(indirect, instr);
316 }
317
318 /* nothing could be scheduled, so keep looking: */
319 if (!ready)
320 return false;
321 }
322
323 /* if this is a write to address/predicate register, and that
324 * register is currently in use, we need to defer until it is
325 * free:
326 */
327 if (writes_addr0(instr) && ctx->addr0) {
328 debug_assert(ctx->addr0 != instr);
329 notes->addr0_conflict = true;
330 return false;
331 }
332
333 if (writes_addr1(instr) && ctx->addr1) {
334 debug_assert(ctx->addr1 != instr);
335 notes->addr1_conflict = true;
336 return false;
337 }
338
339 if (writes_pred(instr) && ctx->pred) {
340 debug_assert(ctx->pred != instr);
341 notes->pred_conflict = true;
342 return false;
343 }
344
345 /* if the instruction is a kill, we need to ensure *every*
346 * bary.f is scheduled. The hw seems unhappy if the thread
347 * gets killed before the end-input (ei) flag is hit.
348 *
349 * We could do this by adding each bary.f instruction as
350 * virtual ssa src for the kill instruction. But we have
351 * fixed length instr->regs[].
352 *
353 * TODO we could handle this by false-deps now, probably.
354 */
355 if (is_kill(instr)) {
356 struct ir3 *ir = instr->block->shader;
357
358 for (unsigned i = 0; i < ir->baryfs_count; i++) {
359 struct ir3_instruction *baryf = ir->baryfs[i];
360 if (baryf->flags & IR3_INSTR_UNUSED)
361 continue;
362 if (!is_scheduled(baryf)) {
363 notes->blocked_kill = true;
364 return false;
365 }
366 }
367 }
368
369 return true;
370 }
371
372 /* Find the instr->ip of the closest use of an instruction, in
373 * pre-sched order. This isn't going to be the same as post-sched
374 * order, but it is a reasonable approximation to limit scheduling
375 * instructions *too* early. This is mostly to prevent bad behavior
376 * in cases where we have a large number of possible instructions
377 * to choose, to avoid creating too much parallelism (ie. blowing
378 * up register pressure)
379 *
380 * See dEQP-GLES31.functional.atomic_counter.layout.reverse_offset.inc_dec.8_counters_5_calls_1_thread
381 */
382 static int
383 nearest_use(struct ir3_instruction *instr)
384 {
385 unsigned nearest = ~0;
386 foreach_ssa_use (use, instr)
387 if (!is_scheduled(use))
388 nearest = MIN2(nearest, use->ip);
389
390 /* slight hack.. this heuristic tends to push bary.f's to later
391 * in the shader, closer to their uses. But we actually would
392 * prefer to get these scheduled earlier, to unlock varying
393 * storage for more VS jobs:
394 */
395 if (is_input(instr))
396 nearest /= 2;
397
398 return nearest;
399 }
400
401 static int
402 use_count(struct ir3_instruction *instr)
403 {
404 unsigned cnt = 0;
405 foreach_ssa_use (use, instr)
406 if (!is_scheduled(use))
407 cnt++;
408 return cnt;
409 }
410
411 /* find net change to live values if instruction were scheduled: */
412 static int
413 live_effect(struct ir3_instruction *instr)
414 {
415 struct ir3_sched_node *n = instr->data;
416 struct ir3_instruction *src;
417 int new_live = n->partially_live ? 0 : dest_regs(instr);
418 int freed_live = 0;
419
420 /* if we schedule something that causes a vecN to be live,
421 * then count all it's other components too:
422 */
423 if (n->collect)
424 new_live *= n->collect->regs_count - 1;
425
426 foreach_ssa_src_n (src, n, instr) {
427 if (__is_false_dep(instr, n))
428 continue;
429
430 if (instr->block != src->block)
431 continue;
432
433 if (use_count(src) == 1)
434 freed_live += dest_regs(src);
435 }
436
437 return new_live - freed_live;
438 }
439
440 /* Determine if this is an instruction that we'd prefer not to schedule
441 * yet, in order to avoid an (ss)/(sy) sync. This is limited by the
442 * sfu_delay/tex_delay counters, ie. the more cycles it has been since
443 * the last SFU/tex, the less costly a sync would be.
444 */
445 static bool
446 would_sync(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
447 {
448 if (ctx->sfu_delay) {
449 if (check_src_cond(instr, is_sfu))
450 return true;
451 }
452
453 /* We mostly just want to try to schedule another texture fetch
454 * before scheduling something that would (sy) sync, so we can
455 * limit this rule to cases where there are remaining texture
456 * fetches
457 */
458 if (ctx->tex_delay && ctx->remaining_tex) {
459 if (check_src_cond(instr, is_tex_or_prefetch))
460 return true;
461 }
462
463 return false;
464 }
465
466 static struct ir3_sched_node *
467 choose_instr_inc(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
468 bool avoid_sync, bool avoid_output);
469
470 /**
471 * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSR (Code
472 * Scheduling for Register pressure) heuristic.
473 *
474 * Only handles the case of choosing instructions that reduce register pressure
475 * or are even.
476 */
477 static struct ir3_sched_node *
478 choose_instr_dec(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
479 bool avoid_sync)
480 {
481 const char *mode = avoid_sync ? "-as" : "";
482 struct ir3_sched_node *chosen = NULL;
483
484 /* Find a ready inst with regs freed and pick the one with max cost. */
485 foreach_sched_node (n, &ctx->dag->heads) {
486 if (avoid_sync && would_sync(ctx, n->instr))
487 continue;
488
489 unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
490
491 if (d > 0)
492 continue;
493
494 if (live_effect(n->instr) > -1)
495 continue;
496
497 if (!check_instr(ctx, notes, n->instr))
498 continue;
499
500 if (!chosen || chosen->max_delay < n->max_delay) {
501 chosen = n;
502 }
503 }
504
505 if (chosen) {
506 di(chosen->instr, "dec%s: chose (freed+ready)", mode);
507 return chosen;
508 }
509
510 /* Find a leader with regs freed and pick the one with max cost. */
511 foreach_sched_node (n, &ctx->dag->heads) {
512 if (avoid_sync && would_sync(ctx, n->instr))
513 continue;
514
515 if (live_effect(n->instr) > -1)
516 continue;
517
518 if (!check_instr(ctx, notes, n->instr))
519 continue;
520
521 if (!chosen || chosen->max_delay < n->max_delay) {
522 chosen = n;
523 }
524 }
525
526 if (chosen) {
527 di(chosen->instr, "dec%s: chose (freed)", mode);
528 return chosen;
529 }
530
531 /* Contra the paper, pick a leader with no effect on used regs. This may
532 * open up new opportunities, as otherwise a single-operand instr consuming
533 * a value will tend to block finding freeing that value. This had a
534 * massive effect on reducing spilling on V3D.
535 *
536 * XXX: Should this prioritize ready?
537 */
538 foreach_sched_node (n, &ctx->dag->heads) {
539 if (avoid_sync && would_sync(ctx, n->instr))
540 continue;
541
542 unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
543
544 if (d > 0)
545 continue;
546
547 if (live_effect(n->instr) > 0)
548 continue;
549
550 if (!check_instr(ctx, notes, n->instr))
551 continue;
552
553 if (!chosen || chosen->max_delay < n->max_delay)
554 chosen = n;
555 }
556
557 if (chosen) {
558 di(chosen->instr, "dec%s: chose (neutral+ready)", mode);
559 return chosen;
560 }
561
562 foreach_sched_node (n, &ctx->dag->heads) {
563 if (avoid_sync && would_sync(ctx, n->instr))
564 continue;
565
566 if (live_effect(n->instr) > 0)
567 continue;
568
569 if (!check_instr(ctx, notes, n->instr))
570 continue;
571
572 if (!chosen || chosen->max_delay < n->max_delay)
573 chosen = n;
574 }
575
576 if (chosen) {
577 di(chosen->instr, "dec%s: chose (neutral)", mode);
578 return chosen;
579 }
580
581 return choose_instr_inc(ctx, notes, avoid_sync, true);
582 }
583
584 /**
585 * When we can't choose an instruction that reduces register pressure or
586 * is neutral, we end up here to try and pick the least bad option.
587 */
588 static struct ir3_sched_node *
589 choose_instr_inc(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
590 bool avoid_sync, bool avoid_output)
591 {
592 const char *mode = avoid_sync ? "-as" : "";
593 struct ir3_sched_node *chosen = NULL;
594
595 /*
596 * From hear on out, we are picking something that increases
597 * register pressure. So try to pick something which will
598 * be consumed soon:
599 */
600 unsigned chosen_distance = 0;
601
602 /* Pick the max delay of the remaining ready set. */
603 foreach_sched_node (n, &ctx->dag->heads) {
604 if (avoid_output && n->output)
605 continue;
606
607 if (avoid_sync && would_sync(ctx, n->instr))
608 continue;
609
610 unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
611
612 if (d > 0)
613 continue;
614
615 if (!check_instr(ctx, notes, n->instr))
616 continue;
617
618 unsigned distance = nearest_use(n->instr);
619
620 if (!chosen || distance < chosen_distance) {
621 chosen = n;
622 chosen_distance = distance;
623 }
624 }
625
626 if (chosen) {
627 di(chosen->instr, "inc%s: chose (distance+ready)", mode);
628 return chosen;
629 }
630
631 /* Pick the max delay of the remaining leaders. */
632 foreach_sched_node (n, &ctx->dag->heads) {
633 if (avoid_output && n->output)
634 continue;
635
636 if (avoid_sync && would_sync(ctx, n->instr))
637 continue;
638
639 if (!check_instr(ctx, notes, n->instr))
640 continue;
641
642 unsigned distance = nearest_use(n->instr);
643
644 if (!chosen || distance < chosen_distance) {
645 chosen = n;
646 chosen_distance = distance;
647 }
648 }
649
650 if (chosen) {
651 di(chosen->instr, "inc%s: chose (distance)", mode);
652 return chosen;
653 }
654
655 return NULL;
656 }
657
658 /* Handles instruction selections for instructions we want to prioritize
659 * even if csp/csr would not pick them.
660 */
661 static struct ir3_sched_node *
662 choose_instr_prio(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes)
663 {
664 struct ir3_sched_node *chosen = NULL;
665
666 foreach_sched_node (n, &ctx->dag->heads) {
667 if (!is_meta(n->instr))
668 continue;
669
670 if (!chosen || (chosen->max_delay < n->max_delay))
671 chosen = n;
672 }
673
674 if (chosen) {
675 di(chosen->instr, "prio: chose (meta)");
676 return chosen;
677 }
678
679 return NULL;
680 }
681
682 static void
683 dump_state(struct ir3_sched_ctx *ctx)
684 {
685 if (!SCHED_DEBUG)
686 return;
687
688 foreach_sched_node (n, &ctx->dag->heads) {
689 di(n->instr, "maxdel=%3d le=%d del=%u ",
690 n->max_delay, live_effect(n->instr),
691 ir3_delay_calc(ctx->block, n->instr, false, false));
692
693 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
694 struct ir3_sched_node *child = (struct ir3_sched_node *)edge->child;
695
696 di(child->instr, " -> (%d parents) ", child->dag.parent_count);
697 }
698 }
699 }
700
701 /* find instruction to schedule: */
702 static struct ir3_instruction *
703 choose_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes)
704 {
705 struct ir3_sched_node *chosen;
706
707 dump_state(ctx);
708
709 chosen = choose_instr_prio(ctx, notes);
710 if (chosen)
711 return chosen->instr;
712
713 chosen = choose_instr_dec(ctx, notes, true);
714 if (chosen)
715 return chosen->instr;
716
717 chosen = choose_instr_dec(ctx, notes, false);
718 if (chosen)
719 return chosen->instr;
720
721 chosen = choose_instr_inc(ctx, notes, false, false);
722 if (chosen)
723 return chosen->instr;
724
725 return NULL;
726 }
727
728 static struct ir3_instruction *
729 split_instr(struct ir3_sched_ctx *ctx, struct ir3_instruction *orig_instr)
730 {
731 struct ir3_instruction *new_instr = ir3_instr_clone(orig_instr);
732 di(new_instr, "split instruction");
733 sched_node_init(ctx, new_instr);
734 return new_instr;
735 }
736
737 /* "spill" the address registers by remapping any unscheduled
738 * instructions which depend on the current address register
739 * to a clone of the instruction which wrote the address reg.
740 */
741 static struct ir3_instruction *
742 split_addr(struct ir3_sched_ctx *ctx, struct ir3_instruction **addr,
743 struct ir3_instruction **users, unsigned users_count)
744 {
745 struct ir3_instruction *new_addr = NULL;
746 unsigned i;
747
748 debug_assert(*addr);
749
750 for (i = 0; i < users_count; i++) {
751 struct ir3_instruction *indirect = users[i];
752
753 if (!indirect)
754 continue;
755
756 /* skip instructions already scheduled: */
757 if (is_scheduled(indirect))
758 continue;
759
760 /* remap remaining instructions using current addr
761 * to new addr:
762 */
763 if (indirect->address == *addr) {
764 if (!new_addr) {
765 new_addr = split_instr(ctx, *addr);
766 /* original addr is scheduled, but new one isn't: */
767 new_addr->flags &= ~IR3_INSTR_MARK;
768 }
769 indirect->address = new_addr;
770 /* don't need to remove old dag edge since old addr is
771 * already scheduled:
772 */
773 sched_node_add_dep(indirect, new_addr, 0);
774 di(indirect, "new address");
775 }
776 }
777
778 /* all remaining indirects remapped to new addr: */
779 *addr = NULL;
780
781 return new_addr;
782 }
783
784 /* "spill" the predicate register by remapping any unscheduled
785 * instructions which depend on the current predicate register
786 * to a clone of the instruction which wrote the address reg.
787 */
788 static struct ir3_instruction *
789 split_pred(struct ir3_sched_ctx *ctx)
790 {
791 struct ir3 *ir;
792 struct ir3_instruction *new_pred = NULL;
793 unsigned i;
794
795 debug_assert(ctx->pred);
796
797 ir = ctx->pred->block->shader;
798
799 for (i = 0; i < ir->predicates_count; i++) {
800 struct ir3_instruction *predicated = ir->predicates[i];
801
802 /* skip instructions already scheduled: */
803 if (is_scheduled(predicated))
804 continue;
805
806 /* remap remaining instructions using current pred
807 * to new pred:
808 *
809 * TODO is there ever a case when pred isn't first
810 * (and only) src?
811 */
812 if (ssa(predicated->regs[1]) == ctx->pred) {
813 if (!new_pred) {
814 new_pred = split_instr(ctx, ctx->pred);
815 /* original pred is scheduled, but new one isn't: */
816 new_pred->flags &= ~IR3_INSTR_MARK;
817 }
818 predicated->regs[1]->instr = new_pred;
819 /* don't need to remove old dag edge since old pred is
820 * already scheduled:
821 */
822 sched_node_add_dep(predicated, new_pred, 0);
823 di(predicated, "new predicate");
824 }
825 }
826
827 /* all remaining predicated remapped to new pred: */
828 ctx->pred = NULL;
829
830 return new_pred;
831 }
832
833 static void
834 sched_node_init(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
835 {
836 struct ir3_sched_node *n = rzalloc(ctx->dag, struct ir3_sched_node);
837
838 dag_init_node(ctx->dag, &n->dag);
839
840 n->instr = instr;
841 instr->data = n;
842 }
843
844 static void
845 sched_node_add_dep(struct ir3_instruction *instr, struct ir3_instruction *src, int i)
846 {
847 /* don't consider dependencies in other blocks: */
848 if (src->block != instr->block)
849 return;
850
851 /* we could have false-dep's that end up unused: */
852 if (src->flags & IR3_INSTR_UNUSED) {
853 debug_assert(__is_false_dep(instr, i));
854 return;
855 }
856
857 struct ir3_sched_node *n = instr->data;
858 struct ir3_sched_node *sn = src->data;
859
860 /* If src is consumed by a collect, track that to realize that once
861 * any of the collect srcs are live, we should hurry up and schedule
862 * the rest.
863 */
864 if (instr->opc == OPC_META_COLLECT)
865 sn->collect = instr;
866
867 dag_add_edge(&sn->dag, &n->dag, NULL);
868
869 unsigned d = ir3_delayslots(src, instr, i, true);
870 n->delay = MAX2(n->delay, d);
871 }
872
873 static void
874 mark_kill_path(struct ir3_instruction *instr)
875 {
876 struct ir3_sched_node *n = instr->data;
877 n->kill_path = true;
878 struct ir3_instruction *src;
879 foreach_ssa_src (src, instr) {
880 if (src->block != instr->block)
881 continue;
882 mark_kill_path(src);
883 }
884 }
885
886 /* Is it an output? */
887 static bool
888 is_output_collect(struct ir3_instruction *instr)
889 {
890 struct ir3 *ir = instr->block->shader;
891
892 for (unsigned i = 0; i < ir->outputs_count; i++) {
893 struct ir3_instruction *collect = ir->outputs[i];
894 assert(collect->opc == OPC_META_COLLECT);
895 if (instr == collect)
896 return true;
897 }
898
899 return false;
900 }
901
902 /* Is it's only use as output? */
903 static bool
904 is_output_only(struct ir3_instruction *instr)
905 {
906 if (!writes_gpr(instr))
907 return false;
908
909 if (!(instr->regs[0]->flags & IR3_REG_SSA))
910 return false;
911
912 foreach_ssa_use (use, instr)
913 if (!is_output_collect(use))
914 return false;
915
916 return true;
917 }
918
919 static void
920 sched_node_add_deps(struct ir3_instruction *instr)
921 {
922 struct ir3_instruction *src;
923
924 /* Since foreach_ssa_src() already handles false-dep's we can construct
925 * the DAG easily in a single pass.
926 */
927 foreach_ssa_src_n (src, i, instr) {
928 sched_node_add_dep(instr, src, i);
929 }
930
931 /* NOTE that all inputs must be scheduled before a kill, so
932 * mark these to be prioritized as well:
933 */
934 if (is_kill(instr) || is_input(instr)) {
935 mark_kill_path(instr);
936 }
937
938 if (is_output_only(instr)) {
939 struct ir3_sched_node *n = instr->data;
940 n->output = true;
941 }
942 }
943
944 static void
945 sched_dag_max_delay_cb(struct dag_node *node, void *state)
946 {
947 struct ir3_sched_node *n = (struct ir3_sched_node *)node;
948 uint32_t max_delay = 0;
949
950 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
951 struct ir3_sched_node *child = (struct ir3_sched_node *)edge->child;
952 max_delay = MAX2(child->max_delay, max_delay);
953 }
954
955 n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
956 }
957
958 static void
959 sched_dag_init(struct ir3_sched_ctx *ctx)
960 {
961 ctx->dag = dag_create(ctx);
962
963 foreach_instr (instr, &ctx->unscheduled_list) {
964 sched_node_init(ctx, instr);
965 sched_node_add_deps(instr);
966 }
967
968 dag_traverse_bottom_up(ctx->dag, sched_dag_max_delay_cb, NULL);
969 }
970
971 static void
972 sched_dag_destroy(struct ir3_sched_ctx *ctx)
973 {
974 ralloc_free(ctx->dag);
975 ctx->dag = NULL;
976 }
977
978 static void
979 sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
980 {
981 ctx->block = block;
982
983 /* addr/pred writes are per-block: */
984 ctx->addr0 = NULL;
985 ctx->addr1 = NULL;
986 ctx->pred = NULL;
987
988 /* move all instructions to the unscheduled list, and
989 * empty the block's instruction list (to which we will
990 * be inserting).
991 */
992 list_replace(&block->instr_list, &ctx->unscheduled_list);
993 list_inithead(&block->instr_list);
994
995 sched_dag_init(ctx);
996
997 ctx->remaining_kills = 0;
998 ctx->remaining_tex = 0;
999 foreach_instr_safe (instr, &ctx->unscheduled_list) {
1000 if (is_kill(instr))
1001 ctx->remaining_kills++;
1002 if (is_tex_or_prefetch(instr))
1003 ctx->remaining_tex++;
1004 }
1005
1006 /* First schedule all meta:input instructions, followed by
1007 * tex-prefetch. We want all of the instructions that load
1008 * values into registers before the shader starts to go
1009 * before any other instructions. But in particular we
1010 * want inputs to come before prefetches. This is because
1011 * a FS's bary_ij input may not actually be live in the
1012 * shader, but it should not be scheduled on top of any
1013 * other input (but can be overwritten by a tex prefetch)
1014 */
1015 foreach_instr_safe (instr, &ctx->unscheduled_list)
1016 if (instr->opc == OPC_META_INPUT)
1017 schedule(ctx, instr);
1018
1019 foreach_instr_safe (instr, &ctx->unscheduled_list)
1020 if (instr->opc == OPC_META_TEX_PREFETCH)
1021 schedule(ctx, instr);
1022
1023 while (!list_is_empty(&ctx->unscheduled_list)) {
1024 struct ir3_sched_notes notes = {0};
1025 struct ir3_instruction *instr;
1026
1027 instr = choose_instr(ctx, &notes);
1028 if (instr) {
1029 unsigned delay = ir3_delay_calc(ctx->block, instr, false, false);
1030 d("delay=%u", delay);
1031
1032 /* and if we run out of instructions that can be scheduled,
1033 * then it is time for nop's:
1034 */
1035 debug_assert(delay <= 6);
1036 while (delay > 0) {
1037 ir3_NOP(block);
1038 delay--;
1039 }
1040
1041 schedule(ctx, instr);
1042 } else {
1043 struct ir3_instruction *new_instr = NULL;
1044 struct ir3 *ir = block->shader;
1045
1046 /* nothing available to schedule.. if we are blocked on
1047 * address/predicate register conflict, then break the
1048 * deadlock by cloning the instruction that wrote that
1049 * reg:
1050 */
1051 if (notes.addr0_conflict) {
1052 new_instr = split_addr(ctx, &ctx->addr0,
1053 ir->a0_users, ir->a0_users_count);
1054 } else if (notes.addr1_conflict) {
1055 new_instr = split_addr(ctx, &ctx->addr1,
1056 ir->a1_users, ir->a1_users_count);
1057 } else if (notes.pred_conflict) {
1058 new_instr = split_pred(ctx);
1059 } else {
1060 d("unscheduled_list:");
1061 foreach_instr (instr, &ctx->unscheduled_list)
1062 di(instr, "unscheduled: ");
1063 debug_assert(0);
1064 ctx->error = true;
1065 return;
1066 }
1067
1068 if (new_instr) {
1069 list_delinit(&new_instr->node);
1070 list_addtail(&new_instr->node, &ctx->unscheduled_list);
1071 }
1072 }
1073 }
1074
1075 sched_dag_destroy(ctx);
1076 }
1077
1078 int ir3_sched(struct ir3 *ir)
1079 {
1080 struct ir3_sched_ctx *ctx = rzalloc(NULL, struct ir3_sched_ctx);
1081
1082 foreach_block (block, &ir->block_list) {
1083 foreach_instr (instr, &block->instr_list) {
1084 instr->data = NULL;
1085 }
1086 }
1087
1088 ir3_count_instructions(ir);
1089 ir3_clear_mark(ir);
1090 ir3_find_ssa_uses(ir, ctx, false);
1091
1092 foreach_block (block, &ir->block_list) {
1093 sched_block(ctx, block);
1094 }
1095
1096 int ret = ctx->error ? -1 : 0;
1097
1098 ralloc_free(ctx);
1099
1100 return ret;
1101 }
1102
1103 static unsigned
1104 get_array_id(struct ir3_instruction *instr)
1105 {
1106 /* The expectation is that there is only a single array
1107 * src or dst, ir3_cp should enforce this.
1108 */
1109
1110 for (unsigned i = 0; i < instr->regs_count; i++)
1111 if (instr->regs[i]->flags & IR3_REG_ARRAY)
1112 return instr->regs[i]->array.id;
1113
1114 unreachable("this was unexpected");
1115 }
1116
1117 /* does instruction 'prior' need to be scheduled before 'instr'? */
1118 static bool
1119 depends_on(struct ir3_instruction *instr, struct ir3_instruction *prior)
1120 {
1121 /* TODO for dependencies that are related to a specific object, ie
1122 * a specific SSBO/image/array, we could relax this constraint to
1123 * make accesses to unrelated objects not depend on each other (at
1124 * least as long as not declared coherent)
1125 */
1126 if (((instr->barrier_class & IR3_BARRIER_EVERYTHING) && prior->barrier_class) ||
1127 ((prior->barrier_class & IR3_BARRIER_EVERYTHING) && instr->barrier_class))
1128 return true;
1129
1130 if (instr->barrier_class & prior->barrier_conflict) {
1131 if (!(instr->barrier_class & ~(IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W))) {
1132 /* if only array barrier, then we can further limit false-deps
1133 * by considering the array-id, ie reads/writes to different
1134 * arrays do not depend on each other (no aliasing)
1135 */
1136 if (get_array_id(instr) != get_array_id(prior)) {
1137 return false;
1138 }
1139 }
1140
1141 return true;
1142 }
1143
1144 return false;
1145 }
1146
1147 static void
1148 add_barrier_deps(struct ir3_block *block, struct ir3_instruction *instr)
1149 {
1150 struct list_head *prev = instr->node.prev;
1151 struct list_head *next = instr->node.next;
1152
1153 /* add dependencies on previous instructions that must be scheduled
1154 * prior to the current instruction
1155 */
1156 while (prev != &block->instr_list) {
1157 struct ir3_instruction *pi =
1158 LIST_ENTRY(struct ir3_instruction, prev, node);
1159
1160 prev = prev->prev;
1161
1162 if (is_meta(pi))
1163 continue;
1164
1165 if (instr->barrier_class == pi->barrier_class) {
1166 ir3_instr_add_dep(instr, pi);
1167 break;
1168 }
1169
1170 if (depends_on(instr, pi))
1171 ir3_instr_add_dep(instr, pi);
1172 }
1173
1174 /* add dependencies on this instruction to following instructions
1175 * that must be scheduled after the current instruction:
1176 */
1177 while (next != &block->instr_list) {
1178 struct ir3_instruction *ni =
1179 LIST_ENTRY(struct ir3_instruction, next, node);
1180
1181 next = next->next;
1182
1183 if (is_meta(ni))
1184 continue;
1185
1186 if (instr->barrier_class == ni->barrier_class) {
1187 ir3_instr_add_dep(ni, instr);
1188 break;
1189 }
1190
1191 if (depends_on(ni, instr))
1192 ir3_instr_add_dep(ni, instr);
1193 }
1194 }
1195
1196 /* before scheduling a block, we need to add any necessary false-dependencies
1197 * to ensure that:
1198 *
1199 * (1) barriers are scheduled in the right order wrt instructions related
1200 * to the barrier
1201 *
1202 * (2) reads that come before a write actually get scheduled before the
1203 * write
1204 */
1205 static void
1206 calculate_deps(struct ir3_block *block)
1207 {
1208 foreach_instr (instr, &block->instr_list) {
1209 if (instr->barrier_class) {
1210 add_barrier_deps(block, instr);
1211 }
1212 }
1213 }
1214
1215 void
1216 ir3_sched_add_deps(struct ir3 *ir)
1217 {
1218 foreach_block (block, &ir->block_list) {
1219 calculate_deps(block);
1220 }
1221 }