nir/scheduler: Handle nir_intrinsic_load_per_vertex_input
[mesa.git] / src / compiler / nir / nir_schedule.c
1 /*
2 * Copyright © 2019 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "nir.h"
25 #include "util/dag.h"
26 #include "util/u_dynarray.h"
27
28 /** @file
29 *
30 * Implements basic-block-level prepass instruction scheduling in NIR to
31 * manage register pressure.
32 *
33 * This is based on the Goodman/Hsu paper (1988, cached copy at
34 * https://people.freedesktop.org/~anholt/scheduling-goodman-hsu.pdf). We
35 * make up the DDG for NIR (which can be mostly done using the NIR def/use
36 * chains for SSA instructions, plus some edges for ordering register writes
37 * vs reads, and some more for ordering intrinsics). Then we pick heads off
38 * of the DDG using their heuristic to emit the NIR instructions back into the
39 * block in their new order.
40 *
41 * The hard case for prepass scheduling on GPUs seems to always be consuming
42 * texture/ubo results. The register pressure heuristic doesn't want to pick
43 * an instr that starts consuming texture results because it usually won't be
44 * the only usage, so that instruction will increase pressure.
45 *
46 * If you try to force consumption of tex results always, then in a case where
47 * single sample is used for many outputs, you'll end up picking every other
48 * user and expanding register pressure. The partially_evaluated_path flag
49 * helps tremendously, in that if you happen for whatever reason to pick a
50 * texture sample's output, then you'll try to finish off that sample. Future
51 * work may include doing some local search before locking in a choice, to try
52 * to more reliably find the case where just a few choices going against the
53 * heuristic can manage to free the whole vector.
54 */
55
56 static bool debug;
57
58 /**
59 * Represents a node in the DDG for a NIR instruction.
60 */
61 typedef struct {
62 struct dag_node dag; /* must be first for our u_dynarray_foreach */
63 nir_instr *instr;
64 bool partially_evaluated_path;
65
66 /* Approximate estimate of the delay between starting this instruction and
67 * its results being available.
68 *
69 * Accuracy is not too important, given that we're prepass scheduling here
70 * and just trying to reduce excess dependencies introduced by a register
71 * allocator by stretching out the live intervals of expensive
72 * instructions.
73 */
74 uint32_t delay;
75
76 /* Cost of the maximum-delay path from this node to the leaves. */
77 uint32_t max_delay;
78
79 /* scoreboard->time value when this instruction can be scheduled without
80 * any stalls expected.
81 */
82 uint32_t ready_time;
83 } nir_schedule_node;
84
85 typedef struct {
86 struct dag *dag;
87
88 nir_shader *shader;
89
90 /* Mapping from nir_register * or nir_ssa_def * to a struct set of
91 * instructions remaining to be scheduled using the register.
92 */
93 struct hash_table *remaining_uses;
94
95 /* Map from nir_instr to nir_schedule_node * */
96 struct hash_table *instr_map;
97
98 /* Set of nir_register * or nir_ssa_def * that have had any instruction
99 * scheduled on them.
100 */
101 struct set *live_values;
102
103 /* An abstract approximation of the number of nir_scheduler_node->delay
104 * units since the start of the shader.
105 */
106 uint32_t time;
107
108 /* Number of channels currently used by the NIR instructions that have been
109 * scheduled.
110 */
111 int pressure;
112
113 /* Number of channels that may be in use before we switch to the
114 * pressure-prioritizing scheduling heuristic.
115 */
116 int threshold;
117 } nir_schedule_scoreboard;
118
119 /* When walking the instructions in reverse, we use this flag to swap
120 * before/after in add_dep().
121 */
122 enum direction { F, R };
123
124 typedef struct {
125 nir_shader *shader;
126
127 /* Map from nir_instr to nir_schedule_node * */
128 struct hash_table *instr_map;
129 /* Map from nir_register to nir_schedule_node * */
130 struct hash_table *reg_map;
131
132 /* Scheduler nodes for last instruction involved in some class of dependency.
133 */
134 nir_schedule_node *load_input;
135 nir_schedule_node *store_shared;
136 nir_schedule_node *unknown_intrinsic;
137 nir_schedule_node *discard;
138 nir_schedule_node *jump;
139
140 enum direction dir;
141 } nir_deps_state;
142
143 static void *
144 _mesa_hash_table_search_data(struct hash_table *ht, void *key)
145 {
146 struct hash_entry *entry = _mesa_hash_table_search(ht, key);
147 if (!entry)
148 return NULL;
149 return entry->data;
150 }
151
152 static nir_schedule_node *
153 nir_schedule_get_node(struct hash_table *instr_map, nir_instr *instr)
154 {
155 return _mesa_hash_table_search_data(instr_map, instr);
156 }
157
158 static struct set *
159 nir_schedule_scoreboard_get_src(nir_schedule_scoreboard *scoreboard, nir_src *src)
160 {
161 if (src->is_ssa) {
162 return _mesa_hash_table_search_data(scoreboard->remaining_uses, src->ssa);
163 } else {
164 return _mesa_hash_table_search_data(scoreboard->remaining_uses,
165 src->reg.reg);
166 }
167 }
168
169 static int
170 nir_schedule_def_pressure(nir_ssa_def *def)
171 {
172 return def->num_components;
173 }
174
175 static int
176 nir_schedule_src_pressure(nir_src *src)
177 {
178 if (src->is_ssa)
179 return nir_schedule_def_pressure(src->ssa);
180 else
181 return src->reg.reg->num_components;
182 }
183
184 static int
185 nir_schedule_dest_pressure(nir_dest *dest)
186 {
187 if (dest->is_ssa)
188 return nir_schedule_def_pressure(&dest->ssa);
189 else
190 return dest->reg.reg->num_components;
191 }
192
193 /**
194 * Adds a dependency such that @after must appear in the final program after
195 * @before.
196 *
197 * We add @before as a child of @after, so that DAG heads are the outputs of
198 * the program and we make our scheduling decisions bottom to top.
199 */
200 static void
201 add_dep(nir_deps_state *state,
202 nir_schedule_node *before,
203 nir_schedule_node *after)
204 {
205 if (!before || !after)
206 return;
207
208 assert(before != after);
209
210 if (state->dir == F)
211 dag_add_edge(&before->dag, &after->dag, NULL);
212 else
213 dag_add_edge(&after->dag, &before->dag, NULL);
214 }
215
216
217 static void
218 add_read_dep(nir_deps_state *state,
219 nir_schedule_node *before,
220 nir_schedule_node *after)
221 {
222 add_dep(state, before, after);
223 }
224
225 static void
226 add_write_dep(nir_deps_state *state,
227 nir_schedule_node **before,
228 nir_schedule_node *after)
229 {
230 add_dep(state, *before, after);
231 *before = after;
232 }
233
234 static bool
235 nir_schedule_reg_src_deps(nir_src *src, void *in_state)
236 {
237 nir_deps_state *state = in_state;
238
239 if (src->is_ssa)
240 return true;
241
242 struct hash_entry *entry = _mesa_hash_table_search(state->reg_map,
243 src->reg.reg);
244 if (!entry)
245 return true;
246 nir_schedule_node *dst_n = entry->data;
247
248 nir_schedule_node *src_n = nir_schedule_get_node(state->instr_map,
249 src->parent_instr);
250
251 add_dep(state, dst_n, src_n);
252
253 return true;
254 }
255
256 static bool
257 nir_schedule_reg_dest_deps(nir_dest *dest, void *in_state)
258 {
259 nir_deps_state *state = in_state;
260
261 if (dest->is_ssa)
262 return true;
263
264 nir_schedule_node *dest_n = nir_schedule_get_node(state->instr_map,
265 dest->reg.parent_instr);
266
267 struct hash_entry *entry = _mesa_hash_table_search(state->reg_map,
268 dest->reg.reg);
269 if (!entry) {
270 _mesa_hash_table_insert(state->reg_map, dest->reg.reg, dest_n);
271 return true;
272 }
273 nir_schedule_node **before = (nir_schedule_node **)&entry->data;
274
275 add_write_dep(state, before, dest_n);
276
277 return true;
278 }
279
280 static bool
281 nir_schedule_ssa_deps(nir_ssa_def *def, void *in_state)
282 {
283 nir_deps_state *state = in_state;
284 nir_schedule_node *def_n = nir_schedule_get_node(state->instr_map, def->parent_instr);
285
286 nir_foreach_use(src, def) {
287 nir_schedule_node *use_n = nir_schedule_get_node(state->instr_map,
288 src->parent_instr);
289
290 add_read_dep(state, def_n, use_n);
291 }
292
293 return true;
294 }
295
296 static void
297 nir_schedule_intrinsic_deps(nir_deps_state *state,
298 nir_intrinsic_instr *instr)
299 {
300 nir_schedule_node *n = nir_schedule_get_node(state->instr_map, &instr->instr);
301
302 switch (instr->intrinsic) {
303 case nir_intrinsic_load_uniform:
304 case nir_intrinsic_load_ubo:
305 case nir_intrinsic_load_front_face:
306 break;
307
308 case nir_intrinsic_discard:
309 case nir_intrinsic_discard_if:
310 /* We are adding two dependencies:
311 *
312 * * A individual one that we could use to add a read_dep while handling
313 * nir_instr_type_tex
314 *
315 * * Include it on the unknown intrinsic set, as we want discard to be
316 * serialized in in the same order relative to intervening stores or
317 * atomic accesses to SSBOs and images
318 */
319 add_write_dep(state, &state->discard, n);
320 add_write_dep(state, &state->unknown_intrinsic, n);
321 break;
322
323 case nir_intrinsic_store_output:
324 /* For some non-FS shader stages, or for some hardware, output stores
325 * affect the same shared memory as input loads.
326 */
327 if (state->shader->info.stage != MESA_SHADER_FRAGMENT)
328 add_write_dep(state, &state->load_input, n);
329
330 /* Make sure that preceding discards stay before the store_output */
331 add_read_dep(state, state->discard, n);
332
333 break;
334
335 case nir_intrinsic_load_input:
336 case nir_intrinsic_load_per_vertex_input:
337 add_read_dep(state, state->load_input, n);
338 break;
339
340 case nir_intrinsic_load_shared:
341 /* Don't move load_shared beyond a following store_shared, as it could
342 * change their value
343 */
344 add_read_dep(state, state->store_shared, n);
345 break;
346
347 case nir_intrinsic_store_shared:
348 add_write_dep(state, &state->store_shared, n);
349 break;
350
351 case nir_intrinsic_control_barrier:
352 case nir_intrinsic_memory_barrier_shared:
353 add_write_dep(state, &state->store_shared, n);
354
355 /* Serialize against ssbos/atomics/etc. */
356 add_write_dep(state, &state->unknown_intrinsic, n);
357 break;
358
359 default:
360 /* Attempt to handle other intrinsics that we haven't individually
361 * categorized by serializing them in the same order relative to each
362 * other.
363 */
364 add_write_dep(state, &state->unknown_intrinsic, n);
365 break;
366 }
367 }
368
369 /**
370 * Common code for dependencies that need to be tracked both forward and
371 * backward.
372 *
373 * This is for things like "all reads of r4 have to happen between the r4
374 * writes that surround them".
375 */
376 static void
377 nir_schedule_calculate_deps(nir_deps_state *state, nir_schedule_node *n)
378 {
379 nir_instr *instr = n->instr;
380
381 /* For NIR SSA defs, we only need to do a single pass of making the uses
382 * depend on the def.
383 */
384 if (state->dir == F)
385 nir_foreach_ssa_def(instr, nir_schedule_ssa_deps, state);
386
387 /* For NIR regs, track the last writer in the scheduler state so that we
388 * can keep the writes in order and let reads get reordered only between
389 * each write.
390 */
391 nir_foreach_src(instr, nir_schedule_reg_src_deps, state);
392
393 nir_foreach_dest(instr, nir_schedule_reg_dest_deps, state);
394
395 /* Make sure any other instructions keep their positions relative to
396 * jumps.
397 */
398 if (instr->type != nir_instr_type_jump)
399 add_read_dep(state, state->jump, n);
400
401 switch (instr->type) {
402 case nir_instr_type_ssa_undef:
403 case nir_instr_type_load_const:
404 case nir_instr_type_alu:
405 case nir_instr_type_deref:
406 break;
407
408 case nir_instr_type_tex:
409 /* Don't move texture ops before a discard, as that could increase
410 * memory bandwidth for reading the discarded samples.
411 */
412 add_read_dep(state, state->discard, n);
413 break;
414
415 case nir_instr_type_jump:
416 add_write_dep(state, &state->jump, n);
417 break;
418
419 case nir_instr_type_call:
420 unreachable("Calls should have been lowered");
421 break;
422
423 case nir_instr_type_parallel_copy:
424 unreachable("Parallel copies should have been lowered");
425 break;
426
427 case nir_instr_type_phi:
428 unreachable("nir_schedule() should be called after lowering from SSA");
429 break;
430
431 case nir_instr_type_intrinsic:
432 nir_schedule_intrinsic_deps(state, nir_instr_as_intrinsic(instr));
433 break;
434 }
435 }
436
437 static void
438 calculate_forward_deps(nir_schedule_scoreboard *scoreboard, nir_block *block)
439 {
440 nir_deps_state state = {
441 .shader = scoreboard->shader,
442 .dir = F,
443 .instr_map = scoreboard->instr_map,
444 .reg_map = _mesa_pointer_hash_table_create(NULL),
445 };
446
447 nir_foreach_instr(instr, block) {
448 nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map,
449 instr);
450 nir_schedule_calculate_deps(&state, node);
451 }
452
453 ralloc_free(state.reg_map);
454 }
455
456 static void
457 calculate_reverse_deps(nir_schedule_scoreboard *scoreboard, nir_block *block)
458 {
459 nir_deps_state state = {
460 .shader = scoreboard->shader,
461 .dir = R,
462 .instr_map = scoreboard->instr_map,
463 .reg_map = _mesa_pointer_hash_table_create(NULL),
464 };
465
466 nir_foreach_instr_reverse(instr, block) {
467 nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map,
468 instr);
469 nir_schedule_calculate_deps(&state, node);
470 }
471
472 ralloc_free(state.reg_map);
473 }
474
475 typedef struct {
476 nir_schedule_scoreboard *scoreboard;
477 int regs_freed;
478 } nir_schedule_regs_freed_state;
479
480 static bool
481 nir_schedule_regs_freed_src_cb(nir_src *src, void *in_state)
482 {
483 nir_schedule_regs_freed_state *state = in_state;
484 nir_schedule_scoreboard *scoreboard = state->scoreboard;
485 struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src);
486
487 if (remaining_uses->entries == 1 &&
488 _mesa_set_search(remaining_uses, src->parent_instr)) {
489 state->regs_freed += nir_schedule_src_pressure(src);
490 }
491
492 return true;
493 }
494
495 static bool
496 nir_schedule_regs_freed_def_cb(nir_ssa_def *def, void *in_state)
497 {
498 nir_schedule_regs_freed_state *state = in_state;
499
500 state->regs_freed -= nir_schedule_def_pressure(def);
501
502 return true;
503 }
504
505 static bool
506 nir_schedule_regs_freed_dest_cb(nir_dest *dest, void *in_state)
507 {
508 nir_schedule_regs_freed_state *state = in_state;
509 nir_schedule_scoreboard *scoreboard = state->scoreboard;
510
511 if (dest->is_ssa)
512 return true;
513
514 nir_register *reg = dest->reg.reg;
515
516 /* Only the first def of a reg counts against register pressure. */
517 if (!_mesa_set_search(scoreboard->live_values, reg))
518 state->regs_freed -= nir_schedule_dest_pressure(dest);
519
520 return true;
521 }
522
523 static int
524 nir_schedule_regs_freed(nir_schedule_scoreboard *scoreboard, nir_schedule_node *n)
525 {
526 nir_schedule_regs_freed_state state = {
527 .scoreboard = scoreboard,
528 };
529
530 nir_foreach_src(n->instr, nir_schedule_regs_freed_src_cb, &state);
531
532 nir_foreach_ssa_def(n->instr, nir_schedule_regs_freed_def_cb, &state);
533
534 nir_foreach_dest(n->instr, nir_schedule_regs_freed_dest_cb, &state);
535
536 return state.regs_freed;
537 }
538
539 /**
540 * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSP (Code
541 * Scheduling for Parallelism) heuristic.
542 *
543 * Picks an instruction on the critical that's ready to execute without
544 * stalls, if possible, otherwise picks the instruction on the critical path.
545 */
546 static nir_schedule_node *
547 nir_schedule_choose_instruction_csp(nir_schedule_scoreboard *scoreboard)
548 {
549 nir_schedule_node *chosen = NULL;
550
551 /* Find the leader in the ready (shouldn't-stall) set with the maximum
552 * cost.
553 */
554 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
555 if (scoreboard->time < n->ready_time)
556 continue;
557
558 if (!chosen || chosen->max_delay < n->max_delay)
559 chosen = n;
560 }
561 if (chosen) {
562 if (debug) {
563 fprintf(stderr, "chose (ready): ");
564 nir_print_instr(chosen->instr, stderr);
565 fprintf(stderr, "\n");
566 }
567
568 return chosen;
569 }
570
571 /* Otherwise, choose the leader with the maximum cost. */
572 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
573 if (!chosen || chosen->max_delay < n->max_delay)
574 chosen = n;
575 }
576 if (debug) {
577 fprintf(stderr, "chose (leader): ");
578 nir_print_instr(chosen->instr, stderr);
579 fprintf(stderr, "\n");
580 }
581
582 return chosen;
583 }
584
585 /**
586 * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSR (Code
587 * Scheduling for Register pressure) heuristic.
588 */
589 static nir_schedule_node *
590 nir_schedule_choose_instruction_csr(nir_schedule_scoreboard *scoreboard)
591 {
592 nir_schedule_node *chosen = NULL;
593
594 /* Find a ready inst with regs freed and pick the one with max cost. */
595 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
596 if (n->ready_time > scoreboard->time)
597 continue;
598
599 int regs_freed = nir_schedule_regs_freed(scoreboard, n);
600
601 if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) {
602 chosen = n;
603 }
604 }
605 if (chosen) {
606 if (debug) {
607 fprintf(stderr, "chose (freed+ready): ");
608 nir_print_instr(chosen->instr, stderr);
609 fprintf(stderr, "\n");
610 }
611
612 return chosen;
613 }
614
615 /* Find a leader with regs freed and pick the one with max cost. */
616 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
617 int regs_freed = nir_schedule_regs_freed(scoreboard, n);
618
619 if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) {
620 chosen = n;
621 }
622 }
623 if (chosen) {
624 if (debug) {
625 fprintf(stderr, "chose (regs freed): ");
626 nir_print_instr(chosen->instr, stderr);
627 fprintf(stderr, "\n");
628 }
629
630 return chosen;
631 }
632
633 /* Find a partially evaluated path and try to finish it off */
634 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
635 if (n->partially_evaluated_path &&
636 (!chosen || chosen->max_delay < n->max_delay)) {
637 chosen = n;
638 }
639 }
640 if (chosen) {
641 if (debug) {
642 fprintf(stderr, "chose (partial path): ");
643 nir_print_instr(chosen->instr, stderr);
644 fprintf(stderr, "\n");
645 }
646
647 return chosen;
648 }
649
650 /* Contra the paper, pick a leader with no effect on used regs. This may
651 * open up new opportunities, as otherwise a single-operand instr consuming
652 * a value will tend to block finding freeing that value. This had a
653 * massive effect on reducing spilling on V3D.
654 *
655 * XXX: Should this prioritize ready?
656 */
657 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
658 if (nir_schedule_regs_freed(scoreboard, n) != 0)
659 continue;
660
661 if (!chosen || chosen->max_delay < n->max_delay)
662 chosen = n;
663 }
664 if (chosen) {
665 if (debug) {
666 fprintf(stderr, "chose (regs no-op): ");
667 nir_print_instr(chosen->instr, stderr);
668 fprintf(stderr, "\n");
669 }
670
671 return chosen;
672 }
673
674 /* Pick the max delay of the remaining ready set. */
675 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
676 if (n->ready_time > scoreboard->time)
677 continue;
678 if (!chosen || chosen->max_delay < n->max_delay)
679 chosen = n;
680 }
681 if (chosen) {
682 if (debug) {
683 fprintf(stderr, "chose (ready max delay): ");
684 nir_print_instr(chosen->instr, stderr);
685 fprintf(stderr, "\n");
686 }
687 return chosen;
688 }
689
690 /* Pick the max delay of the remaining leaders. */
691 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
692 if (!chosen || chosen->max_delay < n->max_delay)
693 chosen = n;
694 }
695
696 if (debug) {
697 fprintf(stderr, "chose (max delay): ");
698 nir_print_instr(chosen->instr, stderr);
699 fprintf(stderr, "\n");
700 }
701
702 return chosen;
703 }
704
705 static void
706 dump_state(nir_schedule_scoreboard *scoreboard)
707 {
708 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
709 fprintf(stderr, "maxdel %5d ", n->max_delay);
710 nir_print_instr(n->instr, stderr);
711 fprintf(stderr, "\n");
712
713 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
714 nir_schedule_node *child = (nir_schedule_node *)edge->child;
715
716 fprintf(stderr, " -> (%d parents) ", child->dag.parent_count);
717 nir_print_instr(child->instr, stderr);
718 fprintf(stderr, "\n");
719 }
720 }
721 }
722
723 static void
724 nir_schedule_mark_use(nir_schedule_scoreboard *scoreboard,
725 void *reg_or_def,
726 nir_instr *reg_or_def_parent,
727 int pressure)
728 {
729 /* Make the value live if it's the first time it's been used. */
730 if (!_mesa_set_search(scoreboard->live_values, reg_or_def)) {
731 _mesa_set_add(scoreboard->live_values, reg_or_def);
732 scoreboard->pressure += pressure;
733 }
734
735 /* Make the value dead if it's the last remaining use. Be careful when one
736 * instruction uses a value twice to not decrement pressure twice.
737 */
738 struct set *remaining_uses =
739 _mesa_hash_table_search_data(scoreboard->remaining_uses, reg_or_def);
740 struct set_entry *entry = _mesa_set_search(remaining_uses, reg_or_def_parent);
741 if (entry) {
742 _mesa_set_remove(remaining_uses, entry);
743
744 if (remaining_uses->entries == 0)
745 scoreboard->pressure -= pressure;
746 }
747 }
748
749 static bool
750 nir_schedule_mark_src_scheduled(nir_src *src, void *state)
751 {
752 nir_schedule_scoreboard *scoreboard = state;
753 struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src);
754
755 struct set_entry *entry = _mesa_set_search(remaining_uses,
756 src->parent_instr);
757 if (entry) {
758 /* Once we've used an SSA value in one instruction, bump the priority of
759 * the other uses so the SSA value can get fully consumed.
760 *
761 * We don't do this for registers, and it's would be a hassle and it's
762 * unclear if that would help or not. Also, skip it for constants, as
763 * they're often folded as immediates into backend instructions and have
764 * many unrelated instructions all referencing the same value (0).
765 */
766 if (src->is_ssa &&
767 src->ssa->parent_instr->type != nir_instr_type_load_const) {
768 nir_foreach_use(other_src, src->ssa) {
769 if (other_src->parent_instr == src->parent_instr)
770 continue;
771
772 nir_schedule_node *n =
773 nir_schedule_get_node(scoreboard->instr_map,
774 other_src->parent_instr);
775
776 if (n && !n->partially_evaluated_path) {
777 if (debug) {
778 fprintf(stderr, " New partially evaluated path: ");
779 nir_print_instr(n->instr, stderr);
780 fprintf(stderr, "\n");
781 }
782
783 n->partially_evaluated_path = true;
784 }
785 }
786 }
787 }
788
789 nir_schedule_mark_use(scoreboard,
790 src->is_ssa ? (void *)src->ssa : (void *)src->reg.reg,
791 src->parent_instr,
792 nir_schedule_src_pressure(src));
793
794 return true;
795 }
796
797 static bool
798 nir_schedule_mark_def_scheduled(nir_ssa_def *def, void *state)
799 {
800 nir_schedule_scoreboard *scoreboard = state;
801
802 nir_schedule_mark_use(scoreboard, def, def->parent_instr,
803 nir_schedule_def_pressure(def));
804
805 return true;
806 }
807
808 static bool
809 nir_schedule_mark_dest_scheduled(nir_dest *dest, void *state)
810 {
811 nir_schedule_scoreboard *scoreboard = state;
812
813 /* SSA defs were handled in nir_schedule_mark_def_scheduled()
814 */
815 if (dest->is_ssa)
816 return true;
817
818 /* XXX: This is not actually accurate for regs -- the last use of a reg may
819 * have a live interval that extends across control flow. We should
820 * calculate the live ranges of regs, and have scheduler nodes for the CF
821 * nodes that also "use" the reg.
822 */
823 nir_schedule_mark_use(scoreboard, dest->reg.reg,
824 dest->reg.parent_instr,
825 nir_schedule_dest_pressure(dest));
826
827 return true;
828 }
829
830 static void
831 nir_schedule_mark_node_scheduled(nir_schedule_scoreboard *scoreboard,
832 nir_schedule_node *n)
833 {
834 nir_foreach_src(n->instr, nir_schedule_mark_src_scheduled, scoreboard);
835 nir_foreach_ssa_def(n->instr, nir_schedule_mark_def_scheduled, scoreboard);
836 nir_foreach_dest(n->instr, nir_schedule_mark_dest_scheduled, scoreboard);
837
838 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
839 nir_schedule_node *child = (nir_schedule_node *)edge->child;
840
841 child->ready_time = MAX2(child->ready_time,
842 scoreboard->time + n->delay);
843
844 if (child->dag.parent_count == 1) {
845 if (debug) {
846 fprintf(stderr, " New DAG head: ");
847 nir_print_instr(child->instr, stderr);
848 fprintf(stderr, "\n");
849 }
850 }
851 }
852
853 dag_prune_head(scoreboard->dag, &n->dag);
854
855 scoreboard->time = MAX2(n->ready_time, scoreboard->time);
856 scoreboard->time++;
857 }
858
859 static void
860 nir_schedule_instructions(nir_schedule_scoreboard *scoreboard, nir_block *block)
861 {
862 while (!list_is_empty(&scoreboard->dag->heads)) {
863 if (debug) {
864 fprintf(stderr, "current list:\n");
865 dump_state(scoreboard);
866 }
867
868 nir_schedule_node *chosen;
869 if (scoreboard->pressure < scoreboard->threshold)
870 chosen = nir_schedule_choose_instruction_csp(scoreboard);
871 else
872 chosen = nir_schedule_choose_instruction_csr(scoreboard);
873
874 /* Now that we've scheduled a new instruction, some of its children may
875 * be promoted to the list of instructions ready to be scheduled.
876 */
877 nir_schedule_mark_node_scheduled(scoreboard, chosen);
878
879 /* Move the instruction to the end (so our first chosen instructions are
880 * the start of the program).
881 */
882 exec_node_remove(&chosen->instr->node);
883 exec_list_push_tail(&block->instr_list, &chosen->instr->node);
884
885 if (debug)
886 fprintf(stderr, "\n");
887 }
888 }
889
890 static uint32_t
891 nir_schedule_get_delay(nir_instr *instr)
892 {
893 switch (instr->type) {
894 case nir_instr_type_ssa_undef:
895 case nir_instr_type_load_const:
896 case nir_instr_type_alu:
897 case nir_instr_type_deref:
898 case nir_instr_type_jump:
899 case nir_instr_type_parallel_copy:
900 case nir_instr_type_call:
901 case nir_instr_type_phi:
902 return 1;
903
904 case nir_instr_type_intrinsic:
905 /* XXX: Pick a large number for UBO/SSBO/image/shared loads */
906 return 1;
907
908 case nir_instr_type_tex:
909 /* Pick some large number to try to fetch textures early and sample them
910 * late.
911 */
912 return 100;
913 }
914
915 return 0;
916 }
917
918 static void
919 nir_schedule_dag_max_delay_cb(struct dag_node *node, void *state)
920 {
921 nir_schedule_node *n = (nir_schedule_node *)node;
922 uint32_t max_delay = 0;
923
924 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
925 nir_schedule_node *child = (nir_schedule_node *)edge->child;
926 max_delay = MAX2(child->max_delay, max_delay);
927 }
928
929 n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
930 }
931
932 static void
933 nir_schedule_block(nir_schedule_scoreboard *scoreboard, nir_block *block)
934 {
935 void *mem_ctx = ralloc_context(NULL);
936 scoreboard->instr_map = _mesa_pointer_hash_table_create(mem_ctx);
937
938 scoreboard->dag = dag_create(mem_ctx);
939
940 nir_foreach_instr(instr, block) {
941 nir_schedule_node *n =
942 rzalloc(mem_ctx, nir_schedule_node);
943
944 n->instr = instr;
945 n->delay = nir_schedule_get_delay(instr);
946 dag_init_node(scoreboard->dag, &n->dag);
947
948 _mesa_hash_table_insert(scoreboard->instr_map, instr, n);
949 }
950
951 calculate_forward_deps(scoreboard, block);
952 calculate_reverse_deps(scoreboard, block);
953
954 dag_traverse_bottom_up(scoreboard->dag, nir_schedule_dag_max_delay_cb, NULL);
955
956 nir_schedule_instructions(scoreboard, block);
957
958 ralloc_free(mem_ctx);
959 scoreboard->instr_map = NULL;
960 }
961
962 static bool
963 nir_schedule_ssa_def_init_scoreboard(nir_ssa_def *def, void *state)
964 {
965 nir_schedule_scoreboard *scoreboard = state;
966 struct set *def_uses = _mesa_pointer_set_create(scoreboard);
967
968 _mesa_hash_table_insert(scoreboard->remaining_uses, def, def_uses);
969
970 _mesa_set_add(def_uses, def->parent_instr);
971
972 nir_foreach_use(src, def) {
973 _mesa_set_add(def_uses, src->parent_instr);
974 }
975
976 /* XXX: Handle if uses */
977
978 return true;
979 }
980
981 static nir_schedule_scoreboard *
982 nir_schedule_get_scoreboard(nir_shader *shader, int threshold)
983 {
984 nir_schedule_scoreboard *scoreboard = rzalloc(NULL, nir_schedule_scoreboard);
985
986 scoreboard->shader = shader;
987 scoreboard->live_values = _mesa_pointer_set_create(scoreboard);
988 scoreboard->remaining_uses = _mesa_pointer_hash_table_create(scoreboard);
989 scoreboard->threshold = threshold;
990 scoreboard->pressure = 0;
991
992 nir_foreach_function(function, shader) {
993 nir_foreach_register(reg, &function->impl->registers) {
994 struct set *register_uses =
995 _mesa_pointer_set_create(scoreboard);
996
997 _mesa_hash_table_insert(scoreboard->remaining_uses, reg, register_uses);
998
999 nir_foreach_use(src, reg) {
1000 _mesa_set_add(register_uses, src->parent_instr);
1001 }
1002
1003 /* XXX: Handle if uses */
1004
1005 nir_foreach_def(dest, reg) {
1006 _mesa_set_add(register_uses, dest->reg.parent_instr);
1007 }
1008 }
1009
1010 nir_foreach_block(block, function->impl) {
1011 nir_foreach_instr(instr, block) {
1012 nir_foreach_ssa_def(instr, nir_schedule_ssa_def_init_scoreboard,
1013 scoreboard);
1014 }
1015
1016 /* XXX: We're ignoring if uses, which may prioritize scheduling other
1017 * uses of the if src even when it doesn't help. That's not many
1018 * values, though, so meh.
1019 */
1020 }
1021 }
1022
1023 return scoreboard;
1024 }
1025
1026 static void
1027 nir_schedule_validate_uses(nir_schedule_scoreboard *scoreboard)
1028 {
1029 #ifdef NDEBUG
1030 return;
1031 #endif
1032
1033 bool any_uses = false;
1034
1035 hash_table_foreach(scoreboard->remaining_uses, entry) {
1036 struct set *remaining_uses = entry->data;
1037
1038 set_foreach(remaining_uses, instr_entry) {
1039 if (!any_uses) {
1040 fprintf(stderr, "Tracked uses remain after scheduling. "
1041 "Affected instructions: \n");
1042 any_uses = true;
1043 }
1044 nir_print_instr(instr_entry->key, stderr);
1045 fprintf(stderr, "\n");
1046 }
1047 }
1048
1049 assert(!any_uses);
1050 }
1051
1052 /**
1053 * Schedules the NIR instructions to try to decrease stalls (for example,
1054 * delaying texture reads) while managing register pressure.
1055 *
1056 * The threshold represents "number of NIR register/SSA def channels live
1057 * before switching the scheduling heuristic to reduce register pressure",
1058 * since most of our GPU architectures are scalar (extending to vector with a
1059 * flag wouldn't be hard). This number should be a bit below the number of
1060 * registers available (counting any that may be occupied by system value
1061 * payload values, for example), since the heuristic may not always be able to
1062 * free a register immediately. The amount below the limit is up to you to
1063 * tune.
1064 */
1065 void
1066 nir_schedule(nir_shader *shader, int threshold)
1067 {
1068 nir_schedule_scoreboard *scoreboard = nir_schedule_get_scoreboard(shader,
1069 threshold);
1070
1071 if (debug) {
1072 fprintf(stderr, "NIR shader before scheduling:\n");
1073 nir_print_shader(shader, stderr);
1074 }
1075
1076 nir_foreach_function(function, shader) {
1077 if (!function->impl)
1078 continue;
1079
1080 nir_foreach_block(block, function->impl) {
1081 nir_schedule_block(scoreboard, block);
1082 }
1083 }
1084
1085 nir_schedule_validate_uses(scoreboard);
1086
1087 ralloc_free(scoreboard);
1088 }