386a515d783900020a97b39f80a7c2dcf247476c
[mesa.git] / src / compiler / nir / nir_schedule.c
1 /*
2 * Copyright © 2019 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "nir_schedule.h"
25 #include "util/dag.h"
26 #include "util/u_dynarray.h"
27
28 /** @file
29 *
30 * Implements basic-block-level prepass instruction scheduling in NIR to
31 * manage register pressure.
32 *
33 * This is based on the Goodman/Hsu paper (1988, cached copy at
34 * https://people.freedesktop.org/~anholt/scheduling-goodman-hsu.pdf). We
35 * make up the DDG for NIR (which can be mostly done using the NIR def/use
36 * chains for SSA instructions, plus some edges for ordering register writes
37 * vs reads, and some more for ordering intrinsics). Then we pick heads off
38 * of the DDG using their heuristic to emit the NIR instructions back into the
39 * block in their new order.
40 *
41 * The hard case for prepass scheduling on GPUs seems to always be consuming
42 * texture/ubo results. The register pressure heuristic doesn't want to pick
43 * an instr that starts consuming texture results because it usually won't be
44 * the only usage, so that instruction will increase pressure.
45 *
46 * If you try to force consumption of tex results always, then in a case where
47 * single sample is used for many outputs, you'll end up picking every other
48 * user and expanding register pressure. The partially_evaluated_path flag
49 * helps tremendously, in that if you happen for whatever reason to pick a
50 * texture sample's output, then you'll try to finish off that sample. Future
51 * work may include doing some local search before locking in a choice, to try
52 * to more reliably find the case where just a few choices going against the
53 * heuristic can manage to free the whole vector.
54 */
55
56 static bool debug;
57
58 /**
59 * Represents a node in the DDG for a NIR instruction.
60 */
61 typedef struct {
62 struct dag_node dag; /* must be first for our u_dynarray_foreach */
63 nir_instr *instr;
64 bool partially_evaluated_path;
65
66 /* Approximate estimate of the delay between starting this instruction and
67 * its results being available.
68 *
69 * Accuracy is not too important, given that we're prepass scheduling here
70 * and just trying to reduce excess dependencies introduced by a register
71 * allocator by stretching out the live intervals of expensive
72 * instructions.
73 */
74 uint32_t delay;
75
76 /* Cost of the maximum-delay path from this node to the leaves. */
77 uint32_t max_delay;
78
79 /* scoreboard->time value when this instruction can be scheduled without
80 * any stalls expected.
81 */
82 uint32_t ready_time;
83 } nir_schedule_node;
84
85 typedef struct {
86 struct dag *dag;
87
88 nir_shader *shader;
89
90 /* Mapping from nir_register * or nir_ssa_def * to a struct set of
91 * instructions remaining to be scheduled using the register.
92 */
93 struct hash_table *remaining_uses;
94
95 /* Map from nir_instr to nir_schedule_node * */
96 struct hash_table *instr_map;
97
98 /* Set of nir_register * or nir_ssa_def * that have had any instruction
99 * scheduled on them.
100 */
101 struct set *live_values;
102
103 /* An abstract approximation of the number of nir_scheduler_node->delay
104 * units since the start of the shader.
105 */
106 uint32_t time;
107
108 /* Number of channels currently used by the NIR instructions that have been
109 * scheduled.
110 */
111 int pressure;
112
113 /* Number of channels that may be in use before we switch to the
114 * pressure-prioritizing scheduling heuristic.
115 */
116 int threshold;
117
118 /* Mask of stages that share memory for inputs and outputs */
119 unsigned stages_with_shared_io_memory;
120 } nir_schedule_scoreboard;
121
122 /* When walking the instructions in reverse, we use this flag to swap
123 * before/after in add_dep().
124 */
125 enum direction { F, R };
126
127 typedef struct {
128 nir_schedule_scoreboard *scoreboard;
129
130 /* Map from nir_register to nir_schedule_node * */
131 struct hash_table *reg_map;
132
133 /* Scheduler nodes for last instruction involved in some class of dependency.
134 */
135 nir_schedule_node *load_input;
136 nir_schedule_node *store_shared;
137 nir_schedule_node *unknown_intrinsic;
138 nir_schedule_node *discard;
139 nir_schedule_node *jump;
140
141 enum direction dir;
142 } nir_deps_state;
143
144 static void *
145 _mesa_hash_table_search_data(struct hash_table *ht, void *key)
146 {
147 struct hash_entry *entry = _mesa_hash_table_search(ht, key);
148 if (!entry)
149 return NULL;
150 return entry->data;
151 }
152
153 static nir_schedule_node *
154 nir_schedule_get_node(struct hash_table *instr_map, nir_instr *instr)
155 {
156 return _mesa_hash_table_search_data(instr_map, instr);
157 }
158
159 static struct set *
160 nir_schedule_scoreboard_get_src(nir_schedule_scoreboard *scoreboard, nir_src *src)
161 {
162 if (src->is_ssa) {
163 return _mesa_hash_table_search_data(scoreboard->remaining_uses, src->ssa);
164 } else {
165 return _mesa_hash_table_search_data(scoreboard->remaining_uses,
166 src->reg.reg);
167 }
168 }
169
170 static int
171 nir_schedule_def_pressure(nir_ssa_def *def)
172 {
173 return def->num_components;
174 }
175
176 static int
177 nir_schedule_src_pressure(nir_src *src)
178 {
179 if (src->is_ssa)
180 return nir_schedule_def_pressure(src->ssa);
181 else
182 return src->reg.reg->num_components;
183 }
184
185 static int
186 nir_schedule_dest_pressure(nir_dest *dest)
187 {
188 if (dest->is_ssa)
189 return nir_schedule_def_pressure(&dest->ssa);
190 else
191 return dest->reg.reg->num_components;
192 }
193
194 /**
195 * Adds a dependency such that @after must appear in the final program after
196 * @before.
197 *
198 * We add @before as a child of @after, so that DAG heads are the outputs of
199 * the program and we make our scheduling decisions bottom to top.
200 */
201 static void
202 add_dep(nir_deps_state *state,
203 nir_schedule_node *before,
204 nir_schedule_node *after)
205 {
206 if (!before || !after)
207 return;
208
209 assert(before != after);
210
211 if (state->dir == F)
212 dag_add_edge(&before->dag, &after->dag, NULL);
213 else
214 dag_add_edge(&after->dag, &before->dag, NULL);
215 }
216
217
218 static void
219 add_read_dep(nir_deps_state *state,
220 nir_schedule_node *before,
221 nir_schedule_node *after)
222 {
223 add_dep(state, before, after);
224 }
225
226 static void
227 add_write_dep(nir_deps_state *state,
228 nir_schedule_node **before,
229 nir_schedule_node *after)
230 {
231 add_dep(state, *before, after);
232 *before = after;
233 }
234
235 static bool
236 nir_schedule_reg_src_deps(nir_src *src, void *in_state)
237 {
238 nir_deps_state *state = in_state;
239
240 if (src->is_ssa)
241 return true;
242
243 struct hash_entry *entry = _mesa_hash_table_search(state->reg_map,
244 src->reg.reg);
245 if (!entry)
246 return true;
247 nir_schedule_node *dst_n = entry->data;
248
249 nir_schedule_node *src_n =
250 nir_schedule_get_node(state->scoreboard->instr_map,
251 src->parent_instr);
252
253 add_dep(state, dst_n, src_n);
254
255 return true;
256 }
257
258 static bool
259 nir_schedule_reg_dest_deps(nir_dest *dest, void *in_state)
260 {
261 nir_deps_state *state = in_state;
262
263 if (dest->is_ssa)
264 return true;
265
266 nir_schedule_node *dest_n =
267 nir_schedule_get_node(state->scoreboard->instr_map,
268 dest->reg.parent_instr);
269
270 struct hash_entry *entry = _mesa_hash_table_search(state->reg_map,
271 dest->reg.reg);
272 if (!entry) {
273 _mesa_hash_table_insert(state->reg_map, dest->reg.reg, dest_n);
274 return true;
275 }
276 nir_schedule_node **before = (nir_schedule_node **)&entry->data;
277
278 add_write_dep(state, before, dest_n);
279
280 return true;
281 }
282
283 static bool
284 nir_schedule_ssa_deps(nir_ssa_def *def, void *in_state)
285 {
286 nir_deps_state *state = in_state;
287 struct hash_table *instr_map = state->scoreboard->instr_map;
288 nir_schedule_node *def_n = nir_schedule_get_node(instr_map, def->parent_instr);
289
290 nir_foreach_use(src, def) {
291 nir_schedule_node *use_n = nir_schedule_get_node(instr_map,
292 src->parent_instr);
293
294 add_read_dep(state, def_n, use_n);
295 }
296
297 return true;
298 }
299
300 static void
301 nir_schedule_intrinsic_deps(nir_deps_state *state,
302 nir_intrinsic_instr *instr)
303 {
304 nir_schedule_node *n = nir_schedule_get_node(state->scoreboard->instr_map,
305 &instr->instr);
306
307 switch (instr->intrinsic) {
308 case nir_intrinsic_load_uniform:
309 case nir_intrinsic_load_ubo:
310 case nir_intrinsic_load_front_face:
311 break;
312
313 case nir_intrinsic_discard:
314 case nir_intrinsic_discard_if:
315 /* We are adding two dependencies:
316 *
317 * * A individual one that we could use to add a read_dep while handling
318 * nir_instr_type_tex
319 *
320 * * Include it on the unknown intrinsic set, as we want discard to be
321 * serialized in in the same order relative to intervening stores or
322 * atomic accesses to SSBOs and images
323 */
324 add_write_dep(state, &state->discard, n);
325 add_write_dep(state, &state->unknown_intrinsic, n);
326 break;
327
328 case nir_intrinsic_store_output:
329 /* For some hardware and stages, output stores affect the same shared
330 * memory as input loads.
331 */
332 if ((state->scoreboard->stages_with_shared_io_memory &
333 (1 << state->scoreboard->shader->info.stage)))
334 add_write_dep(state, &state->load_input, n);
335
336 /* Make sure that preceding discards stay before the store_output */
337 add_read_dep(state, state->discard, n);
338
339 break;
340
341 case nir_intrinsic_load_input:
342 case nir_intrinsic_load_per_vertex_input:
343 add_read_dep(state, state->load_input, n);
344 break;
345
346 case nir_intrinsic_load_shared:
347 /* Don't move load_shared beyond a following store_shared, as it could
348 * change their value
349 */
350 add_read_dep(state, state->store_shared, n);
351 break;
352
353 case nir_intrinsic_store_shared:
354 add_write_dep(state, &state->store_shared, n);
355 break;
356
357 case nir_intrinsic_control_barrier:
358 case nir_intrinsic_memory_barrier_shared:
359 add_write_dep(state, &state->store_shared, n);
360
361 /* Serialize against ssbos/atomics/etc. */
362 add_write_dep(state, &state->unknown_intrinsic, n);
363 break;
364
365 default:
366 /* Attempt to handle other intrinsics that we haven't individually
367 * categorized by serializing them in the same order relative to each
368 * other.
369 */
370 add_write_dep(state, &state->unknown_intrinsic, n);
371 break;
372 }
373 }
374
375 /**
376 * Common code for dependencies that need to be tracked both forward and
377 * backward.
378 *
379 * This is for things like "all reads of r4 have to happen between the r4
380 * writes that surround them".
381 */
382 static void
383 nir_schedule_calculate_deps(nir_deps_state *state, nir_schedule_node *n)
384 {
385 nir_instr *instr = n->instr;
386
387 /* For NIR SSA defs, we only need to do a single pass of making the uses
388 * depend on the def.
389 */
390 if (state->dir == F)
391 nir_foreach_ssa_def(instr, nir_schedule_ssa_deps, state);
392
393 /* For NIR regs, track the last writer in the scheduler state so that we
394 * can keep the writes in order and let reads get reordered only between
395 * each write.
396 */
397 nir_foreach_src(instr, nir_schedule_reg_src_deps, state);
398
399 nir_foreach_dest(instr, nir_schedule_reg_dest_deps, state);
400
401 /* Make sure any other instructions keep their positions relative to
402 * jumps.
403 */
404 if (instr->type != nir_instr_type_jump)
405 add_read_dep(state, state->jump, n);
406
407 switch (instr->type) {
408 case nir_instr_type_ssa_undef:
409 case nir_instr_type_load_const:
410 case nir_instr_type_alu:
411 case nir_instr_type_deref:
412 break;
413
414 case nir_instr_type_tex:
415 /* Don't move texture ops before a discard, as that could increase
416 * memory bandwidth for reading the discarded samples.
417 */
418 add_read_dep(state, state->discard, n);
419 break;
420
421 case nir_instr_type_jump:
422 add_write_dep(state, &state->jump, n);
423 break;
424
425 case nir_instr_type_call:
426 unreachable("Calls should have been lowered");
427 break;
428
429 case nir_instr_type_parallel_copy:
430 unreachable("Parallel copies should have been lowered");
431 break;
432
433 case nir_instr_type_phi:
434 unreachable("nir_schedule() should be called after lowering from SSA");
435 break;
436
437 case nir_instr_type_intrinsic:
438 nir_schedule_intrinsic_deps(state, nir_instr_as_intrinsic(instr));
439 break;
440 }
441 }
442
443 static void
444 calculate_forward_deps(nir_schedule_scoreboard *scoreboard, nir_block *block)
445 {
446 nir_deps_state state = {
447 .scoreboard = scoreboard,
448 .dir = F,
449 .reg_map = _mesa_pointer_hash_table_create(NULL),
450 };
451
452 nir_foreach_instr(instr, block) {
453 nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map,
454 instr);
455 nir_schedule_calculate_deps(&state, node);
456 }
457
458 ralloc_free(state.reg_map);
459 }
460
461 static void
462 calculate_reverse_deps(nir_schedule_scoreboard *scoreboard, nir_block *block)
463 {
464 nir_deps_state state = {
465 .scoreboard = scoreboard,
466 .dir = R,
467 .reg_map = _mesa_pointer_hash_table_create(NULL),
468 };
469
470 nir_foreach_instr_reverse(instr, block) {
471 nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map,
472 instr);
473 nir_schedule_calculate_deps(&state, node);
474 }
475
476 ralloc_free(state.reg_map);
477 }
478
479 typedef struct {
480 nir_schedule_scoreboard *scoreboard;
481 int regs_freed;
482 } nir_schedule_regs_freed_state;
483
484 static bool
485 nir_schedule_regs_freed_src_cb(nir_src *src, void *in_state)
486 {
487 nir_schedule_regs_freed_state *state = in_state;
488 nir_schedule_scoreboard *scoreboard = state->scoreboard;
489 struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src);
490
491 if (remaining_uses->entries == 1 &&
492 _mesa_set_search(remaining_uses, src->parent_instr)) {
493 state->regs_freed += nir_schedule_src_pressure(src);
494 }
495
496 return true;
497 }
498
499 static bool
500 nir_schedule_regs_freed_def_cb(nir_ssa_def *def, void *in_state)
501 {
502 nir_schedule_regs_freed_state *state = in_state;
503
504 state->regs_freed -= nir_schedule_def_pressure(def);
505
506 return true;
507 }
508
509 static bool
510 nir_schedule_regs_freed_dest_cb(nir_dest *dest, void *in_state)
511 {
512 nir_schedule_regs_freed_state *state = in_state;
513 nir_schedule_scoreboard *scoreboard = state->scoreboard;
514
515 if (dest->is_ssa)
516 return true;
517
518 nir_register *reg = dest->reg.reg;
519
520 /* Only the first def of a reg counts against register pressure. */
521 if (!_mesa_set_search(scoreboard->live_values, reg))
522 state->regs_freed -= nir_schedule_dest_pressure(dest);
523
524 return true;
525 }
526
527 static int
528 nir_schedule_regs_freed(nir_schedule_scoreboard *scoreboard, nir_schedule_node *n)
529 {
530 nir_schedule_regs_freed_state state = {
531 .scoreboard = scoreboard,
532 };
533
534 nir_foreach_src(n->instr, nir_schedule_regs_freed_src_cb, &state);
535
536 nir_foreach_ssa_def(n->instr, nir_schedule_regs_freed_def_cb, &state);
537
538 nir_foreach_dest(n->instr, nir_schedule_regs_freed_dest_cb, &state);
539
540 return state.regs_freed;
541 }
542
543 /**
544 * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSP (Code
545 * Scheduling for Parallelism) heuristic.
546 *
547 * Picks an instruction on the critical that's ready to execute without
548 * stalls, if possible, otherwise picks the instruction on the critical path.
549 */
550 static nir_schedule_node *
551 nir_schedule_choose_instruction_csp(nir_schedule_scoreboard *scoreboard)
552 {
553 nir_schedule_node *chosen = NULL;
554
555 /* Find the leader in the ready (shouldn't-stall) set with the maximum
556 * cost.
557 */
558 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
559 if (scoreboard->time < n->ready_time)
560 continue;
561
562 if (!chosen || chosen->max_delay < n->max_delay)
563 chosen = n;
564 }
565 if (chosen) {
566 if (debug) {
567 fprintf(stderr, "chose (ready): ");
568 nir_print_instr(chosen->instr, stderr);
569 fprintf(stderr, "\n");
570 }
571
572 return chosen;
573 }
574
575 /* Otherwise, choose the leader with the maximum cost. */
576 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
577 if (!chosen || chosen->max_delay < n->max_delay)
578 chosen = n;
579 }
580 if (debug) {
581 fprintf(stderr, "chose (leader): ");
582 nir_print_instr(chosen->instr, stderr);
583 fprintf(stderr, "\n");
584 }
585
586 return chosen;
587 }
588
589 /**
590 * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSR (Code
591 * Scheduling for Register pressure) heuristic.
592 */
593 static nir_schedule_node *
594 nir_schedule_choose_instruction_csr(nir_schedule_scoreboard *scoreboard)
595 {
596 nir_schedule_node *chosen = NULL;
597
598 /* Find a ready inst with regs freed and pick the one with max cost. */
599 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
600 if (n->ready_time > scoreboard->time)
601 continue;
602
603 int regs_freed = nir_schedule_regs_freed(scoreboard, n);
604
605 if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) {
606 chosen = n;
607 }
608 }
609 if (chosen) {
610 if (debug) {
611 fprintf(stderr, "chose (freed+ready): ");
612 nir_print_instr(chosen->instr, stderr);
613 fprintf(stderr, "\n");
614 }
615
616 return chosen;
617 }
618
619 /* Find a leader with regs freed and pick the one with max cost. */
620 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
621 int regs_freed = nir_schedule_regs_freed(scoreboard, n);
622
623 if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) {
624 chosen = n;
625 }
626 }
627 if (chosen) {
628 if (debug) {
629 fprintf(stderr, "chose (regs freed): ");
630 nir_print_instr(chosen->instr, stderr);
631 fprintf(stderr, "\n");
632 }
633
634 return chosen;
635 }
636
637 /* Find a partially evaluated path and try to finish it off */
638 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
639 if (n->partially_evaluated_path &&
640 (!chosen || chosen->max_delay < n->max_delay)) {
641 chosen = n;
642 }
643 }
644 if (chosen) {
645 if (debug) {
646 fprintf(stderr, "chose (partial path): ");
647 nir_print_instr(chosen->instr, stderr);
648 fprintf(stderr, "\n");
649 }
650
651 return chosen;
652 }
653
654 /* Contra the paper, pick a leader with no effect on used regs. This may
655 * open up new opportunities, as otherwise a single-operand instr consuming
656 * a value will tend to block finding freeing that value. This had a
657 * massive effect on reducing spilling on V3D.
658 *
659 * XXX: Should this prioritize ready?
660 */
661 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
662 if (nir_schedule_regs_freed(scoreboard, n) != 0)
663 continue;
664
665 if (!chosen || chosen->max_delay < n->max_delay)
666 chosen = n;
667 }
668 if (chosen) {
669 if (debug) {
670 fprintf(stderr, "chose (regs no-op): ");
671 nir_print_instr(chosen->instr, stderr);
672 fprintf(stderr, "\n");
673 }
674
675 return chosen;
676 }
677
678 /* Pick the max delay of the remaining ready set. */
679 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
680 if (n->ready_time > scoreboard->time)
681 continue;
682 if (!chosen || chosen->max_delay < n->max_delay)
683 chosen = n;
684 }
685 if (chosen) {
686 if (debug) {
687 fprintf(stderr, "chose (ready max delay): ");
688 nir_print_instr(chosen->instr, stderr);
689 fprintf(stderr, "\n");
690 }
691 return chosen;
692 }
693
694 /* Pick the max delay of the remaining leaders. */
695 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
696 if (!chosen || chosen->max_delay < n->max_delay)
697 chosen = n;
698 }
699
700 if (debug) {
701 fprintf(stderr, "chose (max delay): ");
702 nir_print_instr(chosen->instr, stderr);
703 fprintf(stderr, "\n");
704 }
705
706 return chosen;
707 }
708
709 static void
710 dump_state(nir_schedule_scoreboard *scoreboard)
711 {
712 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
713 fprintf(stderr, "maxdel %5d ", n->max_delay);
714 nir_print_instr(n->instr, stderr);
715 fprintf(stderr, "\n");
716
717 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
718 nir_schedule_node *child = (nir_schedule_node *)edge->child;
719
720 fprintf(stderr, " -> (%d parents) ", child->dag.parent_count);
721 nir_print_instr(child->instr, stderr);
722 fprintf(stderr, "\n");
723 }
724 }
725 }
726
727 static void
728 nir_schedule_mark_use(nir_schedule_scoreboard *scoreboard,
729 void *reg_or_def,
730 nir_instr *reg_or_def_parent,
731 int pressure)
732 {
733 /* Make the value live if it's the first time it's been used. */
734 if (!_mesa_set_search(scoreboard->live_values, reg_or_def)) {
735 _mesa_set_add(scoreboard->live_values, reg_or_def);
736 scoreboard->pressure += pressure;
737 }
738
739 /* Make the value dead if it's the last remaining use. Be careful when one
740 * instruction uses a value twice to not decrement pressure twice.
741 */
742 struct set *remaining_uses =
743 _mesa_hash_table_search_data(scoreboard->remaining_uses, reg_or_def);
744 struct set_entry *entry = _mesa_set_search(remaining_uses, reg_or_def_parent);
745 if (entry) {
746 _mesa_set_remove(remaining_uses, entry);
747
748 if (remaining_uses->entries == 0)
749 scoreboard->pressure -= pressure;
750 }
751 }
752
753 static bool
754 nir_schedule_mark_src_scheduled(nir_src *src, void *state)
755 {
756 nir_schedule_scoreboard *scoreboard = state;
757 struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src);
758
759 struct set_entry *entry = _mesa_set_search(remaining_uses,
760 src->parent_instr);
761 if (entry) {
762 /* Once we've used an SSA value in one instruction, bump the priority of
763 * the other uses so the SSA value can get fully consumed.
764 *
765 * We don't do this for registers, and it's would be a hassle and it's
766 * unclear if that would help or not. Also, skip it for constants, as
767 * they're often folded as immediates into backend instructions and have
768 * many unrelated instructions all referencing the same value (0).
769 */
770 if (src->is_ssa &&
771 src->ssa->parent_instr->type != nir_instr_type_load_const) {
772 nir_foreach_use(other_src, src->ssa) {
773 if (other_src->parent_instr == src->parent_instr)
774 continue;
775
776 nir_schedule_node *n =
777 nir_schedule_get_node(scoreboard->instr_map,
778 other_src->parent_instr);
779
780 if (n && !n->partially_evaluated_path) {
781 if (debug) {
782 fprintf(stderr, " New partially evaluated path: ");
783 nir_print_instr(n->instr, stderr);
784 fprintf(stderr, "\n");
785 }
786
787 n->partially_evaluated_path = true;
788 }
789 }
790 }
791 }
792
793 nir_schedule_mark_use(scoreboard,
794 src->is_ssa ? (void *)src->ssa : (void *)src->reg.reg,
795 src->parent_instr,
796 nir_schedule_src_pressure(src));
797
798 return true;
799 }
800
801 static bool
802 nir_schedule_mark_def_scheduled(nir_ssa_def *def, void *state)
803 {
804 nir_schedule_scoreboard *scoreboard = state;
805
806 nir_schedule_mark_use(scoreboard, def, def->parent_instr,
807 nir_schedule_def_pressure(def));
808
809 return true;
810 }
811
812 static bool
813 nir_schedule_mark_dest_scheduled(nir_dest *dest, void *state)
814 {
815 nir_schedule_scoreboard *scoreboard = state;
816
817 /* SSA defs were handled in nir_schedule_mark_def_scheduled()
818 */
819 if (dest->is_ssa)
820 return true;
821
822 /* XXX: This is not actually accurate for regs -- the last use of a reg may
823 * have a live interval that extends across control flow. We should
824 * calculate the live ranges of regs, and have scheduler nodes for the CF
825 * nodes that also "use" the reg.
826 */
827 nir_schedule_mark_use(scoreboard, dest->reg.reg,
828 dest->reg.parent_instr,
829 nir_schedule_dest_pressure(dest));
830
831 return true;
832 }
833
834 static void
835 nir_schedule_mark_node_scheduled(nir_schedule_scoreboard *scoreboard,
836 nir_schedule_node *n)
837 {
838 nir_foreach_src(n->instr, nir_schedule_mark_src_scheduled, scoreboard);
839 nir_foreach_ssa_def(n->instr, nir_schedule_mark_def_scheduled, scoreboard);
840 nir_foreach_dest(n->instr, nir_schedule_mark_dest_scheduled, scoreboard);
841
842 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
843 nir_schedule_node *child = (nir_schedule_node *)edge->child;
844
845 child->ready_time = MAX2(child->ready_time,
846 scoreboard->time + n->delay);
847
848 if (child->dag.parent_count == 1) {
849 if (debug) {
850 fprintf(stderr, " New DAG head: ");
851 nir_print_instr(child->instr, stderr);
852 fprintf(stderr, "\n");
853 }
854 }
855 }
856
857 dag_prune_head(scoreboard->dag, &n->dag);
858
859 scoreboard->time = MAX2(n->ready_time, scoreboard->time);
860 scoreboard->time++;
861 }
862
863 static void
864 nir_schedule_instructions(nir_schedule_scoreboard *scoreboard, nir_block *block)
865 {
866 while (!list_is_empty(&scoreboard->dag->heads)) {
867 if (debug) {
868 fprintf(stderr, "current list:\n");
869 dump_state(scoreboard);
870 }
871
872 nir_schedule_node *chosen;
873 if (scoreboard->pressure < scoreboard->threshold)
874 chosen = nir_schedule_choose_instruction_csp(scoreboard);
875 else
876 chosen = nir_schedule_choose_instruction_csr(scoreboard);
877
878 /* Now that we've scheduled a new instruction, some of its children may
879 * be promoted to the list of instructions ready to be scheduled.
880 */
881 nir_schedule_mark_node_scheduled(scoreboard, chosen);
882
883 /* Move the instruction to the end (so our first chosen instructions are
884 * the start of the program).
885 */
886 exec_node_remove(&chosen->instr->node);
887 exec_list_push_tail(&block->instr_list, &chosen->instr->node);
888
889 if (debug)
890 fprintf(stderr, "\n");
891 }
892 }
893
894 static uint32_t
895 nir_schedule_get_delay(nir_instr *instr)
896 {
897 switch (instr->type) {
898 case nir_instr_type_ssa_undef:
899 case nir_instr_type_load_const:
900 case nir_instr_type_alu:
901 case nir_instr_type_deref:
902 case nir_instr_type_jump:
903 case nir_instr_type_parallel_copy:
904 case nir_instr_type_call:
905 case nir_instr_type_phi:
906 return 1;
907
908 case nir_instr_type_intrinsic:
909 /* XXX: Pick a large number for UBO/SSBO/image/shared loads */
910 return 1;
911
912 case nir_instr_type_tex:
913 /* Pick some large number to try to fetch textures early and sample them
914 * late.
915 */
916 return 100;
917 }
918
919 return 0;
920 }
921
922 static void
923 nir_schedule_dag_max_delay_cb(struct dag_node *node, void *state)
924 {
925 nir_schedule_node *n = (nir_schedule_node *)node;
926 uint32_t max_delay = 0;
927
928 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
929 nir_schedule_node *child = (nir_schedule_node *)edge->child;
930 max_delay = MAX2(child->max_delay, max_delay);
931 }
932
933 n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
934 }
935
936 static void
937 nir_schedule_block(nir_schedule_scoreboard *scoreboard, nir_block *block)
938 {
939 void *mem_ctx = ralloc_context(NULL);
940 scoreboard->instr_map = _mesa_pointer_hash_table_create(mem_ctx);
941
942 scoreboard->dag = dag_create(mem_ctx);
943
944 nir_foreach_instr(instr, block) {
945 nir_schedule_node *n =
946 rzalloc(mem_ctx, nir_schedule_node);
947
948 n->instr = instr;
949 n->delay = nir_schedule_get_delay(instr);
950 dag_init_node(scoreboard->dag, &n->dag);
951
952 _mesa_hash_table_insert(scoreboard->instr_map, instr, n);
953 }
954
955 calculate_forward_deps(scoreboard, block);
956 calculate_reverse_deps(scoreboard, block);
957
958 dag_traverse_bottom_up(scoreboard->dag, nir_schedule_dag_max_delay_cb, NULL);
959
960 nir_schedule_instructions(scoreboard, block);
961
962 ralloc_free(mem_ctx);
963 scoreboard->instr_map = NULL;
964 }
965
966 static bool
967 nir_schedule_ssa_def_init_scoreboard(nir_ssa_def *def, void *state)
968 {
969 nir_schedule_scoreboard *scoreboard = state;
970 struct set *def_uses = _mesa_pointer_set_create(scoreboard);
971
972 _mesa_hash_table_insert(scoreboard->remaining_uses, def, def_uses);
973
974 _mesa_set_add(def_uses, def->parent_instr);
975
976 nir_foreach_use(src, def) {
977 _mesa_set_add(def_uses, src->parent_instr);
978 }
979
980 /* XXX: Handle if uses */
981
982 return true;
983 }
984
985 static nir_schedule_scoreboard *
986 nir_schedule_get_scoreboard(nir_shader *shader,
987 const nir_schedule_options *options)
988 {
989 nir_schedule_scoreboard *scoreboard = rzalloc(NULL, nir_schedule_scoreboard);
990
991 scoreboard->shader = shader;
992 scoreboard->live_values = _mesa_pointer_set_create(scoreboard);
993 scoreboard->remaining_uses = _mesa_pointer_hash_table_create(scoreboard);
994 scoreboard->threshold = options->threshold;
995 scoreboard->stages_with_shared_io_memory =
996 options->stages_with_shared_io_memory;
997 scoreboard->pressure = 0;
998
999 nir_foreach_function(function, shader) {
1000 nir_foreach_register(reg, &function->impl->registers) {
1001 struct set *register_uses =
1002 _mesa_pointer_set_create(scoreboard);
1003
1004 _mesa_hash_table_insert(scoreboard->remaining_uses, reg, register_uses);
1005
1006 nir_foreach_use(src, reg) {
1007 _mesa_set_add(register_uses, src->parent_instr);
1008 }
1009
1010 /* XXX: Handle if uses */
1011
1012 nir_foreach_def(dest, reg) {
1013 _mesa_set_add(register_uses, dest->reg.parent_instr);
1014 }
1015 }
1016
1017 nir_foreach_block(block, function->impl) {
1018 nir_foreach_instr(instr, block) {
1019 nir_foreach_ssa_def(instr, nir_schedule_ssa_def_init_scoreboard,
1020 scoreboard);
1021 }
1022
1023 /* XXX: We're ignoring if uses, which may prioritize scheduling other
1024 * uses of the if src even when it doesn't help. That's not many
1025 * values, though, so meh.
1026 */
1027 }
1028 }
1029
1030 return scoreboard;
1031 }
1032
1033 static void
1034 nir_schedule_validate_uses(nir_schedule_scoreboard *scoreboard)
1035 {
1036 #ifdef NDEBUG
1037 return;
1038 #endif
1039
1040 bool any_uses = false;
1041
1042 hash_table_foreach(scoreboard->remaining_uses, entry) {
1043 struct set *remaining_uses = entry->data;
1044
1045 set_foreach(remaining_uses, instr_entry) {
1046 if (!any_uses) {
1047 fprintf(stderr, "Tracked uses remain after scheduling. "
1048 "Affected instructions: \n");
1049 any_uses = true;
1050 }
1051 nir_print_instr(instr_entry->key, stderr);
1052 fprintf(stderr, "\n");
1053 }
1054 }
1055
1056 assert(!any_uses);
1057 }
1058
1059 /**
1060 * Schedules the NIR instructions to try to decrease stalls (for example,
1061 * delaying texture reads) while managing register pressure.
1062 *
1063 * The threshold represents "number of NIR register/SSA def channels live
1064 * before switching the scheduling heuristic to reduce register pressure",
1065 * since most of our GPU architectures are scalar (extending to vector with a
1066 * flag wouldn't be hard). This number should be a bit below the number of
1067 * registers available (counting any that may be occupied by system value
1068 * payload values, for example), since the heuristic may not always be able to
1069 * free a register immediately. The amount below the limit is up to you to
1070 * tune.
1071 */
1072 void
1073 nir_schedule(nir_shader *shader,
1074 const nir_schedule_options *options)
1075 {
1076 nir_schedule_scoreboard *scoreboard = nir_schedule_get_scoreboard(shader,
1077 options);
1078
1079 if (debug) {
1080 fprintf(stderr, "NIR shader before scheduling:\n");
1081 nir_print_shader(shader, stderr);
1082 }
1083
1084 nir_foreach_function(function, shader) {
1085 if (!function->impl)
1086 continue;
1087
1088 nir_foreach_block(block, function->impl) {
1089 nir_schedule_block(scoreboard, block);
1090 }
1091 }
1092
1093 nir_schedule_validate_uses(scoreboard);
1094
1095 ralloc_free(scoreboard);
1096 }