1db808fc1ccf796059c8c19df66acce653fc9b5d
[mesa.git] / src / compiler / nir / nir_schedule.c
1 /*
2 * Copyright © 2019 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "nir_schedule.h"
25 #include "util/dag.h"
26 #include "util/u_dynarray.h"
27
28 /** @file
29 *
30 * Implements basic-block-level prepass instruction scheduling in NIR to
31 * manage register pressure.
32 *
33 * This is based on the Goodman/Hsu paper (1988, cached copy at
34 * https://people.freedesktop.org/~anholt/scheduling-goodman-hsu.pdf). We
35 * make up the DDG for NIR (which can be mostly done using the NIR def/use
36 * chains for SSA instructions, plus some edges for ordering register writes
37 * vs reads, and some more for ordering intrinsics). Then we pick heads off
38 * of the DDG using their heuristic to emit the NIR instructions back into the
39 * block in their new order.
40 *
41 * The hard case for prepass scheduling on GPUs seems to always be consuming
42 * texture/ubo results. The register pressure heuristic doesn't want to pick
43 * an instr that starts consuming texture results because it usually won't be
44 * the only usage, so that instruction will increase pressure.
45 *
46 * If you try to force consumption of tex results always, then in a case where
47 * single sample is used for many outputs, you'll end up picking every other
48 * user and expanding register pressure. The partially_evaluated_path flag
49 * helps tremendously, in that if you happen for whatever reason to pick a
50 * texture sample's output, then you'll try to finish off that sample. Future
51 * work may include doing some local search before locking in a choice, to try
52 * to more reliably find the case where just a few choices going against the
53 * heuristic can manage to free the whole vector.
54 */
55
56 static bool debug;
57
58 /**
59 * Represents a node in the DDG for a NIR instruction.
60 */
61 typedef struct {
62 struct dag_node dag; /* must be first for our u_dynarray_foreach */
63 nir_instr *instr;
64 bool partially_evaluated_path;
65
66 /* Approximate estimate of the delay between starting this instruction and
67 * its results being available.
68 *
69 * Accuracy is not too important, given that we're prepass scheduling here
70 * and just trying to reduce excess dependencies introduced by a register
71 * allocator by stretching out the live intervals of expensive
72 * instructions.
73 */
74 uint32_t delay;
75
76 /* Cost of the maximum-delay path from this node to the leaves. */
77 uint32_t max_delay;
78
79 /* scoreboard->time value when this instruction can be scheduled without
80 * any stalls expected.
81 */
82 uint32_t ready_time;
83 } nir_schedule_node;
84
85 typedef struct {
86 struct dag *dag;
87
88 nir_shader *shader;
89
90 /* Mapping from nir_register * or nir_ssa_def * to a struct set of
91 * instructions remaining to be scheduled using the register.
92 */
93 struct hash_table *remaining_uses;
94
95 /* Map from nir_instr to nir_schedule_node * */
96 struct hash_table *instr_map;
97
98 /* Set of nir_register * or nir_ssa_def * that have had any instruction
99 * scheduled on them.
100 */
101 struct set *live_values;
102
103 /* An abstract approximation of the number of nir_scheduler_node->delay
104 * units since the start of the shader.
105 */
106 uint32_t time;
107
108 /* Number of channels currently used by the NIR instructions that have been
109 * scheduled.
110 */
111 int pressure;
112
113 /* Options specified by the backend */
114 const nir_schedule_options *options;
115 } nir_schedule_scoreboard;
116
117 /* When walking the instructions in reverse, we use this flag to swap
118 * before/after in add_dep().
119 */
120 enum direction { F, R };
121
122 typedef struct {
123 nir_schedule_scoreboard *scoreboard;
124
125 /* Map from nir_register to nir_schedule_node * */
126 struct hash_table *reg_map;
127
128 /* Scheduler nodes for last instruction involved in some class of dependency.
129 */
130 nir_schedule_node *load_input;
131 nir_schedule_node *store_shared;
132 nir_schedule_node *unknown_intrinsic;
133 nir_schedule_node *discard;
134 nir_schedule_node *jump;
135
136 enum direction dir;
137 } nir_deps_state;
138
139 static void *
140 _mesa_hash_table_search_data(struct hash_table *ht, void *key)
141 {
142 struct hash_entry *entry = _mesa_hash_table_search(ht, key);
143 if (!entry)
144 return NULL;
145 return entry->data;
146 }
147
148 static nir_schedule_node *
149 nir_schedule_get_node(struct hash_table *instr_map, nir_instr *instr)
150 {
151 return _mesa_hash_table_search_data(instr_map, instr);
152 }
153
154 static struct set *
155 nir_schedule_scoreboard_get_src(nir_schedule_scoreboard *scoreboard, nir_src *src)
156 {
157 if (src->is_ssa) {
158 return _mesa_hash_table_search_data(scoreboard->remaining_uses, src->ssa);
159 } else {
160 return _mesa_hash_table_search_data(scoreboard->remaining_uses,
161 src->reg.reg);
162 }
163 }
164
165 static int
166 nir_schedule_def_pressure(nir_ssa_def *def)
167 {
168 return def->num_components;
169 }
170
171 static int
172 nir_schedule_src_pressure(nir_src *src)
173 {
174 if (src->is_ssa)
175 return nir_schedule_def_pressure(src->ssa);
176 else
177 return src->reg.reg->num_components;
178 }
179
180 static int
181 nir_schedule_dest_pressure(nir_dest *dest)
182 {
183 if (dest->is_ssa)
184 return nir_schedule_def_pressure(&dest->ssa);
185 else
186 return dest->reg.reg->num_components;
187 }
188
189 /**
190 * Adds a dependency such that @after must appear in the final program after
191 * @before.
192 *
193 * We add @before as a child of @after, so that DAG heads are the outputs of
194 * the program and we make our scheduling decisions bottom to top.
195 */
196 static void
197 add_dep(nir_deps_state *state,
198 nir_schedule_node *before,
199 nir_schedule_node *after)
200 {
201 if (!before || !after)
202 return;
203
204 assert(before != after);
205
206 if (state->dir == F)
207 dag_add_edge(&before->dag, &after->dag, NULL);
208 else
209 dag_add_edge(&after->dag, &before->dag, NULL);
210 }
211
212
213 static void
214 add_read_dep(nir_deps_state *state,
215 nir_schedule_node *before,
216 nir_schedule_node *after)
217 {
218 add_dep(state, before, after);
219 }
220
221 static void
222 add_write_dep(nir_deps_state *state,
223 nir_schedule_node **before,
224 nir_schedule_node *after)
225 {
226 add_dep(state, *before, after);
227 *before = after;
228 }
229
230 static bool
231 nir_schedule_reg_src_deps(nir_src *src, void *in_state)
232 {
233 nir_deps_state *state = in_state;
234
235 if (src->is_ssa)
236 return true;
237
238 struct hash_entry *entry = _mesa_hash_table_search(state->reg_map,
239 src->reg.reg);
240 if (!entry)
241 return true;
242 nir_schedule_node *dst_n = entry->data;
243
244 nir_schedule_node *src_n =
245 nir_schedule_get_node(state->scoreboard->instr_map,
246 src->parent_instr);
247
248 add_dep(state, dst_n, src_n);
249
250 return true;
251 }
252
253 static bool
254 nir_schedule_reg_dest_deps(nir_dest *dest, void *in_state)
255 {
256 nir_deps_state *state = in_state;
257
258 if (dest->is_ssa)
259 return true;
260
261 nir_schedule_node *dest_n =
262 nir_schedule_get_node(state->scoreboard->instr_map,
263 dest->reg.parent_instr);
264
265 struct hash_entry *entry = _mesa_hash_table_search(state->reg_map,
266 dest->reg.reg);
267 if (!entry) {
268 _mesa_hash_table_insert(state->reg_map, dest->reg.reg, dest_n);
269 return true;
270 }
271 nir_schedule_node **before = (nir_schedule_node **)&entry->data;
272
273 add_write_dep(state, before, dest_n);
274
275 return true;
276 }
277
278 static bool
279 nir_schedule_ssa_deps(nir_ssa_def *def, void *in_state)
280 {
281 nir_deps_state *state = in_state;
282 struct hash_table *instr_map = state->scoreboard->instr_map;
283 nir_schedule_node *def_n = nir_schedule_get_node(instr_map, def->parent_instr);
284
285 nir_foreach_use(src, def) {
286 nir_schedule_node *use_n = nir_schedule_get_node(instr_map,
287 src->parent_instr);
288
289 add_read_dep(state, def_n, use_n);
290 }
291
292 return true;
293 }
294
295 static void
296 nir_schedule_intrinsic_deps(nir_deps_state *state,
297 nir_intrinsic_instr *instr)
298 {
299 nir_schedule_node *n = nir_schedule_get_node(state->scoreboard->instr_map,
300 &instr->instr);
301
302 switch (instr->intrinsic) {
303 case nir_intrinsic_load_uniform:
304 case nir_intrinsic_load_ubo:
305 case nir_intrinsic_load_front_face:
306 break;
307
308 case nir_intrinsic_discard:
309 case nir_intrinsic_discard_if:
310 /* We are adding two dependencies:
311 *
312 * * A individual one that we could use to add a read_dep while handling
313 * nir_instr_type_tex
314 *
315 * * Include it on the unknown intrinsic set, as we want discard to be
316 * serialized in in the same order relative to intervening stores or
317 * atomic accesses to SSBOs and images
318 */
319 add_write_dep(state, &state->discard, n);
320 add_write_dep(state, &state->unknown_intrinsic, n);
321 break;
322
323 case nir_intrinsic_store_output:
324 /* For some hardware and stages, output stores affect the same shared
325 * memory as input loads.
326 */
327 if ((state->scoreboard->options->stages_with_shared_io_memory &
328 (1 << state->scoreboard->shader->info.stage)))
329 add_write_dep(state, &state->load_input, n);
330
331 /* Make sure that preceding discards stay before the store_output */
332 add_read_dep(state, state->discard, n);
333
334 break;
335
336 case nir_intrinsic_load_input:
337 case nir_intrinsic_load_per_vertex_input:
338 add_read_dep(state, state->load_input, n);
339 break;
340
341 case nir_intrinsic_load_shared:
342 /* Don't move load_shared beyond a following store_shared, as it could
343 * change their value
344 */
345 add_read_dep(state, state->store_shared, n);
346 break;
347
348 case nir_intrinsic_store_shared:
349 add_write_dep(state, &state->store_shared, n);
350 break;
351
352 case nir_intrinsic_control_barrier:
353 case nir_intrinsic_memory_barrier_shared:
354 add_write_dep(state, &state->store_shared, n);
355
356 /* Serialize against ssbos/atomics/etc. */
357 add_write_dep(state, &state->unknown_intrinsic, n);
358 break;
359
360 default:
361 /* Attempt to handle other intrinsics that we haven't individually
362 * categorized by serializing them in the same order relative to each
363 * other.
364 */
365 add_write_dep(state, &state->unknown_intrinsic, n);
366 break;
367 }
368 }
369
370 /**
371 * Common code for dependencies that need to be tracked both forward and
372 * backward.
373 *
374 * This is for things like "all reads of r4 have to happen between the r4
375 * writes that surround them".
376 */
377 static void
378 nir_schedule_calculate_deps(nir_deps_state *state, nir_schedule_node *n)
379 {
380 nir_instr *instr = n->instr;
381
382 /* For NIR SSA defs, we only need to do a single pass of making the uses
383 * depend on the def.
384 */
385 if (state->dir == F)
386 nir_foreach_ssa_def(instr, nir_schedule_ssa_deps, state);
387
388 /* For NIR regs, track the last writer in the scheduler state so that we
389 * can keep the writes in order and let reads get reordered only between
390 * each write.
391 */
392 nir_foreach_src(instr, nir_schedule_reg_src_deps, state);
393
394 nir_foreach_dest(instr, nir_schedule_reg_dest_deps, state);
395
396 /* Make sure any other instructions keep their positions relative to
397 * jumps.
398 */
399 if (instr->type != nir_instr_type_jump)
400 add_read_dep(state, state->jump, n);
401
402 switch (instr->type) {
403 case nir_instr_type_ssa_undef:
404 case nir_instr_type_load_const:
405 case nir_instr_type_alu:
406 case nir_instr_type_deref:
407 break;
408
409 case nir_instr_type_tex:
410 /* Don't move texture ops before a discard, as that could increase
411 * memory bandwidth for reading the discarded samples.
412 */
413 add_read_dep(state, state->discard, n);
414 break;
415
416 case nir_instr_type_jump:
417 add_write_dep(state, &state->jump, n);
418 break;
419
420 case nir_instr_type_call:
421 unreachable("Calls should have been lowered");
422 break;
423
424 case nir_instr_type_parallel_copy:
425 unreachable("Parallel copies should have been lowered");
426 break;
427
428 case nir_instr_type_phi:
429 unreachable("nir_schedule() should be called after lowering from SSA");
430 break;
431
432 case nir_instr_type_intrinsic:
433 nir_schedule_intrinsic_deps(state, nir_instr_as_intrinsic(instr));
434 break;
435 }
436 }
437
438 static void
439 calculate_forward_deps(nir_schedule_scoreboard *scoreboard, nir_block *block)
440 {
441 nir_deps_state state = {
442 .scoreboard = scoreboard,
443 .dir = F,
444 .reg_map = _mesa_pointer_hash_table_create(NULL),
445 };
446
447 nir_foreach_instr(instr, block) {
448 nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map,
449 instr);
450 nir_schedule_calculate_deps(&state, node);
451 }
452
453 ralloc_free(state.reg_map);
454 }
455
456 static void
457 calculate_reverse_deps(nir_schedule_scoreboard *scoreboard, nir_block *block)
458 {
459 nir_deps_state state = {
460 .scoreboard = scoreboard,
461 .dir = R,
462 .reg_map = _mesa_pointer_hash_table_create(NULL),
463 };
464
465 nir_foreach_instr_reverse(instr, block) {
466 nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map,
467 instr);
468 nir_schedule_calculate_deps(&state, node);
469 }
470
471 ralloc_free(state.reg_map);
472 }
473
474 typedef struct {
475 nir_schedule_scoreboard *scoreboard;
476 int regs_freed;
477 } nir_schedule_regs_freed_state;
478
479 static bool
480 nir_schedule_regs_freed_src_cb(nir_src *src, void *in_state)
481 {
482 nir_schedule_regs_freed_state *state = in_state;
483 nir_schedule_scoreboard *scoreboard = state->scoreboard;
484 struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src);
485
486 if (remaining_uses->entries == 1 &&
487 _mesa_set_search(remaining_uses, src->parent_instr)) {
488 state->regs_freed += nir_schedule_src_pressure(src);
489 }
490
491 return true;
492 }
493
494 static bool
495 nir_schedule_regs_freed_def_cb(nir_ssa_def *def, void *in_state)
496 {
497 nir_schedule_regs_freed_state *state = in_state;
498
499 state->regs_freed -= nir_schedule_def_pressure(def);
500
501 return true;
502 }
503
504 static bool
505 nir_schedule_regs_freed_dest_cb(nir_dest *dest, void *in_state)
506 {
507 nir_schedule_regs_freed_state *state = in_state;
508 nir_schedule_scoreboard *scoreboard = state->scoreboard;
509
510 if (dest->is_ssa)
511 return true;
512
513 nir_register *reg = dest->reg.reg;
514
515 /* Only the first def of a reg counts against register pressure. */
516 if (!_mesa_set_search(scoreboard->live_values, reg))
517 state->regs_freed -= nir_schedule_dest_pressure(dest);
518
519 return true;
520 }
521
522 static int
523 nir_schedule_regs_freed(nir_schedule_scoreboard *scoreboard, nir_schedule_node *n)
524 {
525 nir_schedule_regs_freed_state state = {
526 .scoreboard = scoreboard,
527 };
528
529 nir_foreach_src(n->instr, nir_schedule_regs_freed_src_cb, &state);
530
531 nir_foreach_ssa_def(n->instr, nir_schedule_regs_freed_def_cb, &state);
532
533 nir_foreach_dest(n->instr, nir_schedule_regs_freed_dest_cb, &state);
534
535 return state.regs_freed;
536 }
537
538 /**
539 * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSP (Code
540 * Scheduling for Parallelism) heuristic.
541 *
542 * Picks an instruction on the critical that's ready to execute without
543 * stalls, if possible, otherwise picks the instruction on the critical path.
544 */
545 static nir_schedule_node *
546 nir_schedule_choose_instruction_csp(nir_schedule_scoreboard *scoreboard)
547 {
548 nir_schedule_node *chosen = NULL;
549
550 /* Find the leader in the ready (shouldn't-stall) set with the maximum
551 * cost.
552 */
553 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
554 if (scoreboard->time < n->ready_time)
555 continue;
556
557 if (!chosen || chosen->max_delay < n->max_delay)
558 chosen = n;
559 }
560 if (chosen) {
561 if (debug) {
562 fprintf(stderr, "chose (ready): ");
563 nir_print_instr(chosen->instr, stderr);
564 fprintf(stderr, "\n");
565 }
566
567 return chosen;
568 }
569
570 /* Otherwise, choose the leader with the maximum cost. */
571 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
572 if (!chosen || chosen->max_delay < n->max_delay)
573 chosen = n;
574 }
575 if (debug) {
576 fprintf(stderr, "chose (leader): ");
577 nir_print_instr(chosen->instr, stderr);
578 fprintf(stderr, "\n");
579 }
580
581 return chosen;
582 }
583
584 /**
585 * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSR (Code
586 * Scheduling for Register pressure) heuristic.
587 */
588 static nir_schedule_node *
589 nir_schedule_choose_instruction_csr(nir_schedule_scoreboard *scoreboard)
590 {
591 nir_schedule_node *chosen = NULL;
592
593 /* Find a ready inst with regs freed and pick the one with max cost. */
594 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
595 if (n->ready_time > scoreboard->time)
596 continue;
597
598 int regs_freed = nir_schedule_regs_freed(scoreboard, n);
599
600 if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) {
601 chosen = n;
602 }
603 }
604 if (chosen) {
605 if (debug) {
606 fprintf(stderr, "chose (freed+ready): ");
607 nir_print_instr(chosen->instr, stderr);
608 fprintf(stderr, "\n");
609 }
610
611 return chosen;
612 }
613
614 /* Find a leader with regs freed and pick the one with max cost. */
615 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
616 int regs_freed = nir_schedule_regs_freed(scoreboard, n);
617
618 if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) {
619 chosen = n;
620 }
621 }
622 if (chosen) {
623 if (debug) {
624 fprintf(stderr, "chose (regs freed): ");
625 nir_print_instr(chosen->instr, stderr);
626 fprintf(stderr, "\n");
627 }
628
629 return chosen;
630 }
631
632 /* Find a partially evaluated path and try to finish it off */
633 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
634 if (n->partially_evaluated_path &&
635 (!chosen || chosen->max_delay < n->max_delay)) {
636 chosen = n;
637 }
638 }
639 if (chosen) {
640 if (debug) {
641 fprintf(stderr, "chose (partial path): ");
642 nir_print_instr(chosen->instr, stderr);
643 fprintf(stderr, "\n");
644 }
645
646 return chosen;
647 }
648
649 /* Contra the paper, pick a leader with no effect on used regs. This may
650 * open up new opportunities, as otherwise a single-operand instr consuming
651 * a value will tend to block finding freeing that value. This had a
652 * massive effect on reducing spilling on V3D.
653 *
654 * XXX: Should this prioritize ready?
655 */
656 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
657 if (nir_schedule_regs_freed(scoreboard, n) != 0)
658 continue;
659
660 if (!chosen || chosen->max_delay < n->max_delay)
661 chosen = n;
662 }
663 if (chosen) {
664 if (debug) {
665 fprintf(stderr, "chose (regs no-op): ");
666 nir_print_instr(chosen->instr, stderr);
667 fprintf(stderr, "\n");
668 }
669
670 return chosen;
671 }
672
673 /* Pick the max delay of the remaining ready set. */
674 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
675 if (n->ready_time > scoreboard->time)
676 continue;
677 if (!chosen || chosen->max_delay < n->max_delay)
678 chosen = n;
679 }
680 if (chosen) {
681 if (debug) {
682 fprintf(stderr, "chose (ready max delay): ");
683 nir_print_instr(chosen->instr, stderr);
684 fprintf(stderr, "\n");
685 }
686 return chosen;
687 }
688
689 /* Pick the max delay of the remaining leaders. */
690 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
691 if (!chosen || chosen->max_delay < n->max_delay)
692 chosen = n;
693 }
694
695 if (debug) {
696 fprintf(stderr, "chose (max delay): ");
697 nir_print_instr(chosen->instr, stderr);
698 fprintf(stderr, "\n");
699 }
700
701 return chosen;
702 }
703
704 static void
705 dump_state(nir_schedule_scoreboard *scoreboard)
706 {
707 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
708 fprintf(stderr, "maxdel %5d ", n->max_delay);
709 nir_print_instr(n->instr, stderr);
710 fprintf(stderr, "\n");
711
712 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
713 nir_schedule_node *child = (nir_schedule_node *)edge->child;
714
715 fprintf(stderr, " -> (%d parents) ", child->dag.parent_count);
716 nir_print_instr(child->instr, stderr);
717 fprintf(stderr, "\n");
718 }
719 }
720 }
721
722 static void
723 nir_schedule_mark_use(nir_schedule_scoreboard *scoreboard,
724 void *reg_or_def,
725 nir_instr *reg_or_def_parent,
726 int pressure)
727 {
728 /* Make the value live if it's the first time it's been used. */
729 if (!_mesa_set_search(scoreboard->live_values, reg_or_def)) {
730 _mesa_set_add(scoreboard->live_values, reg_or_def);
731 scoreboard->pressure += pressure;
732 }
733
734 /* Make the value dead if it's the last remaining use. Be careful when one
735 * instruction uses a value twice to not decrement pressure twice.
736 */
737 struct set *remaining_uses =
738 _mesa_hash_table_search_data(scoreboard->remaining_uses, reg_or_def);
739 struct set_entry *entry = _mesa_set_search(remaining_uses, reg_or_def_parent);
740 if (entry) {
741 _mesa_set_remove(remaining_uses, entry);
742
743 if (remaining_uses->entries == 0)
744 scoreboard->pressure -= pressure;
745 }
746 }
747
748 static bool
749 nir_schedule_mark_src_scheduled(nir_src *src, void *state)
750 {
751 nir_schedule_scoreboard *scoreboard = state;
752 struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src);
753
754 struct set_entry *entry = _mesa_set_search(remaining_uses,
755 src->parent_instr);
756 if (entry) {
757 /* Once we've used an SSA value in one instruction, bump the priority of
758 * the other uses so the SSA value can get fully consumed.
759 *
760 * We don't do this for registers, and it's would be a hassle and it's
761 * unclear if that would help or not. Also, skip it for constants, as
762 * they're often folded as immediates into backend instructions and have
763 * many unrelated instructions all referencing the same value (0).
764 */
765 if (src->is_ssa &&
766 src->ssa->parent_instr->type != nir_instr_type_load_const) {
767 nir_foreach_use(other_src, src->ssa) {
768 if (other_src->parent_instr == src->parent_instr)
769 continue;
770
771 nir_schedule_node *n =
772 nir_schedule_get_node(scoreboard->instr_map,
773 other_src->parent_instr);
774
775 if (n && !n->partially_evaluated_path) {
776 if (debug) {
777 fprintf(stderr, " New partially evaluated path: ");
778 nir_print_instr(n->instr, stderr);
779 fprintf(stderr, "\n");
780 }
781
782 n->partially_evaluated_path = true;
783 }
784 }
785 }
786 }
787
788 nir_schedule_mark_use(scoreboard,
789 src->is_ssa ? (void *)src->ssa : (void *)src->reg.reg,
790 src->parent_instr,
791 nir_schedule_src_pressure(src));
792
793 return true;
794 }
795
796 static bool
797 nir_schedule_mark_def_scheduled(nir_ssa_def *def, void *state)
798 {
799 nir_schedule_scoreboard *scoreboard = state;
800
801 nir_schedule_mark_use(scoreboard, def, def->parent_instr,
802 nir_schedule_def_pressure(def));
803
804 return true;
805 }
806
807 static bool
808 nir_schedule_mark_dest_scheduled(nir_dest *dest, void *state)
809 {
810 nir_schedule_scoreboard *scoreboard = state;
811
812 /* SSA defs were handled in nir_schedule_mark_def_scheduled()
813 */
814 if (dest->is_ssa)
815 return true;
816
817 /* XXX: This is not actually accurate for regs -- the last use of a reg may
818 * have a live interval that extends across control flow. We should
819 * calculate the live ranges of regs, and have scheduler nodes for the CF
820 * nodes that also "use" the reg.
821 */
822 nir_schedule_mark_use(scoreboard, dest->reg.reg,
823 dest->reg.parent_instr,
824 nir_schedule_dest_pressure(dest));
825
826 return true;
827 }
828
829 static void
830 nir_schedule_mark_node_scheduled(nir_schedule_scoreboard *scoreboard,
831 nir_schedule_node *n)
832 {
833 nir_foreach_src(n->instr, nir_schedule_mark_src_scheduled, scoreboard);
834 nir_foreach_ssa_def(n->instr, nir_schedule_mark_def_scheduled, scoreboard);
835 nir_foreach_dest(n->instr, nir_schedule_mark_dest_scheduled, scoreboard);
836
837 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
838 nir_schedule_node *child = (nir_schedule_node *)edge->child;
839
840 child->ready_time = MAX2(child->ready_time,
841 scoreboard->time + n->delay);
842
843 if (child->dag.parent_count == 1) {
844 if (debug) {
845 fprintf(stderr, " New DAG head: ");
846 nir_print_instr(child->instr, stderr);
847 fprintf(stderr, "\n");
848 }
849 }
850 }
851
852 dag_prune_head(scoreboard->dag, &n->dag);
853
854 scoreboard->time = MAX2(n->ready_time, scoreboard->time);
855 scoreboard->time++;
856 }
857
858 static void
859 nir_schedule_instructions(nir_schedule_scoreboard *scoreboard, nir_block *block)
860 {
861 while (!list_is_empty(&scoreboard->dag->heads)) {
862 if (debug) {
863 fprintf(stderr, "current list:\n");
864 dump_state(scoreboard);
865 }
866
867 nir_schedule_node *chosen;
868 if (scoreboard->pressure < scoreboard->options->threshold)
869 chosen = nir_schedule_choose_instruction_csp(scoreboard);
870 else
871 chosen = nir_schedule_choose_instruction_csr(scoreboard);
872
873 /* Now that we've scheduled a new instruction, some of its children may
874 * be promoted to the list of instructions ready to be scheduled.
875 */
876 nir_schedule_mark_node_scheduled(scoreboard, chosen);
877
878 /* Move the instruction to the end (so our first chosen instructions are
879 * the start of the program).
880 */
881 exec_node_remove(&chosen->instr->node);
882 exec_list_push_tail(&block->instr_list, &chosen->instr->node);
883
884 if (debug)
885 fprintf(stderr, "\n");
886 }
887 }
888
889 static uint32_t
890 nir_schedule_get_delay(nir_instr *instr)
891 {
892 switch (instr->type) {
893 case nir_instr_type_ssa_undef:
894 case nir_instr_type_load_const:
895 case nir_instr_type_alu:
896 case nir_instr_type_deref:
897 case nir_instr_type_jump:
898 case nir_instr_type_parallel_copy:
899 case nir_instr_type_call:
900 case nir_instr_type_phi:
901 return 1;
902
903 case nir_instr_type_intrinsic:
904 /* XXX: Pick a large number for UBO/SSBO/image/shared loads */
905 return 1;
906
907 case nir_instr_type_tex:
908 /* Pick some large number to try to fetch textures early and sample them
909 * late.
910 */
911 return 100;
912 }
913
914 return 0;
915 }
916
917 static void
918 nir_schedule_dag_max_delay_cb(struct dag_node *node, void *state)
919 {
920 nir_schedule_node *n = (nir_schedule_node *)node;
921 uint32_t max_delay = 0;
922
923 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
924 nir_schedule_node *child = (nir_schedule_node *)edge->child;
925 max_delay = MAX2(child->max_delay, max_delay);
926 }
927
928 n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
929 }
930
931 static void
932 nir_schedule_block(nir_schedule_scoreboard *scoreboard, nir_block *block)
933 {
934 void *mem_ctx = ralloc_context(NULL);
935 scoreboard->instr_map = _mesa_pointer_hash_table_create(mem_ctx);
936
937 scoreboard->dag = dag_create(mem_ctx);
938
939 nir_foreach_instr(instr, block) {
940 nir_schedule_node *n =
941 rzalloc(mem_ctx, nir_schedule_node);
942
943 n->instr = instr;
944 n->delay = nir_schedule_get_delay(instr);
945 dag_init_node(scoreboard->dag, &n->dag);
946
947 _mesa_hash_table_insert(scoreboard->instr_map, instr, n);
948 }
949
950 calculate_forward_deps(scoreboard, block);
951 calculate_reverse_deps(scoreboard, block);
952
953 dag_traverse_bottom_up(scoreboard->dag, nir_schedule_dag_max_delay_cb, NULL);
954
955 nir_schedule_instructions(scoreboard, block);
956
957 ralloc_free(mem_ctx);
958 scoreboard->instr_map = NULL;
959 }
960
961 static bool
962 nir_schedule_ssa_def_init_scoreboard(nir_ssa_def *def, void *state)
963 {
964 nir_schedule_scoreboard *scoreboard = state;
965 struct set *def_uses = _mesa_pointer_set_create(scoreboard);
966
967 _mesa_hash_table_insert(scoreboard->remaining_uses, def, def_uses);
968
969 _mesa_set_add(def_uses, def->parent_instr);
970
971 nir_foreach_use(src, def) {
972 _mesa_set_add(def_uses, src->parent_instr);
973 }
974
975 /* XXX: Handle if uses */
976
977 return true;
978 }
979
980 static nir_schedule_scoreboard *
981 nir_schedule_get_scoreboard(nir_shader *shader,
982 const nir_schedule_options *options)
983 {
984 nir_schedule_scoreboard *scoreboard = rzalloc(NULL, nir_schedule_scoreboard);
985
986 scoreboard->shader = shader;
987 scoreboard->live_values = _mesa_pointer_set_create(scoreboard);
988 scoreboard->remaining_uses = _mesa_pointer_hash_table_create(scoreboard);
989 scoreboard->options = options;
990 scoreboard->pressure = 0;
991
992 nir_foreach_function(function, shader) {
993 nir_foreach_register(reg, &function->impl->registers) {
994 struct set *register_uses =
995 _mesa_pointer_set_create(scoreboard);
996
997 _mesa_hash_table_insert(scoreboard->remaining_uses, reg, register_uses);
998
999 nir_foreach_use(src, reg) {
1000 _mesa_set_add(register_uses, src->parent_instr);
1001 }
1002
1003 /* XXX: Handle if uses */
1004
1005 nir_foreach_def(dest, reg) {
1006 _mesa_set_add(register_uses, dest->reg.parent_instr);
1007 }
1008 }
1009
1010 nir_foreach_block(block, function->impl) {
1011 nir_foreach_instr(instr, block) {
1012 nir_foreach_ssa_def(instr, nir_schedule_ssa_def_init_scoreboard,
1013 scoreboard);
1014 }
1015
1016 /* XXX: We're ignoring if uses, which may prioritize scheduling other
1017 * uses of the if src even when it doesn't help. That's not many
1018 * values, though, so meh.
1019 */
1020 }
1021 }
1022
1023 return scoreboard;
1024 }
1025
1026 static void
1027 nir_schedule_validate_uses(nir_schedule_scoreboard *scoreboard)
1028 {
1029 #ifdef NDEBUG
1030 return;
1031 #endif
1032
1033 bool any_uses = false;
1034
1035 hash_table_foreach(scoreboard->remaining_uses, entry) {
1036 struct set *remaining_uses = entry->data;
1037
1038 set_foreach(remaining_uses, instr_entry) {
1039 if (!any_uses) {
1040 fprintf(stderr, "Tracked uses remain after scheduling. "
1041 "Affected instructions: \n");
1042 any_uses = true;
1043 }
1044 nir_print_instr(instr_entry->key, stderr);
1045 fprintf(stderr, "\n");
1046 }
1047 }
1048
1049 assert(!any_uses);
1050 }
1051
1052 /**
1053 * Schedules the NIR instructions to try to decrease stalls (for example,
1054 * delaying texture reads) while managing register pressure.
1055 *
1056 * The threshold represents "number of NIR register/SSA def channels live
1057 * before switching the scheduling heuristic to reduce register pressure",
1058 * since most of our GPU architectures are scalar (extending to vector with a
1059 * flag wouldn't be hard). This number should be a bit below the number of
1060 * registers available (counting any that may be occupied by system value
1061 * payload values, for example), since the heuristic may not always be able to
1062 * free a register immediately. The amount below the limit is up to you to
1063 * tune.
1064 */
1065 void
1066 nir_schedule(nir_shader *shader,
1067 const nir_schedule_options *options)
1068 {
1069 nir_schedule_scoreboard *scoreboard = nir_schedule_get_scoreboard(shader,
1070 options);
1071
1072 if (debug) {
1073 fprintf(stderr, "NIR shader before scheduling:\n");
1074 nir_print_shader(shader, stderr);
1075 }
1076
1077 nir_foreach_function(function, shader) {
1078 if (!function->impl)
1079 continue;
1080
1081 nir_foreach_block(block, function->impl) {
1082 nir_schedule_block(scoreboard, block);
1083 }
1084 }
1085
1086 nir_schedule_validate_uses(scoreboard);
1087
1088 ralloc_free(scoreboard);
1089 }