2 * Copyright © 2019 Broadcom
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 #include "nir_schedule.h"
26 #include "util/u_dynarray.h"
30 * Implements basic-block-level prepass instruction scheduling in NIR to
31 * manage register pressure.
33 * This is based on the Goodman/Hsu paper (1988, cached copy at
34 * https://people.freedesktop.org/~anholt/scheduling-goodman-hsu.pdf). We
35 * make up the DDG for NIR (which can be mostly done using the NIR def/use
36 * chains for SSA instructions, plus some edges for ordering register writes
37 * vs reads, and some more for ordering intrinsics). Then we pick heads off
38 * of the DDG using their heuristic to emit the NIR instructions back into the
39 * block in their new order.
41 * The hard case for prepass scheduling on GPUs seems to always be consuming
42 * texture/ubo results. The register pressure heuristic doesn't want to pick
43 * an instr that starts consuming texture results because it usually won't be
44 * the only usage, so that instruction will increase pressure.
46 * If you try to force consumption of tex results always, then in a case where
47 * single sample is used for many outputs, you'll end up picking every other
48 * user and expanding register pressure. The partially_evaluated_path flag
49 * helps tremendously, in that if you happen for whatever reason to pick a
50 * texture sample's output, then you'll try to finish off that sample. Future
51 * work may include doing some local search before locking in a choice, to try
52 * to more reliably find the case where just a few choices going against the
53 * heuristic can manage to free the whole vector.
59 * Represents a node in the DDG for a NIR instruction.
62 struct dag_node dag
; /* must be first for our u_dynarray_foreach */
64 bool partially_evaluated_path
;
66 /* Approximate estimate of the delay between starting this instruction and
67 * its results being available.
69 * Accuracy is not too important, given that we're prepass scheduling here
70 * and just trying to reduce excess dependencies introduced by a register
71 * allocator by stretching out the live intervals of expensive
76 /* Cost of the maximum-delay path from this node to the leaves. */
79 /* scoreboard->time value when this instruction can be scheduled without
80 * any stalls expected.
90 /* Mapping from nir_register * or nir_ssa_def * to a struct set of
91 * instructions remaining to be scheduled using the register.
93 struct hash_table
*remaining_uses
;
95 /* Map from nir_instr to nir_schedule_node * */
96 struct hash_table
*instr_map
;
98 /* Set of nir_register * or nir_ssa_def * that have had any instruction
101 struct set
*live_values
;
103 /* An abstract approximation of the number of nir_scheduler_node->delay
104 * units since the start of the shader.
108 /* Number of channels currently used by the NIR instructions that have been
113 /* Options specified by the backend */
114 const nir_schedule_options
*options
;
115 } nir_schedule_scoreboard
;
117 /* When walking the instructions in reverse, we use this flag to swap
118 * before/after in add_dep().
120 enum direction
{ F
, R
};
122 struct nir_schedule_class_dep
{
124 nir_schedule_node
*node
;
125 struct nir_schedule_class_dep
*next
;
129 nir_schedule_scoreboard
*scoreboard
;
131 /* Map from nir_register to nir_schedule_node * */
132 struct hash_table
*reg_map
;
134 /* Scheduler nodes for last instruction involved in some class of dependency.
136 nir_schedule_node
*load_input
;
137 nir_schedule_node
*store_shared
;
138 nir_schedule_node
*unknown_intrinsic
;
139 nir_schedule_node
*discard
;
140 nir_schedule_node
*jump
;
142 struct nir_schedule_class_dep
*class_deps
;
148 _mesa_hash_table_search_data(struct hash_table
*ht
, void *key
)
150 struct hash_entry
*entry
= _mesa_hash_table_search(ht
, key
);
156 static nir_schedule_node
*
157 nir_schedule_get_node(struct hash_table
*instr_map
, nir_instr
*instr
)
159 return _mesa_hash_table_search_data(instr_map
, instr
);
163 nir_schedule_scoreboard_get_src(nir_schedule_scoreboard
*scoreboard
, nir_src
*src
)
166 return _mesa_hash_table_search_data(scoreboard
->remaining_uses
, src
->ssa
);
168 return _mesa_hash_table_search_data(scoreboard
->remaining_uses
,
174 nir_schedule_def_pressure(nir_ssa_def
*def
)
176 return def
->num_components
;
180 nir_schedule_src_pressure(nir_src
*src
)
183 return nir_schedule_def_pressure(src
->ssa
);
185 return src
->reg
.reg
->num_components
;
189 nir_schedule_dest_pressure(nir_dest
*dest
)
192 return nir_schedule_def_pressure(&dest
->ssa
);
194 return dest
->reg
.reg
->num_components
;
198 * Adds a dependency such that @after must appear in the final program after
201 * We add @before as a child of @after, so that DAG heads are the outputs of
202 * the program and we make our scheduling decisions bottom to top.
205 add_dep(nir_deps_state
*state
,
206 nir_schedule_node
*before
,
207 nir_schedule_node
*after
)
209 if (!before
|| !after
)
212 assert(before
!= after
);
215 dag_add_edge(&before
->dag
, &after
->dag
, NULL
);
217 dag_add_edge(&after
->dag
, &before
->dag
, NULL
);
222 add_read_dep(nir_deps_state
*state
,
223 nir_schedule_node
*before
,
224 nir_schedule_node
*after
)
226 add_dep(state
, before
, after
);
230 add_write_dep(nir_deps_state
*state
,
231 nir_schedule_node
**before
,
232 nir_schedule_node
*after
)
234 add_dep(state
, *before
, after
);
239 nir_schedule_reg_src_deps(nir_src
*src
, void *in_state
)
241 nir_deps_state
*state
= in_state
;
246 struct hash_entry
*entry
= _mesa_hash_table_search(state
->reg_map
,
250 nir_schedule_node
*dst_n
= entry
->data
;
252 nir_schedule_node
*src_n
=
253 nir_schedule_get_node(state
->scoreboard
->instr_map
,
256 add_dep(state
, dst_n
, src_n
);
262 nir_schedule_reg_dest_deps(nir_dest
*dest
, void *in_state
)
264 nir_deps_state
*state
= in_state
;
269 nir_schedule_node
*dest_n
=
270 nir_schedule_get_node(state
->scoreboard
->instr_map
,
271 dest
->reg
.parent_instr
);
273 struct hash_entry
*entry
= _mesa_hash_table_search(state
->reg_map
,
276 _mesa_hash_table_insert(state
->reg_map
, dest
->reg
.reg
, dest_n
);
279 nir_schedule_node
**before
= (nir_schedule_node
**)&entry
->data
;
281 add_write_dep(state
, before
, dest_n
);
287 nir_schedule_ssa_deps(nir_ssa_def
*def
, void *in_state
)
289 nir_deps_state
*state
= in_state
;
290 struct hash_table
*instr_map
= state
->scoreboard
->instr_map
;
291 nir_schedule_node
*def_n
= nir_schedule_get_node(instr_map
, def
->parent_instr
);
293 nir_foreach_use(src
, def
) {
294 nir_schedule_node
*use_n
= nir_schedule_get_node(instr_map
,
297 add_read_dep(state
, def_n
, use_n
);
303 static struct nir_schedule_class_dep
*
304 nir_schedule_get_class_dep(nir_deps_state
*state
,
307 for (struct nir_schedule_class_dep
*class_dep
= state
->class_deps
;
309 class_dep
= class_dep
->next
) {
310 if (class_dep
->klass
== klass
)
314 struct nir_schedule_class_dep
*class_dep
=
315 ralloc(state
->reg_map
, struct nir_schedule_class_dep
);
317 class_dep
->klass
= klass
;
318 class_dep
->node
= NULL
;
319 class_dep
->next
= state
->class_deps
;
321 state
->class_deps
= class_dep
;
327 nir_schedule_intrinsic_deps(nir_deps_state
*state
,
328 nir_intrinsic_instr
*instr
)
330 nir_schedule_node
*n
= nir_schedule_get_node(state
->scoreboard
->instr_map
,
332 const nir_schedule_options
*options
= state
->scoreboard
->options
;
333 nir_schedule_dependency dep
;
335 if (options
->intrinsic_cb
&&
336 options
->intrinsic_cb(instr
, &dep
, options
->intrinsic_cb_data
)) {
337 struct nir_schedule_class_dep
*class_dep
=
338 nir_schedule_get_class_dep(state
, dep
.klass
);
341 case NIR_SCHEDULE_READ_DEPENDENCY
:
342 add_read_dep(state
, class_dep
->node
, n
);
344 case NIR_SCHEDULE_WRITE_DEPENDENCY
:
345 add_write_dep(state
, &class_dep
->node
, n
);
350 switch (instr
->intrinsic
) {
351 case nir_intrinsic_load_uniform
:
352 case nir_intrinsic_load_ubo
:
353 case nir_intrinsic_load_front_face
:
356 case nir_intrinsic_discard
:
357 case nir_intrinsic_discard_if
:
358 /* We are adding two dependencies:
360 * * A individual one that we could use to add a read_dep while handling
363 * * Include it on the unknown intrinsic set, as we want discard to be
364 * serialized in in the same order relative to intervening stores or
365 * atomic accesses to SSBOs and images
367 add_write_dep(state
, &state
->discard
, n
);
368 add_write_dep(state
, &state
->unknown_intrinsic
, n
);
371 case nir_intrinsic_store_output
:
372 /* For some hardware and stages, output stores affect the same shared
373 * memory as input loads.
375 if ((state
->scoreboard
->options
->stages_with_shared_io_memory
&
376 (1 << state
->scoreboard
->shader
->info
.stage
)))
377 add_write_dep(state
, &state
->load_input
, n
);
379 /* Make sure that preceding discards stay before the store_output */
380 add_read_dep(state
, state
->discard
, n
);
384 case nir_intrinsic_load_input
:
385 case nir_intrinsic_load_per_vertex_input
:
386 add_read_dep(state
, state
->load_input
, n
);
389 case nir_intrinsic_load_shared
:
390 /* Don't move load_shared beyond a following store_shared, as it could
393 add_read_dep(state
, state
->store_shared
, n
);
396 case nir_intrinsic_store_shared
:
397 add_write_dep(state
, &state
->store_shared
, n
);
400 case nir_intrinsic_control_barrier
:
401 case nir_intrinsic_memory_barrier_shared
:
402 add_write_dep(state
, &state
->store_shared
, n
);
404 /* Serialize against ssbos/atomics/etc. */
405 add_write_dep(state
, &state
->unknown_intrinsic
, n
);
409 /* Attempt to handle other intrinsics that we haven't individually
410 * categorized by serializing them in the same order relative to each
413 add_write_dep(state
, &state
->unknown_intrinsic
, n
);
419 * Common code for dependencies that need to be tracked both forward and
422 * This is for things like "all reads of r4 have to happen between the r4
423 * writes that surround them".
426 nir_schedule_calculate_deps(nir_deps_state
*state
, nir_schedule_node
*n
)
428 nir_instr
*instr
= n
->instr
;
430 /* For NIR SSA defs, we only need to do a single pass of making the uses
434 nir_foreach_ssa_def(instr
, nir_schedule_ssa_deps
, state
);
436 /* For NIR regs, track the last writer in the scheduler state so that we
437 * can keep the writes in order and let reads get reordered only between
440 nir_foreach_src(instr
, nir_schedule_reg_src_deps
, state
);
442 nir_foreach_dest(instr
, nir_schedule_reg_dest_deps
, state
);
444 /* Make sure any other instructions keep their positions relative to
447 if (instr
->type
!= nir_instr_type_jump
)
448 add_read_dep(state
, state
->jump
, n
);
450 switch (instr
->type
) {
451 case nir_instr_type_ssa_undef
:
452 case nir_instr_type_load_const
:
453 case nir_instr_type_alu
:
454 case nir_instr_type_deref
:
457 case nir_instr_type_tex
:
458 /* Don't move texture ops before a discard, as that could increase
459 * memory bandwidth for reading the discarded samples.
461 add_read_dep(state
, state
->discard
, n
);
464 case nir_instr_type_jump
:
465 add_write_dep(state
, &state
->jump
, n
);
468 case nir_instr_type_call
:
469 unreachable("Calls should have been lowered");
472 case nir_instr_type_parallel_copy
:
473 unreachable("Parallel copies should have been lowered");
476 case nir_instr_type_phi
:
477 unreachable("nir_schedule() should be called after lowering from SSA");
480 case nir_instr_type_intrinsic
:
481 nir_schedule_intrinsic_deps(state
, nir_instr_as_intrinsic(instr
));
487 calculate_forward_deps(nir_schedule_scoreboard
*scoreboard
, nir_block
*block
)
489 nir_deps_state state
= {
490 .scoreboard
= scoreboard
,
492 .reg_map
= _mesa_pointer_hash_table_create(NULL
),
495 nir_foreach_instr(instr
, block
) {
496 nir_schedule_node
*node
= nir_schedule_get_node(scoreboard
->instr_map
,
498 nir_schedule_calculate_deps(&state
, node
);
501 ralloc_free(state
.reg_map
);
505 calculate_reverse_deps(nir_schedule_scoreboard
*scoreboard
, nir_block
*block
)
507 nir_deps_state state
= {
508 .scoreboard
= scoreboard
,
510 .reg_map
= _mesa_pointer_hash_table_create(NULL
),
513 nir_foreach_instr_reverse(instr
, block
) {
514 nir_schedule_node
*node
= nir_schedule_get_node(scoreboard
->instr_map
,
516 nir_schedule_calculate_deps(&state
, node
);
519 ralloc_free(state
.reg_map
);
523 nir_schedule_scoreboard
*scoreboard
;
525 } nir_schedule_regs_freed_state
;
528 nir_schedule_regs_freed_src_cb(nir_src
*src
, void *in_state
)
530 nir_schedule_regs_freed_state
*state
= in_state
;
531 nir_schedule_scoreboard
*scoreboard
= state
->scoreboard
;
532 struct set
*remaining_uses
= nir_schedule_scoreboard_get_src(scoreboard
, src
);
534 if (remaining_uses
->entries
== 1 &&
535 _mesa_set_search(remaining_uses
, src
->parent_instr
)) {
536 state
->regs_freed
+= nir_schedule_src_pressure(src
);
543 nir_schedule_regs_freed_def_cb(nir_ssa_def
*def
, void *in_state
)
545 nir_schedule_regs_freed_state
*state
= in_state
;
547 state
->regs_freed
-= nir_schedule_def_pressure(def
);
553 nir_schedule_regs_freed_dest_cb(nir_dest
*dest
, void *in_state
)
555 nir_schedule_regs_freed_state
*state
= in_state
;
556 nir_schedule_scoreboard
*scoreboard
= state
->scoreboard
;
561 nir_register
*reg
= dest
->reg
.reg
;
563 /* Only the first def of a reg counts against register pressure. */
564 if (!_mesa_set_search(scoreboard
->live_values
, reg
))
565 state
->regs_freed
-= nir_schedule_dest_pressure(dest
);
571 nir_schedule_regs_freed(nir_schedule_scoreboard
*scoreboard
, nir_schedule_node
*n
)
573 nir_schedule_regs_freed_state state
= {
574 .scoreboard
= scoreboard
,
577 nir_foreach_src(n
->instr
, nir_schedule_regs_freed_src_cb
, &state
);
579 nir_foreach_ssa_def(n
->instr
, nir_schedule_regs_freed_def_cb
, &state
);
581 nir_foreach_dest(n
->instr
, nir_schedule_regs_freed_dest_cb
, &state
);
583 return state
.regs_freed
;
587 * Chooses an instruction that will minimise the register pressure as much as
588 * possible. This should only be used as a fallback when the regular scheduling
589 * generates a shader whose register allocation fails.
591 static nir_schedule_node
*
592 nir_schedule_choose_instruction_fallback(nir_schedule_scoreboard
*scoreboard
)
594 nir_schedule_node
*chosen
= NULL
;
596 /* Find the leader in the ready (shouldn't-stall) set with the mininum
599 list_for_each_entry(nir_schedule_node
, n
, &scoreboard
->dag
->heads
, dag
.link
) {
600 if (scoreboard
->time
< n
->ready_time
)
603 if (!chosen
|| chosen
->max_delay
> n
->max_delay
)
608 fprintf(stderr
, "chose (ready fallback): ");
609 nir_print_instr(chosen
->instr
, stderr
);
610 fprintf(stderr
, "\n");
616 /* Otherwise, choose the leader with the minimum cost. */
617 list_for_each_entry(nir_schedule_node
, n
, &scoreboard
->dag
->heads
, dag
.link
) {
618 if (!chosen
|| chosen
->max_delay
> n
->max_delay
)
622 fprintf(stderr
, "chose (leader fallback): ");
623 nir_print_instr(chosen
->instr
, stderr
);
624 fprintf(stderr
, "\n");
631 * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSP (Code
632 * Scheduling for Parallelism) heuristic.
634 * Picks an instruction on the critical that's ready to execute without
635 * stalls, if possible, otherwise picks the instruction on the critical path.
637 static nir_schedule_node
*
638 nir_schedule_choose_instruction_csp(nir_schedule_scoreboard
*scoreboard
)
640 nir_schedule_node
*chosen
= NULL
;
642 /* Find the leader in the ready (shouldn't-stall) set with the maximum
645 list_for_each_entry(nir_schedule_node
, n
, &scoreboard
->dag
->heads
, dag
.link
) {
646 if (scoreboard
->time
< n
->ready_time
)
649 if (!chosen
|| chosen
->max_delay
< n
->max_delay
)
654 fprintf(stderr
, "chose (ready): ");
655 nir_print_instr(chosen
->instr
, stderr
);
656 fprintf(stderr
, "\n");
662 /* Otherwise, choose the leader with the maximum cost. */
663 list_for_each_entry(nir_schedule_node
, n
, &scoreboard
->dag
->heads
, dag
.link
) {
664 if (!chosen
|| chosen
->max_delay
< n
->max_delay
)
668 fprintf(stderr
, "chose (leader): ");
669 nir_print_instr(chosen
->instr
, stderr
);
670 fprintf(stderr
, "\n");
677 * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSR (Code
678 * Scheduling for Register pressure) heuristic.
680 static nir_schedule_node
*
681 nir_schedule_choose_instruction_csr(nir_schedule_scoreboard
*scoreboard
)
683 nir_schedule_node
*chosen
= NULL
;
685 /* Find a ready inst with regs freed and pick the one with max cost. */
686 list_for_each_entry(nir_schedule_node
, n
, &scoreboard
->dag
->heads
, dag
.link
) {
687 if (n
->ready_time
> scoreboard
->time
)
690 int regs_freed
= nir_schedule_regs_freed(scoreboard
, n
);
692 if (regs_freed
> 0 && (!chosen
|| chosen
->max_delay
< n
->max_delay
)) {
698 fprintf(stderr
, "chose (freed+ready): ");
699 nir_print_instr(chosen
->instr
, stderr
);
700 fprintf(stderr
, "\n");
706 /* Find a leader with regs freed and pick the one with max cost. */
707 list_for_each_entry(nir_schedule_node
, n
, &scoreboard
->dag
->heads
, dag
.link
) {
708 int regs_freed
= nir_schedule_regs_freed(scoreboard
, n
);
710 if (regs_freed
> 0 && (!chosen
|| chosen
->max_delay
< n
->max_delay
)) {
716 fprintf(stderr
, "chose (regs freed): ");
717 nir_print_instr(chosen
->instr
, stderr
);
718 fprintf(stderr
, "\n");
724 /* Find a partially evaluated path and try to finish it off */
725 list_for_each_entry(nir_schedule_node
, n
, &scoreboard
->dag
->heads
, dag
.link
) {
726 if (n
->partially_evaluated_path
&&
727 (!chosen
|| chosen
->max_delay
< n
->max_delay
)) {
733 fprintf(stderr
, "chose (partial path): ");
734 nir_print_instr(chosen
->instr
, stderr
);
735 fprintf(stderr
, "\n");
741 /* Contra the paper, pick a leader with no effect on used regs. This may
742 * open up new opportunities, as otherwise a single-operand instr consuming
743 * a value will tend to block finding freeing that value. This had a
744 * massive effect on reducing spilling on V3D.
746 * XXX: Should this prioritize ready?
748 list_for_each_entry(nir_schedule_node
, n
, &scoreboard
->dag
->heads
, dag
.link
) {
749 if (nir_schedule_regs_freed(scoreboard
, n
) != 0)
752 if (!chosen
|| chosen
->max_delay
< n
->max_delay
)
757 fprintf(stderr
, "chose (regs no-op): ");
758 nir_print_instr(chosen
->instr
, stderr
);
759 fprintf(stderr
, "\n");
765 /* Pick the max delay of the remaining ready set. */
766 list_for_each_entry(nir_schedule_node
, n
, &scoreboard
->dag
->heads
, dag
.link
) {
767 if (n
->ready_time
> scoreboard
->time
)
769 if (!chosen
|| chosen
->max_delay
< n
->max_delay
)
774 fprintf(stderr
, "chose (ready max delay): ");
775 nir_print_instr(chosen
->instr
, stderr
);
776 fprintf(stderr
, "\n");
781 /* Pick the max delay of the remaining leaders. */
782 list_for_each_entry(nir_schedule_node
, n
, &scoreboard
->dag
->heads
, dag
.link
) {
783 if (!chosen
|| chosen
->max_delay
< n
->max_delay
)
788 fprintf(stderr
, "chose (max delay): ");
789 nir_print_instr(chosen
->instr
, stderr
);
790 fprintf(stderr
, "\n");
797 dump_state(nir_schedule_scoreboard
*scoreboard
)
799 list_for_each_entry(nir_schedule_node
, n
, &scoreboard
->dag
->heads
, dag
.link
) {
800 fprintf(stderr
, "maxdel %5d ", n
->max_delay
);
801 nir_print_instr(n
->instr
, stderr
);
802 fprintf(stderr
, "\n");
804 util_dynarray_foreach(&n
->dag
.edges
, struct dag_edge
, edge
) {
805 nir_schedule_node
*child
= (nir_schedule_node
*)edge
->child
;
807 fprintf(stderr
, " -> (%d parents) ", child
->dag
.parent_count
);
808 nir_print_instr(child
->instr
, stderr
);
809 fprintf(stderr
, "\n");
815 nir_schedule_mark_use(nir_schedule_scoreboard
*scoreboard
,
817 nir_instr
*reg_or_def_parent
,
820 /* Make the value live if it's the first time it's been used. */
821 if (!_mesa_set_search(scoreboard
->live_values
, reg_or_def
)) {
822 _mesa_set_add(scoreboard
->live_values
, reg_or_def
);
823 scoreboard
->pressure
+= pressure
;
826 /* Make the value dead if it's the last remaining use. Be careful when one
827 * instruction uses a value twice to not decrement pressure twice.
829 struct set
*remaining_uses
=
830 _mesa_hash_table_search_data(scoreboard
->remaining_uses
, reg_or_def
);
831 struct set_entry
*entry
= _mesa_set_search(remaining_uses
, reg_or_def_parent
);
833 _mesa_set_remove(remaining_uses
, entry
);
835 if (remaining_uses
->entries
== 0)
836 scoreboard
->pressure
-= pressure
;
841 nir_schedule_mark_src_scheduled(nir_src
*src
, void *state
)
843 nir_schedule_scoreboard
*scoreboard
= state
;
844 struct set
*remaining_uses
= nir_schedule_scoreboard_get_src(scoreboard
, src
);
846 struct set_entry
*entry
= _mesa_set_search(remaining_uses
,
849 /* Once we've used an SSA value in one instruction, bump the priority of
850 * the other uses so the SSA value can get fully consumed.
852 * We don't do this for registers, and it's would be a hassle and it's
853 * unclear if that would help or not. Also, skip it for constants, as
854 * they're often folded as immediates into backend instructions and have
855 * many unrelated instructions all referencing the same value (0).
858 src
->ssa
->parent_instr
->type
!= nir_instr_type_load_const
) {
859 nir_foreach_use(other_src
, src
->ssa
) {
860 if (other_src
->parent_instr
== src
->parent_instr
)
863 nir_schedule_node
*n
=
864 nir_schedule_get_node(scoreboard
->instr_map
,
865 other_src
->parent_instr
);
867 if (n
&& !n
->partially_evaluated_path
) {
869 fprintf(stderr
, " New partially evaluated path: ");
870 nir_print_instr(n
->instr
, stderr
);
871 fprintf(stderr
, "\n");
874 n
->partially_evaluated_path
= true;
880 nir_schedule_mark_use(scoreboard
,
881 src
->is_ssa
? (void *)src
->ssa
: (void *)src
->reg
.reg
,
883 nir_schedule_src_pressure(src
));
889 nir_schedule_mark_def_scheduled(nir_ssa_def
*def
, void *state
)
891 nir_schedule_scoreboard
*scoreboard
= state
;
893 nir_schedule_mark_use(scoreboard
, def
, def
->parent_instr
,
894 nir_schedule_def_pressure(def
));
900 nir_schedule_mark_dest_scheduled(nir_dest
*dest
, void *state
)
902 nir_schedule_scoreboard
*scoreboard
= state
;
904 /* SSA defs were handled in nir_schedule_mark_def_scheduled()
909 /* XXX: This is not actually accurate for regs -- the last use of a reg may
910 * have a live interval that extends across control flow. We should
911 * calculate the live ranges of regs, and have scheduler nodes for the CF
912 * nodes that also "use" the reg.
914 nir_schedule_mark_use(scoreboard
, dest
->reg
.reg
,
915 dest
->reg
.parent_instr
,
916 nir_schedule_dest_pressure(dest
));
922 nir_schedule_mark_node_scheduled(nir_schedule_scoreboard
*scoreboard
,
923 nir_schedule_node
*n
)
925 nir_foreach_src(n
->instr
, nir_schedule_mark_src_scheduled
, scoreboard
);
926 nir_foreach_ssa_def(n
->instr
, nir_schedule_mark_def_scheduled
, scoreboard
);
927 nir_foreach_dest(n
->instr
, nir_schedule_mark_dest_scheduled
, scoreboard
);
929 util_dynarray_foreach(&n
->dag
.edges
, struct dag_edge
, edge
) {
930 nir_schedule_node
*child
= (nir_schedule_node
*)edge
->child
;
932 child
->ready_time
= MAX2(child
->ready_time
,
933 scoreboard
->time
+ n
->delay
);
935 if (child
->dag
.parent_count
== 1) {
937 fprintf(stderr
, " New DAG head: ");
938 nir_print_instr(child
->instr
, stderr
);
939 fprintf(stderr
, "\n");
944 dag_prune_head(scoreboard
->dag
, &n
->dag
);
946 scoreboard
->time
= MAX2(n
->ready_time
, scoreboard
->time
);
951 nir_schedule_instructions(nir_schedule_scoreboard
*scoreboard
, nir_block
*block
)
953 while (!list_is_empty(&scoreboard
->dag
->heads
)) {
955 fprintf(stderr
, "current list:\n");
956 dump_state(scoreboard
);
959 nir_schedule_node
*chosen
;
960 if (scoreboard
->options
->fallback
)
961 chosen
= nir_schedule_choose_instruction_fallback(scoreboard
);
962 else if (scoreboard
->pressure
< scoreboard
->options
->threshold
)
963 chosen
= nir_schedule_choose_instruction_csp(scoreboard
);
965 chosen
= nir_schedule_choose_instruction_csr(scoreboard
);
967 /* Now that we've scheduled a new instruction, some of its children may
968 * be promoted to the list of instructions ready to be scheduled.
970 nir_schedule_mark_node_scheduled(scoreboard
, chosen
);
972 /* Move the instruction to the end (so our first chosen instructions are
973 * the start of the program).
975 exec_node_remove(&chosen
->instr
->node
);
976 exec_list_push_tail(&block
->instr_list
, &chosen
->instr
->node
);
979 fprintf(stderr
, "\n");
984 nir_schedule_get_delay(nir_instr
*instr
)
986 switch (instr
->type
) {
987 case nir_instr_type_ssa_undef
:
988 case nir_instr_type_load_const
:
989 case nir_instr_type_alu
:
990 case nir_instr_type_deref
:
991 case nir_instr_type_jump
:
992 case nir_instr_type_parallel_copy
:
993 case nir_instr_type_call
:
994 case nir_instr_type_phi
:
997 case nir_instr_type_intrinsic
:
998 /* XXX: Pick a large number for UBO/SSBO/image/shared loads */
1001 case nir_instr_type_tex
:
1002 /* Pick some large number to try to fetch textures early and sample them
1012 nir_schedule_dag_max_delay_cb(struct dag_node
*node
, void *state
)
1014 nir_schedule_node
*n
= (nir_schedule_node
*)node
;
1015 uint32_t max_delay
= 0;
1017 util_dynarray_foreach(&n
->dag
.edges
, struct dag_edge
, edge
) {
1018 nir_schedule_node
*child
= (nir_schedule_node
*)edge
->child
;
1019 max_delay
= MAX2(child
->max_delay
, max_delay
);
1022 n
->max_delay
= MAX2(n
->max_delay
, max_delay
+ n
->delay
);
1026 nir_schedule_block(nir_schedule_scoreboard
*scoreboard
, nir_block
*block
)
1028 void *mem_ctx
= ralloc_context(NULL
);
1029 scoreboard
->instr_map
= _mesa_pointer_hash_table_create(mem_ctx
);
1031 scoreboard
->dag
= dag_create(mem_ctx
);
1033 nir_foreach_instr(instr
, block
) {
1034 nir_schedule_node
*n
=
1035 rzalloc(mem_ctx
, nir_schedule_node
);
1038 n
->delay
= nir_schedule_get_delay(instr
);
1039 dag_init_node(scoreboard
->dag
, &n
->dag
);
1041 _mesa_hash_table_insert(scoreboard
->instr_map
, instr
, n
);
1044 calculate_forward_deps(scoreboard
, block
);
1045 calculate_reverse_deps(scoreboard
, block
);
1047 dag_traverse_bottom_up(scoreboard
->dag
, nir_schedule_dag_max_delay_cb
, NULL
);
1049 nir_schedule_instructions(scoreboard
, block
);
1051 ralloc_free(mem_ctx
);
1052 scoreboard
->instr_map
= NULL
;
1056 nir_schedule_ssa_def_init_scoreboard(nir_ssa_def
*def
, void *state
)
1058 nir_schedule_scoreboard
*scoreboard
= state
;
1059 struct set
*def_uses
= _mesa_pointer_set_create(scoreboard
);
1061 _mesa_hash_table_insert(scoreboard
->remaining_uses
, def
, def_uses
);
1063 _mesa_set_add(def_uses
, def
->parent_instr
);
1065 nir_foreach_use(src
, def
) {
1066 _mesa_set_add(def_uses
, src
->parent_instr
);
1069 /* XXX: Handle if uses */
1074 static nir_schedule_scoreboard
*
1075 nir_schedule_get_scoreboard(nir_shader
*shader
,
1076 const nir_schedule_options
*options
)
1078 nir_schedule_scoreboard
*scoreboard
= rzalloc(NULL
, nir_schedule_scoreboard
);
1080 scoreboard
->shader
= shader
;
1081 scoreboard
->live_values
= _mesa_pointer_set_create(scoreboard
);
1082 scoreboard
->remaining_uses
= _mesa_pointer_hash_table_create(scoreboard
);
1083 scoreboard
->options
= options
;
1084 scoreboard
->pressure
= 0;
1086 nir_foreach_function(function
, shader
) {
1087 nir_foreach_register(reg
, &function
->impl
->registers
) {
1088 struct set
*register_uses
=
1089 _mesa_pointer_set_create(scoreboard
);
1091 _mesa_hash_table_insert(scoreboard
->remaining_uses
, reg
, register_uses
);
1093 nir_foreach_use(src
, reg
) {
1094 _mesa_set_add(register_uses
, src
->parent_instr
);
1097 /* XXX: Handle if uses */
1099 nir_foreach_def(dest
, reg
) {
1100 _mesa_set_add(register_uses
, dest
->reg
.parent_instr
);
1104 nir_foreach_block(block
, function
->impl
) {
1105 nir_foreach_instr(instr
, block
) {
1106 nir_foreach_ssa_def(instr
, nir_schedule_ssa_def_init_scoreboard
,
1110 /* XXX: We're ignoring if uses, which may prioritize scheduling other
1111 * uses of the if src even when it doesn't help. That's not many
1112 * values, though, so meh.
1121 nir_schedule_validate_uses(nir_schedule_scoreboard
*scoreboard
)
1127 bool any_uses
= false;
1129 hash_table_foreach(scoreboard
->remaining_uses
, entry
) {
1130 struct set
*remaining_uses
= entry
->data
;
1132 set_foreach(remaining_uses
, instr_entry
) {
1134 fprintf(stderr
, "Tracked uses remain after scheduling. "
1135 "Affected instructions: \n");
1138 nir_print_instr(instr_entry
->key
, stderr
);
1139 fprintf(stderr
, "\n");
1147 * Schedules the NIR instructions to try to decrease stalls (for example,
1148 * delaying texture reads) while managing register pressure.
1150 * The threshold represents "number of NIR register/SSA def channels live
1151 * before switching the scheduling heuristic to reduce register pressure",
1152 * since most of our GPU architectures are scalar (extending to vector with a
1153 * flag wouldn't be hard). This number should be a bit below the number of
1154 * registers available (counting any that may be occupied by system value
1155 * payload values, for example), since the heuristic may not always be able to
1156 * free a register immediately. The amount below the limit is up to you to
1160 nir_schedule(nir_shader
*shader
,
1161 const nir_schedule_options
*options
)
1163 nir_schedule_scoreboard
*scoreboard
= nir_schedule_get_scoreboard(shader
,
1167 fprintf(stderr
, "NIR shader before scheduling:\n");
1168 nir_print_shader(shader
, stderr
);
1171 nir_foreach_function(function
, shader
) {
1172 if (!function
->impl
)
1175 nir_foreach_block(block
, function
->impl
) {
1176 nir_schedule_block(scoreboard
, block
);
1180 nir_schedule_validate_uses(scoreboard
);
1182 ralloc_free(scoreboard
);