2 * Copyright © 2018 Valve Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
26 #include "aco_builder.h"
27 #include <unordered_set>
30 #include "vulkan/radv_shader.h" // for radv_nir_compiler_options
31 #include "amdgfxregs.h"
33 #define SMEM_WINDOW_SIZE (350 - ctx.num_waves * 35)
34 #define VMEM_WINDOW_SIZE (1024 - ctx.num_waves * 64)
35 #define POS_EXP_WINDOW_SIZE 512
36 #define SMEM_MAX_MOVES (64 - ctx.num_waves * 4)
37 #define VMEM_MAX_MOVES (128 - ctx.num_waves * 8)
38 /* creating clauses decreases def-use distances, so make it less aggressive the lower num_waves is */
39 #define VMEM_CLAUSE_MAX_GRAB_DIST ((ctx.num_waves - 1) * 8)
40 #define POS_EXP_MAX_MOVES 512
52 RegisterDemand max_registers
;
56 RegisterDemand
*register_demand
;
59 std::vector
<bool> depends_on
;
60 /* Two are needed because, for downwards VMEM scheduling, one needs to
61 * exclude the instructions in the clause, since new instructions in the
62 * clause are not moved past any other instructions in the clause. */
63 std::vector
<bool> RAR_dependencies
;
64 std::vector
<bool> RAR_dependencies_clause
;
67 int insert_idx
, insert_idx_clause
;
68 RegisterDemand total_demand
, total_demand_clause
;
70 /* for moving instructions before the current instruction to after it */
71 void downwards_init(int current_idx
, bool improved_rar
, bool may_form_clauses
);
72 MoveResult
downwards_move(bool clause
);
73 void downwards_skip();
75 /* for moving instructions after the first use of the current instruction upwards */
76 void upwards_init(int source_idx
, bool improved_rar
);
77 bool upwards_check_deps();
78 void upwards_set_insert_idx(int before
);
79 MoveResult
upwards_move();
83 void downwards_advance_helper();
88 int16_t last_SMEM_stall
;
89 int last_SMEM_dep_idx
;
93 /* This scheduler is a simple bottom-up pass based on ideas from
94 * "A Novel Lightweight Instruction Scheduling Algorithm for Just-In-Time Compiler"
95 * from Xiaohua Shi and Peng Guo.
96 * The basic approach is to iterate over all instructions. When a memory instruction
97 * is encountered it tries to move independent instructions from above and below
98 * between the memory instruction and it's first user.
99 * The novelty is that this scheduler cares for the current register pressure:
100 * Instructions will only be moved if the register pressure won't exceed a certain bound.
103 template <typename T
>
104 void move_element(T begin_it
, size_t idx
, size_t before
) {
106 auto begin
= std::next(begin_it
, idx
);
107 auto end
= std::next(begin_it
, before
);
108 std::rotate(begin
, begin
+ 1, end
);
109 } else if (idx
> before
) {
110 auto begin
= std::next(begin_it
, before
);
111 auto end
= std::next(begin_it
, idx
+ 1);
112 std::rotate(begin
, end
- 1, end
);
116 void MoveState::downwards_advance_helper()
119 total_demand
.update(register_demand
[source_idx
]);
122 void MoveState::downwards_init(int current_idx
, bool improved_rar_
, bool may_form_clauses
)
124 improved_rar
= improved_rar_
;
125 source_idx
= current_idx
;
127 insert_idx
= current_idx
+ 1;
128 insert_idx_clause
= current_idx
;
130 total_demand
= total_demand_clause
= register_demand
[current_idx
];
132 std::fill(depends_on
.begin(), depends_on
.end(), false);
134 std::fill(RAR_dependencies
.begin(), RAR_dependencies
.end(), false);
135 if (may_form_clauses
)
136 std::fill(RAR_dependencies_clause
.begin(), RAR_dependencies_clause
.end(), false);
139 for (const Operand
& op
: current
->operands
) {
141 depends_on
[op
.tempId()] = true;
142 if (improved_rar
&& op
.isFirstKill())
143 RAR_dependencies
[op
.tempId()] = true;
147 /* update total_demand/source_idx */
148 downwards_advance_helper();
151 MoveResult
MoveState::downwards_move(bool clause
)
153 aco_ptr
<Instruction
>& instr
= block
->instructions
[source_idx
];
155 for (const Definition
& def
: instr
->definitions
)
156 if (def
.isTemp() && depends_on
[def
.tempId()])
157 return move_fail_ssa
;
159 /* check if one of candidate's operands is killed by depending instruction */
160 std::vector
<bool>& RAR_deps
= improved_rar
? (clause
? RAR_dependencies_clause
: RAR_dependencies
) : depends_on
;
161 for (const Operand
& op
: instr
->operands
) {
162 if (op
.isTemp() && RAR_deps
[op
.tempId()]) {
163 // FIXME: account for difference in register pressure
164 return move_fail_rar
;
169 for (const Operand
& op
: instr
->operands
) {
171 depends_on
[op
.tempId()] = true;
172 if (op
.isFirstKill())
173 RAR_dependencies
[op
.tempId()] = true;
178 int dest_insert_idx
= clause
? insert_idx_clause
: insert_idx
;
179 RegisterDemand register_pressure
= clause
? total_demand_clause
: total_demand
;
181 const RegisterDemand candidate_diff
= get_live_changes(instr
);
182 const RegisterDemand temp
= get_temp_registers(instr
);
183 if (RegisterDemand(register_pressure
- candidate_diff
).exceeds(max_registers
))
184 return move_fail_pressure
;
185 const RegisterDemand temp2
= get_temp_registers(block
->instructions
[dest_insert_idx
- 1]);
186 const RegisterDemand new_demand
= register_demand
[dest_insert_idx
- 1] - temp2
+ temp
;
187 if (new_demand
.exceeds(max_registers
))
188 return move_fail_pressure
;
190 /* move the candidate below the memory load */
191 move_element(block
->instructions
.begin(), source_idx
, dest_insert_idx
);
193 /* update register pressure */
194 move_element(register_demand
, source_idx
, dest_insert_idx
);
195 for (int i
= source_idx
; i
< dest_insert_idx
- 1; i
++)
196 register_demand
[i
] -= candidate_diff
;
197 register_demand
[dest_insert_idx
- 1] = new_demand
;
198 total_demand_clause
-= candidate_diff
;
201 total_demand
-= candidate_diff
;
205 downwards_advance_helper();
209 void MoveState::downwards_skip()
211 aco_ptr
<Instruction
>& instr
= block
->instructions
[source_idx
];
213 for (const Operand
& op
: instr
->operands
) {
215 depends_on
[op
.tempId()] = true;
216 if (improved_rar
&& op
.isFirstKill()) {
217 RAR_dependencies
[op
.tempId()] = true;
218 RAR_dependencies_clause
[op
.tempId()] = true;
222 total_demand_clause
.update(register_demand
[source_idx
]);
224 downwards_advance_helper();
227 void MoveState::upwards_init(int source_idx_
, bool improved_rar_
)
229 source_idx
= source_idx_
;
230 improved_rar
= improved_rar_
;
234 std::fill(depends_on
.begin(), depends_on
.end(), false);
235 std::fill(RAR_dependencies
.begin(), RAR_dependencies
.end(), false);
237 for (const Definition
& def
: current
->definitions
) {
239 depends_on
[def
.tempId()] = true;
243 bool MoveState::upwards_check_deps()
245 aco_ptr
<Instruction
>& instr
= block
->instructions
[source_idx
];
246 for (const Operand
& op
: instr
->operands
) {
247 if (op
.isTemp() && depends_on
[op
.tempId()])
253 void MoveState::upwards_set_insert_idx(int before
)
256 total_demand
= register_demand
[before
- 1];
259 MoveResult
MoveState::upwards_move()
261 assert(insert_idx
>= 0);
263 aco_ptr
<Instruction
>& instr
= block
->instructions
[source_idx
];
264 for (const Operand
& op
: instr
->operands
) {
265 if (op
.isTemp() && depends_on
[op
.tempId()])
266 return move_fail_ssa
;
269 /* check if candidate uses/kills an operand which is used by a dependency */
270 for (const Operand
& op
: instr
->operands
) {
271 if (op
.isTemp() && (!improved_rar
|| op
.isFirstKill()) && RAR_dependencies
[op
.tempId()])
272 return move_fail_rar
;
275 /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
276 const RegisterDemand candidate_diff
= get_live_changes(instr
);
277 const RegisterDemand temp
= get_temp_registers(instr
);
278 if (RegisterDemand(total_demand
+ candidate_diff
).exceeds(max_registers
))
279 return move_fail_pressure
;
280 const RegisterDemand temp2
= get_temp_registers(block
->instructions
[insert_idx
- 1]);
281 const RegisterDemand new_demand
= register_demand
[insert_idx
- 1] - temp2
+ candidate_diff
+ temp
;
282 if (new_demand
.exceeds(max_registers
))
283 return move_fail_pressure
;
285 /* move the candidate above the insert_idx */
286 move_element(block
->instructions
.begin(), source_idx
, insert_idx
);
288 /* update register pressure */
289 move_element(register_demand
, source_idx
, insert_idx
);
290 for (int i
= insert_idx
+ 1; i
<= source_idx
; i
++)
291 register_demand
[i
] += candidate_diff
;
292 register_demand
[insert_idx
] = new_demand
;
293 total_demand
+= candidate_diff
;
297 total_demand
.update(register_demand
[source_idx
]);
303 void MoveState::upwards_skip()
305 if (insert_idx
>= 0) {
306 aco_ptr
<Instruction
>& instr
= block
->instructions
[source_idx
];
307 for (const Definition
& def
: instr
->definitions
) {
309 depends_on
[def
.tempId()] = true;
311 for (const Operand
& op
: instr
->operands
) {
313 RAR_dependencies
[op
.tempId()] = true;
315 total_demand
.update(register_demand
[source_idx
]);
321 bool is_gs_or_done_sendmsg(const Instruction
*instr
)
323 if (instr
->opcode
== aco_opcode::s_sendmsg
) {
324 uint16_t imm
= static_cast<const SOPP_instruction
*>(instr
)->imm
;
325 return (imm
& sendmsg_id_mask
) == _sendmsg_gs
||
326 (imm
& sendmsg_id_mask
) == _sendmsg_gs_done
;
331 bool is_done_sendmsg(const Instruction
*instr
)
333 if (instr
->opcode
== aco_opcode::s_sendmsg
) {
334 uint16_t imm
= static_cast<const SOPP_instruction
*>(instr
)->imm
;
335 return (imm
& sendmsg_id_mask
) == _sendmsg_gs_done
;
340 memory_sync_info
get_sync_info_with_hack(const Instruction
* instr
)
342 memory_sync_info sync
= get_sync_info(instr
);
343 if (instr
->format
== Format::SMEM
&& !instr
->operands
.empty() && instr
->operands
[0].bytes() == 16) {
344 // FIXME: currently, it doesn't seem beneficial to omit this due to how our scheduler works
345 sync
.storage
= (storage_class
)(sync
.storage
| storage_buffer
);
346 sync
.semantics
= (memory_semantics
)(sync
.semantics
| semantic_private
);
351 struct memory_event_set
{
352 bool has_control_barrier
;
354 unsigned bar_acquire
;
355 unsigned bar_release
;
356 unsigned bar_classes
;
358 unsigned access_acquire
;
359 unsigned access_release
;
360 unsigned access_relaxed
;
361 unsigned access_atomic
;
364 struct hazard_query
{
366 bool contains_sendmsg
;
367 memory_event_set mem_events
;
368 unsigned aliasing_storage
; /* storage classes which are accessed (non-SMEM) */
369 unsigned aliasing_storage_smem
; /* storage classes which are accessed (SMEM) */
372 void init_hazard_query(hazard_query
*query
) {
373 query
->contains_spill
= false;
374 query
->contains_sendmsg
= false;
375 memset(&query
->mem_events
, 0, sizeof(query
->mem_events
));
376 query
->aliasing_storage
= 0;
377 query
->aliasing_storage_smem
= 0;
380 void add_memory_event(memory_event_set
*set
, Instruction
*instr
, memory_sync_info
*sync
)
382 set
->has_control_barrier
|= is_done_sendmsg(instr
);
383 if (instr
->opcode
== aco_opcode::p_barrier
) {
384 Pseudo_barrier_instruction
*bar
= static_cast<Pseudo_barrier_instruction
*>(instr
);
385 if (bar
->sync
.semantics
& semantic_acquire
)
386 set
->bar_acquire
|= bar
->sync
.storage
;
387 if (bar
->sync
.semantics
& semantic_release
)
388 set
->bar_release
|= bar
->sync
.storage
;
389 set
->bar_classes
|= bar
->sync
.storage
;
391 set
->has_control_barrier
|= bar
->exec_scope
> scope_invocation
;
397 if (sync
->semantics
& semantic_acquire
)
398 set
->access_acquire
|= sync
->storage
;
399 if (sync
->semantics
& semantic_release
)
400 set
->access_release
|= sync
->storage
;
402 if (!(sync
->semantics
& semantic_private
)) {
403 if (sync
->semantics
& semantic_atomic
)
404 set
->access_atomic
|= sync
->storage
;
406 set
->access_relaxed
|= sync
->storage
;
410 void add_to_hazard_query(hazard_query
*query
, Instruction
*instr
)
412 if (instr
->opcode
== aco_opcode::p_spill
|| instr
->opcode
== aco_opcode::p_reload
)
413 query
->contains_spill
= true;
414 query
->contains_sendmsg
|= instr
->opcode
== aco_opcode::s_sendmsg
;
416 memory_sync_info sync
= get_sync_info_with_hack(instr
);
418 add_memory_event(&query
->mem_events
, instr
, &sync
);
420 if (!(sync
.semantics
& semantic_can_reorder
)) {
421 unsigned storage
= sync
.storage
;
422 /* images and buffer/global memory can alias */ //TODO: more precisely, buffer images and buffer/global memory can alias
423 if (storage
& (storage_buffer
| storage_image
))
424 storage
|= storage_buffer
| storage_image
;
425 if (instr
->format
== Format::SMEM
)
426 query
->aliasing_storage_smem
|= storage
;
428 query
->aliasing_storage
|= storage
;
434 hazard_fail_reorder_vmem_smem
,
435 hazard_fail_reorder_ds
,
436 hazard_fail_reorder_sendmsg
,
440 /* Must stop at these failures. The hazard query code doesn't consider them
443 hazard_fail_unreorderable
,
446 HazardResult
perform_hazard_query(hazard_query
*query
, Instruction
*instr
, bool upwards
)
448 if (instr
->opcode
== aco_opcode::p_exit_early_if
)
449 return hazard_fail_exec
;
450 for (const Definition
& def
: instr
->definitions
) {
451 if (def
.isFixed() && def
.physReg() == exec
)
452 return hazard_fail_exec
;
455 /* don't move exports so that they stay closer together */
456 if (instr
->format
== Format::EXP
)
457 return hazard_fail_export
;
459 /* don't move non-reorderable instructions */
460 if (instr
->opcode
== aco_opcode::s_memtime
||
461 instr
->opcode
== aco_opcode::s_memrealtime
||
462 instr
->opcode
== aco_opcode::s_setprio
||
463 instr
->opcode
== aco_opcode::s_getreg_b32
)
464 return hazard_fail_unreorderable
;
466 memory_event_set instr_set
;
467 memset(&instr_set
, 0, sizeof(instr_set
));
468 memory_sync_info sync
= get_sync_info_with_hack(instr
);
469 add_memory_event(&instr_set
, instr
, &sync
);
471 memory_event_set
*first
= &instr_set
;
472 memory_event_set
*second
= &query
->mem_events
;
474 std::swap(first
, second
);
476 /* everything after barrier(acquire) happens after the atomics/control_barriers before
477 * everything after load(acquire) happens after the load
479 if ((first
->has_control_barrier
|| first
->access_atomic
) && second
->bar_acquire
)
480 return hazard_fail_barrier
;
481 if (((first
->access_acquire
|| first
->bar_acquire
) && second
->bar_classes
) ||
482 ((first
->access_acquire
| first
->bar_acquire
) & (second
->access_relaxed
| second
->access_atomic
)))
483 return hazard_fail_barrier
;
485 /* everything before barrier(release) happens before the atomics/control_barriers after *
486 * everything before store(release) happens before the store
488 if (first
->bar_release
&& (second
->has_control_barrier
|| second
->access_atomic
))
489 return hazard_fail_barrier
;
490 if ((first
->bar_classes
&& (second
->bar_release
|| second
->access_release
)) ||
491 ((first
->access_relaxed
| first
->access_atomic
) & (second
->bar_release
| second
->access_release
)))
492 return hazard_fail_barrier
;
494 /* don't move memory barriers around other memory barriers */
495 if (first
->bar_classes
&& second
->bar_classes
)
496 return hazard_fail_barrier
;
498 /* Don't move memory accesses to before control barriers. I don't think
499 * this is necessary for the Vulkan memory model, but it might be for GLSL450. */
500 unsigned control_classes
= storage_buffer
| storage_atomic_counter
| storage_image
| storage_shared
;
501 if (first
->has_control_barrier
&& ((second
->access_atomic
| second
->access_relaxed
) & control_classes
))
502 return hazard_fail_barrier
;
504 /* don't move memory loads/stores past potentially aliasing loads/stores */
505 unsigned aliasing_storage
= instr
->format
== Format::SMEM
?
506 query
->aliasing_storage_smem
:
507 query
->aliasing_storage
;
508 if ((sync
.storage
& aliasing_storage
) && !(sync
.semantics
& semantic_can_reorder
)) {
509 unsigned intersect
= sync
.storage
& aliasing_storage
;
510 if (intersect
& storage_shared
)
511 return hazard_fail_reorder_ds
;
512 return hazard_fail_reorder_vmem_smem
;
515 if ((instr
->opcode
== aco_opcode::p_spill
|| instr
->opcode
== aco_opcode::p_reload
) &&
516 query
->contains_spill
)
517 return hazard_fail_spill
;
519 if (instr
->opcode
== aco_opcode::s_sendmsg
&& query
->contains_sendmsg
)
520 return hazard_fail_reorder_sendmsg
;
522 return hazard_success
;
525 void schedule_SMEM(sched_ctx
& ctx
, Block
* block
,
526 std::vector
<RegisterDemand
>& register_demand
,
527 Instruction
* current
, int idx
)
530 int window_size
= SMEM_WINDOW_SIZE
;
531 int max_moves
= SMEM_MAX_MOVES
;
534 /* don't move s_memtime/s_memrealtime */
535 if (current
->opcode
== aco_opcode::s_memtime
|| current
->opcode
== aco_opcode::s_memrealtime
)
538 /* first, check if we have instructions before current to move down */
540 init_hazard_query(&hq
);
541 add_to_hazard_query(&hq
, current
);
543 ctx
.mv
.downwards_init(idx
, false, false);
545 for (int candidate_idx
= idx
- 1; k
< max_moves
&& candidate_idx
> (int) idx
- window_size
; candidate_idx
--) {
546 assert(candidate_idx
>= 0);
547 assert(candidate_idx
== ctx
.mv
.source_idx
);
548 aco_ptr
<Instruction
>& candidate
= block
->instructions
[candidate_idx
];
550 /* break if we'd make the previous SMEM instruction stall */
551 bool can_stall_prev_smem
= idx
<= ctx
.last_SMEM_dep_idx
&& candidate_idx
< ctx
.last_SMEM_dep_idx
;
552 if (can_stall_prev_smem
&& ctx
.last_SMEM_stall
>= 0)
555 /* break when encountering another MEM instruction, logical_start or barriers */
556 if (candidate
->opcode
== aco_opcode::p_logical_start
)
558 if (candidate
->isVMEM())
561 bool can_move_down
= true;
563 HazardResult haz
= perform_hazard_query(&hq
, candidate
.get(), false);
564 if (haz
== hazard_fail_reorder_ds
|| haz
== hazard_fail_spill
|| haz
== hazard_fail_reorder_sendmsg
|| haz
== hazard_fail_barrier
|| haz
== hazard_fail_export
)
565 can_move_down
= false;
566 else if (haz
!= hazard_success
)
569 /* don't use LDS/GDS instructions to hide latency since it can
570 * significanly worsen LDS scheduling */
571 if (candidate
->format
== Format::DS
|| !can_move_down
) {
572 add_to_hazard_query(&hq
, candidate
.get());
573 ctx
.mv
.downwards_skip();
577 MoveResult res
= ctx
.mv
.downwards_move(false);
578 if (res
== move_fail_ssa
|| res
== move_fail_rar
) {
579 add_to_hazard_query(&hq
, candidate
.get());
580 ctx
.mv
.downwards_skip();
582 } else if (res
== move_fail_pressure
) {
586 if (candidate_idx
< ctx
.last_SMEM_dep_idx
)
587 ctx
.last_SMEM_stall
++;
591 /* find the first instruction depending on current or find another MEM */
592 ctx
.mv
.upwards_init(idx
+ 1, false);
594 bool found_dependency
= false;
595 /* second, check if we have instructions after current to move up */
596 for (int candidate_idx
= idx
+ 1; k
< max_moves
&& candidate_idx
< (int) idx
+ window_size
; candidate_idx
++) {
597 assert(candidate_idx
== ctx
.mv
.source_idx
);
598 assert(candidate_idx
< (int) block
->instructions
.size());
599 aco_ptr
<Instruction
>& candidate
= block
->instructions
[candidate_idx
];
601 if (candidate
->opcode
== aco_opcode::p_logical_end
)
604 /* check if candidate depends on current */
605 bool is_dependency
= !found_dependency
&& !ctx
.mv
.upwards_check_deps();
606 /* no need to steal from following VMEM instructions */
607 if (is_dependency
&& candidate
->isVMEM())
610 if (found_dependency
) {
611 HazardResult haz
= perform_hazard_query(&hq
, candidate
.get(), true);
612 if (haz
== hazard_fail_reorder_ds
|| haz
== hazard_fail_spill
||
613 haz
== hazard_fail_reorder_sendmsg
|| haz
== hazard_fail_barrier
||
614 haz
== hazard_fail_export
)
615 is_dependency
= true;
616 else if (haz
!= hazard_success
)
621 if (!found_dependency
) {
622 ctx
.mv
.upwards_set_insert_idx(candidate_idx
);
623 init_hazard_query(&hq
);
624 found_dependency
= true;
628 if (is_dependency
|| !found_dependency
) {
629 if (found_dependency
)
630 add_to_hazard_query(&hq
, candidate
.get());
633 ctx
.mv
.upwards_skip();
637 MoveResult res
= ctx
.mv
.upwards_move();
638 if (res
== move_fail_ssa
|| res
== move_fail_rar
) {
639 /* no need to steal from following VMEM instructions */
640 if (res
== move_fail_ssa
&& candidate
->isVMEM())
642 add_to_hazard_query(&hq
, candidate
.get());
643 ctx
.mv
.upwards_skip();
645 } else if (res
== move_fail_pressure
) {
651 ctx
.last_SMEM_dep_idx
= found_dependency
? ctx
.mv
.insert_idx
: 0;
652 ctx
.last_SMEM_stall
= 10 - ctx
.num_waves
- k
;
655 void schedule_VMEM(sched_ctx
& ctx
, Block
* block
,
656 std::vector
<RegisterDemand
>& register_demand
,
657 Instruction
* current
, int idx
)
660 int window_size
= VMEM_WINDOW_SIZE
;
661 int max_moves
= VMEM_MAX_MOVES
;
662 int clause_max_grab_dist
= VMEM_CLAUSE_MAX_GRAB_DIST
;
665 /* first, check if we have instructions before current to move down */
666 hazard_query indep_hq
;
667 hazard_query clause_hq
;
668 init_hazard_query(&indep_hq
);
669 init_hazard_query(&clause_hq
);
670 add_to_hazard_query(&indep_hq
, current
);
672 ctx
.mv
.downwards_init(idx
, true, true);
674 for (int candidate_idx
= idx
- 1; k
< max_moves
&& candidate_idx
> (int) idx
- window_size
; candidate_idx
--) {
675 assert(candidate_idx
== ctx
.mv
.source_idx
);
676 assert(candidate_idx
>= 0);
677 aco_ptr
<Instruction
>& candidate
= block
->instructions
[candidate_idx
];
678 bool is_vmem
= candidate
->isVMEM() || candidate
->isFlatOrGlobal();
680 /* break when encountering another VMEM instruction, logical_start or barriers */
681 if (candidate
->opcode
== aco_opcode::p_logical_start
)
684 /* break if we'd make the previous SMEM instruction stall */
685 bool can_stall_prev_smem
= idx
<= ctx
.last_SMEM_dep_idx
&& candidate_idx
< ctx
.last_SMEM_dep_idx
;
686 if (can_stall_prev_smem
&& ctx
.last_SMEM_stall
>= 0)
689 bool part_of_clause
= false;
690 if (current
->isVMEM() == candidate
->isVMEM()) {
691 bool same_resource
= true;
692 if (current
->isVMEM())
693 same_resource
= candidate
->operands
[0].tempId() == current
->operands
[0].tempId();
694 int grab_dist
= ctx
.mv
.insert_idx_clause
- candidate_idx
;
695 /* We can't easily tell how much this will decrease the def-to-use
696 * distances, so just use how far it will be moved as a heuristic. */
697 part_of_clause
= same_resource
&& grab_dist
< clause_max_grab_dist
;
700 /* if current depends on candidate, add additional dependencies and continue */
701 bool can_move_down
= !is_vmem
|| part_of_clause
;
703 HazardResult haz
= perform_hazard_query(part_of_clause
? &clause_hq
: &indep_hq
, candidate
.get(), false);
704 if (haz
== hazard_fail_reorder_ds
|| haz
== hazard_fail_spill
||
705 haz
== hazard_fail_reorder_sendmsg
|| haz
== hazard_fail_barrier
||
706 haz
== hazard_fail_export
)
707 can_move_down
= false;
708 else if (haz
!= hazard_success
)
711 if (!can_move_down
) {
712 add_to_hazard_query(&indep_hq
, candidate
.get());
713 add_to_hazard_query(&clause_hq
, candidate
.get());
714 ctx
.mv
.downwards_skip();
718 Instruction
*candidate_ptr
= candidate
.get();
719 MoveResult res
= ctx
.mv
.downwards_move(part_of_clause
);
720 if (res
== move_fail_ssa
|| res
== move_fail_rar
) {
721 add_to_hazard_query(&indep_hq
, candidate
.get());
722 add_to_hazard_query(&clause_hq
, candidate
.get());
723 ctx
.mv
.downwards_skip();
725 } else if (res
== move_fail_pressure
) {
729 add_to_hazard_query(&indep_hq
, candidate_ptr
);
731 if (candidate_idx
< ctx
.last_SMEM_dep_idx
)
732 ctx
.last_SMEM_stall
++;
735 /* find the first instruction depending on current or find another VMEM */
736 ctx
.mv
.upwards_init(idx
+ 1, true);
738 bool found_dependency
= false;
739 /* second, check if we have instructions after current to move up */
740 for (int candidate_idx
= idx
+ 1; k
< max_moves
&& candidate_idx
< (int) idx
+ window_size
; candidate_idx
++) {
741 assert(candidate_idx
== ctx
.mv
.source_idx
);
742 assert(candidate_idx
< (int) block
->instructions
.size());
743 aco_ptr
<Instruction
>& candidate
= block
->instructions
[candidate_idx
];
744 bool is_vmem
= candidate
->isVMEM() || candidate
->isFlatOrGlobal();
746 if (candidate
->opcode
== aco_opcode::p_logical_end
)
749 /* check if candidate depends on current */
750 bool is_dependency
= false;
751 if (found_dependency
) {
752 HazardResult haz
= perform_hazard_query(&indep_hq
, candidate
.get(), true);
753 if (haz
== hazard_fail_reorder_ds
|| haz
== hazard_fail_spill
||
754 haz
== hazard_fail_reorder_vmem_smem
|| haz
== hazard_fail_reorder_sendmsg
||
755 haz
== hazard_fail_barrier
|| haz
== hazard_fail_export
)
756 is_dependency
= true;
757 else if (haz
!= hazard_success
)
761 is_dependency
|= !found_dependency
&& !ctx
.mv
.upwards_check_deps();
763 if (!found_dependency
) {
764 ctx
.mv
.upwards_set_insert_idx(candidate_idx
);
765 init_hazard_query(&indep_hq
);
766 found_dependency
= true;
768 } else if (is_vmem
) {
769 /* don't move up dependencies of other VMEM instructions */
770 for (const Definition
& def
: candidate
->definitions
) {
772 ctx
.mv
.depends_on
[def
.tempId()] = true;
776 if (is_dependency
|| !found_dependency
) {
777 if (found_dependency
)
778 add_to_hazard_query(&indep_hq
, candidate
.get());
779 ctx
.mv
.upwards_skip();
783 MoveResult res
= ctx
.mv
.upwards_move();
784 if (res
== move_fail_ssa
|| res
== move_fail_rar
) {
785 add_to_hazard_query(&indep_hq
, candidate
.get());
786 ctx
.mv
.upwards_skip();
788 } else if (res
== move_fail_pressure
) {
795 void schedule_position_export(sched_ctx
& ctx
, Block
* block
,
796 std::vector
<RegisterDemand
>& register_demand
,
797 Instruction
* current
, int idx
)
800 int window_size
= POS_EXP_WINDOW_SIZE
;
801 int max_moves
= POS_EXP_MAX_MOVES
;
804 ctx
.mv
.downwards_init(idx
, true, false);
807 init_hazard_query(&hq
);
808 add_to_hazard_query(&hq
, current
);
810 for (int candidate_idx
= idx
- 1; k
< max_moves
&& candidate_idx
> (int) idx
- window_size
; candidate_idx
--) {
811 assert(candidate_idx
>= 0);
812 aco_ptr
<Instruction
>& candidate
= block
->instructions
[candidate_idx
];
814 if (candidate
->opcode
== aco_opcode::p_logical_start
)
816 if (candidate
->isVMEM() || candidate
->format
== Format::SMEM
|| candidate
->isFlatOrGlobal())
819 HazardResult haz
= perform_hazard_query(&hq
, candidate
.get(), false);
820 if (haz
== hazard_fail_exec
|| haz
== hazard_fail_unreorderable
)
823 if (haz
!= hazard_success
) {
824 add_to_hazard_query(&hq
, candidate
.get());
825 ctx
.mv
.downwards_skip();
829 MoveResult res
= ctx
.mv
.downwards_move(false);
830 if (res
== move_fail_ssa
|| res
== move_fail_rar
) {
831 add_to_hazard_query(&hq
, candidate
.get());
832 ctx
.mv
.downwards_skip();
834 } else if (res
== move_fail_pressure
) {
841 void schedule_block(sched_ctx
& ctx
, Program
*program
, Block
* block
, live
& live_vars
)
843 ctx
.last_SMEM_dep_idx
= 0;
844 ctx
.last_SMEM_stall
= INT16_MIN
;
845 ctx
.mv
.block
= block
;
846 ctx
.mv
.register_demand
= live_vars
.register_demand
[block
->index
].data();
848 /* go through all instructions and find memory loads */
849 for (unsigned idx
= 0; idx
< block
->instructions
.size(); idx
++) {
850 Instruction
* current
= block
->instructions
[idx
].get();
852 if (current
->definitions
.empty())
855 if (current
->isVMEM() || current
->isFlatOrGlobal()) {
856 ctx
.mv
.current
= current
;
857 schedule_VMEM(ctx
, block
, live_vars
.register_demand
[block
->index
], current
, idx
);
860 if (current
->format
== Format::SMEM
) {
861 ctx
.mv
.current
= current
;
862 schedule_SMEM(ctx
, block
, live_vars
.register_demand
[block
->index
], current
, idx
);
866 if ((program
->stage
& (hw_vs
| hw_ngg_gs
)) && (block
->kind
& block_kind_export_end
)) {
867 /* Try to move position exports as far up as possible, to reduce register
868 * usage and because ISA reference guides say so. */
869 for (unsigned idx
= 0; idx
< block
->instructions
.size(); idx
++) {
870 Instruction
* current
= block
->instructions
[idx
].get();
872 if (current
->format
== Format::EXP
) {
873 unsigned target
= static_cast<Export_instruction
*>(current
)->dest
;
874 if (target
>= V_008DFC_SQ_EXP_POS
&& target
< V_008DFC_SQ_EXP_PARAM
) {
875 ctx
.mv
.current
= current
;
876 schedule_position_export(ctx
, block
, live_vars
.register_demand
[block
->index
], current
, idx
);
882 /* resummarize the block's register demand */
883 block
->register_demand
= RegisterDemand();
884 for (unsigned idx
= 0; idx
< block
->instructions
.size(); idx
++) {
885 block
->register_demand
.update(live_vars
.register_demand
[block
->index
][idx
]);
890 void schedule_program(Program
*program
, live
& live_vars
)
892 /* don't use program->max_reg_demand because that is affected by max_waves_per_simd */
893 RegisterDemand demand
;
894 for (Block
& block
: program
->blocks
)
895 demand
.update(block
.register_demand
);
898 ctx
.mv
.depends_on
.resize(program
->peekAllocationId());
899 ctx
.mv
.RAR_dependencies
.resize(program
->peekAllocationId());
900 ctx
.mv
.RAR_dependencies_clause
.resize(program
->peekAllocationId());
901 /* Allowing the scheduler to reduce the number of waves to as low as 5
902 * improves performance of Thrones of Britannia significantly and doesn't
903 * seem to hurt anything else. */
904 if (program
->num_waves
<= 5)
905 ctx
.num_waves
= program
->num_waves
;
906 else if (demand
.vgpr
>= 29)
908 else if (demand
.vgpr
>= 25)
912 ctx
.num_waves
= std::max
<uint16_t>(ctx
.num_waves
, program
->min_waves
);
913 ctx
.num_waves
= std::min
<uint16_t>(ctx
.num_waves
, program
->max_waves
);
915 assert(ctx
.num_waves
> 0 && ctx
.num_waves
<= program
->num_waves
);
916 ctx
.mv
.max_registers
= { int16_t(get_addr_vgpr_from_waves(program
, ctx
.num_waves
) - 2),
917 int16_t(get_addr_sgpr_from_waves(program
, ctx
.num_waves
))};
919 for (Block
& block
: program
->blocks
)
920 schedule_block(ctx
, program
, &block
, live_vars
);
922 /* update max_reg_demand and num_waves */
923 RegisterDemand new_demand
;
924 for (Block
& block
: program
->blocks
) {
925 new_demand
.update(block
.register_demand
);
927 update_vgpr_sgpr_demand(program
, new_demand
);
929 /* if enabled, this code asserts that register_demand is updated correctly */
931 int prev_num_waves
= program
->num_waves
;
932 const RegisterDemand prev_max_demand
= program
->max_reg_demand
;
934 std::vector
<RegisterDemand
> demands(program
->blocks
.size());
935 for (unsigned j
= 0; j
< program
->blocks
.size(); j
++) {
936 demands
[j
] = program
->blocks
[j
].register_demand
;
939 struct radv_nir_compiler_options options
;
940 options
.chip_class
= program
->chip_class
;
941 live live_vars2
= aco::live_var_analysis(program
, &options
);
943 for (unsigned j
= 0; j
< program
->blocks
.size(); j
++) {
944 Block
&b
= program
->blocks
[j
];
945 for (unsigned i
= 0; i
< b
.instructions
.size(); i
++)
946 assert(live_vars
.register_demand
[b
.index
][i
] == live_vars2
.register_demand
[b
.index
][i
]);
947 assert(b
.register_demand
== demands
[j
]);
950 assert(program
->max_reg_demand
== prev_max_demand
);
951 assert(program
->num_waves
== prev_num_waves
);