2 * Copyright © 2018 Valve Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
26 #include "aco_builder.h"
27 #include <unordered_set>
30 #include "vulkan/radv_shader.h" // for radv_nir_compiler_options
31 #include "amdgfxregs.h"
33 #define SMEM_WINDOW_SIZE (350 - ctx.num_waves * 35)
34 #define VMEM_WINDOW_SIZE (1024 - ctx.num_waves * 64)
35 #define POS_EXP_WINDOW_SIZE 512
36 #define SMEM_MAX_MOVES (64 - ctx.num_waves * 4)
37 #define VMEM_MAX_MOVES (128 - ctx.num_waves * 8)
38 /* creating clauses decreases def-use distances, so make it less aggressive the lower num_waves is */
39 #define VMEM_CLAUSE_MAX_GRAB_DIST ((ctx.num_waves - 1) * 8)
40 #define POS_EXP_MAX_MOVES 512
52 RegisterDemand max_registers
;
56 RegisterDemand
*register_demand
;
59 std::vector
<bool> depends_on
;
60 /* Two are needed because, for downwards VMEM scheduling, one needs to
61 * exclude the instructions in the clause, since new instructions in the
62 * clause are not moved past any other instructions in the clause. */
63 std::vector
<bool> RAR_dependencies
;
64 std::vector
<bool> RAR_dependencies_clause
;
67 int insert_idx
, insert_idx_clause
;
68 RegisterDemand total_demand
, total_demand_clause
;
70 /* for moving instructions before the current instruction to after it */
71 void downwards_init(int current_idx
, bool improved_rar
, bool may_form_clauses
);
72 MoveResult
downwards_move(bool clause
);
73 void downwards_skip();
75 /* for moving instructions after the first use of the current instruction upwards */
76 void upwards_init(int source_idx
, bool improved_rar
);
77 bool upwards_check_deps();
78 void upwards_set_insert_idx(int before
);
79 MoveResult
upwards_move();
83 void downwards_advance_helper();
88 int16_t last_SMEM_stall
;
89 int last_SMEM_dep_idx
;
93 /* This scheduler is a simple bottom-up pass based on ideas from
94 * "A Novel Lightweight Instruction Scheduling Algorithm for Just-In-Time Compiler"
95 * from Xiaohua Shi and Peng Guo.
96 * The basic approach is to iterate over all instructions. When a memory instruction
97 * is encountered it tries to move independent instructions from above and below
98 * between the memory instruction and it's first user.
99 * The novelty is that this scheduler cares for the current register pressure:
100 * Instructions will only be moved if the register pressure won't exceed a certain bound.
103 template <typename T
>
104 void move_element(T begin_it
, size_t idx
, size_t before
) {
106 auto begin
= std::next(begin_it
, idx
);
107 auto end
= std::next(begin_it
, before
);
108 std::rotate(begin
, begin
+ 1, end
);
109 } else if (idx
> before
) {
110 auto begin
= std::next(begin_it
, before
);
111 auto end
= std::next(begin_it
, idx
+ 1);
112 std::rotate(begin
, end
- 1, end
);
116 static RegisterDemand
getLiveChanges(aco_ptr
<Instruction
>& instr
)
118 RegisterDemand changes
;
119 for (const Definition
& def
: instr
->definitions
) {
120 if (!def
.isTemp() || def
.isKill())
122 changes
+= def
.getTemp();
125 for (const Operand
& op
: instr
->operands
) {
126 if (!op
.isTemp() || !op
.isFirstKill())
128 changes
-= op
.getTemp();
134 static RegisterDemand
getTempRegisters(aco_ptr
<Instruction
>& instr
)
136 RegisterDemand temp_registers
;
137 for (const Definition
& def
: instr
->definitions
) {
138 if (!def
.isTemp() || !def
.isKill())
140 temp_registers
+= def
.getTemp();
142 return temp_registers
;
145 void MoveState::downwards_advance_helper()
148 total_demand
.update(register_demand
[source_idx
]);
151 void MoveState::downwards_init(int current_idx
, bool improved_rar_
, bool may_form_clauses
)
153 improved_rar
= improved_rar_
;
154 source_idx
= current_idx
;
156 insert_idx
= current_idx
+ 1;
157 insert_idx_clause
= current_idx
;
159 total_demand
= total_demand_clause
= register_demand
[current_idx
];
161 std::fill(depends_on
.begin(), depends_on
.end(), false);
163 std::fill(RAR_dependencies
.begin(), RAR_dependencies
.end(), false);
164 if (may_form_clauses
)
165 std::fill(RAR_dependencies_clause
.begin(), RAR_dependencies_clause
.end(), false);
168 for (const Operand
& op
: current
->operands
) {
170 depends_on
[op
.tempId()] = true;
171 if (improved_rar
&& op
.isFirstKill())
172 RAR_dependencies
[op
.tempId()] = true;
176 /* update total_demand/source_idx */
177 downwards_advance_helper();
180 MoveResult
MoveState::downwards_move(bool clause
)
182 aco_ptr
<Instruction
>& instr
= block
->instructions
[source_idx
];
184 for (const Definition
& def
: instr
->definitions
)
185 if (def
.isTemp() && depends_on
[def
.tempId()])
186 return move_fail_ssa
;
188 /* check if one of candidate's operands is killed by depending instruction */
189 std::vector
<bool>& RAR_deps
= improved_rar
? (clause
? RAR_dependencies_clause
: RAR_dependencies
) : depends_on
;
190 for (const Operand
& op
: instr
->operands
) {
191 if (op
.isTemp() && RAR_deps
[op
.tempId()]) {
192 // FIXME: account for difference in register pressure
193 return move_fail_rar
;
198 for (const Operand
& op
: instr
->operands
) {
200 depends_on
[op
.tempId()] = true;
201 if (op
.isFirstKill())
202 RAR_dependencies
[op
.tempId()] = true;
207 int dest_insert_idx
= clause
? insert_idx_clause
: insert_idx
;
208 RegisterDemand register_pressure
= clause
? total_demand_clause
: total_demand
;
210 const RegisterDemand candidate_diff
= getLiveChanges(instr
);
211 const RegisterDemand temp
= getTempRegisters(instr
);
212 if (RegisterDemand(register_pressure
- candidate_diff
).exceeds(max_registers
))
213 return move_fail_pressure
;
214 const RegisterDemand temp2
= getTempRegisters(block
->instructions
[dest_insert_idx
- 1]);
215 const RegisterDemand new_demand
= register_demand
[dest_insert_idx
- 1] - temp2
+ temp
;
216 if (new_demand
.exceeds(max_registers
))
217 return move_fail_pressure
;
219 /* move the candidate below the memory load */
220 move_element(block
->instructions
.begin(), source_idx
, dest_insert_idx
);
222 /* update register pressure */
223 move_element(register_demand
, source_idx
, dest_insert_idx
);
224 for (int i
= source_idx
; i
< dest_insert_idx
- 1; i
++)
225 register_demand
[i
] -= candidate_diff
;
226 register_demand
[dest_insert_idx
- 1] = new_demand
;
227 total_demand_clause
-= candidate_diff
;
230 total_demand
-= candidate_diff
;
234 downwards_advance_helper();
238 void MoveState::downwards_skip()
240 aco_ptr
<Instruction
>& instr
= block
->instructions
[source_idx
];
242 for (const Operand
& op
: instr
->operands
) {
244 depends_on
[op
.tempId()] = true;
245 if (improved_rar
&& op
.isFirstKill()) {
246 RAR_dependencies
[op
.tempId()] = true;
247 RAR_dependencies_clause
[op
.tempId()] = true;
251 total_demand_clause
.update(register_demand
[source_idx
]);
253 downwards_advance_helper();
256 void MoveState::upwards_init(int source_idx_
, bool improved_rar_
)
258 source_idx
= source_idx_
;
259 improved_rar
= improved_rar_
;
263 std::fill(depends_on
.begin(), depends_on
.end(), false);
264 std::fill(RAR_dependencies
.begin(), RAR_dependencies
.end(), false);
266 for (const Definition
& def
: current
->definitions
) {
268 depends_on
[def
.tempId()] = true;
272 bool MoveState::upwards_check_deps()
274 aco_ptr
<Instruction
>& instr
= block
->instructions
[source_idx
];
275 for (const Operand
& op
: instr
->operands
) {
276 if (op
.isTemp() && depends_on
[op
.tempId()])
282 void MoveState::upwards_set_insert_idx(int before
)
285 total_demand
= register_demand
[before
- 1];
288 MoveResult
MoveState::upwards_move()
290 assert(insert_idx
>= 0);
292 aco_ptr
<Instruction
>& instr
= block
->instructions
[source_idx
];
293 for (const Operand
& op
: instr
->operands
) {
294 if (op
.isTemp() && depends_on
[op
.tempId()])
295 return move_fail_ssa
;
298 /* check if candidate uses/kills an operand which is used by a dependency */
299 for (const Operand
& op
: instr
->operands
) {
300 if (op
.isTemp() && (!improved_rar
|| op
.isFirstKill()) && RAR_dependencies
[op
.tempId()])
301 return move_fail_rar
;
304 /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
305 const RegisterDemand candidate_diff
= getLiveChanges(instr
);
306 const RegisterDemand temp
= getTempRegisters(instr
);
307 if (RegisterDemand(total_demand
+ candidate_diff
).exceeds(max_registers
))
308 return move_fail_pressure
;
309 const RegisterDemand temp2
= getTempRegisters(block
->instructions
[insert_idx
- 1]);
310 const RegisterDemand new_demand
= register_demand
[insert_idx
- 1] - temp2
+ candidate_diff
+ temp
;
311 if (new_demand
.exceeds(max_registers
))
312 return move_fail_pressure
;
314 /* move the candidate above the insert_idx */
315 move_element(block
->instructions
.begin(), source_idx
, insert_idx
);
317 /* update register pressure */
318 move_element(register_demand
, source_idx
, insert_idx
);
319 for (int i
= insert_idx
+ 1; i
<= source_idx
; i
++)
320 register_demand
[i
] += candidate_diff
;
321 register_demand
[insert_idx
] = new_demand
;
322 total_demand
+= candidate_diff
;
326 total_demand
.update(register_demand
[source_idx
]);
332 void MoveState::upwards_skip()
334 if (insert_idx
>= 0) {
335 aco_ptr
<Instruction
>& instr
= block
->instructions
[source_idx
];
336 for (const Definition
& def
: instr
->definitions
) {
338 depends_on
[def
.tempId()] = true;
340 for (const Operand
& op
: instr
->operands
) {
342 RAR_dependencies
[op
.tempId()] = true;
344 total_demand
.update(register_demand
[source_idx
]);
350 bool can_reorder(Instruction
* candidate
)
352 switch (candidate
->format
) {
354 return static_cast<SMEM_instruction
*>(candidate
)->can_reorder
;
356 return static_cast<MUBUF_instruction
*>(candidate
)->can_reorder
;
358 return static_cast<MIMG_instruction
*>(candidate
)->can_reorder
;
360 return static_cast<MTBUF_instruction
*>(candidate
)->can_reorder
;
363 case Format::SCRATCH
:
364 return static_cast<FLAT_instruction
*>(candidate
)->can_reorder
;
370 bool is_gs_or_done_sendmsg(Instruction
*instr
)
372 if (instr
->opcode
== aco_opcode::s_sendmsg
) {
373 uint16_t imm
= static_cast<SOPP_instruction
*>(instr
)->imm
;
374 return (imm
& sendmsg_id_mask
) == _sendmsg_gs
||
375 (imm
& sendmsg_id_mask
) == _sendmsg_gs_done
;
380 bool is_done_sendmsg(Instruction
*instr
)
382 if (instr
->opcode
== aco_opcode::s_sendmsg
) {
383 uint16_t imm
= static_cast<SOPP_instruction
*>(instr
)->imm
;
384 return (imm
& sendmsg_id_mask
) == _sendmsg_gs_done
;
389 barrier_interaction
get_barrier_interaction(Instruction
* instr
)
391 switch (instr
->format
) {
393 return static_cast<SMEM_instruction
*>(instr
)->barrier
;
395 return static_cast<MUBUF_instruction
*>(instr
)->barrier
;
397 return static_cast<MIMG_instruction
*>(instr
)->barrier
;
399 return static_cast<MTBUF_instruction
*>(instr
)->barrier
;
402 case Format::SCRATCH
:
403 return static_cast<FLAT_instruction
*>(instr
)->barrier
;
405 return barrier_shared
;
407 if (is_done_sendmsg(instr
))
408 return (barrier_interaction
)(barrier_gs_data
| barrier_gs_sendmsg
);
409 else if (is_gs_or_done_sendmsg(instr
))
410 return barrier_gs_sendmsg
;
413 case Format::PSEUDO_BARRIER
:
414 return barrier_barrier
;
420 barrier_interaction
parse_barrier(Instruction
*instr
)
422 if (instr
->format
== Format::PSEUDO_BARRIER
) {
423 switch (instr
->opcode
) {
424 case aco_opcode::p_memory_barrier_atomic
:
425 return barrier_atomic
;
426 /* For now, buffer and image barriers are treated the same. this is because of
427 * dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_nonlocal.buffer.guard_nonlocal.image.comp
428 * which seems to use an image load to determine if the result of a buffer load is valid. So the ordering of the two loads is important.
429 * I /think/ we should probably eventually expand the meaning of a buffer barrier so that all buffer operations before it, must stay before it
430 * and that both image and buffer operations after it, must stay after it. We should also do the same for image barriers.
431 * Or perhaps the problem is that we don't have a combined barrier instruction for both buffers and images, but the CTS test expects us to?
432 * Either way, this solution should work. */
433 case aco_opcode::p_memory_barrier_buffer
:
434 case aco_opcode::p_memory_barrier_image
:
435 return (barrier_interaction
)(barrier_image
| barrier_buffer
);
436 case aco_opcode::p_memory_barrier_shared
:
437 return barrier_shared
;
438 case aco_opcode::p_memory_barrier_common
:
439 return (barrier_interaction
)(barrier_image
| barrier_buffer
| barrier_shared
| barrier_atomic
);
440 case aco_opcode::p_memory_barrier_gs_data
:
441 return barrier_gs_data
;
442 case aco_opcode::p_memory_barrier_gs_sendmsg
:
443 return barrier_gs_sendmsg
;
447 } else if (instr
->opcode
== aco_opcode::s_barrier
) {
448 return (barrier_interaction
)(barrier_barrier
| barrier_image
| barrier_buffer
| barrier_shared
| barrier_atomic
);
453 struct hazard_query
{
456 int barrier_interaction
;
457 bool can_reorder_vmem
;
458 bool can_reorder_smem
;
461 void init_hazard_query(hazard_query
*query
) {
462 query
->contains_spill
= false;
464 query
->barrier_interaction
= 0;
465 query
->can_reorder_vmem
= true;
466 query
->can_reorder_smem
= true;
469 void add_to_hazard_query(hazard_query
*query
, Instruction
*instr
)
471 query
->barriers
|= parse_barrier(instr
);
472 query
->barrier_interaction
|= get_barrier_interaction(instr
);
473 if (instr
->opcode
== aco_opcode::p_spill
|| instr
->opcode
== aco_opcode::p_reload
)
474 query
->contains_spill
= true;
476 bool can_reorder_instr
= can_reorder(instr
);
477 query
->can_reorder_smem
&= instr
->format
!= Format::SMEM
|| can_reorder_instr
;
478 query
->can_reorder_vmem
&= !(instr
->isVMEM() || instr
->isFlatOrGlobal()) || can_reorder_instr
;
483 hazard_fail_reorder_vmem_smem
,
484 hazard_fail_reorder_ds
,
485 hazard_fail_reorder_sendmsg
,
489 /* Must stop at these failures. The hazard query code doesn't consider them
495 HazardResult
perform_hazard_query(hazard_query
*query
, Instruction
*instr
)
497 bool can_reorder_candidate
= can_reorder(instr
);
499 if (instr
->opcode
== aco_opcode::p_exit_early_if
)
500 return hazard_fail_exec
;
501 for (const Definition
& def
: instr
->definitions
) {
502 if (def
.isFixed() && def
.physReg() == exec
)
503 return hazard_fail_exec
;
506 /* don't move exports so that they stay closer together */
507 if (instr
->format
== Format::EXP
)
508 return hazard_fail_export
;
510 /* don't move s_memtime/s_memrealtime */
511 if (instr
->opcode
== aco_opcode::s_memtime
|| instr
->opcode
== aco_opcode::s_memrealtime
)
512 return hazard_fail_memtime
;
514 if (query
->barrier_interaction
&& (query
->barrier_interaction
& parse_barrier(instr
)))
515 return hazard_fail_barrier
;
516 if (query
->barriers
&& (query
->barriers
& get_barrier_interaction(instr
)))
517 return hazard_fail_barrier
;
519 if (!query
->can_reorder_smem
&& instr
->format
== Format::SMEM
&& !can_reorder_candidate
)
520 return hazard_fail_reorder_vmem_smem
;
521 if (!query
->can_reorder_vmem
&& (instr
->isVMEM() || instr
->isFlatOrGlobal()) && !can_reorder_candidate
)
522 return hazard_fail_reorder_vmem_smem
;
523 if ((query
->barrier_interaction
& barrier_shared
) && instr
->format
== Format::DS
)
524 return hazard_fail_reorder_ds
;
525 if (is_gs_or_done_sendmsg(instr
) && (query
->barrier_interaction
& get_barrier_interaction(instr
)))
526 return hazard_fail_reorder_sendmsg
;
528 if ((instr
->opcode
== aco_opcode::p_spill
|| instr
->opcode
== aco_opcode::p_reload
) &&
529 query
->contains_spill
)
530 return hazard_fail_spill
;
532 return hazard_success
;
535 void schedule_SMEM(sched_ctx
& ctx
, Block
* block
,
536 std::vector
<RegisterDemand
>& register_demand
,
537 Instruction
* current
, int idx
)
540 int window_size
= SMEM_WINDOW_SIZE
;
541 int max_moves
= SMEM_MAX_MOVES
;
544 /* don't move s_memtime/s_memrealtime */
545 if (current
->opcode
== aco_opcode::s_memtime
|| current
->opcode
== aco_opcode::s_memrealtime
)
548 /* first, check if we have instructions before current to move down */
550 init_hazard_query(&hq
);
551 add_to_hazard_query(&hq
, current
);
553 ctx
.mv
.downwards_init(idx
, false, false);
555 for (int candidate_idx
= idx
- 1; k
< max_moves
&& candidate_idx
> (int) idx
- window_size
; candidate_idx
--) {
556 assert(candidate_idx
>= 0);
557 assert(candidate_idx
== ctx
.mv
.source_idx
);
558 aco_ptr
<Instruction
>& candidate
= block
->instructions
[candidate_idx
];
560 /* break if we'd make the previous SMEM instruction stall */
561 bool can_stall_prev_smem
= idx
<= ctx
.last_SMEM_dep_idx
&& candidate_idx
< ctx
.last_SMEM_dep_idx
;
562 if (can_stall_prev_smem
&& ctx
.last_SMEM_stall
>= 0)
565 /* break when encountering another MEM instruction, logical_start or barriers */
566 if (candidate
->opcode
== aco_opcode::p_logical_start
)
568 if (candidate
->isVMEM())
571 bool can_move_down
= true;
573 HazardResult haz
= perform_hazard_query(&hq
, candidate
.get());
574 if (haz
== hazard_fail_reorder_ds
|| haz
== hazard_fail_spill
|| haz
== hazard_fail_reorder_sendmsg
|| haz
== hazard_fail_barrier
)
575 can_move_down
= false;
576 else if (haz
!= hazard_success
)
579 /* don't use LDS/GDS instructions to hide latency since it can
580 * significanly worsen LDS scheduling */
581 if (candidate
->format
== Format::DS
|| !can_move_down
) {
582 add_to_hazard_query(&hq
, candidate
.get());
583 ctx
.mv
.downwards_skip();
587 MoveResult res
= ctx
.mv
.downwards_move(false);
588 if (res
== move_fail_ssa
|| res
== move_fail_rar
) {
589 add_to_hazard_query(&hq
, candidate
.get());
590 ctx
.mv
.downwards_skip();
592 } else if (res
== move_fail_pressure
) {
596 if (candidate_idx
< ctx
.last_SMEM_dep_idx
)
597 ctx
.last_SMEM_stall
++;
601 /* find the first instruction depending on current or find another MEM */
602 ctx
.mv
.upwards_init(idx
+ 1, false);
604 bool found_dependency
= false;
605 /* second, check if we have instructions after current to move up */
606 for (int candidate_idx
= idx
+ 1; k
< max_moves
&& candidate_idx
< (int) idx
+ window_size
; candidate_idx
++) {
607 assert(candidate_idx
== ctx
.mv
.source_idx
);
608 assert(candidate_idx
< (int) block
->instructions
.size());
609 aco_ptr
<Instruction
>& candidate
= block
->instructions
[candidate_idx
];
611 if (candidate
->opcode
== aco_opcode::p_logical_end
)
614 /* check if candidate depends on current */
615 bool is_dependency
= !found_dependency
&& !ctx
.mv
.upwards_check_deps();
616 /* no need to steal from following VMEM instructions */
617 if (is_dependency
&& candidate
->isVMEM())
620 if (found_dependency
) {
621 HazardResult haz
= perform_hazard_query(&hq
, candidate
.get());
622 if (haz
== hazard_fail_reorder_ds
|| haz
== hazard_fail_spill
||
623 haz
== hazard_fail_reorder_sendmsg
|| haz
== hazard_fail_barrier
)
624 is_dependency
= true;
625 else if (haz
!= hazard_success
)
630 if (!found_dependency
) {
631 ctx
.mv
.upwards_set_insert_idx(candidate_idx
);
632 init_hazard_query(&hq
);
633 found_dependency
= true;
637 if (is_dependency
|| !found_dependency
) {
638 if (found_dependency
)
639 add_to_hazard_query(&hq
, candidate
.get());
642 ctx
.mv
.upwards_skip();
646 MoveResult res
= ctx
.mv
.upwards_move();
647 if (res
== move_fail_ssa
|| res
== move_fail_rar
) {
648 /* no need to steal from following VMEM instructions */
649 if (res
== move_fail_ssa
&& candidate
->isVMEM())
651 add_to_hazard_query(&hq
, candidate
.get());
652 ctx
.mv
.upwards_skip();
654 } else if (res
== move_fail_pressure
) {
660 ctx
.last_SMEM_dep_idx
= found_dependency
? ctx
.mv
.insert_idx
: 0;
661 ctx
.last_SMEM_stall
= 10 - ctx
.num_waves
- k
;
664 void schedule_VMEM(sched_ctx
& ctx
, Block
* block
,
665 std::vector
<RegisterDemand
>& register_demand
,
666 Instruction
* current
, int idx
)
669 int window_size
= VMEM_WINDOW_SIZE
;
670 int max_moves
= VMEM_MAX_MOVES
;
671 int clause_max_grab_dist
= VMEM_CLAUSE_MAX_GRAB_DIST
;
674 /* first, check if we have instructions before current to move down */
675 hazard_query indep_hq
;
676 hazard_query clause_hq
;
677 init_hazard_query(&indep_hq
);
678 init_hazard_query(&clause_hq
);
679 add_to_hazard_query(&indep_hq
, current
);
681 ctx
.mv
.downwards_init(idx
, true, true);
683 for (int candidate_idx
= idx
- 1; k
< max_moves
&& candidate_idx
> (int) idx
- window_size
; candidate_idx
--) {
684 assert(candidate_idx
== ctx
.mv
.source_idx
);
685 assert(candidate_idx
>= 0);
686 aco_ptr
<Instruction
>& candidate
= block
->instructions
[candidate_idx
];
687 bool is_vmem
= candidate
->isVMEM() || candidate
->isFlatOrGlobal();
689 /* break when encountering another VMEM instruction, logical_start or barriers */
690 if (candidate
->opcode
== aco_opcode::p_logical_start
)
693 /* break if we'd make the previous SMEM instruction stall */
694 bool can_stall_prev_smem
= idx
<= ctx
.last_SMEM_dep_idx
&& candidate_idx
< ctx
.last_SMEM_dep_idx
;
695 if (can_stall_prev_smem
&& ctx
.last_SMEM_stall
>= 0)
698 bool part_of_clause
= false;
699 if (current
->isVMEM() == candidate
->isVMEM()) {
700 bool same_resource
= true;
701 if (current
->isVMEM())
702 same_resource
= candidate
->operands
[0].tempId() == current
->operands
[0].tempId();
703 int grab_dist
= ctx
.mv
.insert_idx_clause
- candidate_idx
;
704 /* We can't easily tell how much this will decrease the def-to-use
705 * distances, so just use how far it will be moved as a heuristic. */
706 part_of_clause
= same_resource
&& grab_dist
< clause_max_grab_dist
;
709 /* if current depends on candidate, add additional dependencies and continue */
710 bool can_move_down
= !is_vmem
|| part_of_clause
;
712 HazardResult haz
= perform_hazard_query(part_of_clause
? &clause_hq
: &indep_hq
, candidate
.get());
713 if (haz
== hazard_fail_reorder_ds
|| haz
== hazard_fail_spill
||
714 haz
== hazard_fail_reorder_sendmsg
|| haz
== hazard_fail_barrier
)
715 can_move_down
= false;
716 else if (haz
!= hazard_success
)
719 if (!can_move_down
) {
720 add_to_hazard_query(&indep_hq
, candidate
.get());
721 add_to_hazard_query(&clause_hq
, candidate
.get());
722 ctx
.mv
.downwards_skip();
726 MoveResult res
= ctx
.mv
.downwards_move(part_of_clause
);
727 if (res
== move_fail_ssa
|| res
== move_fail_rar
) {
728 add_to_hazard_query(&indep_hq
, candidate
.get());
729 add_to_hazard_query(&clause_hq
, candidate
.get());
730 ctx
.mv
.downwards_skip();
732 } else if (res
== move_fail_pressure
) {
736 if (candidate_idx
< ctx
.last_SMEM_dep_idx
)
737 ctx
.last_SMEM_stall
++;
740 /* find the first instruction depending on current or find another VMEM */
741 ctx
.mv
.upwards_init(idx
+ 1, true);
743 bool found_dependency
= false;
744 /* second, check if we have instructions after current to move up */
745 for (int candidate_idx
= idx
+ 1; k
< max_moves
&& candidate_idx
< (int) idx
+ window_size
; candidate_idx
++) {
746 assert(candidate_idx
== ctx
.mv
.source_idx
);
747 assert(candidate_idx
< (int) block
->instructions
.size());
748 aco_ptr
<Instruction
>& candidate
= block
->instructions
[candidate_idx
];
749 bool is_vmem
= candidate
->isVMEM() || candidate
->isFlatOrGlobal();
751 if (candidate
->opcode
== aco_opcode::p_logical_end
)
754 /* check if candidate depends on current */
755 bool is_dependency
= false;
756 if (found_dependency
) {
757 HazardResult haz
= perform_hazard_query(&indep_hq
, candidate
.get());
758 if (haz
== hazard_fail_reorder_ds
|| haz
== hazard_fail_spill
||
759 haz
== hazard_fail_reorder_vmem_smem
|| haz
== hazard_fail_reorder_sendmsg
||
760 haz
== hazard_fail_barrier
)
761 is_dependency
= true;
762 else if (haz
!= hazard_success
)
766 is_dependency
|= !found_dependency
&& !ctx
.mv
.upwards_check_deps();
768 if (!found_dependency
) {
769 ctx
.mv
.upwards_set_insert_idx(candidate_idx
);
770 init_hazard_query(&indep_hq
);
771 found_dependency
= true;
773 } else if (is_vmem
) {
774 /* don't move up dependencies of other VMEM instructions */
775 for (const Definition
& def
: candidate
->definitions
) {
777 ctx
.mv
.depends_on
[def
.tempId()] = true;
781 if (is_dependency
|| !found_dependency
) {
782 if (found_dependency
)
783 add_to_hazard_query(&indep_hq
, candidate
.get());
784 ctx
.mv
.upwards_skip();
788 MoveResult res
= ctx
.mv
.upwards_move();
789 if (res
== move_fail_ssa
|| res
== move_fail_rar
) {
790 add_to_hazard_query(&indep_hq
, candidate
.get());
791 ctx
.mv
.upwards_skip();
793 } else if (res
== move_fail_pressure
) {
800 void schedule_position_export(sched_ctx
& ctx
, Block
* block
,
801 std::vector
<RegisterDemand
>& register_demand
,
802 Instruction
* current
, int idx
)
805 int window_size
= POS_EXP_WINDOW_SIZE
;
806 int max_moves
= POS_EXP_MAX_MOVES
;
809 ctx
.mv
.downwards_init(idx
, true, false);
812 init_hazard_query(&hq
);
813 add_to_hazard_query(&hq
, current
);
815 for (int candidate_idx
= idx
- 1; k
< max_moves
&& candidate_idx
> (int) idx
- window_size
; candidate_idx
--) {
816 assert(candidate_idx
>= 0);
817 aco_ptr
<Instruction
>& candidate
= block
->instructions
[candidate_idx
];
819 if (candidate
->opcode
== aco_opcode::p_logical_start
)
821 if (candidate
->isVMEM() || candidate
->format
== Format::SMEM
|| candidate
->isFlatOrGlobal())
824 HazardResult haz
= perform_hazard_query(&hq
, candidate
.get());
825 if (haz
== hazard_fail_exec
|| haz
== hazard_fail_export
|| haz
== hazard_fail_memtime
)
828 if (haz
!= hazard_success
) {
829 add_to_hazard_query(&hq
, candidate
.get());
830 ctx
.mv
.downwards_skip();
834 MoveResult res
= ctx
.mv
.downwards_move(false);
835 if (res
== move_fail_ssa
|| res
== move_fail_rar
) {
836 add_to_hazard_query(&hq
, candidate
.get());
837 ctx
.mv
.downwards_skip();
839 } else if (res
== move_fail_pressure
) {
846 void schedule_block(sched_ctx
& ctx
, Program
*program
, Block
* block
, live
& live_vars
)
848 ctx
.last_SMEM_dep_idx
= 0;
849 ctx
.last_SMEM_stall
= INT16_MIN
;
850 ctx
.mv
.block
= block
;
851 ctx
.mv
.register_demand
= live_vars
.register_demand
[block
->index
].data();
853 /* go through all instructions and find memory loads */
854 for (unsigned idx
= 0; idx
< block
->instructions
.size(); idx
++) {
855 Instruction
* current
= block
->instructions
[idx
].get();
857 if (current
->definitions
.empty())
860 if (current
->isVMEM() || current
->isFlatOrGlobal()) {
861 ctx
.mv
.current
= current
;
862 schedule_VMEM(ctx
, block
, live_vars
.register_demand
[block
->index
], current
, idx
);
865 if (current
->format
== Format::SMEM
) {
866 ctx
.mv
.current
= current
;
867 schedule_SMEM(ctx
, block
, live_vars
.register_demand
[block
->index
], current
, idx
);
871 if ((program
->stage
& hw_vs
) && block
->index
== program
->blocks
.size() - 1) {
872 /* Try to move position exports as far up as possible, to reduce register
873 * usage and because ISA reference guides say so. */
874 for (unsigned idx
= 0; idx
< block
->instructions
.size(); idx
++) {
875 Instruction
* current
= block
->instructions
[idx
].get();
877 if (current
->format
== Format::EXP
) {
878 unsigned target
= static_cast<Export_instruction
*>(current
)->dest
;
879 if (target
>= V_008DFC_SQ_EXP_POS
&& target
< V_008DFC_SQ_EXP_PARAM
) {
880 ctx
.mv
.current
= current
;
881 schedule_position_export(ctx
, block
, live_vars
.register_demand
[block
->index
], current
, idx
);
887 /* resummarize the block's register demand */
888 block
->register_demand
= RegisterDemand();
889 for (unsigned idx
= 0; idx
< block
->instructions
.size(); idx
++) {
890 block
->register_demand
.update(live_vars
.register_demand
[block
->index
][idx
]);
895 void schedule_program(Program
*program
, live
& live_vars
)
898 ctx
.mv
.depends_on
.resize(program
->peekAllocationId());
899 ctx
.mv
.RAR_dependencies
.resize(program
->peekAllocationId());
900 ctx
.mv
.RAR_dependencies_clause
.resize(program
->peekAllocationId());
901 /* Allowing the scheduler to reduce the number of waves to as low as 5
902 * improves performance of Thrones of Britannia significantly and doesn't
903 * seem to hurt anything else. */
904 if (program
->num_waves
<= 5)
905 ctx
.num_waves
= program
->num_waves
;
906 else if (program
->max_reg_demand
.vgpr
>= 32)
908 else if (program
->max_reg_demand
.vgpr
>= 28)
910 else if (program
->max_reg_demand
.vgpr
>= 24)
914 ctx
.num_waves
= std::max
<uint16_t>(ctx
.num_waves
, program
->min_waves
);
916 assert(ctx
.num_waves
> 0 && ctx
.num_waves
<= program
->num_waves
);
917 ctx
.mv
.max_registers
= { int16_t(get_addr_vgpr_from_waves(program
, ctx
.num_waves
) - 2),
918 int16_t(get_addr_sgpr_from_waves(program
, ctx
.num_waves
))};
920 for (Block
& block
: program
->blocks
)
921 schedule_block(ctx
, program
, &block
, live_vars
);
923 /* update max_reg_demand and num_waves */
924 RegisterDemand new_demand
;
925 for (Block
& block
: program
->blocks
) {
926 new_demand
.update(block
.register_demand
);
928 update_vgpr_sgpr_demand(program
, new_demand
);
930 /* if enabled, this code asserts that register_demand is updated correctly */
932 int prev_num_waves
= program
->num_waves
;
933 const RegisterDemand prev_max_demand
= program
->max_reg_demand
;
935 std::vector
<RegisterDemand
> demands(program
->blocks
.size());
936 for (unsigned j
= 0; j
< program
->blocks
.size(); j
++) {
937 demands
[j
] = program
->blocks
[j
].register_demand
;
940 struct radv_nir_compiler_options options
;
941 options
.chip_class
= program
->chip_class
;
942 live live_vars2
= aco::live_var_analysis(program
, &options
);
944 for (unsigned j
= 0; j
< program
->blocks
.size(); j
++) {
945 Block
&b
= program
->blocks
[j
];
946 for (unsigned i
= 0; i
< b
.instructions
.size(); i
++)
947 assert(live_vars
.register_demand
[b
.index
][i
] == live_vars2
.register_demand
[b
.index
][i
]);
948 assert(b
.register_demand
== demands
[j
]);
951 assert(program
->max_reg_demand
== prev_max_demand
);
952 assert(program
->num_waves
== prev_num_waves
);