2 * Copyright © 2018 Valve Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
26 #include "aco_builder.h"
27 #include <unordered_set>
30 #include "vulkan/radv_shader.h" // for radv_nir_compiler_options
31 #include "amdgfxregs.h"
33 #define SMEM_WINDOW_SIZE (350 - ctx.num_waves * 35)
34 #define VMEM_WINDOW_SIZE (1024 - ctx.num_waves * 64)
35 #define POS_EXP_WINDOW_SIZE 512
36 #define SMEM_MAX_MOVES (64 - ctx.num_waves * 4)
37 #define VMEM_MAX_MOVES (128 - ctx.num_waves * 8)
38 /* creating clauses decreases def-use distances, so make it less aggressive the lower num_waves is */
39 #define VMEM_CLAUSE_MAX_GRAB_DIST ((ctx.num_waves - 1) * 8)
40 #define POS_EXP_MAX_MOVES 512
45 std::vector
<bool> depends_on
;
46 std::vector
<bool> RAR_dependencies
;
47 /* For downwards VMEM scheduling, same as RAR_dependencies but excludes the
48 * instructions in the clause, since new instructions in the clause are not
49 * moved past any other instructions in the clause. */
50 std::vector
<bool> new_RAR_dependencies
;
52 RegisterDemand max_registers
;
54 int16_t last_SMEM_stall
;
55 int last_SMEM_dep_idx
;
58 /* This scheduler is a simple bottom-up pass based on ideas from
59 * "A Novel Lightweight Instruction Scheduling Algorithm for Just-In-Time Compiler"
60 * from Xiaohua Shi and Peng Guo.
61 * The basic approach is to iterate over all instructions. When a memory instruction
62 * is encountered it tries to move independent instructions from above and below
63 * between the memory instruction and it's first user.
64 * The novelty is that this scheduler cares for the current register pressure:
65 * Instructions will only be moved if the register pressure won't exceed a certain bound.
69 void move_element(T
& list
, size_t idx
, size_t before
) {
71 auto begin
= std::next(list
.begin(), idx
);
72 auto end
= std::next(list
.begin(), before
);
73 std::rotate(begin
, begin
+ 1, end
);
74 } else if (idx
> before
) {
75 auto begin
= std::next(list
.begin(), before
);
76 auto end
= std::next(list
.begin(), idx
+ 1);
77 std::rotate(begin
, end
- 1, end
);
81 static RegisterDemand
getLiveChanges(aco_ptr
<Instruction
>& instr
)
83 RegisterDemand changes
;
84 for (const Definition
& def
: instr
->definitions
) {
85 if (!def
.isTemp() || def
.isKill())
87 changes
+= def
.getTemp();
90 for (const Operand
& op
: instr
->operands
) {
91 if (!op
.isTemp() || !op
.isFirstKill())
93 changes
-= op
.getTemp();
99 static RegisterDemand
getTempRegisters(aco_ptr
<Instruction
>& instr
)
101 RegisterDemand temp_registers
;
102 for (const Definition
& def
: instr
->definitions
) {
103 if (!def
.isTemp() || !def
.isKill())
105 temp_registers
+= def
.getTemp();
107 return temp_registers
;
110 static bool is_spill_reload(aco_ptr
<Instruction
>& instr
)
112 return instr
->opcode
== aco_opcode::p_spill
|| instr
->opcode
== aco_opcode::p_reload
;
115 bool can_reorder(Instruction
* candidate
)
117 switch (candidate
->format
) {
119 return static_cast<SMEM_instruction
*>(candidate
)->can_reorder
;
121 return static_cast<MUBUF_instruction
*>(candidate
)->can_reorder
;
123 return static_cast<MIMG_instruction
*>(candidate
)->can_reorder
;
125 return static_cast<MTBUF_instruction
*>(candidate
)->can_reorder
;
128 case Format::SCRATCH
:
129 return static_cast<FLAT_instruction
*>(candidate
)->can_reorder
;
135 bool is_gs_or_done_sendmsg(Instruction
*instr
)
137 if (instr
->opcode
== aco_opcode::s_sendmsg
) {
138 uint16_t imm
= static_cast<SOPP_instruction
*>(instr
)->imm
;
139 return (imm
& sendmsg_id_mask
) == _sendmsg_gs
||
140 (imm
& sendmsg_id_mask
) == _sendmsg_gs_done
;
145 bool is_done_sendmsg(Instruction
*instr
)
147 if (instr
->opcode
== aco_opcode::s_sendmsg
) {
148 uint16_t imm
= static_cast<SOPP_instruction
*>(instr
)->imm
;
149 return (imm
& sendmsg_id_mask
) == _sendmsg_gs_done
;
154 barrier_interaction
get_barrier_interaction(Instruction
* instr
)
156 switch (instr
->format
) {
158 return static_cast<SMEM_instruction
*>(instr
)->barrier
;
160 return static_cast<MUBUF_instruction
*>(instr
)->barrier
;
162 return static_cast<MIMG_instruction
*>(instr
)->barrier
;
164 return static_cast<MTBUF_instruction
*>(instr
)->barrier
;
167 case Format::SCRATCH
:
168 return static_cast<FLAT_instruction
*>(instr
)->barrier
;
170 return barrier_shared
;
172 if (is_done_sendmsg(instr
))
173 return (barrier_interaction
)(barrier_gs_data
| barrier_gs_sendmsg
);
174 else if (is_gs_or_done_sendmsg(instr
))
175 return barrier_gs_sendmsg
;
183 bool can_move_instr(aco_ptr
<Instruction
>& instr
, Instruction
* current
, int moving_interaction
)
185 /* don't move exports so that they stay closer together */
186 if (instr
->format
== Format::EXP
)
189 /* don't move s_memtime/s_memrealtime */
190 if (instr
->opcode
== aco_opcode::s_memtime
|| instr
->opcode
== aco_opcode::s_memrealtime
)
193 /* handle barriers */
195 /* TODO: instead of stopping, maybe try to move the barriers and any
196 * instructions interacting with them instead? */
197 if (instr
->format
!= Format::PSEUDO_BARRIER
) {
198 if (instr
->opcode
== aco_opcode::s_barrier
) {
199 return can_reorder(current
) && moving_interaction
== barrier_none
;
200 } else if (is_gs_or_done_sendmsg(instr
.get())) {
201 int interaction
= get_barrier_interaction(current
);
202 interaction
|= moving_interaction
;
203 return !(interaction
& get_barrier_interaction(instr
.get()));
209 int interaction
= get_barrier_interaction(current
);
210 interaction
|= moving_interaction
;
212 switch (instr
->opcode
) {
213 case aco_opcode::p_memory_barrier_atomic
:
214 return !(interaction
& barrier_atomic
);
215 /* For now, buffer and image barriers are treated the same. this is because of
216 * dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_nonlocal.buffer.guard_nonlocal.image.comp
217 * which seems to use an image load to determine if the result of a buffer load is valid. So the ordering of the two loads is important.
218 * I /think/ we should probably eventually expand the meaning of a buffer barrier so that all buffer operations before it, must stay before it
219 * and that both image and buffer operations after it, must stay after it. We should also do the same for image barriers.
220 * Or perhaps the problem is that we don't have a combined barrier instruction for both buffers and images, but the CTS test expects us to?
221 * Either way, this solution should work. */
222 case aco_opcode::p_memory_barrier_buffer
:
223 case aco_opcode::p_memory_barrier_image
:
224 return !(interaction
& (barrier_image
| barrier_buffer
));
225 case aco_opcode::p_memory_barrier_shared
:
226 return !(interaction
& barrier_shared
);
227 case aco_opcode::p_memory_barrier_common
:
228 return !(interaction
& (barrier_image
| barrier_buffer
| barrier_shared
| barrier_atomic
));
229 case aco_opcode::p_memory_barrier_gs_data
:
230 return !(interaction
& barrier_gs_data
);
231 case aco_opcode::p_memory_barrier_gs_sendmsg
:
232 return !(interaction
& barrier_gs_sendmsg
);
238 void schedule_SMEM(sched_ctx
& ctx
, Block
* block
,
239 std::vector
<RegisterDemand
>& register_demand
,
240 Instruction
* current
, int idx
)
243 int window_size
= SMEM_WINDOW_SIZE
;
244 int max_moves
= SMEM_MAX_MOVES
;
246 bool can_reorder_cur
= can_reorder(current
);
248 /* don't move s_memtime/s_memrealtime */
249 if (current
->opcode
== aco_opcode::s_memtime
|| current
->opcode
== aco_opcode::s_memrealtime
)
252 /* create the initial set of values which current depends on */
253 std::fill(ctx
.depends_on
.begin(), ctx
.depends_on
.end(), false);
254 for (const Operand
& op
: current
->operands
) {
256 ctx
.depends_on
[op
.tempId()] = true;
259 /* maintain how many registers remain free when moving instructions */
260 RegisterDemand register_pressure
= register_demand
[idx
];
262 /* first, check if we have instructions before current to move down */
263 int insert_idx
= idx
+ 1;
264 int moving_interaction
= barrier_none
;
265 bool moving_spill
= false;
267 for (int candidate_idx
= idx
- 1; k
< max_moves
&& candidate_idx
> (int) idx
- window_size
; candidate_idx
--) {
268 assert(candidate_idx
>= 0);
269 aco_ptr
<Instruction
>& candidate
= block
->instructions
[candidate_idx
];
270 bool can_reorder_candidate
= can_reorder(candidate
.get());
272 /* break if we'd make the previous SMEM instruction stall */
273 bool can_stall_prev_smem
= idx
<= ctx
.last_SMEM_dep_idx
&& candidate_idx
< ctx
.last_SMEM_dep_idx
;
274 if (can_stall_prev_smem
&& ctx
.last_SMEM_stall
>= 0)
277 /* break when encountering another MEM instruction, logical_start or barriers */
278 if (!can_reorder_candidate
&& !can_reorder_cur
)
280 if (candidate
->opcode
== aco_opcode::p_logical_start
)
282 if (candidate
->opcode
== aco_opcode::p_exit_early_if
)
284 if (!can_move_instr(candidate
, current
, moving_interaction
))
286 if (candidate
->isVMEM())
288 register_pressure
.update(register_demand
[candidate_idx
]);
290 /* if current depends on candidate, add additional dependencies and continue */
291 bool can_move_down
= true;
292 bool writes_exec
= false;
293 for (const Definition
& def
: candidate
->definitions
) {
294 if (def
.isTemp() && ctx
.depends_on
[def
.tempId()])
295 can_move_down
= false;
296 if (def
.isFixed() && def
.physReg() == exec
)
302 if (moving_spill
&& is_spill_reload(candidate
))
303 can_move_down
= false;
304 if ((moving_interaction
& barrier_shared
) && candidate
->format
== Format::DS
)
305 can_move_down
= false;
306 moving_interaction
|= get_barrier_interaction(candidate
.get());
307 moving_spill
|= is_spill_reload(candidate
);
308 if (!can_move_down
) {
309 for (const Operand
& op
: candidate
->operands
) {
311 ctx
.depends_on
[op
.tempId()] = true;
313 can_reorder_cur
&= can_reorder_candidate
;
317 bool register_pressure_unknown
= false;
318 /* check if one of candidate's operands is killed by depending instruction */
319 for (const Operand
& op
: candidate
->operands
) {
320 if (op
.isTemp() && ctx
.depends_on
[op
.tempId()]) {
321 // FIXME: account for difference in register pressure
322 register_pressure_unknown
= true;
325 if (register_pressure_unknown
) {
326 for (const Operand
& op
: candidate
->operands
) {
328 ctx
.depends_on
[op
.tempId()] = true;
330 can_reorder_cur
&= can_reorder_candidate
;
334 /* check if register pressure is low enough: the diff is negative if register pressure is increased */
335 const RegisterDemand candidate_diff
= getLiveChanges(candidate
);
336 const RegisterDemand tempDemand
= getTempRegisters(candidate
);
337 if (RegisterDemand(register_pressure
- candidate_diff
).exceeds(ctx
.max_registers
))
339 const RegisterDemand tempDemand2
= getTempRegisters(block
->instructions
[insert_idx
- 1]);
340 const RegisterDemand new_demand
= register_demand
[insert_idx
- 1] - tempDemand2
+ tempDemand
;
341 if (new_demand
.exceeds(ctx
.max_registers
))
343 // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure
345 /* move the candidate below the memory load */
346 move_element(block
->instructions
, candidate_idx
, insert_idx
);
348 /* update register pressure */
349 move_element(register_demand
, candidate_idx
, insert_idx
);
350 for (int i
= candidate_idx
; i
< insert_idx
- 1; i
++) {
351 register_demand
[i
] -= candidate_diff
;
353 register_demand
[insert_idx
- 1] = new_demand
;
354 register_pressure
-= candidate_diff
;
356 if (candidate_idx
< ctx
.last_SMEM_dep_idx
)
357 ctx
.last_SMEM_stall
++;
362 /* create the initial set of values which depend on current */
363 std::fill(ctx
.depends_on
.begin(), ctx
.depends_on
.end(), false);
364 std::fill(ctx
.RAR_dependencies
.begin(), ctx
.RAR_dependencies
.end(), false);
365 for (const Definition
& def
: current
->definitions
) {
367 ctx
.depends_on
[def
.tempId()] = true;
370 /* find the first instruction depending on current or find another MEM */
371 insert_idx
= idx
+ 1;
372 moving_interaction
= barrier_none
;
373 moving_spill
= false;
374 can_reorder_cur
= true;
376 bool found_dependency
= false;
377 /* second, check if we have instructions after current to move up */
378 for (int candidate_idx
= idx
+ 1; k
< max_moves
&& candidate_idx
< (int) idx
+ window_size
; candidate_idx
++) {
379 assert(candidate_idx
< (int) block
->instructions
.size());
380 aco_ptr
<Instruction
>& candidate
= block
->instructions
[candidate_idx
];
381 bool can_reorder_candidate
= can_reorder(candidate
.get());
383 if (candidate
->opcode
== aco_opcode::p_logical_end
)
385 if (!can_move_instr(candidate
, current
, moving_interaction
))
388 const bool writes_exec
= std::any_of(candidate
->definitions
.begin(), candidate
->definitions
.end(),
389 [](const Definition
& def
) { return def
.isFixed() && def
.physReg() == exec
;});
393 /* check if candidate depends on current */
394 bool is_dependency
= std::any_of(candidate
->operands
.begin(), candidate
->operands
.end(),
395 [&ctx
](const Operand
& op
) { return op
.isTemp() && ctx
.depends_on
[op
.tempId()];});
396 /* no need to steal from following VMEM instructions */
397 if (is_dependency
&& candidate
->isVMEM())
399 if (moving_spill
&& is_spill_reload(candidate
))
400 is_dependency
= true;
401 if ((moving_interaction
& barrier_shared
) && candidate
->format
== Format::DS
)
402 is_dependency
= true;
403 moving_interaction
|= get_barrier_interaction(candidate
.get());
404 moving_spill
|= is_spill_reload(candidate
);
406 for (const Definition
& def
: candidate
->definitions
) {
408 ctx
.depends_on
[def
.tempId()] = true;
410 for (const Operand
& op
: candidate
->operands
) {
412 ctx
.RAR_dependencies
[op
.tempId()] = true;
414 if (!found_dependency
) {
415 insert_idx
= candidate_idx
;
416 found_dependency
= true;
417 /* init register pressure */
418 register_pressure
= register_demand
[insert_idx
- 1];
422 if (!can_reorder_candidate
&& !can_reorder_cur
)
425 if (!found_dependency
) {
430 /* update register pressure */
431 register_pressure
.update(register_demand
[candidate_idx
- 1]);
434 can_reorder_cur
&= can_reorder_candidate
;
437 assert(insert_idx
!= idx
);
439 // TODO: correctly calculate register pressure for this case
440 bool register_pressure_unknown
= false;
441 /* check if candidate uses/kills an operand which is used by a dependency */
442 for (const Operand
& op
: candidate
->operands
) {
443 if (op
.isTemp() && ctx
.RAR_dependencies
[op
.tempId()])
444 register_pressure_unknown
= true;
446 if (register_pressure_unknown
) {
447 if (candidate
->isVMEM())
449 for (const Definition
& def
: candidate
->definitions
) {
451 ctx
.RAR_dependencies
[def
.tempId()] = true;
453 for (const Operand
& op
: candidate
->operands
) {
455 ctx
.RAR_dependencies
[op
.tempId()] = true;
457 can_reorder_cur
&= can_reorder_candidate
;
461 /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
462 const RegisterDemand candidate_diff
= getLiveChanges(candidate
);
463 const RegisterDemand temp
= getTempRegisters(candidate
);
464 if (RegisterDemand(register_pressure
+ candidate_diff
).exceeds(ctx
.max_registers
))
466 const RegisterDemand temp2
= getTempRegisters(block
->instructions
[insert_idx
- 1]);
467 const RegisterDemand new_demand
= register_demand
[insert_idx
- 1] - temp2
+ candidate_diff
+ temp
;
468 if (new_demand
.exceeds(ctx
.max_registers
))
471 /* move the candidate above the insert_idx */
472 move_element(block
->instructions
, candidate_idx
, insert_idx
);
474 /* update register pressure */
475 move_element(register_demand
, candidate_idx
, insert_idx
);
476 for (int i
= insert_idx
+ 1; i
<= candidate_idx
; i
++) {
477 register_demand
[i
] += candidate_diff
;
479 register_demand
[insert_idx
] = new_demand
;
480 register_pressure
+= candidate_diff
;
485 ctx
.last_SMEM_dep_idx
= found_dependency
? insert_idx
: 0;
486 ctx
.last_SMEM_stall
= 10 - ctx
.num_waves
- k
;
489 void schedule_VMEM(sched_ctx
& ctx
, Block
* block
,
490 std::vector
<RegisterDemand
>& register_demand
,
491 Instruction
* current
, int idx
)
494 int window_size
= VMEM_WINDOW_SIZE
;
495 int max_moves
= VMEM_MAX_MOVES
;
496 int clause_max_grab_dist
= VMEM_CLAUSE_MAX_GRAB_DIST
;
498 /* initially true as we don't pull other VMEM instructions
499 * through the current instruction */
500 bool can_reorder_vmem
= true;
501 bool can_reorder_smem
= true;
503 /* create the initial set of values which current depends on */
504 std::fill(ctx
.depends_on
.begin(), ctx
.depends_on
.end(), false);
505 std::fill(ctx
.RAR_dependencies
.begin(), ctx
.RAR_dependencies
.end(), false);
506 std::fill(ctx
.new_RAR_dependencies
.begin(), ctx
.new_RAR_dependencies
.end(), false);
507 for (const Operand
& op
: current
->operands
) {
509 ctx
.depends_on
[op
.tempId()] = true;
510 if (op
.isFirstKill())
511 ctx
.RAR_dependencies
[op
.tempId()] = true;
515 /* maintain how many registers remain free when moving instructions */
516 RegisterDemand register_pressure_indep
= register_demand
[idx
];
517 RegisterDemand register_pressure_clause
= register_demand
[idx
];
519 /* first, check if we have instructions before current to move down */
520 int indep_insert_idx
= idx
+ 1;
521 int clause_insert_idx
= idx
;
522 int moving_interaction
= barrier_none
;
523 bool moving_spill
= false;
525 for (int candidate_idx
= idx
- 1; k
< max_moves
&& candidate_idx
> (int) idx
- window_size
; candidate_idx
--) {
526 assert(candidate_idx
>= 0);
527 aco_ptr
<Instruction
>& candidate
= block
->instructions
[candidate_idx
];
528 bool can_reorder_candidate
= can_reorder(candidate
.get());
529 bool is_vmem
= candidate
->isVMEM() || candidate
->isFlatOrGlobal();
531 /* break when encountering another VMEM instruction, logical_start or barriers */
532 if (!can_reorder_smem
&& candidate
->format
== Format::SMEM
&& !can_reorder_candidate
)
534 if (candidate
->opcode
== aco_opcode::p_logical_start
)
536 if (candidate
->opcode
== aco_opcode::p_exit_early_if
)
538 if (!can_move_instr(candidate
, current
, moving_interaction
))
541 /* break if we'd make the previous SMEM instruction stall */
542 bool can_stall_prev_smem
= idx
<= ctx
.last_SMEM_dep_idx
&& candidate_idx
< ctx
.last_SMEM_dep_idx
;
543 if (can_stall_prev_smem
&& ctx
.last_SMEM_stall
>= 0)
545 register_pressure_indep
.update(register_demand
[candidate_idx
]);
547 bool part_of_clause
= false;
548 if (current
->isVMEM() == candidate
->isVMEM()) {
549 bool same_resource
= true;
550 if (current
->isVMEM())
551 same_resource
= candidate
->operands
[0].tempId() == current
->operands
[0].tempId();
552 bool can_reorder
= can_reorder_vmem
|| can_reorder_candidate
;
553 int grab_dist
= clause_insert_idx
- candidate_idx
;
554 /* We can't easily tell how much this will decrease the def-to-use
555 * distances, so just use how far it will be moved as a heuristic. */
556 part_of_clause
= can_reorder
&& same_resource
&& grab_dist
< clause_max_grab_dist
;
559 /* if current depends on candidate, add additional dependencies and continue */
560 bool can_move_down
= !is_vmem
|| part_of_clause
;
561 bool writes_exec
= false;
562 for (const Definition
& def
: candidate
->definitions
) {
563 if (def
.isTemp() && ctx
.depends_on
[def
.tempId()])
564 can_move_down
= false;
565 if (def
.isFixed() && def
.physReg() == exec
)
571 if (moving_spill
&& is_spill_reload(candidate
))
572 can_move_down
= false;
573 if ((moving_interaction
& barrier_shared
) && candidate
->format
== Format::DS
)
574 can_move_down
= false;
575 moving_interaction
|= get_barrier_interaction(candidate
.get());
576 moving_spill
|= is_spill_reload(candidate
);
577 if (!can_move_down
) {
578 for (const Operand
& op
: candidate
->operands
) {
580 ctx
.depends_on
[op
.tempId()] = true;
581 if (op
.isFirstKill()) {
582 ctx
.RAR_dependencies
[op
.tempId()] = true;
583 ctx
.new_RAR_dependencies
[op
.tempId()] = true;
587 register_pressure_clause
.update(register_demand
[candidate_idx
]);
588 can_reorder_smem
&= candidate
->format
!= Format::SMEM
|| can_reorder_candidate
;
589 can_reorder_vmem
&= !is_vmem
|| can_reorder_candidate
;
593 if (part_of_clause
) {
594 for (const Operand
& op
: candidate
->operands
) {
596 ctx
.depends_on
[op
.tempId()] = true;
597 if (op
.isFirstKill())
598 ctx
.RAR_dependencies
[op
.tempId()] = true;
603 bool register_pressure_unknown
= false;
604 std::vector
<bool>& RAR_deps
= part_of_clause
? ctx
.new_RAR_dependencies
: ctx
.RAR_dependencies
;
605 /* check if one of candidate's operands is killed by depending instruction */
606 for (const Operand
& op
: candidate
->operands
) {
607 if (op
.isTemp() && RAR_deps
[op
.tempId()]) {
608 // FIXME: account for difference in register pressure
609 register_pressure_unknown
= true;
612 if (register_pressure_unknown
) {
613 for (const Operand
& op
: candidate
->operands
) {
615 ctx
.depends_on
[op
.tempId()] = true;
616 if (op
.isFirstKill()) {
617 ctx
.RAR_dependencies
[op
.tempId()] = true;
618 ctx
.new_RAR_dependencies
[op
.tempId()] = true;
622 register_pressure_clause
.update(register_demand
[candidate_idx
]);
623 can_reorder_smem
&= candidate
->format
!= Format::SMEM
|| can_reorder_candidate
;
624 can_reorder_vmem
&= !is_vmem
|| can_reorder_candidate
;
628 int insert_idx
= part_of_clause
? clause_insert_idx
: indep_insert_idx
;
629 RegisterDemand register_pressure
= part_of_clause
? register_pressure_clause
: register_pressure_indep
;
631 /* check if register pressure is low enough: the diff is negative if register pressure is increased */
632 const RegisterDemand candidate_diff
= getLiveChanges(candidate
);
633 const RegisterDemand temp
= getTempRegisters(candidate
);;
634 if (RegisterDemand(register_pressure
- candidate_diff
).exceeds(ctx
.max_registers
))
636 const RegisterDemand temp2
= getTempRegisters(block
->instructions
[insert_idx
- 1]);
637 const RegisterDemand new_demand
= register_demand
[insert_idx
- 1] - temp2
+ temp
;
638 if (new_demand
.exceeds(ctx
.max_registers
))
640 // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure
642 /* move the candidate below the memory load */
643 move_element(block
->instructions
, candidate_idx
, insert_idx
);
645 /* update register pressure */
646 move_element(register_demand
, candidate_idx
, insert_idx
);
647 for (int i
= candidate_idx
; i
< insert_idx
- 1; i
++) {
648 register_demand
[i
] -= candidate_diff
;
650 register_demand
[insert_idx
- 1] = new_demand
;
651 register_pressure_clause
-= candidate_diff
;
653 if (!part_of_clause
) {
654 register_pressure_indep
-= candidate_diff
;
658 if (candidate_idx
< ctx
.last_SMEM_dep_idx
)
659 ctx
.last_SMEM_stall
++;
662 /* create the initial set of values which depend on current */
663 std::fill(ctx
.depends_on
.begin(), ctx
.depends_on
.end(), false);
664 std::fill(ctx
.RAR_dependencies
.begin(), ctx
.RAR_dependencies
.end(), false);
665 for (const Definition
& def
: current
->definitions
) {
667 ctx
.depends_on
[def
.tempId()] = true;
670 /* find the first instruction depending on current or find another VMEM */
671 RegisterDemand register_pressure
;
672 int insert_idx
= idx
;
673 moving_interaction
= barrier_none
;
674 moving_spill
= false;
675 // TODO: differentiate between loads and stores (load-load can always reorder)
676 can_reorder_vmem
= true;
677 can_reorder_smem
= true;
679 bool found_dependency
= false;
680 /* second, check if we have instructions after current to move up */
681 for (int candidate_idx
= idx
+ 1; k
< max_moves
&& candidate_idx
< (int) idx
+ window_size
; candidate_idx
++) {
682 assert(candidate_idx
< (int) block
->instructions
.size());
683 aco_ptr
<Instruction
>& candidate
= block
->instructions
[candidate_idx
];
684 bool can_reorder_candidate
= can_reorder(candidate
.get());
685 bool is_vmem
= candidate
->isVMEM() || candidate
->isFlatOrGlobal();
687 if (candidate
->opcode
== aco_opcode::p_logical_end
)
689 if (!can_move_instr(candidate
, current
, moving_interaction
))
692 const bool writes_exec
= std::any_of(candidate
->definitions
.begin(), candidate
->definitions
.end(),
693 [](const Definition
& def
) {return def
.isFixed() && def
.physReg() == exec
; });
697 /* check if candidate depends on current */
698 bool is_dependency
= false;
699 if (candidate
->format
== Format::SMEM
)
700 is_dependency
= !can_reorder_smem
&& !can_reorder_candidate
;
702 is_dependency
= !can_reorder_vmem
&& !can_reorder_candidate
;
703 for (const Operand
& op
: candidate
->operands
) {
704 if (op
.isTemp() && ctx
.depends_on
[op
.tempId()]) {
705 is_dependency
= true;
709 if (moving_spill
&& is_spill_reload(candidate
))
710 is_dependency
= true;
711 if ((moving_interaction
& barrier_shared
) && candidate
->format
== Format::DS
)
712 is_dependency
= true;
713 moving_interaction
|= get_barrier_interaction(candidate
.get());
714 moving_spill
|= is_spill_reload(candidate
);
716 for (const Definition
& def
: candidate
->definitions
) {
718 ctx
.depends_on
[def
.tempId()] = true;
720 for (const Operand
& op
: candidate
->operands
) {
722 ctx
.RAR_dependencies
[op
.tempId()] = true;
724 /* update flag whether we can reorder other memory instructions */
725 can_reorder_smem
&= candidate
->format
!= Format::SMEM
|| can_reorder_candidate
;
726 can_reorder_vmem
&= !is_vmem
|| can_reorder_candidate
;
728 if (!found_dependency
) {
729 insert_idx
= candidate_idx
;
730 found_dependency
= true;
731 /* init register pressure */
732 register_pressure
= register_demand
[insert_idx
- 1];
736 } else if (is_vmem
) {
737 /* don't move up dependencies of other VMEM instructions */
738 for (const Definition
& def
: candidate
->definitions
) {
740 ctx
.depends_on
[def
.tempId()] = true;
744 /* update register pressure */
745 register_pressure
.update(register_demand
[candidate_idx
- 1]);
747 if (is_dependency
|| !found_dependency
)
749 assert(insert_idx
!= idx
);
751 bool register_pressure_unknown
= false;
752 /* check if candidate uses/kills an operand which is used by a dependency */
753 for (const Operand
& op
: candidate
->operands
) {
754 if (op
.isTemp() && op
.isFirstKill() && ctx
.RAR_dependencies
[op
.tempId()])
755 register_pressure_unknown
= true;
757 if (register_pressure_unknown
) {
758 for (const Definition
& def
: candidate
->definitions
) {
760 ctx
.depends_on
[def
.tempId()] = true;
762 for (const Operand
& op
: candidate
->operands
) {
764 ctx
.RAR_dependencies
[op
.tempId()] = true;
766 can_reorder_smem
&= candidate
->format
!= Format::SMEM
|| can_reorder_candidate
;
767 can_reorder_vmem
&= !is_vmem
|| can_reorder_candidate
;
771 /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
772 const RegisterDemand candidate_diff
= getLiveChanges(candidate
);
773 const RegisterDemand temp
= getTempRegisters(candidate
);
774 if (RegisterDemand(register_pressure
+ candidate_diff
).exceeds(ctx
.max_registers
))
776 const RegisterDemand temp2
= getTempRegisters(block
->instructions
[insert_idx
- 1]);
777 const RegisterDemand new_demand
= register_demand
[insert_idx
- 1] - temp2
+ candidate_diff
+ temp
;
778 if (new_demand
.exceeds(ctx
.max_registers
))
781 /* move the candidate above the insert_idx */
782 move_element(block
->instructions
, candidate_idx
, insert_idx
);
784 /* update register pressure */
785 move_element(register_demand
, candidate_idx
, insert_idx
);
786 for (int i
= insert_idx
+ 1; i
<= candidate_idx
; i
++) {
787 register_demand
[i
] += candidate_diff
;
789 register_demand
[insert_idx
] = new_demand
;
790 register_pressure
+= candidate_diff
;
796 void schedule_position_export(sched_ctx
& ctx
, Block
* block
,
797 std::vector
<RegisterDemand
>& register_demand
,
798 Instruction
* current
, int idx
)
801 int window_size
= POS_EXP_WINDOW_SIZE
;
802 int max_moves
= POS_EXP_MAX_MOVES
;
805 /* create the initial set of values which current depends on */
806 std::fill(ctx
.depends_on
.begin(), ctx
.depends_on
.end(), false);
807 std::fill(ctx
.RAR_dependencies
.begin(), ctx
.RAR_dependencies
.end(), false);
808 for (const Operand
& op
: current
->operands
) {
810 ctx
.depends_on
[op
.tempId()] = true;
811 if (op
.isFirstKill())
812 ctx
.RAR_dependencies
[op
.tempId()] = true;
816 /* maintain how many registers remain free when moving instructions */
817 RegisterDemand register_pressure
= register_demand
[idx
];
819 /* first, check if we have instructions before current to move down */
820 int insert_idx
= idx
+ 1;
821 int moving_interaction
= barrier_none
;
822 bool moving_spill
= false;
824 for (int candidate_idx
= idx
- 1; k
< max_moves
&& candidate_idx
> (int) idx
- window_size
; candidate_idx
--) {
825 assert(candidate_idx
>= 0);
826 aco_ptr
<Instruction
>& candidate
= block
->instructions
[candidate_idx
];
828 /* break when encountering logical_start or barriers */
829 if (candidate
->opcode
== aco_opcode::p_logical_start
)
831 if (candidate
->opcode
== aco_opcode::p_exit_early_if
)
833 if (candidate
->isVMEM() || candidate
->format
== Format::SMEM
|| candidate
->isFlatOrGlobal())
835 if (!can_move_instr(candidate
, current
, moving_interaction
))
838 register_pressure
.update(register_demand
[candidate_idx
]);
840 /* if current depends on candidate, add additional dependencies and continue */
841 bool can_move_down
= true;
842 bool writes_exec
= false;
843 for (unsigned i
= 0; i
< candidate
->definitions
.size(); i
++) {
844 if (candidate
->definitions
[i
].isTemp() && ctx
.depends_on
[candidate
->definitions
[i
].tempId()])
845 can_move_down
= false;
846 if (candidate
->definitions
[i
].isFixed() && candidate
->definitions
[i
].physReg() == exec
)
852 if (moving_spill
&& is_spill_reload(candidate
))
853 can_move_down
= false;
854 if ((moving_interaction
& barrier_shared
) && candidate
->format
== Format::DS
)
855 can_move_down
= false;
856 moving_interaction
|= get_barrier_interaction(candidate
.get());
857 moving_spill
|= is_spill_reload(candidate
);
858 if (!can_move_down
) {
859 for (const Operand
& op
: candidate
->operands
) {
861 ctx
.depends_on
[op
.tempId()] = true;
862 if (op
.isFirstKill())
863 ctx
.RAR_dependencies
[op
.tempId()] = true;
869 bool register_pressure_unknown
= false;
870 /* check if one of candidate's operands is killed by depending instruction */
871 for (const Operand
& op
: candidate
->operands
) {
872 if (op
.isTemp() && ctx
.RAR_dependencies
[op
.tempId()]) {
873 // FIXME: account for difference in register pressure
874 register_pressure_unknown
= true;
877 if (register_pressure_unknown
) {
878 for (const Operand
& op
: candidate
->operands
) {
880 ctx
.depends_on
[op
.tempId()] = true;
881 if (op
.isFirstKill())
882 ctx
.RAR_dependencies
[op
.tempId()] = true;
888 /* check if register pressure is low enough: the diff is negative if register pressure is increased */
889 const RegisterDemand candidate_diff
= getLiveChanges(candidate
);
890 const RegisterDemand temp
= getTempRegisters(candidate
);;
891 if (RegisterDemand(register_pressure
- candidate_diff
).exceeds(ctx
.max_registers
))
893 const RegisterDemand temp2
= getTempRegisters(block
->instructions
[insert_idx
- 1]);
894 const RegisterDemand new_demand
= register_demand
[insert_idx
- 1] - temp2
+ temp
;
895 if (new_demand
.exceeds(ctx
.max_registers
))
897 // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure
899 /* move the candidate below the export */
900 move_element(block
->instructions
, candidate_idx
, insert_idx
);
902 /* update register pressure */
903 move_element(register_demand
, candidate_idx
, insert_idx
);
904 for (int i
= candidate_idx
; i
< insert_idx
- 1; i
++) {
905 register_demand
[i
] -= candidate_diff
;
907 register_demand
[insert_idx
- 1] = new_demand
;
908 register_pressure
-= candidate_diff
;
914 void schedule_block(sched_ctx
& ctx
, Program
*program
, Block
* block
, live
& live_vars
)
916 ctx
.last_SMEM_dep_idx
= 0;
917 ctx
.last_SMEM_stall
= INT16_MIN
;
919 /* go through all instructions and find memory loads */
920 for (unsigned idx
= 0; idx
< block
->instructions
.size(); idx
++) {
921 Instruction
* current
= block
->instructions
[idx
].get();
923 if (current
->definitions
.empty())
926 if (current
->isVMEM() || current
->isFlatOrGlobal())
927 schedule_VMEM(ctx
, block
, live_vars
.register_demand
[block
->index
], current
, idx
);
928 if (current
->format
== Format::SMEM
)
929 schedule_SMEM(ctx
, block
, live_vars
.register_demand
[block
->index
], current
, idx
);
932 if ((program
->stage
& hw_vs
) && block
->index
== program
->blocks
.size() - 1) {
933 /* Try to move position exports as far up as possible, to reduce register
934 * usage and because ISA reference guides say so. */
935 for (unsigned idx
= 0; idx
< block
->instructions
.size(); idx
++) {
936 Instruction
* current
= block
->instructions
[idx
].get();
938 if (current
->format
== Format::EXP
) {
939 unsigned target
= static_cast<Export_instruction
*>(current
)->dest
;
940 if (target
>= V_008DFC_SQ_EXP_POS
&& target
< V_008DFC_SQ_EXP_PARAM
)
941 schedule_position_export(ctx
, block
, live_vars
.register_demand
[block
->index
], current
, idx
);
946 /* resummarize the block's register demand */
947 block
->register_demand
= RegisterDemand();
948 for (unsigned idx
= 0; idx
< block
->instructions
.size(); idx
++) {
949 block
->register_demand
.update(live_vars
.register_demand
[block
->index
][idx
]);
954 void schedule_program(Program
*program
, live
& live_vars
)
957 ctx
.depends_on
.resize(program
->peekAllocationId());
958 ctx
.RAR_dependencies
.resize(program
->peekAllocationId());
959 ctx
.new_RAR_dependencies
.resize(program
->peekAllocationId());
960 /* Allowing the scheduler to reduce the number of waves to as low as 5
961 * improves performance of Thrones of Britannia significantly and doesn't
962 * seem to hurt anything else. */
963 if (program
->num_waves
<= 5)
964 ctx
.num_waves
= program
->num_waves
;
965 else if (program
->max_reg_demand
.vgpr
>= 32)
967 else if (program
->max_reg_demand
.vgpr
>= 28)
969 else if (program
->max_reg_demand
.vgpr
>= 24)
973 ctx
.num_waves
= std::max
<uint16_t>(ctx
.num_waves
, program
->min_waves
);
975 assert(ctx
.num_waves
> 0 && ctx
.num_waves
<= program
->num_waves
);
976 ctx
.max_registers
= { int16_t(get_addr_vgpr_from_waves(program
, ctx
.num_waves
) - 2),
977 int16_t(get_addr_sgpr_from_waves(program
, ctx
.num_waves
))};
979 for (Block
& block
: program
->blocks
)
980 schedule_block(ctx
, program
, &block
, live_vars
);
982 /* update max_reg_demand and num_waves */
983 RegisterDemand new_demand
;
984 for (Block
& block
: program
->blocks
) {
985 new_demand
.update(block
.register_demand
);
987 update_vgpr_sgpr_demand(program
, new_demand
);
989 /* if enabled, this code asserts that register_demand is updated correctly */
991 int prev_num_waves
= program
->num_waves
;
992 const RegisterDemand prev_max_demand
= program
->max_reg_demand
;
994 std::vector
<RegisterDemand
> demands(program
->blocks
.size());
995 for (unsigned j
= 0; j
< program
->blocks
.size(); j
++) {
996 demands
[j
] = program
->blocks
[j
].register_demand
;
999 struct radv_nir_compiler_options options
;
1000 options
.chip_class
= program
->chip_class
;
1001 live live_vars2
= aco::live_var_analysis(program
, &options
);
1003 for (unsigned j
= 0; j
< program
->blocks
.size(); j
++) {
1004 Block
&b
= program
->blocks
[j
];
1005 for (unsigned i
= 0; i
< b
.instructions
.size(); i
++)
1006 assert(live_vars
.register_demand
[b
.index
][i
] == live_vars2
.register_demand
[b
.index
][i
]);
1007 assert(b
.register_demand
== demands
[j
]);
1010 assert(program
->max_reg_demand
== prev_max_demand
);
1011 assert(program
->num_waves
== prev_num_waves
);