2 * Copyright © 2018 Valve Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
26 #include <unordered_set>
29 #include "vulkan/radv_shader.h" // for radv_nir_compiler_options
30 #include "amdgfxregs.h"
32 #define SMEM_WINDOW_SIZE (350 - ctx.num_waves * 35)
33 #define VMEM_WINDOW_SIZE (1024 - ctx.num_waves * 64)
34 #define POS_EXP_WINDOW_SIZE 512
35 #define SMEM_MAX_MOVES (64 - ctx.num_waves * 4)
36 #define VMEM_MAX_MOVES (128 - ctx.num_waves * 8)
37 /* creating clauses decreases def-use distances, so make it less aggressive the lower num_waves is */
38 #define VMEM_CLAUSE_MAX_GRAB_DIST ((ctx.num_waves - 1) * 8)
39 #define POS_EXP_MAX_MOVES 512
44 std::vector
<bool> depends_on
;
45 std::vector
<bool> RAR_dependencies
;
46 /* For downwards VMEM scheduling, same as RAR_dependencies but excludes the
47 * instructions in the clause, since new instructions in the clause are not
48 * moved past any other instructions in the clause. */
49 std::vector
<bool> new_RAR_dependencies
;
51 RegisterDemand max_registers
;
53 int16_t last_SMEM_stall
;
54 int last_SMEM_dep_idx
;
57 /* This scheduler is a simple bottom-up pass based on ideas from
58 * "A Novel Lightweight Instruction Scheduling Algorithm for Just-In-Time Compiler"
59 * from Xiaohua Shi and Peng Guo.
60 * The basic approach is to iterate over all instructions. When a memory instruction
61 * is encountered it tries to move independent instructions from above and below
62 * between the memory instruction and it's first user.
63 * The novelty is that this scheduler cares for the current register pressure:
64 * Instructions will only be moved if the register pressure won't exceed a certain bound.
68 void move_element(T
& list
, size_t idx
, size_t before
) {
70 auto begin
= std::next(list
.begin(), idx
);
71 auto end
= std::next(list
.begin(), before
);
72 std::rotate(begin
, begin
+ 1, end
);
73 } else if (idx
> before
) {
74 auto begin
= std::next(list
.begin(), before
);
75 auto end
= std::next(list
.begin(), idx
+ 1);
76 std::rotate(begin
, end
- 1, end
);
80 static RegisterDemand
getLiveChanges(aco_ptr
<Instruction
>& instr
)
82 RegisterDemand changes
;
83 for (const Definition
& def
: instr
->definitions
) {
84 if (!def
.isTemp() || def
.isKill())
86 changes
+= def
.getTemp();
89 for (const Operand
& op
: instr
->operands
) {
90 if (!op
.isTemp() || !op
.isFirstKill())
92 changes
-= op
.getTemp();
98 static RegisterDemand
getTempRegisters(aco_ptr
<Instruction
>& instr
)
100 RegisterDemand temp_registers
;
101 for (const Definition
& def
: instr
->definitions
) {
102 if (!def
.isTemp() || !def
.isKill())
104 temp_registers
+= def
.getTemp();
106 return temp_registers
;
109 static bool is_spill_reload(aco_ptr
<Instruction
>& instr
)
111 return instr
->opcode
== aco_opcode::p_spill
|| instr
->opcode
== aco_opcode::p_reload
;
114 bool can_move_instr(aco_ptr
<Instruction
>& instr
, Instruction
* current
, int moving_interaction
)
116 /* don't move exports so that they stay closer together */
117 if (instr
->format
== Format::EXP
)
120 /* don't move s_memtime/s_memrealtime */
121 if (instr
->opcode
== aco_opcode::s_memtime
|| instr
->opcode
== aco_opcode::s_memrealtime
)
124 /* handle barriers */
126 /* TODO: instead of stopping, maybe try to move the barriers and any
127 * instructions interacting with them instead? */
128 if (instr
->format
!= Format::PSEUDO_BARRIER
) {
129 if (instr
->opcode
== aco_opcode::s_barrier
) {
130 bool can_reorder
= false;
131 switch (current
->format
) {
133 can_reorder
= static_cast<SMEM_instruction
*>(current
)->can_reorder
;
136 can_reorder
= static_cast<MUBUF_instruction
*>(current
)->can_reorder
;
139 can_reorder
= static_cast<MIMG_instruction
*>(current
)->can_reorder
;
143 case Format::SCRATCH
:
144 can_reorder
= static_cast<FLAT_instruction
*>(current
)->can_reorder
;
149 return can_reorder
&& moving_interaction
== barrier_none
;
155 int interaction
= get_barrier_interaction(current
);
156 interaction
|= moving_interaction
;
158 switch (instr
->opcode
) {
159 case aco_opcode::p_memory_barrier_atomic
:
160 return !(interaction
& barrier_atomic
);
161 /* For now, buffer and image barriers are treated the same. this is because of
162 * dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_nonlocal.buffer.guard_nonlocal.image.comp
163 * which seems to use an image load to determine if the result of a buffer load is valid. So the ordering of the two loads is important.
164 * I /think/ we should probably eventually expand the meaning of a buffer barrier so that all buffer operations before it, must stay before it
165 * and that both image and buffer operations after it, must stay after it. We should also do the same for image barriers.
166 * Or perhaps the problem is that we don't have a combined barrier instruction for both buffers and images, but the CTS test expects us to?
167 * Either way, this solution should work. */
168 case aco_opcode::p_memory_barrier_buffer
:
169 case aco_opcode::p_memory_barrier_image
:
170 return !(interaction
& (barrier_image
| barrier_buffer
));
171 case aco_opcode::p_memory_barrier_shared
:
172 return !(interaction
& barrier_shared
);
173 case aco_opcode::p_memory_barrier_all
:
174 return interaction
== barrier_none
;
180 bool can_reorder(Instruction
* candidate
)
182 switch (candidate
->format
) {
184 return static_cast<SMEM_instruction
*>(candidate
)->can_reorder
;
186 return static_cast<MUBUF_instruction
*>(candidate
)->can_reorder
;
188 return static_cast<MIMG_instruction
*>(candidate
)->can_reorder
;
190 return static_cast<MTBUF_instruction
*>(candidate
)->can_reorder
;
193 case Format::SCRATCH
:
194 return static_cast<FLAT_instruction
*>(candidate
)->can_reorder
;
200 void schedule_SMEM(sched_ctx
& ctx
, Block
* block
,
201 std::vector
<RegisterDemand
>& register_demand
,
202 Instruction
* current
, int idx
)
205 int window_size
= SMEM_WINDOW_SIZE
;
206 int max_moves
= SMEM_MAX_MOVES
;
208 bool can_reorder_cur
= can_reorder(current
);
210 /* don't move s_memtime/s_memrealtime */
211 if (current
->opcode
== aco_opcode::s_memtime
|| current
->opcode
== aco_opcode::s_memrealtime
)
214 /* create the initial set of values which current depends on */
215 std::fill(ctx
.depends_on
.begin(), ctx
.depends_on
.end(), false);
216 for (const Operand
& op
: current
->operands
) {
218 ctx
.depends_on
[op
.tempId()] = true;
221 /* maintain how many registers remain free when moving instructions */
222 RegisterDemand register_pressure
= register_demand
[idx
];
224 /* first, check if we have instructions before current to move down */
225 int insert_idx
= idx
+ 1;
226 int moving_interaction
= barrier_none
;
227 bool moving_spill
= false;
229 for (int candidate_idx
= idx
- 1; k
< max_moves
&& candidate_idx
> (int) idx
- window_size
; candidate_idx
--) {
230 assert(candidate_idx
>= 0);
231 aco_ptr
<Instruction
>& candidate
= block
->instructions
[candidate_idx
];
232 bool can_reorder_candidate
= can_reorder(candidate
.get());
234 /* break if we'd make the previous SMEM instruction stall */
235 bool can_stall_prev_smem
= idx
<= ctx
.last_SMEM_dep_idx
&& candidate_idx
< ctx
.last_SMEM_dep_idx
;
236 if (can_stall_prev_smem
&& ctx
.last_SMEM_stall
>= 0)
239 /* break when encountering another MEM instruction, logical_start or barriers */
240 if (!can_reorder_candidate
&& !can_reorder_cur
)
242 if (candidate
->opcode
== aco_opcode::p_logical_start
)
244 if (candidate
->opcode
== aco_opcode::p_exit_early_if
)
246 if (!can_move_instr(candidate
, current
, moving_interaction
))
248 if (candidate
->isVMEM())
250 register_pressure
.update(register_demand
[candidate_idx
]);
252 /* if current depends on candidate, add additional dependencies and continue */
253 bool can_move_down
= true;
254 bool writes_exec
= false;
255 for (const Definition
& def
: candidate
->definitions
) {
256 if (def
.isTemp() && ctx
.depends_on
[def
.tempId()])
257 can_move_down
= false;
258 if (def
.isFixed() && def
.physReg() == exec
)
264 if (moving_spill
&& is_spill_reload(candidate
))
265 can_move_down
= false;
266 if ((moving_interaction
& barrier_shared
) && candidate
->format
== Format::DS
)
267 can_move_down
= false;
268 moving_interaction
|= get_barrier_interaction(candidate
.get());
269 moving_spill
|= is_spill_reload(candidate
);
270 if (!can_move_down
) {
271 for (const Operand
& op
: candidate
->operands
) {
273 ctx
.depends_on
[op
.tempId()] = true;
275 can_reorder_cur
&= can_reorder_candidate
;
279 bool register_pressure_unknown
= false;
280 /* check if one of candidate's operands is killed by depending instruction */
281 for (const Operand
& op
: candidate
->operands
) {
282 if (op
.isTemp() && ctx
.depends_on
[op
.tempId()]) {
283 // FIXME: account for difference in register pressure
284 register_pressure_unknown
= true;
287 if (register_pressure_unknown
) {
288 for (const Operand
& op
: candidate
->operands
) {
290 ctx
.depends_on
[op
.tempId()] = true;
292 can_reorder_cur
&= can_reorder_candidate
;
296 /* check if register pressure is low enough: the diff is negative if register pressure is increased */
297 const RegisterDemand candidate_diff
= getLiveChanges(candidate
);
298 const RegisterDemand tempDemand
= getTempRegisters(candidate
);
299 if (RegisterDemand(register_pressure
- candidate_diff
).exceeds(ctx
.max_registers
))
301 const RegisterDemand tempDemand2
= getTempRegisters(block
->instructions
[insert_idx
- 1]);
302 const RegisterDemand new_demand
= register_demand
[insert_idx
- 1] - tempDemand2
+ tempDemand
;
303 if (new_demand
.exceeds(ctx
.max_registers
))
305 // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure
307 /* move the candidate below the memory load */
308 move_element(block
->instructions
, candidate_idx
, insert_idx
);
310 /* update register pressure */
311 move_element(register_demand
, candidate_idx
, insert_idx
);
312 for (int i
= candidate_idx
; i
< insert_idx
- 1; i
++) {
313 register_demand
[i
] -= candidate_diff
;
315 register_demand
[insert_idx
- 1] = new_demand
;
316 register_pressure
-= candidate_diff
;
318 if (candidate_idx
< ctx
.last_SMEM_dep_idx
)
319 ctx
.last_SMEM_stall
++;
324 /* create the initial set of values which depend on current */
325 std::fill(ctx
.depends_on
.begin(), ctx
.depends_on
.end(), false);
326 std::fill(ctx
.RAR_dependencies
.begin(), ctx
.RAR_dependencies
.end(), false);
327 for (const Definition
& def
: current
->definitions
) {
329 ctx
.depends_on
[def
.tempId()] = true;
332 /* find the first instruction depending on current or find another MEM */
333 insert_idx
= idx
+ 1;
334 moving_interaction
= barrier_none
;
335 moving_spill
= false;
336 can_reorder_cur
= true;
338 bool found_dependency
= false;
339 /* second, check if we have instructions after current to move up */
340 for (int candidate_idx
= idx
+ 1; k
< max_moves
&& candidate_idx
< (int) idx
+ window_size
; candidate_idx
++) {
341 assert(candidate_idx
< (int) block
->instructions
.size());
342 aco_ptr
<Instruction
>& candidate
= block
->instructions
[candidate_idx
];
343 bool can_reorder_candidate
= can_reorder(candidate
.get());
345 if (candidate
->opcode
== aco_opcode::p_logical_end
)
347 if (!can_move_instr(candidate
, current
, moving_interaction
))
350 const bool writes_exec
= std::any_of(candidate
->definitions
.begin(), candidate
->definitions
.end(),
351 [](const Definition
& def
) { return def
.isFixed() && def
.physReg() == exec
;});
355 /* check if candidate depends on current */
356 bool is_dependency
= std::any_of(candidate
->operands
.begin(), candidate
->operands
.end(),
357 [&ctx
](const Operand
& op
) { return op
.isTemp() && ctx
.depends_on
[op
.tempId()];});
358 /* no need to steal from following VMEM instructions */
359 if (is_dependency
&& candidate
->isVMEM())
361 if (moving_spill
&& is_spill_reload(candidate
))
362 is_dependency
= true;
363 if ((moving_interaction
& barrier_shared
) && candidate
->format
== Format::DS
)
364 is_dependency
= true;
365 moving_interaction
|= get_barrier_interaction(candidate
.get());
366 moving_spill
|= is_spill_reload(candidate
);
368 for (const Definition
& def
: candidate
->definitions
) {
370 ctx
.depends_on
[def
.tempId()] = true;
372 for (const Operand
& op
: candidate
->operands
) {
374 ctx
.RAR_dependencies
[op
.tempId()] = true;
376 if (!found_dependency
) {
377 insert_idx
= candidate_idx
;
378 found_dependency
= true;
379 /* init register pressure */
380 register_pressure
= register_demand
[insert_idx
- 1];
384 if (!can_reorder_candidate
&& !can_reorder_cur
)
387 if (!found_dependency
) {
392 /* update register pressure */
393 register_pressure
.update(register_demand
[candidate_idx
- 1]);
396 can_reorder_cur
&= can_reorder_candidate
;
399 assert(insert_idx
!= idx
);
401 // TODO: correctly calculate register pressure for this case
402 bool register_pressure_unknown
= false;
403 /* check if candidate uses/kills an operand which is used by a dependency */
404 for (const Operand
& op
: candidate
->operands
) {
405 if (op
.isTemp() && ctx
.RAR_dependencies
[op
.tempId()])
406 register_pressure_unknown
= true;
408 if (register_pressure_unknown
) {
409 if (candidate
->isVMEM())
411 for (const Definition
& def
: candidate
->definitions
) {
413 ctx
.RAR_dependencies
[def
.tempId()] = true;
415 for (const Operand
& op
: candidate
->operands
) {
417 ctx
.RAR_dependencies
[op
.tempId()] = true;
419 can_reorder_cur
&= can_reorder_candidate
;
423 /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
424 const RegisterDemand candidate_diff
= getLiveChanges(candidate
);
425 const RegisterDemand temp
= getTempRegisters(candidate
);
426 if (RegisterDemand(register_pressure
+ candidate_diff
).exceeds(ctx
.max_registers
))
428 const RegisterDemand temp2
= getTempRegisters(block
->instructions
[insert_idx
- 1]);
429 const RegisterDemand new_demand
= register_demand
[insert_idx
- 1] - temp2
+ candidate_diff
+ temp
;
430 if (new_demand
.exceeds(ctx
.max_registers
))
433 /* move the candidate above the insert_idx */
434 move_element(block
->instructions
, candidate_idx
, insert_idx
);
436 /* update register pressure */
437 move_element(register_demand
, candidate_idx
, insert_idx
);
438 for (int i
= insert_idx
+ 1; i
<= candidate_idx
; i
++) {
439 register_demand
[i
] += candidate_diff
;
441 register_demand
[insert_idx
] = new_demand
;
442 register_pressure
+= candidate_diff
;
447 ctx
.last_SMEM_dep_idx
= found_dependency
? insert_idx
: 0;
448 ctx
.last_SMEM_stall
= 10 - ctx
.num_waves
- k
;
451 void schedule_VMEM(sched_ctx
& ctx
, Block
* block
,
452 std::vector
<RegisterDemand
>& register_demand
,
453 Instruction
* current
, int idx
)
456 int window_size
= VMEM_WINDOW_SIZE
;
457 int max_moves
= VMEM_MAX_MOVES
;
458 int clause_max_grab_dist
= VMEM_CLAUSE_MAX_GRAB_DIST
;
460 /* initially true as we don't pull other VMEM instructions
461 * through the current instruction */
462 bool can_reorder_vmem
= true;
463 bool can_reorder_smem
= true;
465 /* create the initial set of values which current depends on */
466 std::fill(ctx
.depends_on
.begin(), ctx
.depends_on
.end(), false);
467 std::fill(ctx
.RAR_dependencies
.begin(), ctx
.RAR_dependencies
.end(), false);
468 std::fill(ctx
.new_RAR_dependencies
.begin(), ctx
.new_RAR_dependencies
.end(), false);
469 for (const Operand
& op
: current
->operands
) {
471 ctx
.depends_on
[op
.tempId()] = true;
472 if (op
.isFirstKill())
473 ctx
.RAR_dependencies
[op
.tempId()] = true;
477 /* maintain how many registers remain free when moving instructions */
478 RegisterDemand register_pressure_indep
= register_demand
[idx
];
479 RegisterDemand register_pressure_clause
= register_demand
[idx
];
481 /* first, check if we have instructions before current to move down */
482 int indep_insert_idx
= idx
+ 1;
483 int clause_insert_idx
= idx
;
484 int moving_interaction
= barrier_none
;
485 bool moving_spill
= false;
487 for (int candidate_idx
= idx
- 1; k
< max_moves
&& candidate_idx
> (int) idx
- window_size
; candidate_idx
--) {
488 assert(candidate_idx
>= 0);
489 aco_ptr
<Instruction
>& candidate
= block
->instructions
[candidate_idx
];
490 bool can_reorder_candidate
= can_reorder(candidate
.get());
491 bool is_vmem
= candidate
->isVMEM() || candidate
->isFlatOrGlobal();
493 /* break when encountering another VMEM instruction, logical_start or barriers */
494 if (!can_reorder_smem
&& candidate
->format
== Format::SMEM
&& !can_reorder_candidate
)
496 if (candidate
->opcode
== aco_opcode::p_logical_start
)
498 if (candidate
->opcode
== aco_opcode::p_exit_early_if
)
500 if (!can_move_instr(candidate
, current
, moving_interaction
))
503 /* break if we'd make the previous SMEM instruction stall */
504 bool can_stall_prev_smem
= idx
<= ctx
.last_SMEM_dep_idx
&& candidate_idx
< ctx
.last_SMEM_dep_idx
;
505 if (can_stall_prev_smem
&& ctx
.last_SMEM_stall
>= 0)
507 register_pressure_indep
.update(register_demand
[candidate_idx
]);
509 bool part_of_clause
= false;
510 if (current
->isVMEM() == candidate
->isVMEM()) {
511 bool same_resource
= true;
512 if (current
->isVMEM())
513 same_resource
= candidate
->operands
[1].tempId() == current
->operands
[1].tempId();
514 bool can_reorder
= can_reorder_vmem
|| can_reorder_candidate
;
515 int grab_dist
= clause_insert_idx
- candidate_idx
;
516 /* We can't easily tell how much this will decrease the def-to-use
517 * distances, so just use how far it will be moved as a heuristic. */
518 part_of_clause
= can_reorder
&& same_resource
&& grab_dist
< clause_max_grab_dist
;
521 /* if current depends on candidate, add additional dependencies and continue */
522 bool can_move_down
= !is_vmem
|| part_of_clause
;
523 bool writes_exec
= false;
524 for (const Definition
& def
: candidate
->definitions
) {
525 if (def
.isTemp() && ctx
.depends_on
[def
.tempId()])
526 can_move_down
= false;
527 if (def
.isFixed() && def
.physReg() == exec
)
533 if (moving_spill
&& is_spill_reload(candidate
))
534 can_move_down
= false;
535 if ((moving_interaction
& barrier_shared
) && candidate
->format
== Format::DS
)
536 can_move_down
= false;
537 moving_interaction
|= get_barrier_interaction(candidate
.get());
538 moving_spill
|= is_spill_reload(candidate
);
539 if (!can_move_down
) {
540 for (const Operand
& op
: candidate
->operands
) {
542 ctx
.depends_on
[op
.tempId()] = true;
543 if (op
.isFirstKill()) {
544 ctx
.RAR_dependencies
[op
.tempId()] = true;
545 ctx
.new_RAR_dependencies
[op
.tempId()] = true;
549 register_pressure_clause
.update(register_demand
[candidate_idx
]);
550 can_reorder_smem
&= candidate
->format
!= Format::SMEM
|| can_reorder_candidate
;
551 can_reorder_vmem
&= !is_vmem
|| can_reorder_candidate
;
555 if (part_of_clause
) {
556 for (const Operand
& op
: candidate
->operands
) {
558 ctx
.depends_on
[op
.tempId()] = true;
559 if (op
.isFirstKill())
560 ctx
.RAR_dependencies
[op
.tempId()] = true;
565 bool register_pressure_unknown
= false;
566 std::vector
<bool>& RAR_deps
= part_of_clause
? ctx
.new_RAR_dependencies
: ctx
.RAR_dependencies
;
567 /* check if one of candidate's operands is killed by depending instruction */
568 for (const Operand
& op
: candidate
->operands
) {
569 if (op
.isTemp() && RAR_deps
[op
.tempId()]) {
570 // FIXME: account for difference in register pressure
571 register_pressure_unknown
= true;
574 if (register_pressure_unknown
) {
575 for (const Operand
& op
: candidate
->operands
) {
577 ctx
.depends_on
[op
.tempId()] = true;
578 if (op
.isFirstKill()) {
579 ctx
.RAR_dependencies
[op
.tempId()] = true;
580 ctx
.new_RAR_dependencies
[op
.tempId()] = true;
584 register_pressure_clause
.update(register_demand
[candidate_idx
]);
585 can_reorder_smem
&= candidate
->format
!= Format::SMEM
|| can_reorder_candidate
;
586 can_reorder_vmem
&= !is_vmem
|| can_reorder_candidate
;
590 int insert_idx
= part_of_clause
? clause_insert_idx
: indep_insert_idx
;
591 RegisterDemand register_pressure
= part_of_clause
? register_pressure_clause
: register_pressure_indep
;
593 /* check if register pressure is low enough: the diff is negative if register pressure is increased */
594 const RegisterDemand candidate_diff
= getLiveChanges(candidate
);
595 const RegisterDemand temp
= getTempRegisters(candidate
);;
596 if (RegisterDemand(register_pressure
- candidate_diff
).exceeds(ctx
.max_registers
))
598 const RegisterDemand temp2
= getTempRegisters(block
->instructions
[insert_idx
- 1]);
599 const RegisterDemand new_demand
= register_demand
[insert_idx
- 1] - temp2
+ temp
;
600 if (new_demand
.exceeds(ctx
.max_registers
))
602 // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure
604 /* move the candidate below the memory load */
605 move_element(block
->instructions
, candidate_idx
, insert_idx
);
607 /* update register pressure */
608 move_element(register_demand
, candidate_idx
, insert_idx
);
609 for (int i
= candidate_idx
; i
< insert_idx
- 1; i
++) {
610 register_demand
[i
] -= candidate_diff
;
612 register_demand
[insert_idx
- 1] = new_demand
;
613 register_pressure_clause
-= candidate_diff
;
615 if (!part_of_clause
) {
616 register_pressure_indep
-= candidate_diff
;
620 if (candidate_idx
< ctx
.last_SMEM_dep_idx
)
621 ctx
.last_SMEM_stall
++;
624 /* create the initial set of values which depend on current */
625 std::fill(ctx
.depends_on
.begin(), ctx
.depends_on
.end(), false);
626 std::fill(ctx
.RAR_dependencies
.begin(), ctx
.RAR_dependencies
.end(), false);
627 for (const Definition
& def
: current
->definitions
) {
629 ctx
.depends_on
[def
.tempId()] = true;
632 /* find the first instruction depending on current or find another VMEM */
633 RegisterDemand register_pressure
;
634 int insert_idx
= idx
;
635 moving_interaction
= barrier_none
;
636 moving_spill
= false;
637 // TODO: differentiate between loads and stores (load-load can always reorder)
638 can_reorder_vmem
= true;
639 can_reorder_smem
= true;
641 bool found_dependency
= false;
642 /* second, check if we have instructions after current to move up */
643 for (int candidate_idx
= idx
+ 1; k
< max_moves
&& candidate_idx
< (int) idx
+ window_size
; candidate_idx
++) {
644 assert(candidate_idx
< (int) block
->instructions
.size());
645 aco_ptr
<Instruction
>& candidate
= block
->instructions
[candidate_idx
];
646 bool can_reorder_candidate
= can_reorder(candidate
.get());
647 bool is_vmem
= candidate
->isVMEM() || candidate
->isFlatOrGlobal();
649 if (candidate
->opcode
== aco_opcode::p_logical_end
)
651 if (!can_move_instr(candidate
, current
, moving_interaction
))
654 const bool writes_exec
= std::any_of(candidate
->definitions
.begin(), candidate
->definitions
.end(),
655 [](const Definition
& def
) {return def
.isFixed() && def
.physReg() == exec
; });
659 /* check if candidate depends on current */
660 bool is_dependency
= false;
661 if (candidate
->format
== Format::SMEM
)
662 is_dependency
= !can_reorder_smem
&& !can_reorder_candidate
;
664 is_dependency
= !can_reorder_vmem
&& !can_reorder_candidate
;
665 for (const Operand
& op
: candidate
->operands
) {
666 if (op
.isTemp() && ctx
.depends_on
[op
.tempId()]) {
667 is_dependency
= true;
671 if (moving_spill
&& is_spill_reload(candidate
))
672 is_dependency
= true;
673 if ((moving_interaction
& barrier_shared
) && candidate
->format
== Format::DS
)
674 is_dependency
= true;
675 moving_interaction
|= get_barrier_interaction(candidate
.get());
676 moving_spill
|= is_spill_reload(candidate
);
678 for (const Definition
& def
: candidate
->definitions
) {
680 ctx
.depends_on
[def
.tempId()] = true;
682 for (const Operand
& op
: candidate
->operands
) {
684 ctx
.RAR_dependencies
[op
.tempId()] = true;
686 /* update flag whether we can reorder other memory instructions */
687 can_reorder_smem
&= candidate
->format
!= Format::SMEM
|| can_reorder_candidate
;
688 can_reorder_vmem
&= !is_vmem
|| can_reorder_candidate
;
690 if (!found_dependency
) {
691 insert_idx
= candidate_idx
;
692 found_dependency
= true;
693 /* init register pressure */
694 register_pressure
= register_demand
[insert_idx
- 1];
698 } else if (is_vmem
) {
699 /* don't move up dependencies of other VMEM instructions */
700 for (const Definition
& def
: candidate
->definitions
) {
702 ctx
.depends_on
[def
.tempId()] = true;
706 /* update register pressure */
707 register_pressure
.update(register_demand
[candidate_idx
- 1]);
709 if (is_dependency
|| !found_dependency
)
711 assert(insert_idx
!= idx
);
713 bool register_pressure_unknown
= false;
714 /* check if candidate uses/kills an operand which is used by a dependency */
715 for (const Operand
& op
: candidate
->operands
) {
716 if (op
.isTemp() && op
.isFirstKill() && ctx
.RAR_dependencies
[op
.tempId()])
717 register_pressure_unknown
= true;
719 if (register_pressure_unknown
) {
720 for (const Definition
& def
: candidate
->definitions
) {
722 ctx
.depends_on
[def
.tempId()] = true;
724 for (const Operand
& op
: candidate
->operands
) {
726 ctx
.RAR_dependencies
[op
.tempId()] = true;
728 can_reorder_smem
&= candidate
->format
!= Format::SMEM
|| can_reorder_candidate
;
729 can_reorder_vmem
&= !is_vmem
|| can_reorder_candidate
;
733 /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
734 const RegisterDemand candidate_diff
= getLiveChanges(candidate
);
735 const RegisterDemand temp
= getTempRegisters(candidate
);
736 if (RegisterDemand(register_pressure
+ candidate_diff
).exceeds(ctx
.max_registers
))
738 const RegisterDemand temp2
= getTempRegisters(block
->instructions
[insert_idx
- 1]);
739 const RegisterDemand new_demand
= register_demand
[insert_idx
- 1] - temp2
+ candidate_diff
+ temp
;
740 if (new_demand
.exceeds(ctx
.max_registers
))
743 /* move the candidate above the insert_idx */
744 move_element(block
->instructions
, candidate_idx
, insert_idx
);
746 /* update register pressure */
747 move_element(register_demand
, candidate_idx
, insert_idx
);
748 for (int i
= insert_idx
+ 1; i
<= candidate_idx
; i
++) {
749 register_demand
[i
] += candidate_diff
;
751 register_demand
[insert_idx
] = new_demand
;
752 register_pressure
+= candidate_diff
;
758 void schedule_position_export(sched_ctx
& ctx
, Block
* block
,
759 std::vector
<RegisterDemand
>& register_demand
,
760 Instruction
* current
, int idx
)
763 int window_size
= POS_EXP_WINDOW_SIZE
;
764 int max_moves
= POS_EXP_MAX_MOVES
;
767 /* create the initial set of values which current depends on */
768 std::fill(ctx
.depends_on
.begin(), ctx
.depends_on
.end(), false);
769 std::fill(ctx
.RAR_dependencies
.begin(), ctx
.RAR_dependencies
.end(), false);
770 for (const Operand
& op
: current
->operands
) {
772 ctx
.depends_on
[op
.tempId()] = true;
773 if (op
.isFirstKill())
774 ctx
.RAR_dependencies
[op
.tempId()] = true;
778 /* maintain how many registers remain free when moving instructions */
779 RegisterDemand register_pressure
= register_demand
[idx
];
781 /* first, check if we have instructions before current to move down */
782 int insert_idx
= idx
+ 1;
783 int moving_interaction
= barrier_none
;
784 bool moving_spill
= false;
786 for (int candidate_idx
= idx
- 1; k
< max_moves
&& candidate_idx
> (int) idx
- window_size
; candidate_idx
--) {
787 assert(candidate_idx
>= 0);
788 aco_ptr
<Instruction
>& candidate
= block
->instructions
[candidate_idx
];
790 /* break when encountering logical_start or barriers */
791 if (candidate
->opcode
== aco_opcode::p_logical_start
)
793 if (candidate
->opcode
== aco_opcode::p_exit_early_if
)
795 if (candidate
->isVMEM() || candidate
->format
== Format::SMEM
|| candidate
->isFlatOrGlobal())
797 if (!can_move_instr(candidate
, current
, moving_interaction
))
800 register_pressure
.update(register_demand
[candidate_idx
]);
802 /* if current depends on candidate, add additional dependencies and continue */
803 bool can_move_down
= true;
804 bool writes_exec
= false;
805 for (unsigned i
= 0; i
< candidate
->definitions
.size(); i
++) {
806 if (candidate
->definitions
[i
].isTemp() && ctx
.depends_on
[candidate
->definitions
[i
].tempId()])
807 can_move_down
= false;
808 if (candidate
->definitions
[i
].isFixed() && candidate
->definitions
[i
].physReg() == exec
)
814 if (moving_spill
&& is_spill_reload(candidate
))
815 can_move_down
= false;
816 if ((moving_interaction
& barrier_shared
) && candidate
->format
== Format::DS
)
817 can_move_down
= false;
818 moving_interaction
|= get_barrier_interaction(candidate
.get());
819 moving_spill
|= is_spill_reload(candidate
);
820 if (!can_move_down
) {
821 for (const Operand
& op
: candidate
->operands
) {
823 ctx
.depends_on
[op
.tempId()] = true;
824 if (op
.isFirstKill())
825 ctx
.RAR_dependencies
[op
.tempId()] = true;
831 bool register_pressure_unknown
= false;
832 /* check if one of candidate's operands is killed by depending instruction */
833 for (const Operand
& op
: candidate
->operands
) {
834 if (op
.isTemp() && ctx
.RAR_dependencies
[op
.tempId()]) {
835 // FIXME: account for difference in register pressure
836 register_pressure_unknown
= true;
839 if (register_pressure_unknown
) {
840 for (const Operand
& op
: candidate
->operands
) {
842 ctx
.depends_on
[op
.tempId()] = true;
843 if (op
.isFirstKill())
844 ctx
.RAR_dependencies
[op
.tempId()] = true;
850 /* check if register pressure is low enough: the diff is negative if register pressure is increased */
851 const RegisterDemand candidate_diff
= getLiveChanges(candidate
);
852 const RegisterDemand temp
= getTempRegisters(candidate
);;
853 if (RegisterDemand(register_pressure
- candidate_diff
).exceeds(ctx
.max_registers
))
855 const RegisterDemand temp2
= getTempRegisters(block
->instructions
[insert_idx
- 1]);
856 const RegisterDemand new_demand
= register_demand
[insert_idx
- 1] - temp2
+ temp
;
857 if (new_demand
.exceeds(ctx
.max_registers
))
859 // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure
861 /* move the candidate below the export */
862 move_element(block
->instructions
, candidate_idx
, insert_idx
);
864 /* update register pressure */
865 move_element(register_demand
, candidate_idx
, insert_idx
);
866 for (int i
= candidate_idx
; i
< insert_idx
- 1; i
++) {
867 register_demand
[i
] -= candidate_diff
;
869 register_demand
[insert_idx
- 1] = new_demand
;
870 register_pressure
-= candidate_diff
;
876 void schedule_block(sched_ctx
& ctx
, Program
*program
, Block
* block
, live
& live_vars
)
878 ctx
.last_SMEM_dep_idx
= 0;
879 ctx
.last_SMEM_stall
= INT16_MIN
;
881 /* go through all instructions and find memory loads */
882 for (unsigned idx
= 0; idx
< block
->instructions
.size(); idx
++) {
883 Instruction
* current
= block
->instructions
[idx
].get();
885 if (current
->definitions
.empty())
888 if (current
->isVMEM() || current
->isFlatOrGlobal())
889 schedule_VMEM(ctx
, block
, live_vars
.register_demand
[block
->index
], current
, idx
);
890 if (current
->format
== Format::SMEM
)
891 schedule_SMEM(ctx
, block
, live_vars
.register_demand
[block
->index
], current
, idx
);
894 if ((program
->stage
& hw_vs
) && block
->index
== program
->blocks
.size() - 1) {
895 /* Try to move position exports as far up as possible, to reduce register
896 * usage and because ISA reference guides say so. */
897 for (unsigned idx
= 0; idx
< block
->instructions
.size(); idx
++) {
898 Instruction
* current
= block
->instructions
[idx
].get();
900 if (current
->format
== Format::EXP
) {
901 unsigned target
= static_cast<Export_instruction
*>(current
)->dest
;
902 if (target
>= V_008DFC_SQ_EXP_POS
&& target
< V_008DFC_SQ_EXP_PARAM
)
903 schedule_position_export(ctx
, block
, live_vars
.register_demand
[block
->index
], current
, idx
);
908 /* resummarize the block's register demand */
909 block
->register_demand
= RegisterDemand();
910 for (unsigned idx
= 0; idx
< block
->instructions
.size(); idx
++) {
911 block
->register_demand
.update(live_vars
.register_demand
[block
->index
][idx
]);
916 void schedule_program(Program
*program
, live
& live_vars
)
919 ctx
.depends_on
.resize(program
->peekAllocationId());
920 ctx
.RAR_dependencies
.resize(program
->peekAllocationId());
921 ctx
.new_RAR_dependencies
.resize(program
->peekAllocationId());
922 /* Allowing the scheduler to reduce the number of waves to as low as 5
923 * improves performance of Thrones of Britannia significantly and doesn't
924 * seem to hurt anything else. */
925 if (program
->num_waves
<= 5)
926 ctx
.num_waves
= program
->num_waves
;
927 else if (program
->max_reg_demand
.vgpr
>= 32)
929 else if (program
->max_reg_demand
.vgpr
>= 28)
931 else if (program
->max_reg_demand
.vgpr
>= 24)
936 assert(ctx
.num_waves
> 0 && ctx
.num_waves
<= program
->num_waves
);
937 ctx
.max_registers
= { int16_t(((256 / ctx
.num_waves
) & ~3) - 2), int16_t(get_addr_sgpr_from_waves(program
, ctx
.num_waves
))};
939 for (Block
& block
: program
->blocks
)
940 schedule_block(ctx
, program
, &block
, live_vars
);
942 /* update max_reg_demand and num_waves */
943 RegisterDemand new_demand
;
944 for (Block
& block
: program
->blocks
) {
945 new_demand
.update(block
.register_demand
);
947 update_vgpr_sgpr_demand(program
, new_demand
);
949 /* if enabled, this code asserts that register_demand is updated correctly */
951 int prev_num_waves
= program
->num_waves
;
952 const RegisterDemand prev_max_demand
= program
->max_reg_demand
;
954 std::vector
<RegisterDemand
> demands(program
->blocks
.size());
955 for (unsigned j
= 0; j
< program
->blocks
.size(); j
++) {
956 demands
[j
] = program
->blocks
[j
].register_demand
;
959 struct radv_nir_compiler_options options
;
960 options
.chip_class
= program
->chip_class
;
961 live live_vars2
= aco::live_var_analysis(program
, &options
);
963 for (unsigned j
= 0; j
< program
->blocks
.size(); j
++) {
964 Block
&b
= program
->blocks
[j
];
965 for (unsigned i
= 0; i
< b
.instructions
.size(); i
++)
966 assert(live_vars
.register_demand
[b
.index
][i
] == live_vars2
.register_demand
[b
.index
][i
]);
967 assert(b
.register_demand
== demands
[j
]);
970 assert(program
->max_reg_demand
== prev_max_demand
);
971 assert(program
->num_waves
== prev_num_waves
);