2 * Copyright © 2018 Valve Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
26 #include <unordered_set>
29 #include "vulkan/radv_shader.h" // for radv_nir_compiler_options
30 #include "amdgfxregs.h"
32 #define SMEM_WINDOW_SIZE (350 - ctx.num_waves * 35)
33 #define VMEM_WINDOW_SIZE (1024 - ctx.num_waves * 64)
34 #define POS_EXP_WINDOW_SIZE 512
35 #define SMEM_MAX_MOVES (64 - ctx.num_waves * 4)
36 #define VMEM_MAX_MOVES (128 - ctx.num_waves * 8)
37 /* creating clauses decreases def-use distances, so make it less aggressive the lower num_waves is */
38 #define VMEM_CLAUSE_MAX_GRAB_DIST ((ctx.num_waves - 1) * 8)
39 #define POS_EXP_MAX_MOVES 512
44 std::vector
<bool> depends_on
;
45 std::vector
<bool> RAR_dependencies
;
46 /* For downwards VMEM scheduling, same as RAR_dependencies but excludes the
47 * instructions in the clause, since new instructions in the clause are not
48 * moved past any other instructions in the clause. */
49 std::vector
<bool> new_RAR_dependencies
;
51 RegisterDemand max_registers
;
53 int16_t last_SMEM_stall
;
54 int last_SMEM_dep_idx
;
57 /* This scheduler is a simple bottom-up pass based on ideas from
58 * "A Novel Lightweight Instruction Scheduling Algorithm for Just-In-Time Compiler"
59 * from Xiaohua Shi and Peng Guo.
60 * The basic approach is to iterate over all instructions. When a memory instruction
61 * is encountered it tries to move independent instructions from above and below
62 * between the memory instruction and it's first user.
63 * The novelty is that this scheduler cares for the current register pressure:
64 * Instructions will only be moved if the register pressure won't exceed a certain bound.
68 void move_element(T
& list
, size_t idx
, size_t before
) {
70 auto begin
= std::next(list
.begin(), idx
);
71 auto end
= std::next(list
.begin(), before
);
72 std::rotate(begin
, begin
+ 1, end
);
73 } else if (idx
> before
) {
74 auto begin
= std::next(list
.begin(), before
);
75 auto end
= std::next(list
.begin(), idx
+ 1);
76 std::rotate(begin
, end
- 1, end
);
80 static RegisterDemand
getLiveChanges(aco_ptr
<Instruction
>& instr
)
82 RegisterDemand changes
;
83 for (const Definition
& def
: instr
->definitions
) {
84 if (!def
.isTemp() || def
.isKill())
86 changes
+= def
.getTemp();
89 for (const Operand
& op
: instr
->operands
) {
90 if (!op
.isTemp() || !op
.isFirstKill())
92 changes
-= op
.getTemp();
98 static RegisterDemand
getTempRegisters(aco_ptr
<Instruction
>& instr
)
100 RegisterDemand temp_registers
;
101 for (const Definition
& def
: instr
->definitions
) {
102 if (!def
.isTemp() || !def
.isKill())
104 temp_registers
+= def
.getTemp();
106 return temp_registers
;
109 static bool is_spill_reload(aco_ptr
<Instruction
>& instr
)
111 return instr
->opcode
== aco_opcode::p_spill
|| instr
->opcode
== aco_opcode::p_reload
;
114 bool can_move_instr(aco_ptr
<Instruction
>& instr
, Instruction
* current
, int moving_interaction
)
116 /* don't move exports so that they stay closer together */
117 if (instr
->format
== Format::EXP
)
120 /* don't move s_memtime/s_memrealtime */
121 if (instr
->opcode
== aco_opcode::s_memtime
|| instr
->opcode
== aco_opcode::s_memrealtime
)
124 /* handle barriers */
126 /* TODO: instead of stopping, maybe try to move the barriers and any
127 * instructions interacting with them instead? */
128 if (instr
->format
!= Format::PSEUDO_BARRIER
) {
129 if (instr
->opcode
== aco_opcode::s_barrier
) {
130 bool can_reorder
= false;
131 switch (current
->format
) {
133 can_reorder
= static_cast<SMEM_instruction
*>(current
)->can_reorder
;
136 can_reorder
= static_cast<MUBUF_instruction
*>(current
)->can_reorder
;
139 can_reorder
= static_cast<MIMG_instruction
*>(current
)->can_reorder
;
144 return can_reorder
&& moving_interaction
== barrier_none
;
150 int interaction
= get_barrier_interaction(current
);
151 interaction
|= moving_interaction
;
153 switch (instr
->opcode
) {
154 case aco_opcode::p_memory_barrier_atomic
:
155 return !(interaction
& barrier_atomic
);
156 /* For now, buffer and image barriers are treated the same. this is because of
157 * dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_nonlocal.buffer.guard_nonlocal.image.comp
158 * which seems to use an image load to determine if the result of a buffer load is valid. So the ordering of the two loads is important.
159 * I /think/ we should probably eventually expand the meaning of a buffer barrier so that all buffer operations before it, must stay before it
160 * and that both image and buffer operations after it, must stay after it. We should also do the same for image barriers.
161 * Or perhaps the problem is that we don't have a combined barrier instruction for both buffers and images, but the CTS test expects us to?
162 * Either way, this solution should work. */
163 case aco_opcode::p_memory_barrier_buffer
:
164 case aco_opcode::p_memory_barrier_image
:
165 return !(interaction
& (barrier_image
| barrier_buffer
));
166 case aco_opcode::p_memory_barrier_shared
:
167 return !(interaction
& barrier_shared
);
168 case aco_opcode::p_memory_barrier_all
:
169 return interaction
== barrier_none
;
175 bool can_reorder(Instruction
* candidate
)
177 switch (candidate
->format
) {
179 return static_cast<SMEM_instruction
*>(candidate
)->can_reorder
;
181 return static_cast<MUBUF_instruction
*>(candidate
)->can_reorder
;
183 return static_cast<MIMG_instruction
*>(candidate
)->can_reorder
;
185 return static_cast<MTBUF_instruction
*>(candidate
)->can_reorder
;
188 case Format::SCRATCH
:
195 void schedule_SMEM(sched_ctx
& ctx
, Block
* block
,
196 std::vector
<RegisterDemand
>& register_demand
,
197 Instruction
* current
, int idx
)
200 int window_size
= SMEM_WINDOW_SIZE
;
201 int max_moves
= SMEM_MAX_MOVES
;
203 bool can_reorder_cur
= can_reorder(current
);
205 /* don't move s_memtime/s_memrealtime */
206 if (current
->opcode
== aco_opcode::s_memtime
|| current
->opcode
== aco_opcode::s_memrealtime
)
209 /* create the initial set of values which current depends on */
210 std::fill(ctx
.depends_on
.begin(), ctx
.depends_on
.end(), false);
211 for (const Operand
& op
: current
->operands
) {
213 ctx
.depends_on
[op
.tempId()] = true;
216 /* maintain how many registers remain free when moving instructions */
217 RegisterDemand register_pressure
= register_demand
[idx
];
219 /* first, check if we have instructions before current to move down */
220 int insert_idx
= idx
+ 1;
221 int moving_interaction
= barrier_none
;
222 bool moving_spill
= false;
224 for (int candidate_idx
= idx
- 1; k
< max_moves
&& candidate_idx
> (int) idx
- window_size
; candidate_idx
--) {
225 assert(candidate_idx
>= 0);
226 aco_ptr
<Instruction
>& candidate
= block
->instructions
[candidate_idx
];
227 bool can_reorder_candidate
= can_reorder(candidate
.get());
229 /* break if we'd make the previous SMEM instruction stall */
230 bool can_stall_prev_smem
= idx
<= ctx
.last_SMEM_dep_idx
&& candidate_idx
< ctx
.last_SMEM_dep_idx
;
231 if (can_stall_prev_smem
&& ctx
.last_SMEM_stall
>= 0)
234 /* break when encountering another MEM instruction, logical_start or barriers */
235 if (!can_reorder_candidate
&& !can_reorder_cur
)
237 if (candidate
->opcode
== aco_opcode::p_logical_start
)
239 if (candidate
->opcode
== aco_opcode::p_exit_early_if
)
241 if (!can_move_instr(candidate
, current
, moving_interaction
))
243 if (candidate
->isVMEM())
245 register_pressure
.update(register_demand
[candidate_idx
]);
247 /* if current depends on candidate, add additional dependencies and continue */
248 bool can_move_down
= true;
249 bool writes_exec
= false;
250 for (const Definition
& def
: candidate
->definitions
) {
251 if (def
.isTemp() && ctx
.depends_on
[def
.tempId()])
252 can_move_down
= false;
253 if (def
.isFixed() && def
.physReg() == exec
)
259 if (moving_spill
&& is_spill_reload(candidate
))
260 can_move_down
= false;
261 if ((moving_interaction
& barrier_shared
) && candidate
->format
== Format::DS
)
262 can_move_down
= false;
263 moving_interaction
|= get_barrier_interaction(candidate
.get());
264 moving_spill
|= is_spill_reload(candidate
);
265 if (!can_move_down
) {
266 for (const Operand
& op
: candidate
->operands
) {
268 ctx
.depends_on
[op
.tempId()] = true;
270 can_reorder_cur
&= can_reorder_candidate
;
274 bool register_pressure_unknown
= false;
275 /* check if one of candidate's operands is killed by depending instruction */
276 for (const Operand
& op
: candidate
->operands
) {
277 if (op
.isTemp() && ctx
.depends_on
[op
.tempId()]) {
278 // FIXME: account for difference in register pressure
279 register_pressure_unknown
= true;
282 if (register_pressure_unknown
) {
283 for (const Operand
& op
: candidate
->operands
) {
285 ctx
.depends_on
[op
.tempId()] = true;
287 can_reorder_cur
&= can_reorder_candidate
;
291 /* check if register pressure is low enough: the diff is negative if register pressure is increased */
292 const RegisterDemand candidate_diff
= getLiveChanges(candidate
);
293 const RegisterDemand tempDemand
= getTempRegisters(candidate
);
294 if (RegisterDemand(register_pressure
- candidate_diff
).exceeds(ctx
.max_registers
))
296 const RegisterDemand tempDemand2
= getTempRegisters(block
->instructions
[insert_idx
- 1]);
297 const RegisterDemand new_demand
= register_demand
[insert_idx
- 1] - tempDemand2
+ tempDemand
;
298 if (new_demand
.exceeds(ctx
.max_registers
))
300 // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure
302 /* move the candidate below the memory load */
303 move_element(block
->instructions
, candidate_idx
, insert_idx
);
305 /* update register pressure */
306 move_element(register_demand
, candidate_idx
, insert_idx
);
307 for (int i
= candidate_idx
; i
< insert_idx
- 1; i
++) {
308 register_demand
[i
] -= candidate_diff
;
310 register_demand
[insert_idx
- 1] = new_demand
;
311 register_pressure
-= candidate_diff
;
313 if (candidate_idx
< ctx
.last_SMEM_dep_idx
)
314 ctx
.last_SMEM_stall
++;
319 /* create the initial set of values which depend on current */
320 std::fill(ctx
.depends_on
.begin(), ctx
.depends_on
.end(), false);
321 std::fill(ctx
.RAR_dependencies
.begin(), ctx
.RAR_dependencies
.end(), false);
322 for (const Definition
& def
: current
->definitions
) {
324 ctx
.depends_on
[def
.tempId()] = true;
327 /* find the first instruction depending on current or find another MEM */
328 insert_idx
= idx
+ 1;
329 moving_interaction
= barrier_none
;
330 moving_spill
= false;
331 can_reorder_cur
= true;
333 bool found_dependency
= false;
334 /* second, check if we have instructions after current to move up */
335 for (int candidate_idx
= idx
+ 1; k
< max_moves
&& candidate_idx
< (int) idx
+ window_size
; candidate_idx
++) {
336 assert(candidate_idx
< (int) block
->instructions
.size());
337 aco_ptr
<Instruction
>& candidate
= block
->instructions
[candidate_idx
];
338 bool can_reorder_candidate
= can_reorder(candidate
.get());
340 if (candidate
->opcode
== aco_opcode::p_logical_end
)
342 if (!can_move_instr(candidate
, current
, moving_interaction
))
345 const bool writes_exec
= std::any_of(candidate
->definitions
.begin(), candidate
->definitions
.end(),
346 [](const Definition
& def
) { return def
.isFixed() && def
.physReg() == exec
;});
350 /* check if candidate depends on current */
351 bool is_dependency
= std::any_of(candidate
->operands
.begin(), candidate
->operands
.end(),
352 [&ctx
](const Operand
& op
) { return op
.isTemp() && ctx
.depends_on
[op
.tempId()];});
353 /* no need to steal from following VMEM instructions */
354 if (is_dependency
&& candidate
->isVMEM())
356 if (moving_spill
&& is_spill_reload(candidate
))
357 is_dependency
= true;
358 if ((moving_interaction
& barrier_shared
) && candidate
->format
== Format::DS
)
359 is_dependency
= true;
360 moving_interaction
|= get_barrier_interaction(candidate
.get());
361 moving_spill
|= is_spill_reload(candidate
);
363 for (const Definition
& def
: candidate
->definitions
) {
365 ctx
.depends_on
[def
.tempId()] = true;
367 for (const Operand
& op
: candidate
->operands
) {
369 ctx
.RAR_dependencies
[op
.tempId()] = true;
371 if (!found_dependency
) {
372 insert_idx
= candidate_idx
;
373 found_dependency
= true;
374 /* init register pressure */
375 register_pressure
= register_demand
[insert_idx
- 1];
379 if (!can_reorder_candidate
&& !can_reorder_cur
)
382 if (!found_dependency
) {
387 /* update register pressure */
388 register_pressure
.update(register_demand
[candidate_idx
- 1]);
391 can_reorder_cur
&= can_reorder_candidate
;
394 assert(insert_idx
!= idx
);
396 // TODO: correctly calculate register pressure for this case
397 bool register_pressure_unknown
= false;
398 /* check if candidate uses/kills an operand which is used by a dependency */
399 for (const Operand
& op
: candidate
->operands
) {
400 if (op
.isTemp() && ctx
.RAR_dependencies
[op
.tempId()])
401 register_pressure_unknown
= true;
403 if (register_pressure_unknown
) {
404 if (candidate
->isVMEM())
406 for (const Definition
& def
: candidate
->definitions
) {
408 ctx
.RAR_dependencies
[def
.tempId()] = true;
410 for (const Operand
& op
: candidate
->operands
) {
412 ctx
.RAR_dependencies
[op
.tempId()] = true;
414 can_reorder_cur
&= can_reorder_candidate
;
418 /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
419 const RegisterDemand candidate_diff
= getLiveChanges(candidate
);
420 const RegisterDemand temp
= getTempRegisters(candidate
);
421 if (RegisterDemand(register_pressure
+ candidate_diff
).exceeds(ctx
.max_registers
))
423 const RegisterDemand temp2
= getTempRegisters(block
->instructions
[insert_idx
- 1]);
424 const RegisterDemand new_demand
= register_demand
[insert_idx
- 1] - temp2
+ candidate_diff
+ temp
;
425 if (new_demand
.exceeds(ctx
.max_registers
))
428 /* move the candidate above the insert_idx */
429 move_element(block
->instructions
, candidate_idx
, insert_idx
);
431 /* update register pressure */
432 move_element(register_demand
, candidate_idx
, insert_idx
);
433 for (int i
= insert_idx
+ 1; i
<= candidate_idx
; i
++) {
434 register_demand
[i
] += candidate_diff
;
436 register_demand
[insert_idx
] = new_demand
;
437 register_pressure
+= candidate_diff
;
442 ctx
.last_SMEM_dep_idx
= found_dependency
? insert_idx
: 0;
443 ctx
.last_SMEM_stall
= 10 - ctx
.num_waves
- k
;
446 void schedule_VMEM(sched_ctx
& ctx
, Block
* block
,
447 std::vector
<RegisterDemand
>& register_demand
,
448 Instruction
* current
, int idx
)
451 int window_size
= VMEM_WINDOW_SIZE
;
452 int max_moves
= VMEM_MAX_MOVES
;
453 int clause_max_grab_dist
= VMEM_CLAUSE_MAX_GRAB_DIST
;
455 /* initially true as we don't pull other VMEM instructions
456 * through the current instruction */
457 bool can_reorder_vmem
= true;
458 bool can_reorder_smem
= true;
460 /* create the initial set of values which current depends on */
461 std::fill(ctx
.depends_on
.begin(), ctx
.depends_on
.end(), false);
462 std::fill(ctx
.RAR_dependencies
.begin(), ctx
.RAR_dependencies
.end(), false);
463 std::fill(ctx
.new_RAR_dependencies
.begin(), ctx
.new_RAR_dependencies
.end(), false);
464 for (const Operand
& op
: current
->operands
) {
466 ctx
.depends_on
[op
.tempId()] = true;
467 if (op
.isFirstKill())
468 ctx
.RAR_dependencies
[op
.tempId()] = true;
472 /* maintain how many registers remain free when moving instructions */
473 RegisterDemand register_pressure_indep
= register_demand
[idx
];
474 RegisterDemand register_pressure_clause
= register_demand
[idx
];
476 /* first, check if we have instructions before current to move down */
477 int indep_insert_idx
= idx
+ 1;
478 int clause_insert_idx
= idx
;
479 int moving_interaction
= barrier_none
;
480 bool moving_spill
= false;
482 for (int candidate_idx
= idx
- 1; k
< max_moves
&& candidate_idx
> (int) idx
- window_size
; candidate_idx
--) {
483 assert(candidate_idx
>= 0);
484 aco_ptr
<Instruction
>& candidate
= block
->instructions
[candidate_idx
];
485 bool can_reorder_candidate
= can_reorder(candidate
.get());
487 /* break when encountering another VMEM instruction, logical_start or barriers */
488 if (!can_reorder_smem
&& candidate
->format
== Format::SMEM
&& !can_reorder_candidate
)
490 if (candidate
->opcode
== aco_opcode::p_logical_start
)
492 if (candidate
->opcode
== aco_opcode::p_exit_early_if
)
494 if (!can_move_instr(candidate
, current
, moving_interaction
))
497 /* break if we'd make the previous SMEM instruction stall */
498 bool can_stall_prev_smem
= idx
<= ctx
.last_SMEM_dep_idx
&& candidate_idx
< ctx
.last_SMEM_dep_idx
;
499 if (can_stall_prev_smem
&& ctx
.last_SMEM_stall
>= 0)
501 register_pressure_indep
.update(register_demand
[candidate_idx
]);
503 bool part_of_clause
= false;
504 if (candidate
->isVMEM()) {
505 bool same_resource
= candidate
->operands
[1].tempId() == current
->operands
[1].tempId();
506 bool can_reorder
= can_reorder_vmem
|| can_reorder_candidate
;
507 int grab_dist
= clause_insert_idx
- candidate_idx
;
508 /* We can't easily tell how much this will decrease the def-to-use
509 * distances, so just use how far it will be moved as a heuristic. */
510 part_of_clause
= can_reorder
&& same_resource
&& grab_dist
< clause_max_grab_dist
;
513 /* if current depends on candidate, add additional dependencies and continue */
514 bool can_move_down
= !candidate
->isVMEM() || part_of_clause
;
515 bool writes_exec
= false;
516 for (const Definition
& def
: candidate
->definitions
) {
517 if (def
.isTemp() && ctx
.depends_on
[def
.tempId()])
518 can_move_down
= false;
519 if (def
.isFixed() && def
.physReg() == exec
)
525 if (moving_spill
&& is_spill_reload(candidate
))
526 can_move_down
= false;
527 if ((moving_interaction
& barrier_shared
) && candidate
->format
== Format::DS
)
528 can_move_down
= false;
529 moving_interaction
|= get_barrier_interaction(candidate
.get());
530 moving_spill
|= is_spill_reload(candidate
);
531 if (!can_move_down
) {
532 for (const Operand
& op
: candidate
->operands
) {
534 ctx
.depends_on
[op
.tempId()] = true;
535 if (op
.isFirstKill()) {
536 ctx
.RAR_dependencies
[op
.tempId()] = true;
537 ctx
.new_RAR_dependencies
[op
.tempId()] = true;
541 register_pressure_clause
.update(register_demand
[candidate_idx
]);
542 can_reorder_smem
&= candidate
->format
!= Format::SMEM
|| can_reorder_candidate
;
543 can_reorder_vmem
&= !candidate
->isVMEM() || can_reorder_candidate
;
547 if (part_of_clause
) {
548 for (const Operand
& op
: candidate
->operands
) {
550 ctx
.depends_on
[op
.tempId()] = true;
551 if (op
.isFirstKill())
552 ctx
.RAR_dependencies
[op
.tempId()] = true;
557 bool register_pressure_unknown
= false;
558 std::vector
<bool>& RAR_deps
= part_of_clause
? ctx
.new_RAR_dependencies
: ctx
.RAR_dependencies
;
559 /* check if one of candidate's operands is killed by depending instruction */
560 for (const Operand
& op
: candidate
->operands
) {
561 if (op
.isTemp() && RAR_deps
[op
.tempId()]) {
562 // FIXME: account for difference in register pressure
563 register_pressure_unknown
= true;
566 if (register_pressure_unknown
) {
567 for (const Operand
& op
: candidate
->operands
) {
569 ctx
.depends_on
[op
.tempId()] = true;
570 if (op
.isFirstKill()) {
571 ctx
.RAR_dependencies
[op
.tempId()] = true;
572 ctx
.new_RAR_dependencies
[op
.tempId()] = true;
576 register_pressure_clause
.update(register_demand
[candidate_idx
]);
577 can_reorder_smem
&= candidate
->format
!= Format::SMEM
|| can_reorder_candidate
;
578 can_reorder_vmem
&= !candidate
->isVMEM() || can_reorder_candidate
;
582 int insert_idx
= part_of_clause
? clause_insert_idx
: indep_insert_idx
;
583 RegisterDemand register_pressure
= part_of_clause
? register_pressure_clause
: register_pressure_indep
;
585 /* check if register pressure is low enough: the diff is negative if register pressure is increased */
586 const RegisterDemand candidate_diff
= getLiveChanges(candidate
);
587 const RegisterDemand temp
= getTempRegisters(candidate
);;
588 if (RegisterDemand(register_pressure
- candidate_diff
).exceeds(ctx
.max_registers
))
590 const RegisterDemand temp2
= getTempRegisters(block
->instructions
[insert_idx
- 1]);
591 const RegisterDemand new_demand
= register_demand
[insert_idx
- 1] - temp2
+ temp
;
592 if (new_demand
.exceeds(ctx
.max_registers
))
594 // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure
596 /* move the candidate below the memory load */
597 move_element(block
->instructions
, candidate_idx
, insert_idx
);
599 /* update register pressure */
600 move_element(register_demand
, candidate_idx
, insert_idx
);
601 for (int i
= candidate_idx
; i
< insert_idx
- 1; i
++) {
602 register_demand
[i
] -= candidate_diff
;
604 register_demand
[insert_idx
- 1] = new_demand
;
605 register_pressure_clause
-= candidate_diff
;
607 if (!part_of_clause
) {
608 register_pressure_indep
-= candidate_diff
;
612 if (candidate_idx
< ctx
.last_SMEM_dep_idx
)
613 ctx
.last_SMEM_stall
++;
616 /* create the initial set of values which depend on current */
617 std::fill(ctx
.depends_on
.begin(), ctx
.depends_on
.end(), false);
618 std::fill(ctx
.RAR_dependencies
.begin(), ctx
.RAR_dependencies
.end(), false);
619 for (const Definition
& def
: current
->definitions
) {
621 ctx
.depends_on
[def
.tempId()] = true;
624 /* find the first instruction depending on current or find another VMEM */
625 RegisterDemand register_pressure
;
626 int insert_idx
= idx
;
627 moving_interaction
= barrier_none
;
628 moving_spill
= false;
629 // TODO: differentiate between loads and stores (load-load can always reorder)
630 can_reorder_vmem
= true;
631 can_reorder_smem
= true;
633 bool found_dependency
= false;
634 /* second, check if we have instructions after current to move up */
635 for (int candidate_idx
= idx
+ 1; k
< max_moves
&& candidate_idx
< (int) idx
+ window_size
; candidate_idx
++) {
636 assert(candidate_idx
< (int) block
->instructions
.size());
637 aco_ptr
<Instruction
>& candidate
= block
->instructions
[candidate_idx
];
638 bool can_reorder_candidate
= can_reorder(candidate
.get());
640 if (candidate
->opcode
== aco_opcode::p_logical_end
)
642 if (!can_move_instr(candidate
, current
, moving_interaction
))
645 const bool writes_exec
= std::any_of(candidate
->definitions
.begin(), candidate
->definitions
.end(),
646 [](const Definition
& def
) {return def
.isFixed() && def
.physReg() == exec
; });
650 /* check if candidate depends on current */
651 bool is_dependency
= false;
652 if (candidate
->format
== Format::SMEM
)
653 is_dependency
= !can_reorder_smem
&& !can_reorder_candidate
;
654 if (candidate
->isVMEM())
655 is_dependency
= !can_reorder_vmem
&& !can_reorder_candidate
;
656 for (const Operand
& op
: candidate
->operands
) {
657 if (op
.isTemp() && ctx
.depends_on
[op
.tempId()]) {
658 is_dependency
= true;
662 if (moving_spill
&& is_spill_reload(candidate
))
663 is_dependency
= true;
664 if ((moving_interaction
& barrier_shared
) && candidate
->format
== Format::DS
)
665 is_dependency
= true;
666 moving_interaction
|= get_barrier_interaction(candidate
.get());
667 moving_spill
|= is_spill_reload(candidate
);
669 for (const Definition
& def
: candidate
->definitions
) {
671 ctx
.depends_on
[def
.tempId()] = true;
673 for (const Operand
& op
: candidate
->operands
) {
675 ctx
.RAR_dependencies
[op
.tempId()] = true;
677 /* update flag whether we can reorder other memory instructions */
678 can_reorder_smem
&= candidate
->format
!= Format::SMEM
|| can_reorder_candidate
;
679 can_reorder_vmem
&= !candidate
->isVMEM() || can_reorder_candidate
;
681 if (!found_dependency
) {
682 insert_idx
= candidate_idx
;
683 found_dependency
= true;
684 /* init register pressure */
685 register_pressure
= register_demand
[insert_idx
- 1];
689 } else if (candidate
->isVMEM()) {
690 /* don't move up dependencies of other VMEM instructions */
691 for (const Definition
& def
: candidate
->definitions
) {
693 ctx
.depends_on
[def
.tempId()] = true;
697 /* update register pressure */
698 register_pressure
.update(register_demand
[candidate_idx
- 1]);
700 if (is_dependency
|| !found_dependency
)
702 assert(insert_idx
!= idx
);
704 bool register_pressure_unknown
= false;
705 /* check if candidate uses/kills an operand which is used by a dependency */
706 for (const Operand
& op
: candidate
->operands
) {
707 if (op
.isTemp() && op
.isFirstKill() && ctx
.RAR_dependencies
[op
.tempId()])
708 register_pressure_unknown
= true;
710 if (register_pressure_unknown
) {
711 for (const Definition
& def
: candidate
->definitions
) {
713 ctx
.depends_on
[def
.tempId()] = true;
715 for (const Operand
& op
: candidate
->operands
) {
717 ctx
.RAR_dependencies
[op
.tempId()] = true;
719 can_reorder_smem
&= candidate
->format
!= Format::SMEM
|| can_reorder_candidate
;
720 can_reorder_vmem
&= !candidate
->isVMEM() || can_reorder_candidate
;
724 /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
725 const RegisterDemand candidate_diff
= getLiveChanges(candidate
);
726 const RegisterDemand temp
= getTempRegisters(candidate
);
727 if (RegisterDemand(register_pressure
+ candidate_diff
).exceeds(ctx
.max_registers
))
729 const RegisterDemand temp2
= getTempRegisters(block
->instructions
[insert_idx
- 1]);
730 const RegisterDemand new_demand
= register_demand
[insert_idx
- 1] - temp2
+ candidate_diff
+ temp
;
731 if (new_demand
.exceeds(ctx
.max_registers
))
734 /* move the candidate above the insert_idx */
735 move_element(block
->instructions
, candidate_idx
, insert_idx
);
737 /* update register pressure */
738 move_element(register_demand
, candidate_idx
, insert_idx
);
739 for (int i
= insert_idx
+ 1; i
<= candidate_idx
; i
++) {
740 register_demand
[i
] += candidate_diff
;
742 register_demand
[insert_idx
] = new_demand
;
743 register_pressure
+= candidate_diff
;
749 void schedule_position_export(sched_ctx
& ctx
, Block
* block
,
750 std::vector
<RegisterDemand
>& register_demand
,
751 Instruction
* current
, int idx
)
754 int window_size
= POS_EXP_WINDOW_SIZE
;
755 int max_moves
= POS_EXP_MAX_MOVES
;
758 /* create the initial set of values which current depends on */
759 std::fill(ctx
.depends_on
.begin(), ctx
.depends_on
.end(), false);
760 std::fill(ctx
.RAR_dependencies
.begin(), ctx
.RAR_dependencies
.end(), false);
761 for (const Operand
& op
: current
->operands
) {
763 ctx
.depends_on
[op
.tempId()] = true;
764 if (op
.isFirstKill())
765 ctx
.RAR_dependencies
[op
.tempId()] = true;
769 /* maintain how many registers remain free when moving instructions */
770 RegisterDemand register_pressure
= register_demand
[idx
];
772 /* first, check if we have instructions before current to move down */
773 int insert_idx
= idx
+ 1;
774 int moving_interaction
= barrier_none
;
775 bool moving_spill
= false;
777 for (int candidate_idx
= idx
- 1; k
< max_moves
&& candidate_idx
> (int) idx
- window_size
; candidate_idx
--) {
778 assert(candidate_idx
>= 0);
779 aco_ptr
<Instruction
>& candidate
= block
->instructions
[candidate_idx
];
781 /* break when encountering logical_start or barriers */
782 if (candidate
->opcode
== aco_opcode::p_logical_start
)
784 if (candidate
->opcode
== aco_opcode::p_exit_early_if
)
786 if (candidate
->isVMEM() || candidate
->format
== Format::SMEM
)
788 if (!can_move_instr(candidate
, current
, moving_interaction
))
791 register_pressure
.update(register_demand
[candidate_idx
]);
793 /* if current depends on candidate, add additional dependencies and continue */
794 bool can_move_down
= true;
795 bool writes_exec
= false;
796 for (unsigned i
= 0; i
< candidate
->definitions
.size(); i
++) {
797 if (candidate
->definitions
[i
].isTemp() && ctx
.depends_on
[candidate
->definitions
[i
].tempId()])
798 can_move_down
= false;
799 if (candidate
->definitions
[i
].isFixed() && candidate
->definitions
[i
].physReg() == exec
)
805 if (moving_spill
&& is_spill_reload(candidate
))
806 can_move_down
= false;
807 if ((moving_interaction
& barrier_shared
) && candidate
->format
== Format::DS
)
808 can_move_down
= false;
809 moving_interaction
|= get_barrier_interaction(candidate
.get());
810 moving_spill
|= is_spill_reload(candidate
);
811 if (!can_move_down
) {
812 for (const Operand
& op
: candidate
->operands
) {
814 ctx
.depends_on
[op
.tempId()] = true;
815 if (op
.isFirstKill())
816 ctx
.RAR_dependencies
[op
.tempId()] = true;
822 bool register_pressure_unknown
= false;
823 /* check if one of candidate's operands is killed by depending instruction */
824 for (const Operand
& op
: candidate
->operands
) {
825 if (op
.isTemp() && ctx
.RAR_dependencies
[op
.tempId()]) {
826 // FIXME: account for difference in register pressure
827 register_pressure_unknown
= true;
830 if (register_pressure_unknown
) {
831 for (const Operand
& op
: candidate
->operands
) {
833 ctx
.depends_on
[op
.tempId()] = true;
834 if (op
.isFirstKill())
835 ctx
.RAR_dependencies
[op
.tempId()] = true;
841 /* check if register pressure is low enough: the diff is negative if register pressure is increased */
842 const RegisterDemand candidate_diff
= getLiveChanges(candidate
);
843 const RegisterDemand temp
= getTempRegisters(candidate
);;
844 if (RegisterDemand(register_pressure
- candidate_diff
).exceeds(ctx
.max_registers
))
846 const RegisterDemand temp2
= getTempRegisters(block
->instructions
[insert_idx
- 1]);
847 const RegisterDemand new_demand
= register_demand
[insert_idx
- 1] - temp2
+ temp
;
848 if (new_demand
.exceeds(ctx
.max_registers
))
850 // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure
852 /* move the candidate below the export */
853 move_element(block
->instructions
, candidate_idx
, insert_idx
);
855 /* update register pressure */
856 move_element(register_demand
, candidate_idx
, insert_idx
);
857 for (int i
= candidate_idx
; i
< insert_idx
- 1; i
++) {
858 register_demand
[i
] -= candidate_diff
;
860 register_demand
[insert_idx
- 1] = new_demand
;
861 register_pressure
-= candidate_diff
;
867 void schedule_block(sched_ctx
& ctx
, Program
*program
, Block
* block
, live
& live_vars
)
869 ctx
.last_SMEM_dep_idx
= 0;
870 ctx
.last_SMEM_stall
= INT16_MIN
;
872 /* go through all instructions and find memory loads */
873 for (unsigned idx
= 0; idx
< block
->instructions
.size(); idx
++) {
874 Instruction
* current
= block
->instructions
[idx
].get();
876 if (current
->definitions
.empty())
879 if (current
->isVMEM())
880 schedule_VMEM(ctx
, block
, live_vars
.register_demand
[block
->index
], current
, idx
);
881 if (current
->format
== Format::SMEM
)
882 schedule_SMEM(ctx
, block
, live_vars
.register_demand
[block
->index
], current
, idx
);
885 if ((program
->stage
& hw_vs
) && block
->index
== program
->blocks
.size() - 1) {
886 /* Try to move position exports as far up as possible, to reduce register
887 * usage and because ISA reference guides say so. */
888 for (unsigned idx
= 0; idx
< block
->instructions
.size(); idx
++) {
889 Instruction
* current
= block
->instructions
[idx
].get();
891 if (current
->format
== Format::EXP
) {
892 unsigned target
= static_cast<Export_instruction
*>(current
)->dest
;
893 if (target
>= V_008DFC_SQ_EXP_POS
&& target
< V_008DFC_SQ_EXP_PARAM
)
894 schedule_position_export(ctx
, block
, live_vars
.register_demand
[block
->index
], current
, idx
);
899 /* resummarize the block's register demand */
900 block
->register_demand
= RegisterDemand();
901 for (unsigned idx
= 0; idx
< block
->instructions
.size(); idx
++) {
902 block
->register_demand
.update(live_vars
.register_demand
[block
->index
][idx
]);
907 void schedule_program(Program
*program
, live
& live_vars
)
910 ctx
.depends_on
.resize(program
->peekAllocationId());
911 ctx
.RAR_dependencies
.resize(program
->peekAllocationId());
912 ctx
.new_RAR_dependencies
.resize(program
->peekAllocationId());
913 /* Allowing the scheduler to reduce the number of waves to as low as 5
914 * improves performance of Thrones of Britannia significantly and doesn't
915 * seem to hurt anything else. */
916 if (program
->num_waves
<= 5)
917 ctx
.num_waves
= program
->num_waves
;
918 else if (program
->max_reg_demand
.vgpr
>= 32)
920 else if (program
->max_reg_demand
.vgpr
>= 28)
922 else if (program
->max_reg_demand
.vgpr
>= 24)
927 assert(ctx
.num_waves
> 0 && ctx
.num_waves
<= program
->num_waves
);
928 ctx
.max_registers
= { int16_t(((256 / ctx
.num_waves
) & ~3) - 2), int16_t(get_addr_sgpr_from_waves(program
, ctx
.num_waves
))};
930 for (Block
& block
: program
->blocks
)
931 schedule_block(ctx
, program
, &block
, live_vars
);
933 /* update max_reg_demand and num_waves */
934 RegisterDemand new_demand
;
935 for (Block
& block
: program
->blocks
) {
936 new_demand
.update(block
.register_demand
);
938 update_vgpr_sgpr_demand(program
, new_demand
);
940 /* if enabled, this code asserts that register_demand is updated correctly */
942 int prev_num_waves
= program
->num_waves
;
943 const RegisterDemand prev_max_demand
= program
->max_reg_demand
;
945 std::vector
<RegisterDemand
> demands(program
->blocks
.size());
946 for (unsigned j
= 0; j
< program
->blocks
.size(); j
++) {
947 demands
[j
] = program
->blocks
[j
].register_demand
;
950 struct radv_nir_compiler_options options
;
951 options
.chip_class
= program
->chip_class
;
952 live live_vars2
= aco::live_var_analysis(program
, &options
);
954 for (unsigned j
= 0; j
< program
->blocks
.size(); j
++) {
955 Block
&b
= program
->blocks
[j
];
956 for (unsigned i
= 0; i
< b
.instructions
.size(); i
++)
957 assert(live_vars
.register_demand
[b
.index
][i
] == live_vars2
.register_demand
[b
.index
][i
]);
958 assert(b
.register_demand
== demands
[j
]);
961 assert(program
->max_reg_demand
== prev_max_demand
);
962 assert(program
->num_waves
== prev_num_waves
);