2 * Copyright © 2018 Valve Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
26 #include <unordered_set>
29 #include "vulkan/radv_shader.h" // for radv_nir_compiler_options
30 #include "amdgfxregs.h"
32 #define SMEM_WINDOW_SIZE (350 - ctx.num_waves * 35)
33 #define VMEM_WINDOW_SIZE (1024 - ctx.num_waves * 64)
34 #define POS_EXP_WINDOW_SIZE 512
35 #define SMEM_MAX_MOVES (64 - ctx.num_waves * 4)
36 #define VMEM_MAX_MOVES (128 - ctx.num_waves * 8)
37 #define POS_EXP_MAX_MOVES 512
42 std::vector
<bool> depends_on
;
43 std::vector
<bool> RAR_dependencies
;
44 RegisterDemand max_registers
;
46 int16_t last_SMEM_stall
;
47 int last_SMEM_dep_idx
;
50 /* This scheduler is a simple bottom-up pass based on ideas from
51 * "A Novel Lightweight Instruction Scheduling Algorithm for Just-In-Time Compiler"
52 * from Xiaohua Shi and Peng Guo.
53 * The basic approach is to iterate over all instructions. When a memory instruction
54 * is encountered it tries to move independent instructions from above and below
55 * between the memory instruction and it's first user.
56 * The novelty is that this scheduler cares for the current register pressure:
57 * Instructions will only be moved if the register pressure won't exceed a certain bound.
61 void move_element(T
& list
, size_t idx
, size_t before
) {
63 auto begin
= std::next(list
.begin(), idx
);
64 auto end
= std::next(list
.begin(), before
);
65 std::rotate(begin
, begin
+ 1, end
);
66 } else if (idx
> before
) {
67 auto begin
= std::next(list
.begin(), before
);
68 auto end
= std::next(list
.begin(), idx
+ 1);
69 std::rotate(begin
, end
- 1, end
);
73 static RegisterDemand
getLiveChanges(aco_ptr
<Instruction
>& instr
)
75 RegisterDemand changes
;
76 for (const Definition
& def
: instr
->definitions
) {
77 if (!def
.isTemp() || def
.isKill())
79 changes
+= def
.getTemp();
82 for (const Operand
& op
: instr
->operands
) {
83 if (!op
.isTemp() || !op
.isFirstKill())
85 changes
-= op
.getTemp();
91 static RegisterDemand
getTempRegisters(aco_ptr
<Instruction
>& instr
)
93 RegisterDemand temp_registers
;
94 for (const Definition
& def
: instr
->definitions
) {
95 if (!def
.isTemp() || !def
.isKill())
97 temp_registers
+= def
.getTemp();
99 return temp_registers
;
102 static bool is_spill_reload(aco_ptr
<Instruction
>& instr
)
104 return instr
->opcode
== aco_opcode::p_spill
|| instr
->opcode
== aco_opcode::p_reload
;
107 bool can_move_instr(aco_ptr
<Instruction
>& instr
, Instruction
* current
, int moving_interaction
)
109 /* don't move exports so that they stay closer together */
110 if (instr
->format
== Format::EXP
)
113 /* don't move s_memtime/s_memrealtime */
114 if (instr
->opcode
== aco_opcode::s_memtime
|| instr
->opcode
== aco_opcode::s_memrealtime
)
117 /* handle barriers */
119 /* TODO: instead of stopping, maybe try to move the barriers and any
120 * instructions interacting with them instead? */
121 if (instr
->format
!= Format::PSEUDO_BARRIER
) {
122 if (instr
->opcode
== aco_opcode::s_barrier
) {
123 bool can_reorder
= false;
124 switch (current
->format
) {
126 can_reorder
= static_cast<SMEM_instruction
*>(current
)->can_reorder
;
129 can_reorder
= static_cast<MUBUF_instruction
*>(current
)->can_reorder
;
132 can_reorder
= static_cast<MIMG_instruction
*>(current
)->can_reorder
;
137 return can_reorder
&& moving_interaction
== barrier_none
;
143 int interaction
= get_barrier_interaction(current
);
144 interaction
|= moving_interaction
;
146 switch (instr
->opcode
) {
147 case aco_opcode::p_memory_barrier_atomic
:
148 return !(interaction
& barrier_atomic
);
149 /* For now, buffer and image barriers are treated the same. this is because of
150 * dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_nonlocal.buffer.guard_nonlocal.image.comp
151 * which seems to use an image load to determine if the result of a buffer load is valid. So the ordering of the two loads is important.
152 * I /think/ we should probably eventually expand the meaning of a buffer barrier so that all buffer operations before it, must stay before it
153 * and that both image and buffer operations after it, must stay after it. We should also do the same for image barriers.
154 * Or perhaps the problem is that we don't have a combined barrier instruction for both buffers and images, but the CTS test expects us to?
155 * Either way, this solution should work. */
156 case aco_opcode::p_memory_barrier_buffer
:
157 case aco_opcode::p_memory_barrier_image
:
158 return !(interaction
& (barrier_image
| barrier_buffer
));
159 case aco_opcode::p_memory_barrier_shared
:
160 return !(interaction
& barrier_shared
);
161 case aco_opcode::p_memory_barrier_all
:
162 return interaction
== barrier_none
;
168 bool can_reorder(Instruction
* candidate
, bool allow_smem
)
170 switch (candidate
->format
) {
172 return allow_smem
|| static_cast<SMEM_instruction
*>(candidate
)->can_reorder
;
174 return static_cast<MUBUF_instruction
*>(candidate
)->can_reorder
;
176 return static_cast<MIMG_instruction
*>(candidate
)->can_reorder
;
178 return static_cast<MTBUF_instruction
*>(candidate
)->can_reorder
;
181 case Format::SCRATCH
:
188 void schedule_SMEM(sched_ctx
& ctx
, Block
* block
,
189 std::vector
<RegisterDemand
>& register_demand
,
190 Instruction
* current
, int idx
)
193 int window_size
= SMEM_WINDOW_SIZE
;
194 int max_moves
= SMEM_MAX_MOVES
;
196 bool can_reorder_cur
= can_reorder(current
, false);
198 /* don't move s_memtime/s_memrealtime */
199 if (current
->opcode
== aco_opcode::s_memtime
|| current
->opcode
== aco_opcode::s_memrealtime
)
202 /* create the initial set of values which current depends on */
203 std::fill(ctx
.depends_on
.begin(), ctx
.depends_on
.end(), false);
204 for (const Operand
& op
: current
->operands
) {
206 ctx
.depends_on
[op
.tempId()] = true;
209 /* maintain how many registers remain free when moving instructions */
210 RegisterDemand register_pressure
= register_demand
[idx
];
212 /* first, check if we have instructions before current to move down */
213 int insert_idx
= idx
+ 1;
214 int moving_interaction
= barrier_none
;
215 bool moving_spill
= false;
217 for (int candidate_idx
= idx
- 1; k
< max_moves
&& candidate_idx
> (int) idx
- window_size
; candidate_idx
--) {
218 assert(candidate_idx
>= 0);
219 aco_ptr
<Instruction
>& candidate
= block
->instructions
[candidate_idx
];
221 /* break if we'd make the previous SMEM instruction stall */
222 bool can_stall_prev_smem
= idx
<= ctx
.last_SMEM_dep_idx
&& candidate_idx
< ctx
.last_SMEM_dep_idx
;
223 if (can_stall_prev_smem
&& ctx
.last_SMEM_stall
>= 0)
226 /* break when encountering another MEM instruction, logical_start or barriers */
227 if (!can_reorder(candidate
.get(), false) && !can_reorder_cur
)
229 if (candidate
->opcode
== aco_opcode::p_logical_start
)
231 if (candidate
->opcode
== aco_opcode::p_exit_early_if
)
233 if (!can_move_instr(candidate
, current
, moving_interaction
))
235 register_pressure
.update(register_demand
[candidate_idx
]);
237 /* if current depends on candidate, add additional dependencies and continue */
238 bool can_move_down
= true;
239 bool writes_exec
= false;
240 for (const Definition
& def
: candidate
->definitions
) {
241 if (def
.isTemp() && ctx
.depends_on
[def
.tempId()])
242 can_move_down
= false;
243 if (def
.isFixed() && def
.physReg() == exec
)
249 if (moving_spill
&& is_spill_reload(candidate
))
250 can_move_down
= false;
251 if ((moving_interaction
& barrier_shared
) && candidate
->format
== Format::DS
)
252 can_move_down
= false;
253 moving_interaction
|= get_barrier_interaction(candidate
.get());
254 moving_spill
|= is_spill_reload(candidate
);
255 if (!can_move_down
) {
256 for (const Operand
& op
: candidate
->operands
) {
258 ctx
.depends_on
[op
.tempId()] = true;
263 bool register_pressure_unknown
= false;
264 /* check if one of candidate's operands is killed by depending instruction */
265 for (const Operand
& op
: candidate
->operands
) {
266 if (op
.isTemp() && ctx
.depends_on
[op
.tempId()]) {
267 // FIXME: account for difference in register pressure
268 register_pressure_unknown
= true;
271 if (register_pressure_unknown
) {
272 for (const Operand
& op
: candidate
->operands
) {
274 ctx
.depends_on
[op
.tempId()] = true;
279 /* check if register pressure is low enough: the diff is negative if register pressure is increased */
280 const RegisterDemand candidate_diff
= getLiveChanges(candidate
);
281 const RegisterDemand tempDemand
= getTempRegisters(candidate
);
282 if (RegisterDemand(register_pressure
- candidate_diff
).exceeds(ctx
.max_registers
))
284 const RegisterDemand tempDemand2
= getTempRegisters(block
->instructions
[insert_idx
- 1]);
285 const RegisterDemand new_demand
= register_demand
[insert_idx
- 1] - tempDemand2
+ tempDemand
;
286 if (new_demand
.exceeds(ctx
.max_registers
))
288 // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure
290 /* move the candidate below the memory load */
291 move_element(block
->instructions
, candidate_idx
, insert_idx
);
293 /* update register pressure */
294 move_element(register_demand
, candidate_idx
, insert_idx
);
295 for (int i
= candidate_idx
; i
< insert_idx
- 1; i
++) {
296 register_demand
[i
] -= candidate_diff
;
298 register_demand
[insert_idx
- 1] = new_demand
;
299 register_pressure
-= candidate_diff
;
301 if (candidate_idx
< ctx
.last_SMEM_dep_idx
)
302 ctx
.last_SMEM_stall
++;
307 /* create the initial set of values which depend on current */
308 std::fill(ctx
.depends_on
.begin(), ctx
.depends_on
.end(), false);
309 std::fill(ctx
.RAR_dependencies
.begin(), ctx
.RAR_dependencies
.end(), false);
310 for (const Definition
& def
: current
->definitions
) {
312 ctx
.depends_on
[def
.tempId()] = true;
315 /* find the first instruction depending on current or find another MEM */
316 insert_idx
= idx
+ 1;
317 moving_interaction
= barrier_none
;
318 moving_spill
= false;
320 bool found_dependency
= false;
321 /* second, check if we have instructions after current to move up */
322 for (int candidate_idx
= idx
+ 1; k
< max_moves
&& candidate_idx
< (int) idx
+ window_size
; candidate_idx
++) {
323 assert(candidate_idx
< (int) block
->instructions
.size());
324 aco_ptr
<Instruction
>& candidate
= block
->instructions
[candidate_idx
];
326 if (candidate
->opcode
== aco_opcode::p_logical_end
)
328 if (!can_move_instr(candidate
, current
, moving_interaction
))
331 const bool writes_exec
= std::any_of(candidate
->definitions
.begin(), candidate
->definitions
.end(),
332 [](const Definition
& def
) { return def
.isFixed() && def
.physReg() == exec
;});
336 /* check if candidate depends on current */
337 bool is_dependency
= std::any_of(candidate
->operands
.begin(), candidate
->operands
.end(),
338 [&ctx
](const Operand
& op
) { return op
.isTemp() && ctx
.depends_on
[op
.tempId()];});
339 if (moving_spill
&& is_spill_reload(candidate
))
340 is_dependency
= true;
341 if ((moving_interaction
& barrier_shared
) && candidate
->format
== Format::DS
)
342 is_dependency
= true;
343 moving_interaction
|= get_barrier_interaction(candidate
.get());
344 moving_spill
|= is_spill_reload(candidate
);
346 for (const Definition
& def
: candidate
->definitions
) {
348 ctx
.depends_on
[def
.tempId()] = true;
350 for (const Operand
& op
: candidate
->operands
) {
352 ctx
.RAR_dependencies
[op
.tempId()] = true;
354 if (!found_dependency
) {
355 insert_idx
= candidate_idx
;
356 found_dependency
= true;
357 /* init register pressure */
358 register_pressure
= register_demand
[insert_idx
- 1];
362 if (!can_reorder(candidate
.get(), false) && !can_reorder_cur
)
365 if (!found_dependency
) {
370 /* update register pressure */
371 register_pressure
.update(register_demand
[candidate_idx
- 1]);
375 assert(insert_idx
!= idx
);
377 // TODO: correctly calculate register pressure for this case
378 bool register_pressure_unknown
= false;
379 /* check if candidate uses/kills an operand which is used by a dependency */
380 for (const Operand
& op
: candidate
->operands
) {
381 if (op
.isTemp() && ctx
.RAR_dependencies
[op
.tempId()])
382 register_pressure_unknown
= true;
384 if (register_pressure_unknown
) {
385 for (const Definition
& def
: candidate
->definitions
) {
387 ctx
.RAR_dependencies
[def
.tempId()] = true;
389 for (const Operand
& op
: candidate
->operands
) {
391 ctx
.RAR_dependencies
[op
.tempId()] = true;
396 /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
397 const RegisterDemand candidate_diff
= getLiveChanges(candidate
);
398 const RegisterDemand temp
= getTempRegisters(candidate
);
399 if (RegisterDemand(register_pressure
+ candidate_diff
).exceeds(ctx
.max_registers
))
401 const RegisterDemand temp2
= getTempRegisters(block
->instructions
[insert_idx
- 1]);
402 const RegisterDemand new_demand
= register_demand
[insert_idx
- 1] - temp2
+ candidate_diff
+ temp
;
403 if (new_demand
.exceeds(ctx
.max_registers
))
406 /* move the candidate above the insert_idx */
407 move_element(block
->instructions
, candidate_idx
, insert_idx
);
409 /* update register pressure */
410 move_element(register_demand
, candidate_idx
, insert_idx
);
411 for (int i
= insert_idx
+ 1; i
<= candidate_idx
; i
++) {
412 register_demand
[i
] += candidate_diff
;
414 register_demand
[insert_idx
] = new_demand
;
415 register_pressure
+= candidate_diff
;
420 ctx
.last_SMEM_dep_idx
= found_dependency
? insert_idx
: 0;
421 ctx
.last_SMEM_stall
= 10 - ctx
.num_waves
- k
;
424 void schedule_VMEM(sched_ctx
& ctx
, Block
* block
,
425 std::vector
<RegisterDemand
>& register_demand
,
426 Instruction
* current
, int idx
)
429 int window_size
= VMEM_WINDOW_SIZE
;
430 int max_moves
= VMEM_MAX_MOVES
;
432 bool can_reorder_cur
= can_reorder(current
, false);
434 /* create the initial set of values which current depends on */
435 std::fill(ctx
.depends_on
.begin(), ctx
.depends_on
.end(), false);
436 std::fill(ctx
.RAR_dependencies
.begin(), ctx
.RAR_dependencies
.end(), false);
437 for (const Operand
& op
: current
->operands
) {
439 ctx
.depends_on
[op
.tempId()] = true;
440 if (op
.isFirstKill())
441 ctx
.RAR_dependencies
[op
.tempId()] = true;
445 /* maintain how many registers remain free when moving instructions */
446 RegisterDemand register_pressure
= register_demand
[idx
];
448 /* first, check if we have instructions before current to move down */
449 int insert_idx
= idx
+ 1;
450 int moving_interaction
= barrier_none
;
451 bool moving_spill
= false;
453 for (int candidate_idx
= idx
- 1; k
< max_moves
&& candidate_idx
> (int) idx
- window_size
; candidate_idx
--) {
454 assert(candidate_idx
>= 0);
455 aco_ptr
<Instruction
>& candidate
= block
->instructions
[candidate_idx
];
457 /* break when encountering another VMEM instruction, logical_start or barriers */
458 if (!can_reorder(candidate
.get(), true) && !can_reorder_cur
)
460 if (candidate
->opcode
== aco_opcode::p_logical_start
)
462 if (candidate
->opcode
== aco_opcode::p_exit_early_if
)
464 if (!can_move_instr(candidate
, current
, moving_interaction
))
467 /* break if we'd make the previous SMEM instruction stall */
468 bool can_stall_prev_smem
= idx
<= ctx
.last_SMEM_dep_idx
&& candidate_idx
< ctx
.last_SMEM_dep_idx
;
469 if (can_stall_prev_smem
&& ctx
.last_SMEM_stall
>= 0)
471 register_pressure
.update(register_demand
[candidate_idx
]);
473 /* if current depends on candidate, add additional dependencies and continue */
474 bool can_move_down
= !candidate
->isVMEM();
475 bool writes_exec
= false;
476 for (const Definition
& def
: candidate
->definitions
) {
477 if (def
.isTemp() && ctx
.depends_on
[def
.tempId()])
478 can_move_down
= false;
479 if (def
.isFixed() && def
.physReg() == exec
)
485 if (moving_spill
&& is_spill_reload(candidate
))
486 can_move_down
= false;
487 if ((moving_interaction
& barrier_shared
) && candidate
->format
== Format::DS
)
488 can_move_down
= false;
489 moving_interaction
|= get_barrier_interaction(candidate
.get());
490 moving_spill
|= is_spill_reload(candidate
);
491 if (!can_move_down
) {
492 for (const Operand
& op
: candidate
->operands
) {
494 ctx
.depends_on
[op
.tempId()] = true;
495 if (op
.isFirstKill())
496 ctx
.RAR_dependencies
[op
.tempId()] = true;
502 bool register_pressure_unknown
= false;
503 /* check if one of candidate's operands is killed by depending instruction */
504 for (const Operand
& op
: candidate
->operands
) {
505 if (op
.isTemp() && ctx
.RAR_dependencies
[op
.tempId()]) {
506 // FIXME: account for difference in register pressure
507 register_pressure_unknown
= true;
510 if (register_pressure_unknown
) {
511 for (const Operand
& op
: candidate
->operands
) {
513 ctx
.depends_on
[op
.tempId()] = true;
514 if (op
.isFirstKill())
515 ctx
.RAR_dependencies
[op
.tempId()] = true;
521 /* check if register pressure is low enough: the diff is negative if register pressure is increased */
522 const RegisterDemand candidate_diff
= getLiveChanges(candidate
);
523 const RegisterDemand temp
= getTempRegisters(candidate
);;
524 if (RegisterDemand(register_pressure
- candidate_diff
).exceeds(ctx
.max_registers
))
526 const RegisterDemand temp2
= getTempRegisters(block
->instructions
[insert_idx
- 1]);
527 const RegisterDemand new_demand
= register_demand
[insert_idx
- 1] - temp2
+ temp
;
528 if (new_demand
.exceeds(ctx
.max_registers
))
530 // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure
532 /* move the candidate below the memory load */
533 move_element(block
->instructions
, candidate_idx
, insert_idx
);
535 /* update register pressure */
536 move_element(register_demand
, candidate_idx
, insert_idx
);
537 for (int i
= candidate_idx
; i
< insert_idx
- 1; i
++) {
538 register_demand
[i
] -= candidate_diff
;
540 register_demand
[insert_idx
- 1] = new_demand
;
541 register_pressure
-= candidate_diff
;
544 if (candidate_idx
< ctx
.last_SMEM_dep_idx
)
545 ctx
.last_SMEM_stall
++;
548 /* create the initial set of values which depend on current */
549 std::fill(ctx
.depends_on
.begin(), ctx
.depends_on
.end(), false);
550 std::fill(ctx
.RAR_dependencies
.begin(), ctx
.RAR_dependencies
.end(), false);
551 for (const Definition
& def
: current
->definitions
) {
553 ctx
.depends_on
[def
.tempId()] = true;
556 /* find the first instruction depending on current or find another VMEM */
558 moving_interaction
= barrier_none
;
559 moving_spill
= false;
561 bool found_dependency
= false;
562 /* second, check if we have instructions after current to move up */
563 for (int candidate_idx
= idx
+ 1; k
< max_moves
&& candidate_idx
< (int) idx
+ window_size
; candidate_idx
++) {
564 assert(candidate_idx
< (int) block
->instructions
.size());
565 aco_ptr
<Instruction
>& candidate
= block
->instructions
[candidate_idx
];
567 if (candidate
->opcode
== aco_opcode::p_logical_end
)
569 if (!can_move_instr(candidate
, current
, moving_interaction
))
572 const bool writes_exec
= std::any_of(candidate
->definitions
.begin(), candidate
->definitions
.end(),
573 [](const Definition
& def
) {return def
.isFixed() && def
.physReg() == exec
; });
577 /* check if candidate depends on current */
578 bool is_dependency
= !can_reorder(candidate
.get(), true) && !can_reorder_cur
;
579 for (const Operand
& op
: candidate
->operands
) {
580 if (op
.isTemp() && ctx
.depends_on
[op
.tempId()]) {
581 is_dependency
= true;
585 if (moving_spill
&& is_spill_reload(candidate
))
586 is_dependency
= true;
587 if ((moving_interaction
& barrier_shared
) && candidate
->format
== Format::DS
)
588 is_dependency
= true;
589 moving_interaction
|= get_barrier_interaction(candidate
.get());
590 moving_spill
|= is_spill_reload(candidate
);
592 for (const Definition
& def
: candidate
->definitions
) {
594 ctx
.depends_on
[def
.tempId()] = true;
596 for (const Operand
& op
: candidate
->operands
) {
598 ctx
.RAR_dependencies
[op
.tempId()] = true;
600 if (!found_dependency
) {
601 insert_idx
= candidate_idx
;
602 found_dependency
= true;
603 /* init register pressure */
604 register_pressure
= register_demand
[insert_idx
- 1];
607 } else if (candidate
->isVMEM()) {
608 for (const Definition
& def
: candidate
->definitions
) {
610 ctx
.depends_on
[def
.tempId()] = true;
614 /* update register pressure */
615 register_pressure
.update(register_demand
[candidate_idx
- 1]);
617 if (is_dependency
|| !found_dependency
)
619 assert(insert_idx
!= idx
);
621 bool register_pressure_unknown
= false;
622 /* check if candidate uses/kills an operand which is used by a dependency */
623 for (const Operand
& op
: candidate
->operands
) {
624 if (op
.isTemp() && op
.isFirstKill() && ctx
.RAR_dependencies
[op
.tempId()])
625 register_pressure_unknown
= true;
627 if (register_pressure_unknown
) {
628 for (const Definition
& def
: candidate
->definitions
) {
630 ctx
.depends_on
[def
.tempId()] = true;
632 for (const Operand
& op
: candidate
->operands
) {
634 ctx
.RAR_dependencies
[op
.tempId()] = true;
639 /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
640 const RegisterDemand candidate_diff
= getLiveChanges(candidate
);
641 const RegisterDemand temp
= getTempRegisters(candidate
);
642 if (RegisterDemand(register_pressure
+ candidate_diff
).exceeds(ctx
.max_registers
))
644 const RegisterDemand temp2
= getTempRegisters(block
->instructions
[insert_idx
- 1]);
645 const RegisterDemand new_demand
= register_demand
[insert_idx
- 1] - temp2
+ candidate_diff
+ temp
;
646 if (new_demand
.exceeds(ctx
.max_registers
))
649 /* move the candidate above the insert_idx */
650 move_element(block
->instructions
, candidate_idx
, insert_idx
);
652 /* update register pressure */
653 move_element(register_demand
, candidate_idx
, insert_idx
);
654 for (int i
= insert_idx
+ 1; i
<= candidate_idx
; i
++) {
655 register_demand
[i
] += candidate_diff
;
657 register_demand
[insert_idx
] = new_demand
;
658 register_pressure
+= candidate_diff
;
664 void schedule_position_export(sched_ctx
& ctx
, Block
* block
,
665 std::vector
<RegisterDemand
>& register_demand
,
666 Instruction
* current
, int idx
)
669 int window_size
= POS_EXP_WINDOW_SIZE
;
670 int max_moves
= POS_EXP_MAX_MOVES
;
673 /* create the initial set of values which current depends on */
674 std::fill(ctx
.depends_on
.begin(), ctx
.depends_on
.end(), false);
675 std::fill(ctx
.RAR_dependencies
.begin(), ctx
.RAR_dependencies
.end(), false);
676 for (const Operand
& op
: current
->operands
) {
678 ctx
.depends_on
[op
.tempId()] = true;
679 if (op
.isFirstKill())
680 ctx
.RAR_dependencies
[op
.tempId()] = true;
684 /* maintain how many registers remain free when moving instructions */
685 RegisterDemand register_pressure
= register_demand
[idx
];
687 /* first, check if we have instructions before current to move down */
688 int insert_idx
= idx
+ 1;
689 int moving_interaction
= barrier_none
;
690 bool moving_spill
= false;
692 for (int candidate_idx
= idx
- 1; k
< max_moves
&& candidate_idx
> (int) idx
- window_size
; candidate_idx
--) {
693 assert(candidate_idx
>= 0);
694 aco_ptr
<Instruction
>& candidate
= block
->instructions
[candidate_idx
];
696 /* break when encountering logical_start or barriers */
697 if (candidate
->opcode
== aco_opcode::p_logical_start
)
699 if (candidate
->opcode
== aco_opcode::p_exit_early_if
)
701 if (candidate
->isVMEM() || candidate
->format
== Format::SMEM
)
703 if (!can_move_instr(candidate
, current
, moving_interaction
))
706 register_pressure
.update(register_demand
[candidate_idx
]);
708 /* if current depends on candidate, add additional dependencies and continue */
709 bool can_move_down
= true;
710 bool writes_exec
= false;
711 for (unsigned i
= 0; i
< candidate
->definitions
.size(); i
++) {
712 if (candidate
->definitions
[i
].isTemp() && ctx
.depends_on
[candidate
->definitions
[i
].tempId()])
713 can_move_down
= false;
714 if (candidate
->definitions
[i
].isFixed() && candidate
->definitions
[i
].physReg() == exec
)
720 if (moving_spill
&& is_spill_reload(candidate
))
721 can_move_down
= false;
722 if ((moving_interaction
& barrier_shared
) && candidate
->format
== Format::DS
)
723 can_move_down
= false;
724 moving_interaction
|= get_barrier_interaction(candidate
.get());
725 moving_spill
|= is_spill_reload(candidate
);
726 if (!can_move_down
) {
727 for (const Operand
& op
: candidate
->operands
) {
729 ctx
.depends_on
[op
.tempId()] = true;
730 if (op
.isFirstKill())
731 ctx
.RAR_dependencies
[op
.tempId()] = true;
737 bool register_pressure_unknown
= false;
738 /* check if one of candidate's operands is killed by depending instruction */
739 for (const Operand
& op
: candidate
->operands
) {
740 if (op
.isTemp() && ctx
.RAR_dependencies
[op
.tempId()]) {
741 // FIXME: account for difference in register pressure
742 register_pressure_unknown
= true;
745 if (register_pressure_unknown
) {
746 for (const Operand
& op
: candidate
->operands
) {
748 ctx
.depends_on
[op
.tempId()] = true;
749 if (op
.isFirstKill())
750 ctx
.RAR_dependencies
[op
.tempId()] = true;
756 /* check if register pressure is low enough: the diff is negative if register pressure is increased */
757 const RegisterDemand candidate_diff
= getLiveChanges(candidate
);
758 const RegisterDemand temp
= getTempRegisters(candidate
);;
759 if (RegisterDemand(register_pressure
- candidate_diff
).exceeds(ctx
.max_registers
))
761 const RegisterDemand temp2
= getTempRegisters(block
->instructions
[insert_idx
- 1]);
762 const RegisterDemand new_demand
= register_demand
[insert_idx
- 1] - temp2
+ temp
;
763 if (new_demand
.exceeds(ctx
.max_registers
))
765 // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure
767 /* move the candidate below the export */
768 move_element(block
->instructions
, candidate_idx
, insert_idx
);
770 /* update register pressure */
771 move_element(register_demand
, candidate_idx
, insert_idx
);
772 for (int i
= candidate_idx
; i
< insert_idx
- 1; i
++) {
773 register_demand
[i
] -= candidate_diff
;
775 register_demand
[insert_idx
- 1] = new_demand
;
776 register_pressure
-= candidate_diff
;
782 void schedule_block(sched_ctx
& ctx
, Program
*program
, Block
* block
, live
& live_vars
)
784 ctx
.last_SMEM_dep_idx
= 0;
785 ctx
.last_SMEM_stall
= INT16_MIN
;
787 /* go through all instructions and find memory loads */
788 for (unsigned idx
= 0; idx
< block
->instructions
.size(); idx
++) {
789 Instruction
* current
= block
->instructions
[idx
].get();
791 if (current
->definitions
.empty())
794 if (current
->isVMEM())
795 schedule_VMEM(ctx
, block
, live_vars
.register_demand
[block
->index
], current
, idx
);
796 if (current
->format
== Format::SMEM
)
797 schedule_SMEM(ctx
, block
, live_vars
.register_demand
[block
->index
], current
, idx
);
800 if ((program
->stage
& hw_vs
) && block
->index
== program
->blocks
.size() - 1) {
801 /* Try to move position exports as far up as possible, to reduce register
802 * usage and because ISA reference guides say so. */
803 for (unsigned idx
= 0; idx
< block
->instructions
.size(); idx
++) {
804 Instruction
* current
= block
->instructions
[idx
].get();
806 if (current
->format
== Format::EXP
) {
807 unsigned target
= static_cast<Export_instruction
*>(current
)->dest
;
808 if (target
>= V_008DFC_SQ_EXP_POS
&& target
< V_008DFC_SQ_EXP_PARAM
)
809 schedule_position_export(ctx
, block
, live_vars
.register_demand
[block
->index
], current
, idx
);
814 /* resummarize the block's register demand */
815 block
->register_demand
= RegisterDemand();
816 for (unsigned idx
= 0; idx
< block
->instructions
.size(); idx
++) {
817 block
->register_demand
.update(live_vars
.register_demand
[block
->index
][idx
]);
822 void schedule_program(Program
*program
, live
& live_vars
)
825 ctx
.depends_on
.resize(program
->peekAllocationId());
826 ctx
.RAR_dependencies
.resize(program
->peekAllocationId());
827 /* Allowing the scheduler to reduce the number of waves to as low as 5
828 * improves performance of Thrones of Britannia significantly and doesn't
829 * seem to hurt anything else. */
830 if (program
->num_waves
<= 5)
831 ctx
.num_waves
= program
->num_waves
;
832 else if (program
->max_reg_demand
.vgpr
>= 32)
834 else if (program
->max_reg_demand
.vgpr
>= 28)
836 else if (program
->max_reg_demand
.vgpr
>= 24)
841 assert(ctx
.num_waves
> 0 && ctx
.num_waves
<= program
->num_waves
);
842 ctx
.max_registers
= { int16_t(((256 / ctx
.num_waves
) & ~3) - 2), int16_t(get_addr_sgpr_from_waves(program
, ctx
.num_waves
))};
844 for (Block
& block
: program
->blocks
)
845 schedule_block(ctx
, program
, &block
, live_vars
);
847 /* update max_reg_demand and num_waves */
848 RegisterDemand new_demand
;
849 for (Block
& block
: program
->blocks
) {
850 new_demand
.update(block
.register_demand
);
852 update_vgpr_sgpr_demand(program
, new_demand
);
854 /* if enabled, this code asserts that register_demand is updated correctly */
856 int prev_num_waves
= program
->num_waves
;
857 const RegisterDemand prev_max_demand
= program
->max_reg_demand
;
859 std::vector
<RegisterDemand
> demands(program
->blocks
.size());
860 for (unsigned j
= 0; j
< program
->blocks
.size(); j
++) {
861 demands
[j
] = program
->blocks
[j
].register_demand
;
864 struct radv_nir_compiler_options options
;
865 options
.chip_class
= program
->chip_class
;
866 live live_vars2
= aco::live_var_analysis(program
, &options
);
868 for (unsigned j
= 0; j
< program
->blocks
.size(); j
++) {
869 Block
&b
= program
->blocks
[j
];
870 for (unsigned i
= 0; i
< b
.instructions
.size(); i
++)
871 assert(live_vars
.register_demand
[b
.index
][i
] == live_vars2
.register_demand
[b
.index
][i
]);
872 assert(b
.register_demand
== demands
[j
]);
875 assert(program
->max_reg_demand
== prev_max_demand
);
876 assert(program
->num_waves
== prev_num_waves
);