aco: fix accidential reordering of instructions when scheduling
[mesa.git] / src / amd / compiler / aco_scheduler.cpp
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #include "aco_ir.h"
26 #include <unordered_set>
27 #include <algorithm>
28
29 #include "vulkan/radv_shader.h" // for radv_nir_compiler_options
30 #include "amdgfxregs.h"
31
32 #define SMEM_WINDOW_SIZE (350 - ctx.num_waves * 35)
33 #define VMEM_WINDOW_SIZE (1024 - ctx.num_waves * 64)
34 #define POS_EXP_WINDOW_SIZE 512
35 #define SMEM_MAX_MOVES (64 - ctx.num_waves * 4)
36 #define VMEM_MAX_MOVES (128 - ctx.num_waves * 8)
37 /* creating clauses decreases def-use distances, so make it less aggressive the lower num_waves is */
38 #define VMEM_CLAUSE_MAX_GRAB_DIST ((ctx.num_waves - 1) * 8)
39 #define POS_EXP_MAX_MOVES 512
40
41 namespace aco {
42
43 struct sched_ctx {
44 std::vector<bool> depends_on;
45 std::vector<bool> RAR_dependencies;
46 /* For downwards VMEM scheduling, same as RAR_dependencies but excludes the
47 * instructions in the clause, since new instructions in the clause are not
48 * moved past any other instructions in the clause. */
49 std::vector<bool> new_RAR_dependencies;
50
51 RegisterDemand max_registers;
52 int16_t num_waves;
53 int16_t last_SMEM_stall;
54 int last_SMEM_dep_idx;
55 };
56
57 /* This scheduler is a simple bottom-up pass based on ideas from
58 * "A Novel Lightweight Instruction Scheduling Algorithm for Just-In-Time Compiler"
59 * from Xiaohua Shi and Peng Guo.
60 * The basic approach is to iterate over all instructions. When a memory instruction
61 * is encountered it tries to move independent instructions from above and below
62 * between the memory instruction and it's first user.
63 * The novelty is that this scheduler cares for the current register pressure:
64 * Instructions will only be moved if the register pressure won't exceed a certain bound.
65 */
66
67 template <typename T>
68 void move_element(T& list, size_t idx, size_t before) {
69 if (idx < before) {
70 auto begin = std::next(list.begin(), idx);
71 auto end = std::next(list.begin(), before);
72 std::rotate(begin, begin + 1, end);
73 } else if (idx > before) {
74 auto begin = std::next(list.begin(), before);
75 auto end = std::next(list.begin(), idx + 1);
76 std::rotate(begin, end - 1, end);
77 }
78 }
79
80 static RegisterDemand getLiveChanges(aco_ptr<Instruction>& instr)
81 {
82 RegisterDemand changes;
83 for (const Definition& def : instr->definitions) {
84 if (!def.isTemp() || def.isKill())
85 continue;
86 changes += def.getTemp();
87 }
88
89 for (const Operand& op : instr->operands) {
90 if (!op.isTemp() || !op.isFirstKill())
91 continue;
92 changes -= op.getTemp();
93 }
94
95 return changes;
96 }
97
98 static RegisterDemand getTempRegisters(aco_ptr<Instruction>& instr)
99 {
100 RegisterDemand temp_registers;
101 for (const Definition& def : instr->definitions) {
102 if (!def.isTemp() || !def.isKill())
103 continue;
104 temp_registers += def.getTemp();
105 }
106 return temp_registers;
107 }
108
109 static bool is_spill_reload(aco_ptr<Instruction>& instr)
110 {
111 return instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload;
112 }
113
114 bool can_move_instr(aco_ptr<Instruction>& instr, Instruction* current, int moving_interaction)
115 {
116 /* don't move exports so that they stay closer together */
117 if (instr->format == Format::EXP)
118 return false;
119
120 /* don't move s_memtime/s_memrealtime */
121 if (instr->opcode == aco_opcode::s_memtime || instr->opcode == aco_opcode::s_memrealtime)
122 return false;
123
124 /* handle barriers */
125
126 /* TODO: instead of stopping, maybe try to move the barriers and any
127 * instructions interacting with them instead? */
128 if (instr->format != Format::PSEUDO_BARRIER) {
129 if (instr->opcode == aco_opcode::s_barrier) {
130 bool can_reorder = false;
131 switch (current->format) {
132 case Format::SMEM:
133 can_reorder = static_cast<SMEM_instruction*>(current)->can_reorder;
134 break;
135 case Format::MUBUF:
136 can_reorder = static_cast<MUBUF_instruction*>(current)->can_reorder;
137 break;
138 case Format::MIMG:
139 can_reorder = static_cast<MIMG_instruction*>(current)->can_reorder;
140 break;
141 default:
142 break;
143 }
144 return can_reorder && moving_interaction == barrier_none;
145 } else {
146 return true;
147 }
148 }
149
150 int interaction = get_barrier_interaction(current);
151 interaction |= moving_interaction;
152
153 switch (instr->opcode) {
154 case aco_opcode::p_memory_barrier_atomic:
155 return !(interaction & barrier_atomic);
156 /* For now, buffer and image barriers are treated the same. this is because of
157 * dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_nonlocal.buffer.guard_nonlocal.image.comp
158 * which seems to use an image load to determine if the result of a buffer load is valid. So the ordering of the two loads is important.
159 * I /think/ we should probably eventually expand the meaning of a buffer barrier so that all buffer operations before it, must stay before it
160 * and that both image and buffer operations after it, must stay after it. We should also do the same for image barriers.
161 * Or perhaps the problem is that we don't have a combined barrier instruction for both buffers and images, but the CTS test expects us to?
162 * Either way, this solution should work. */
163 case aco_opcode::p_memory_barrier_buffer:
164 case aco_opcode::p_memory_barrier_image:
165 return !(interaction & (barrier_image | barrier_buffer));
166 case aco_opcode::p_memory_barrier_shared:
167 return !(interaction & barrier_shared);
168 case aco_opcode::p_memory_barrier_all:
169 return interaction == barrier_none;
170 default:
171 return false;
172 }
173 }
174
175 bool can_reorder(Instruction* candidate)
176 {
177 switch (candidate->format) {
178 case Format::SMEM:
179 return static_cast<SMEM_instruction*>(candidate)->can_reorder;
180 case Format::MUBUF:
181 return static_cast<MUBUF_instruction*>(candidate)->can_reorder;
182 case Format::MIMG:
183 return static_cast<MIMG_instruction*>(candidate)->can_reorder;
184 case Format::MTBUF:
185 return static_cast<MTBUF_instruction*>(candidate)->can_reorder;
186 case Format::FLAT:
187 case Format::GLOBAL:
188 case Format::SCRATCH:
189 return false;
190 default:
191 return true;
192 }
193 }
194
195 void schedule_SMEM(sched_ctx& ctx, Block* block,
196 std::vector<RegisterDemand>& register_demand,
197 Instruction* current, int idx)
198 {
199 assert(idx != 0);
200 int window_size = SMEM_WINDOW_SIZE;
201 int max_moves = SMEM_MAX_MOVES;
202 int16_t k = 0;
203 bool can_reorder_cur = can_reorder(current);
204
205 /* don't move s_memtime/s_memrealtime */
206 if (current->opcode == aco_opcode::s_memtime || current->opcode == aco_opcode::s_memrealtime)
207 return;
208
209 /* create the initial set of values which current depends on */
210 std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
211 for (const Operand& op : current->operands) {
212 if (op.isTemp())
213 ctx.depends_on[op.tempId()] = true;
214 }
215
216 /* maintain how many registers remain free when moving instructions */
217 RegisterDemand register_pressure = register_demand[idx];
218
219 /* first, check if we have instructions before current to move down */
220 int insert_idx = idx + 1;
221 int moving_interaction = barrier_none;
222 bool moving_spill = false;
223
224 for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
225 assert(candidate_idx >= 0);
226 aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
227 bool can_reorder_candidate = can_reorder(candidate.get());
228
229 /* break if we'd make the previous SMEM instruction stall */
230 bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
231 if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0)
232 break;
233
234 /* break when encountering another MEM instruction, logical_start or barriers */
235 if (!can_reorder_candidate && !can_reorder_cur)
236 break;
237 if (candidate->opcode == aco_opcode::p_logical_start)
238 break;
239 if (candidate->opcode == aco_opcode::p_exit_early_if)
240 break;
241 if (!can_move_instr(candidate, current, moving_interaction))
242 break;
243 if (candidate->isVMEM())
244 break;
245 register_pressure.update(register_demand[candidate_idx]);
246
247 /* if current depends on candidate, add additional dependencies and continue */
248 bool can_move_down = true;
249 bool writes_exec = false;
250 for (const Definition& def : candidate->definitions) {
251 if (def.isTemp() && ctx.depends_on[def.tempId()])
252 can_move_down = false;
253 if (def.isFixed() && def.physReg() == exec)
254 writes_exec = true;
255 }
256 if (writes_exec)
257 break;
258
259 if (moving_spill && is_spill_reload(candidate))
260 can_move_down = false;
261 if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
262 can_move_down = false;
263 moving_interaction |= get_barrier_interaction(candidate.get());
264 moving_spill |= is_spill_reload(candidate);
265 if (!can_move_down) {
266 for (const Operand& op : candidate->operands) {
267 if (op.isTemp())
268 ctx.depends_on[op.tempId()] = true;
269 }
270 can_reorder_cur &= can_reorder_candidate;
271 continue;
272 }
273
274 bool register_pressure_unknown = false;
275 /* check if one of candidate's operands is killed by depending instruction */
276 for (const Operand& op : candidate->operands) {
277 if (op.isTemp() && ctx.depends_on[op.tempId()]) {
278 // FIXME: account for difference in register pressure
279 register_pressure_unknown = true;
280 }
281 }
282 if (register_pressure_unknown) {
283 for (const Operand& op : candidate->operands) {
284 if (op.isTemp())
285 ctx.depends_on[op.tempId()] = true;
286 }
287 can_reorder_cur &= can_reorder_candidate;
288 continue;
289 }
290
291 /* check if register pressure is low enough: the diff is negative if register pressure is increased */
292 const RegisterDemand candidate_diff = getLiveChanges(candidate);
293 const RegisterDemand tempDemand = getTempRegisters(candidate);
294 if (RegisterDemand(register_pressure - candidate_diff).exceeds(ctx.max_registers))
295 break;
296 const RegisterDemand tempDemand2 = getTempRegisters(block->instructions[insert_idx - 1]);
297 const RegisterDemand new_demand = register_demand[insert_idx - 1] - tempDemand2 + tempDemand;
298 if (new_demand.exceeds(ctx.max_registers))
299 break;
300 // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure
301
302 /* move the candidate below the memory load */
303 move_element(block->instructions, candidate_idx, insert_idx);
304
305 /* update register pressure */
306 move_element(register_demand, candidate_idx, insert_idx);
307 for (int i = candidate_idx; i < insert_idx - 1; i++) {
308 register_demand[i] -= candidate_diff;
309 }
310 register_demand[insert_idx - 1] = new_demand;
311 register_pressure -= candidate_diff;
312
313 if (candidate_idx < ctx.last_SMEM_dep_idx)
314 ctx.last_SMEM_stall++;
315 insert_idx--;
316 k++;
317 }
318
319 /* create the initial set of values which depend on current */
320 std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
321 std::fill(ctx.RAR_dependencies.begin(), ctx.RAR_dependencies.end(), false);
322 for (const Definition& def : current->definitions) {
323 if (def.isTemp())
324 ctx.depends_on[def.tempId()] = true;
325 }
326
327 /* find the first instruction depending on current or find another MEM */
328 insert_idx = idx + 1;
329 moving_interaction = barrier_none;
330 moving_spill = false;
331 can_reorder_cur = true;
332
333 bool found_dependency = false;
334 /* second, check if we have instructions after current to move up */
335 for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) {
336 assert(candidate_idx < (int) block->instructions.size());
337 aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
338 bool can_reorder_candidate = can_reorder(candidate.get());
339
340 if (candidate->opcode == aco_opcode::p_logical_end)
341 break;
342 if (!can_move_instr(candidate, current, moving_interaction))
343 break;
344
345 const bool writes_exec = std::any_of(candidate->definitions.begin(), candidate->definitions.end(),
346 [](const Definition& def) { return def.isFixed() && def.physReg() == exec;});
347 if (writes_exec)
348 break;
349
350 /* check if candidate depends on current */
351 bool is_dependency = std::any_of(candidate->operands.begin(), candidate->operands.end(),
352 [&ctx](const Operand& op) { return op.isTemp() && ctx.depends_on[op.tempId()];});
353 /* no need to steal from following VMEM instructions */
354 if (is_dependency && candidate->isVMEM())
355 break;
356 if (moving_spill && is_spill_reload(candidate))
357 is_dependency = true;
358 if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
359 is_dependency = true;
360 moving_interaction |= get_barrier_interaction(candidate.get());
361 moving_spill |= is_spill_reload(candidate);
362 if (is_dependency) {
363 for (const Definition& def : candidate->definitions) {
364 if (def.isTemp())
365 ctx.depends_on[def.tempId()] = true;
366 }
367 for (const Operand& op : candidate->operands) {
368 if (op.isTemp())
369 ctx.RAR_dependencies[op.tempId()] = true;
370 }
371 if (!found_dependency) {
372 insert_idx = candidate_idx;
373 found_dependency = true;
374 /* init register pressure */
375 register_pressure = register_demand[insert_idx - 1];
376 }
377 }
378
379 if (!can_reorder_candidate && !can_reorder_cur)
380 break;
381
382 if (!found_dependency) {
383 k++;
384 continue;
385 }
386
387 /* update register pressure */
388 register_pressure.update(register_demand[candidate_idx - 1]);
389
390 if (is_dependency) {
391 can_reorder_cur &= can_reorder_candidate;
392 continue;
393 }
394 assert(insert_idx != idx);
395
396 // TODO: correctly calculate register pressure for this case
397 bool register_pressure_unknown = false;
398 /* check if candidate uses/kills an operand which is used by a dependency */
399 for (const Operand& op : candidate->operands) {
400 if (op.isTemp() && ctx.RAR_dependencies[op.tempId()])
401 register_pressure_unknown = true;
402 }
403 if (register_pressure_unknown) {
404 if (candidate->isVMEM())
405 break;
406 for (const Definition& def : candidate->definitions) {
407 if (def.isTemp())
408 ctx.RAR_dependencies[def.tempId()] = true;
409 }
410 for (const Operand& op : candidate->operands) {
411 if (op.isTemp())
412 ctx.RAR_dependencies[op.tempId()] = true;
413 }
414 can_reorder_cur &= can_reorder_candidate;
415 continue;
416 }
417
418 /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
419 const RegisterDemand candidate_diff = getLiveChanges(candidate);
420 const RegisterDemand temp = getTempRegisters(candidate);
421 if (RegisterDemand(register_pressure + candidate_diff).exceeds(ctx.max_registers))
422 break;
423 const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]);
424 const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + candidate_diff + temp;
425 if (new_demand.exceeds(ctx.max_registers))
426 break;
427
428 /* move the candidate above the insert_idx */
429 move_element(block->instructions, candidate_idx, insert_idx);
430
431 /* update register pressure */
432 move_element(register_demand, candidate_idx, insert_idx);
433 for (int i = insert_idx + 1; i <= candidate_idx; i++) {
434 register_demand[i] += candidate_diff;
435 }
436 register_demand[insert_idx] = new_demand;
437 register_pressure += candidate_diff;
438 insert_idx++;
439 k++;
440 }
441
442 ctx.last_SMEM_dep_idx = found_dependency ? insert_idx : 0;
443 ctx.last_SMEM_stall = 10 - ctx.num_waves - k;
444 }
445
446 void schedule_VMEM(sched_ctx& ctx, Block* block,
447 std::vector<RegisterDemand>& register_demand,
448 Instruction* current, int idx)
449 {
450 assert(idx != 0);
451 int window_size = VMEM_WINDOW_SIZE;
452 int max_moves = VMEM_MAX_MOVES;
453 int clause_max_grab_dist = VMEM_CLAUSE_MAX_GRAB_DIST;
454 int16_t k = 0;
455 /* initially true as we don't pull other VMEM instructions
456 * through the current instruction */
457 bool can_reorder_vmem = true;
458 bool can_reorder_smem = true;
459
460 /* create the initial set of values which current depends on */
461 std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
462 std::fill(ctx.RAR_dependencies.begin(), ctx.RAR_dependencies.end(), false);
463 std::fill(ctx.new_RAR_dependencies.begin(), ctx.new_RAR_dependencies.end(), false);
464 for (const Operand& op : current->operands) {
465 if (op.isTemp()) {
466 ctx.depends_on[op.tempId()] = true;
467 if (op.isFirstKill())
468 ctx.RAR_dependencies[op.tempId()] = true;
469 }
470 }
471
472 /* maintain how many registers remain free when moving instructions */
473 RegisterDemand register_pressure_indep = register_demand[idx];
474 RegisterDemand register_pressure_clause = register_demand[idx];
475
476 /* first, check if we have instructions before current to move down */
477 int indep_insert_idx = idx + 1;
478 int clause_insert_idx = idx;
479 int moving_interaction = barrier_none;
480 bool moving_spill = false;
481
482 for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
483 assert(candidate_idx >= 0);
484 aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
485 bool can_reorder_candidate = can_reorder(candidate.get());
486
487 /* break when encountering another VMEM instruction, logical_start or barriers */
488 if (!can_reorder_smem && candidate->format == Format::SMEM && !can_reorder_candidate)
489 break;
490 if (candidate->opcode == aco_opcode::p_logical_start)
491 break;
492 if (candidate->opcode == aco_opcode::p_exit_early_if)
493 break;
494 if (!can_move_instr(candidate, current, moving_interaction))
495 break;
496
497 /* break if we'd make the previous SMEM instruction stall */
498 bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
499 if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0)
500 break;
501 register_pressure_indep.update(register_demand[candidate_idx]);
502
503 bool part_of_clause = false;
504 if (candidate->isVMEM()) {
505 bool same_resource = candidate->operands[1].tempId() == current->operands[1].tempId();
506 bool can_reorder = can_reorder_vmem || can_reorder_candidate;
507 int grab_dist = clause_insert_idx - candidate_idx;
508 /* We can't easily tell how much this will decrease the def-to-use
509 * distances, so just use how far it will be moved as a heuristic. */
510 part_of_clause = can_reorder && same_resource && grab_dist < clause_max_grab_dist;
511 }
512
513 /* if current depends on candidate, add additional dependencies and continue */
514 bool can_move_down = !candidate->isVMEM() || part_of_clause;
515 bool writes_exec = false;
516 for (const Definition& def : candidate->definitions) {
517 if (def.isTemp() && ctx.depends_on[def.tempId()])
518 can_move_down = false;
519 if (def.isFixed() && def.physReg() == exec)
520 writes_exec = true;
521 }
522 if (writes_exec)
523 break;
524
525 if (moving_spill && is_spill_reload(candidate))
526 can_move_down = false;
527 if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
528 can_move_down = false;
529 moving_interaction |= get_barrier_interaction(candidate.get());
530 moving_spill |= is_spill_reload(candidate);
531 if (!can_move_down) {
532 for (const Operand& op : candidate->operands) {
533 if (op.isTemp()) {
534 ctx.depends_on[op.tempId()] = true;
535 if (op.isFirstKill()) {
536 ctx.RAR_dependencies[op.tempId()] = true;
537 ctx.new_RAR_dependencies[op.tempId()] = true;
538 }
539 }
540 }
541 register_pressure_clause.update(register_demand[candidate_idx]);
542 can_reorder_smem &= candidate->format != Format::SMEM || can_reorder_candidate;
543 can_reorder_vmem &= !candidate->isVMEM() || can_reorder_candidate;
544 continue;
545 }
546
547 if (part_of_clause) {
548 for (const Operand& op : candidate->operands) {
549 if (op.isTemp()) {
550 ctx.depends_on[op.tempId()] = true;
551 if (op.isFirstKill())
552 ctx.RAR_dependencies[op.tempId()] = true;
553 }
554 }
555 }
556
557 bool register_pressure_unknown = false;
558 std::vector<bool>& RAR_deps = part_of_clause ? ctx.new_RAR_dependencies : ctx.RAR_dependencies;
559 /* check if one of candidate's operands is killed by depending instruction */
560 for (const Operand& op : candidate->operands) {
561 if (op.isTemp() && RAR_deps[op.tempId()]) {
562 // FIXME: account for difference in register pressure
563 register_pressure_unknown = true;
564 }
565 }
566 if (register_pressure_unknown) {
567 for (const Operand& op : candidate->operands) {
568 if (op.isTemp()) {
569 ctx.depends_on[op.tempId()] = true;
570 if (op.isFirstKill()) {
571 ctx.RAR_dependencies[op.tempId()] = true;
572 ctx.new_RAR_dependencies[op.tempId()] = true;
573 }
574 }
575 }
576 register_pressure_clause.update(register_demand[candidate_idx]);
577 can_reorder_smem &= candidate->format != Format::SMEM || can_reorder_candidate;
578 can_reorder_vmem &= !candidate->isVMEM() || can_reorder_candidate;
579 continue;
580 }
581
582 int insert_idx = part_of_clause ? clause_insert_idx : indep_insert_idx;
583 RegisterDemand register_pressure = part_of_clause ? register_pressure_clause : register_pressure_indep;
584
585 /* check if register pressure is low enough: the diff is negative if register pressure is increased */
586 const RegisterDemand candidate_diff = getLiveChanges(candidate);
587 const RegisterDemand temp = getTempRegisters(candidate);;
588 if (RegisterDemand(register_pressure - candidate_diff).exceeds(ctx.max_registers))
589 break;
590 const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]);
591 const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + temp;
592 if (new_demand.exceeds(ctx.max_registers))
593 break;
594 // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure
595
596 /* move the candidate below the memory load */
597 move_element(block->instructions, candidate_idx, insert_idx);
598
599 /* update register pressure */
600 move_element(register_demand, candidate_idx, insert_idx);
601 for (int i = candidate_idx; i < insert_idx - 1; i++) {
602 register_demand[i] -= candidate_diff;
603 }
604 register_demand[insert_idx - 1] = new_demand;
605 register_pressure_clause -= candidate_diff;
606 clause_insert_idx--;
607 if (!part_of_clause) {
608 register_pressure_indep -= candidate_diff;
609 indep_insert_idx--;
610 }
611 k++;
612 if (candidate_idx < ctx.last_SMEM_dep_idx)
613 ctx.last_SMEM_stall++;
614 }
615
616 /* create the initial set of values which depend on current */
617 std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
618 std::fill(ctx.RAR_dependencies.begin(), ctx.RAR_dependencies.end(), false);
619 for (const Definition& def : current->definitions) {
620 if (def.isTemp())
621 ctx.depends_on[def.tempId()] = true;
622 }
623
624 /* find the first instruction depending on current or find another VMEM */
625 RegisterDemand register_pressure;
626 int insert_idx = idx;
627 moving_interaction = barrier_none;
628 moving_spill = false;
629 // TODO: differentiate between loads and stores (load-load can always reorder)
630 can_reorder_vmem = true;
631 can_reorder_smem = true;
632
633 bool found_dependency = false;
634 /* second, check if we have instructions after current to move up */
635 for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) {
636 assert(candidate_idx < (int) block->instructions.size());
637 aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
638 bool can_reorder_candidate = can_reorder(candidate.get());
639
640 if (candidate->opcode == aco_opcode::p_logical_end)
641 break;
642 if (!can_move_instr(candidate, current, moving_interaction))
643 break;
644
645 const bool writes_exec = std::any_of(candidate->definitions.begin(), candidate->definitions.end(),
646 [](const Definition& def) {return def.isFixed() && def.physReg() == exec; });
647 if (writes_exec)
648 break;
649
650 /* check if candidate depends on current */
651 bool is_dependency = false;
652 if (candidate->format == Format::SMEM)
653 is_dependency = !can_reorder_smem && !can_reorder_candidate;
654 if (candidate->isVMEM())
655 is_dependency = !can_reorder_vmem && !can_reorder_candidate;
656 for (const Operand& op : candidate->operands) {
657 if (op.isTemp() && ctx.depends_on[op.tempId()]) {
658 is_dependency = true;
659 break;
660 }
661 }
662 if (moving_spill && is_spill_reload(candidate))
663 is_dependency = true;
664 if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
665 is_dependency = true;
666 moving_interaction |= get_barrier_interaction(candidate.get());
667 moving_spill |= is_spill_reload(candidate);
668 if (is_dependency) {
669 for (const Definition& def : candidate->definitions) {
670 if (def.isTemp())
671 ctx.depends_on[def.tempId()] = true;
672 }
673 for (const Operand& op : candidate->operands) {
674 if (op.isTemp())
675 ctx.RAR_dependencies[op.tempId()] = true;
676 }
677 /* update flag whether we can reorder other memory instructions */
678 can_reorder_smem &= candidate->format != Format::SMEM || can_reorder_candidate;
679 can_reorder_vmem &= !candidate->isVMEM() || can_reorder_candidate;
680
681 if (!found_dependency) {
682 insert_idx = candidate_idx;
683 found_dependency = true;
684 /* init register pressure */
685 register_pressure = register_demand[insert_idx - 1];
686 continue;
687 }
688
689 } else if (candidate->isVMEM()) {
690 /* don't move up dependencies of other VMEM instructions */
691 for (const Definition& def : candidate->definitions) {
692 if (def.isTemp())
693 ctx.depends_on[def.tempId()] = true;
694 }
695 }
696
697 /* update register pressure */
698 register_pressure.update(register_demand[candidate_idx - 1]);
699
700 if (is_dependency || !found_dependency)
701 continue;
702 assert(insert_idx != idx);
703
704 bool register_pressure_unknown = false;
705 /* check if candidate uses/kills an operand which is used by a dependency */
706 for (const Operand& op : candidate->operands) {
707 if (op.isTemp() && op.isFirstKill() && ctx.RAR_dependencies[op.tempId()])
708 register_pressure_unknown = true;
709 }
710 if (register_pressure_unknown) {
711 for (const Definition& def : candidate->definitions) {
712 if (def.isTemp())
713 ctx.depends_on[def.tempId()] = true;
714 }
715 for (const Operand& op : candidate->operands) {
716 if (op.isTemp())
717 ctx.RAR_dependencies[op.tempId()] = true;
718 }
719 can_reorder_smem &= candidate->format != Format::SMEM || can_reorder_candidate;
720 can_reorder_vmem &= !candidate->isVMEM() || can_reorder_candidate;
721 continue;
722 }
723
724 /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
725 const RegisterDemand candidate_diff = getLiveChanges(candidate);
726 const RegisterDemand temp = getTempRegisters(candidate);
727 if (RegisterDemand(register_pressure + candidate_diff).exceeds(ctx.max_registers))
728 break;
729 const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]);
730 const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + candidate_diff + temp;
731 if (new_demand.exceeds(ctx.max_registers))
732 break;
733
734 /* move the candidate above the insert_idx */
735 move_element(block->instructions, candidate_idx, insert_idx);
736
737 /* update register pressure */
738 move_element(register_demand, candidate_idx, insert_idx);
739 for (int i = insert_idx + 1; i <= candidate_idx; i++) {
740 register_demand[i] += candidate_diff;
741 }
742 register_demand[insert_idx] = new_demand;
743 register_pressure += candidate_diff;
744 insert_idx++;
745 k++;
746 }
747 }
748
749 void schedule_position_export(sched_ctx& ctx, Block* block,
750 std::vector<RegisterDemand>& register_demand,
751 Instruction* current, int idx)
752 {
753 assert(idx != 0);
754 int window_size = POS_EXP_WINDOW_SIZE;
755 int max_moves = POS_EXP_MAX_MOVES;
756 int16_t k = 0;
757
758 /* create the initial set of values which current depends on */
759 std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
760 std::fill(ctx.RAR_dependencies.begin(), ctx.RAR_dependencies.end(), false);
761 for (const Operand& op : current->operands) {
762 if (op.isTemp()) {
763 ctx.depends_on[op.tempId()] = true;
764 if (op.isFirstKill())
765 ctx.RAR_dependencies[op.tempId()] = true;
766 }
767 }
768
769 /* maintain how many registers remain free when moving instructions */
770 RegisterDemand register_pressure = register_demand[idx];
771
772 /* first, check if we have instructions before current to move down */
773 int insert_idx = idx + 1;
774 int moving_interaction = barrier_none;
775 bool moving_spill = false;
776
777 for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
778 assert(candidate_idx >= 0);
779 aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
780
781 /* break when encountering logical_start or barriers */
782 if (candidate->opcode == aco_opcode::p_logical_start)
783 break;
784 if (candidate->opcode == aco_opcode::p_exit_early_if)
785 break;
786 if (candidate->isVMEM() || candidate->format == Format::SMEM)
787 break;
788 if (!can_move_instr(candidate, current, moving_interaction))
789 break;
790
791 register_pressure.update(register_demand[candidate_idx]);
792
793 /* if current depends on candidate, add additional dependencies and continue */
794 bool can_move_down = true;
795 bool writes_exec = false;
796 for (unsigned i = 0; i < candidate->definitions.size(); i++) {
797 if (candidate->definitions[i].isTemp() && ctx.depends_on[candidate->definitions[i].tempId()])
798 can_move_down = false;
799 if (candidate->definitions[i].isFixed() && candidate->definitions[i].physReg() == exec)
800 writes_exec = true;
801 }
802 if (writes_exec)
803 break;
804
805 if (moving_spill && is_spill_reload(candidate))
806 can_move_down = false;
807 if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
808 can_move_down = false;
809 moving_interaction |= get_barrier_interaction(candidate.get());
810 moving_spill |= is_spill_reload(candidate);
811 if (!can_move_down) {
812 for (const Operand& op : candidate->operands) {
813 if (op.isTemp()) {
814 ctx.depends_on[op.tempId()] = true;
815 if (op.isFirstKill())
816 ctx.RAR_dependencies[op.tempId()] = true;
817 }
818 }
819 continue;
820 }
821
822 bool register_pressure_unknown = false;
823 /* check if one of candidate's operands is killed by depending instruction */
824 for (const Operand& op : candidate->operands) {
825 if (op.isTemp() && ctx.RAR_dependencies[op.tempId()]) {
826 // FIXME: account for difference in register pressure
827 register_pressure_unknown = true;
828 }
829 }
830 if (register_pressure_unknown) {
831 for (const Operand& op : candidate->operands) {
832 if (op.isTemp()) {
833 ctx.depends_on[op.tempId()] = true;
834 if (op.isFirstKill())
835 ctx.RAR_dependencies[op.tempId()] = true;
836 }
837 }
838 continue;
839 }
840
841 /* check if register pressure is low enough: the diff is negative if register pressure is increased */
842 const RegisterDemand candidate_diff = getLiveChanges(candidate);
843 const RegisterDemand temp = getTempRegisters(candidate);;
844 if (RegisterDemand(register_pressure - candidate_diff).exceeds(ctx.max_registers))
845 break;
846 const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]);
847 const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + temp;
848 if (new_demand.exceeds(ctx.max_registers))
849 break;
850 // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure
851
852 /* move the candidate below the export */
853 move_element(block->instructions, candidate_idx, insert_idx);
854
855 /* update register pressure */
856 move_element(register_demand, candidate_idx, insert_idx);
857 for (int i = candidate_idx; i < insert_idx - 1; i++) {
858 register_demand[i] -= candidate_diff;
859 }
860 register_demand[insert_idx - 1] = new_demand;
861 register_pressure -= candidate_diff;
862 insert_idx--;
863 k++;
864 }
865 }
866
867 void schedule_block(sched_ctx& ctx, Program *program, Block* block, live& live_vars)
868 {
869 ctx.last_SMEM_dep_idx = 0;
870 ctx.last_SMEM_stall = INT16_MIN;
871
872 /* go through all instructions and find memory loads */
873 for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
874 Instruction* current = block->instructions[idx].get();
875
876 if (current->definitions.empty())
877 continue;
878
879 if (current->isVMEM())
880 schedule_VMEM(ctx, block, live_vars.register_demand[block->index], current, idx);
881 if (current->format == Format::SMEM)
882 schedule_SMEM(ctx, block, live_vars.register_demand[block->index], current, idx);
883 }
884
885 if ((program->stage & hw_vs) && block->index == program->blocks.size() - 1) {
886 /* Try to move position exports as far up as possible, to reduce register
887 * usage and because ISA reference guides say so. */
888 for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
889 Instruction* current = block->instructions[idx].get();
890
891 if (current->format == Format::EXP) {
892 unsigned target = static_cast<Export_instruction*>(current)->dest;
893 if (target >= V_008DFC_SQ_EXP_POS && target < V_008DFC_SQ_EXP_PARAM)
894 schedule_position_export(ctx, block, live_vars.register_demand[block->index], current, idx);
895 }
896 }
897 }
898
899 /* resummarize the block's register demand */
900 block->register_demand = RegisterDemand();
901 for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
902 block->register_demand.update(live_vars.register_demand[block->index][idx]);
903 }
904 }
905
906
907 void schedule_program(Program *program, live& live_vars)
908 {
909 sched_ctx ctx;
910 ctx.depends_on.resize(program->peekAllocationId());
911 ctx.RAR_dependencies.resize(program->peekAllocationId());
912 ctx.new_RAR_dependencies.resize(program->peekAllocationId());
913 /* Allowing the scheduler to reduce the number of waves to as low as 5
914 * improves performance of Thrones of Britannia significantly and doesn't
915 * seem to hurt anything else. */
916 if (program->num_waves <= 5)
917 ctx.num_waves = program->num_waves;
918 else if (program->max_reg_demand.vgpr >= 32)
919 ctx.num_waves = 5;
920 else if (program->max_reg_demand.vgpr >= 28)
921 ctx.num_waves = 6;
922 else if (program->max_reg_demand.vgpr >= 24)
923 ctx.num_waves = 7;
924 else
925 ctx.num_waves = 8;
926
927 assert(ctx.num_waves > 0 && ctx.num_waves <= program->num_waves);
928 ctx.max_registers = { int16_t(((256 / ctx.num_waves) & ~3) - 2), int16_t(get_addr_sgpr_from_waves(program, ctx.num_waves))};
929
930 for (Block& block : program->blocks)
931 schedule_block(ctx, program, &block, live_vars);
932
933 /* update max_reg_demand and num_waves */
934 RegisterDemand new_demand;
935 for (Block& block : program->blocks) {
936 new_demand.update(block.register_demand);
937 }
938 update_vgpr_sgpr_demand(program, new_demand);
939
940 /* if enabled, this code asserts that register_demand is updated correctly */
941 #if 0
942 int prev_num_waves = program->num_waves;
943 const RegisterDemand prev_max_demand = program->max_reg_demand;
944
945 std::vector<RegisterDemand> demands(program->blocks.size());
946 for (unsigned j = 0; j < program->blocks.size(); j++) {
947 demands[j] = program->blocks[j].register_demand;
948 }
949
950 struct radv_nir_compiler_options options;
951 options.chip_class = program->chip_class;
952 live live_vars2 = aco::live_var_analysis(program, &options);
953
954 for (unsigned j = 0; j < program->blocks.size(); j++) {
955 Block &b = program->blocks[j];
956 for (unsigned i = 0; i < b.instructions.size(); i++)
957 assert(live_vars.register_demand[b.index][i] == live_vars2.register_demand[b.index][i]);
958 assert(b.register_demand == demands[j]);
959 }
960
961 assert(program->max_reg_demand == prev_max_demand);
962 assert(program->num_waves == prev_num_waves);
963 #endif
964 }
965
966 }