aco: only skip RAR dependencies if the variable is killed somewhere
[mesa.git] / src / amd / compiler / aco_scheduler.cpp
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #include "aco_ir.h"
26 #include <unordered_set>
27 #include <algorithm>
28
29 #include "vulkan/radv_shader.h" // for radv_nir_compiler_options
30 #include "amdgfxregs.h"
31
32 #define SMEM_WINDOW_SIZE (350 - ctx.num_waves * 35)
33 #define VMEM_WINDOW_SIZE (1024 - ctx.num_waves * 64)
34 #define POS_EXP_WINDOW_SIZE 512
35 #define SMEM_MAX_MOVES (64 - ctx.num_waves * 4)
36 #define VMEM_MAX_MOVES (128 - ctx.num_waves * 8)
37 #define POS_EXP_MAX_MOVES 512
38
39 namespace aco {
40
41 struct sched_ctx {
42 std::vector<bool> depends_on;
43 std::vector<bool> RAR_dependencies;
44 RegisterDemand max_registers;
45 int16_t num_waves;
46 int16_t last_SMEM_stall;
47 int last_SMEM_dep_idx;
48 };
49
50 /* This scheduler is a simple bottom-up pass based on ideas from
51 * "A Novel Lightweight Instruction Scheduling Algorithm for Just-In-Time Compiler"
52 * from Xiaohua Shi and Peng Guo.
53 * The basic approach is to iterate over all instructions. When a memory instruction
54 * is encountered it tries to move independent instructions from above and below
55 * between the memory instruction and it's first user.
56 * The novelty is that this scheduler cares for the current register pressure:
57 * Instructions will only be moved if the register pressure won't exceed a certain bound.
58 */
59
60 template <typename T>
61 void move_element(T& list, size_t idx, size_t before) {
62 if (idx < before) {
63 auto begin = std::next(list.begin(), idx);
64 auto end = std::next(list.begin(), before);
65 std::rotate(begin, begin + 1, end);
66 } else if (idx > before) {
67 auto begin = std::next(list.begin(), before);
68 auto end = std::next(list.begin(), idx + 1);
69 std::rotate(begin, end - 1, end);
70 }
71 }
72
73 static RegisterDemand getLiveChanges(aco_ptr<Instruction>& instr)
74 {
75 RegisterDemand changes;
76 for (const Definition& def : instr->definitions) {
77 if (!def.isTemp() || def.isKill())
78 continue;
79 changes += def.getTemp();
80 }
81
82 for (const Operand& op : instr->operands) {
83 if (!op.isTemp() || !op.isFirstKill())
84 continue;
85 changes -= op.getTemp();
86 }
87
88 return changes;
89 }
90
91 static RegisterDemand getTempRegisters(aco_ptr<Instruction>& instr)
92 {
93 RegisterDemand temp_registers;
94 for (const Definition& def : instr->definitions) {
95 if (!def.isTemp() || !def.isKill())
96 continue;
97 temp_registers += def.getTemp();
98 }
99 return temp_registers;
100 }
101
102 static bool is_spill_reload(aco_ptr<Instruction>& instr)
103 {
104 return instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload;
105 }
106
107 bool can_move_instr(aco_ptr<Instruction>& instr, Instruction* current, int moving_interaction)
108 {
109 /* don't move exports so that they stay closer together */
110 if (instr->format == Format::EXP)
111 return false;
112
113 /* don't move s_memtime/s_memrealtime */
114 if (instr->opcode == aco_opcode::s_memtime || instr->opcode == aco_opcode::s_memrealtime)
115 return false;
116
117 /* handle barriers */
118
119 /* TODO: instead of stopping, maybe try to move the barriers and any
120 * instructions interacting with them instead? */
121 if (instr->format != Format::PSEUDO_BARRIER) {
122 if (instr->opcode == aco_opcode::s_barrier) {
123 bool can_reorder = false;
124 switch (current->format) {
125 case Format::SMEM:
126 can_reorder = static_cast<SMEM_instruction*>(current)->can_reorder;
127 break;
128 case Format::MUBUF:
129 can_reorder = static_cast<MUBUF_instruction*>(current)->can_reorder;
130 break;
131 case Format::MIMG:
132 can_reorder = static_cast<MIMG_instruction*>(current)->can_reorder;
133 break;
134 default:
135 break;
136 }
137 return can_reorder && moving_interaction == barrier_none;
138 } else {
139 return true;
140 }
141 }
142
143 int interaction = get_barrier_interaction(current);
144 interaction |= moving_interaction;
145
146 switch (instr->opcode) {
147 case aco_opcode::p_memory_barrier_atomic:
148 return !(interaction & barrier_atomic);
149 /* For now, buffer and image barriers are treated the same. this is because of
150 * dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_nonlocal.buffer.guard_nonlocal.image.comp
151 * which seems to use an image load to determine if the result of a buffer load is valid. So the ordering of the two loads is important.
152 * I /think/ we should probably eventually expand the meaning of a buffer barrier so that all buffer operations before it, must stay before it
153 * and that both image and buffer operations after it, must stay after it. We should also do the same for image barriers.
154 * Or perhaps the problem is that we don't have a combined barrier instruction for both buffers and images, but the CTS test expects us to?
155 * Either way, this solution should work. */
156 case aco_opcode::p_memory_barrier_buffer:
157 case aco_opcode::p_memory_barrier_image:
158 return !(interaction & (barrier_image | barrier_buffer));
159 case aco_opcode::p_memory_barrier_shared:
160 return !(interaction & barrier_shared);
161 case aco_opcode::p_memory_barrier_all:
162 return interaction == barrier_none;
163 default:
164 return false;
165 }
166 }
167
168 bool can_reorder(Instruction* candidate, bool allow_smem)
169 {
170 switch (candidate->format) {
171 case Format::SMEM:
172 return allow_smem || static_cast<SMEM_instruction*>(candidate)->can_reorder;
173 case Format::MUBUF:
174 return static_cast<MUBUF_instruction*>(candidate)->can_reorder;
175 case Format::MIMG:
176 return static_cast<MIMG_instruction*>(candidate)->can_reorder;
177 case Format::MTBUF:
178 return static_cast<MTBUF_instruction*>(candidate)->can_reorder;
179 case Format::FLAT:
180 case Format::GLOBAL:
181 case Format::SCRATCH:
182 return false;
183 default:
184 return true;
185 }
186 }
187
188 void schedule_SMEM(sched_ctx& ctx, Block* block,
189 std::vector<RegisterDemand>& register_demand,
190 Instruction* current, int idx)
191 {
192 assert(idx != 0);
193 int window_size = SMEM_WINDOW_SIZE;
194 int max_moves = SMEM_MAX_MOVES;
195 int16_t k = 0;
196 bool can_reorder_cur = can_reorder(current, false);
197
198 /* don't move s_memtime/s_memrealtime */
199 if (current->opcode == aco_opcode::s_memtime || current->opcode == aco_opcode::s_memrealtime)
200 return;
201
202 /* create the initial set of values which current depends on */
203 std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
204 for (const Operand& op : current->operands) {
205 if (op.isTemp())
206 ctx.depends_on[op.tempId()] = true;
207 }
208
209 /* maintain how many registers remain free when moving instructions */
210 RegisterDemand register_pressure = register_demand[idx];
211
212 /* first, check if we have instructions before current to move down */
213 int insert_idx = idx + 1;
214 int moving_interaction = barrier_none;
215 bool moving_spill = false;
216
217 for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
218 assert(candidate_idx >= 0);
219 aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
220
221 /* break if we'd make the previous SMEM instruction stall */
222 bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
223 if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0)
224 break;
225
226 /* break when encountering another MEM instruction, logical_start or barriers */
227 if (!can_reorder(candidate.get(), false) && !can_reorder_cur)
228 break;
229 if (candidate->opcode == aco_opcode::p_logical_start)
230 break;
231 if (candidate->opcode == aco_opcode::p_exit_early_if)
232 break;
233 if (!can_move_instr(candidate, current, moving_interaction))
234 break;
235 register_pressure.update(register_demand[candidate_idx]);
236
237 /* if current depends on candidate, add additional dependencies and continue */
238 bool can_move_down = true;
239 bool writes_exec = false;
240 for (const Definition& def : candidate->definitions) {
241 if (def.isTemp() && ctx.depends_on[def.tempId()])
242 can_move_down = false;
243 if (def.isFixed() && def.physReg() == exec)
244 writes_exec = true;
245 }
246 if (writes_exec)
247 break;
248
249 if (moving_spill && is_spill_reload(candidate))
250 can_move_down = false;
251 if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
252 can_move_down = false;
253 moving_interaction |= get_barrier_interaction(candidate.get());
254 moving_spill |= is_spill_reload(candidate);
255 if (!can_move_down) {
256 for (const Operand& op : candidate->operands) {
257 if (op.isTemp())
258 ctx.depends_on[op.tempId()] = true;
259 }
260 continue;
261 }
262
263 bool register_pressure_unknown = false;
264 /* check if one of candidate's operands is killed by depending instruction */
265 for (const Operand& op : candidate->operands) {
266 if (op.isTemp() && ctx.depends_on[op.tempId()]) {
267 // FIXME: account for difference in register pressure
268 register_pressure_unknown = true;
269 }
270 }
271 if (register_pressure_unknown) {
272 for (const Operand& op : candidate->operands) {
273 if (op.isTemp())
274 ctx.depends_on[op.tempId()] = true;
275 }
276 continue;
277 }
278
279 /* check if register pressure is low enough: the diff is negative if register pressure is increased */
280 const RegisterDemand candidate_diff = getLiveChanges(candidate);
281 const RegisterDemand tempDemand = getTempRegisters(candidate);
282 if (RegisterDemand(register_pressure - candidate_diff).exceeds(ctx.max_registers))
283 break;
284 const RegisterDemand tempDemand2 = getTempRegisters(block->instructions[insert_idx - 1]);
285 const RegisterDemand new_demand = register_demand[insert_idx - 1] - tempDemand2 + tempDemand;
286 if (new_demand.exceeds(ctx.max_registers))
287 break;
288 // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure
289
290 /* move the candidate below the memory load */
291 move_element(block->instructions, candidate_idx, insert_idx);
292
293 /* update register pressure */
294 move_element(register_demand, candidate_idx, insert_idx);
295 for (int i = candidate_idx; i < insert_idx - 1; i++) {
296 register_demand[i] -= candidate_diff;
297 }
298 register_demand[insert_idx - 1] = new_demand;
299 register_pressure -= candidate_diff;
300
301 if (candidate_idx < ctx.last_SMEM_dep_idx)
302 ctx.last_SMEM_stall++;
303 insert_idx--;
304 k++;
305 }
306
307 /* create the initial set of values which depend on current */
308 std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
309 std::fill(ctx.RAR_dependencies.begin(), ctx.RAR_dependencies.end(), false);
310 for (const Definition& def : current->definitions) {
311 if (def.isTemp())
312 ctx.depends_on[def.tempId()] = true;
313 }
314
315 /* find the first instruction depending on current or find another MEM */
316 insert_idx = idx + 1;
317 moving_interaction = barrier_none;
318 moving_spill = false;
319
320 bool found_dependency = false;
321 /* second, check if we have instructions after current to move up */
322 for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) {
323 assert(candidate_idx < (int) block->instructions.size());
324 aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
325
326 if (candidate->opcode == aco_opcode::p_logical_end)
327 break;
328 if (!can_move_instr(candidate, current, moving_interaction))
329 break;
330
331 const bool writes_exec = std::any_of(candidate->definitions.begin(), candidate->definitions.end(),
332 [](const Definition& def) { return def.isFixed() && def.physReg() == exec;});
333 if (writes_exec)
334 break;
335
336 /* check if candidate depends on current */
337 bool is_dependency = std::any_of(candidate->operands.begin(), candidate->operands.end(),
338 [&ctx](const Operand& op) { return op.isTemp() && ctx.depends_on[op.tempId()];});
339 if (moving_spill && is_spill_reload(candidate))
340 is_dependency = true;
341 if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
342 is_dependency = true;
343 moving_interaction |= get_barrier_interaction(candidate.get());
344 moving_spill |= is_spill_reload(candidate);
345 if (is_dependency) {
346 for (const Definition& def : candidate->definitions) {
347 if (def.isTemp())
348 ctx.depends_on[def.tempId()] = true;
349 }
350 for (const Operand& op : candidate->operands) {
351 if (op.isTemp())
352 ctx.RAR_dependencies[op.tempId()] = true;
353 }
354 if (!found_dependency) {
355 insert_idx = candidate_idx;
356 found_dependency = true;
357 /* init register pressure */
358 register_pressure = register_demand[insert_idx - 1];
359 }
360 }
361
362 if (!can_reorder(candidate.get(), false) && !can_reorder_cur)
363 break;
364
365 if (!found_dependency) {
366 k++;
367 continue;
368 }
369
370 /* update register pressure */
371 register_pressure.update(register_demand[candidate_idx - 1]);
372
373 if (is_dependency)
374 continue;
375 assert(insert_idx != idx);
376
377 // TODO: correctly calculate register pressure for this case
378 bool register_pressure_unknown = false;
379 /* check if candidate uses/kills an operand which is used by a dependency */
380 for (const Operand& op : candidate->operands) {
381 if (op.isTemp() && ctx.RAR_dependencies[op.tempId()])
382 register_pressure_unknown = true;
383 }
384 if (register_pressure_unknown) {
385 for (const Definition& def : candidate->definitions) {
386 if (def.isTemp())
387 ctx.RAR_dependencies[def.tempId()] = true;
388 }
389 for (const Operand& op : candidate->operands) {
390 if (op.isTemp())
391 ctx.RAR_dependencies[op.tempId()] = true;
392 }
393 continue;
394 }
395
396 /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
397 const RegisterDemand candidate_diff = getLiveChanges(candidate);
398 const RegisterDemand temp = getTempRegisters(candidate);
399 if (RegisterDemand(register_pressure + candidate_diff).exceeds(ctx.max_registers))
400 break;
401 const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]);
402 const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + candidate_diff + temp;
403 if (new_demand.exceeds(ctx.max_registers))
404 break;
405
406 /* move the candidate above the insert_idx */
407 move_element(block->instructions, candidate_idx, insert_idx);
408
409 /* update register pressure */
410 move_element(register_demand, candidate_idx, insert_idx);
411 for (int i = insert_idx + 1; i <= candidate_idx; i++) {
412 register_demand[i] += candidate_diff;
413 }
414 register_demand[insert_idx] = new_demand;
415 register_pressure += candidate_diff;
416 insert_idx++;
417 k++;
418 }
419
420 ctx.last_SMEM_dep_idx = found_dependency ? insert_idx : 0;
421 ctx.last_SMEM_stall = 10 - ctx.num_waves - k;
422 }
423
424 void schedule_VMEM(sched_ctx& ctx, Block* block,
425 std::vector<RegisterDemand>& register_demand,
426 Instruction* current, int idx)
427 {
428 assert(idx != 0);
429 int window_size = VMEM_WINDOW_SIZE;
430 int max_moves = VMEM_MAX_MOVES;
431 int16_t k = 0;
432 bool can_reorder_cur = can_reorder(current, false);
433
434 /* create the initial set of values which current depends on */
435 std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
436 std::fill(ctx.RAR_dependencies.begin(), ctx.RAR_dependencies.end(), false);
437 for (const Operand& op : current->operands) {
438 if (op.isTemp()) {
439 ctx.depends_on[op.tempId()] = true;
440 if (op.isFirstKill())
441 ctx.RAR_dependencies[op.tempId()] = true;
442 }
443 }
444
445 /* maintain how many registers remain free when moving instructions */
446 RegisterDemand register_pressure = register_demand[idx];
447
448 /* first, check if we have instructions before current to move down */
449 int insert_idx = idx + 1;
450 int moving_interaction = barrier_none;
451 bool moving_spill = false;
452
453 for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
454 assert(candidate_idx >= 0);
455 aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
456
457 /* break when encountering another VMEM instruction, logical_start or barriers */
458 if (!can_reorder(candidate.get(), true) && !can_reorder_cur)
459 break;
460 if (candidate->opcode == aco_opcode::p_logical_start)
461 break;
462 if (candidate->opcode == aco_opcode::p_exit_early_if)
463 break;
464 if (!can_move_instr(candidate, current, moving_interaction))
465 break;
466
467 /* break if we'd make the previous SMEM instruction stall */
468 bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
469 if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0)
470 break;
471 register_pressure.update(register_demand[candidate_idx]);
472
473 /* if current depends on candidate, add additional dependencies and continue */
474 bool can_move_down = !candidate->isVMEM();
475 bool writes_exec = false;
476 for (const Definition& def : candidate->definitions) {
477 if (def.isTemp() && ctx.depends_on[def.tempId()])
478 can_move_down = false;
479 if (def.isFixed() && def.physReg() == exec)
480 writes_exec = true;
481 }
482 if (writes_exec)
483 break;
484
485 if (moving_spill && is_spill_reload(candidate))
486 can_move_down = false;
487 if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
488 can_move_down = false;
489 moving_interaction |= get_barrier_interaction(candidate.get());
490 moving_spill |= is_spill_reload(candidate);
491 if (!can_move_down) {
492 for (const Operand& op : candidate->operands) {
493 if (op.isTemp()) {
494 ctx.depends_on[op.tempId()] = true;
495 if (op.isFirstKill())
496 ctx.RAR_dependencies[op.tempId()] = true;
497 }
498 }
499 continue;
500 }
501
502 bool register_pressure_unknown = false;
503 /* check if one of candidate's operands is killed by depending instruction */
504 for (const Operand& op : candidate->operands) {
505 if (op.isTemp() && ctx.RAR_dependencies[op.tempId()]) {
506 // FIXME: account for difference in register pressure
507 register_pressure_unknown = true;
508 }
509 }
510 if (register_pressure_unknown) {
511 for (const Operand& op : candidate->operands) {
512 if (op.isTemp()) {
513 ctx.depends_on[op.tempId()] = true;
514 if (op.isFirstKill())
515 ctx.RAR_dependencies[op.tempId()] = true;
516 }
517 }
518 continue;
519 }
520
521 /* check if register pressure is low enough: the diff is negative if register pressure is increased */
522 const RegisterDemand candidate_diff = getLiveChanges(candidate);
523 const RegisterDemand temp = getTempRegisters(candidate);;
524 if (RegisterDemand(register_pressure - candidate_diff).exceeds(ctx.max_registers))
525 break;
526 const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]);
527 const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + temp;
528 if (new_demand.exceeds(ctx.max_registers))
529 break;
530 // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure
531
532 /* move the candidate below the memory load */
533 move_element(block->instructions, candidate_idx, insert_idx);
534
535 /* update register pressure */
536 move_element(register_demand, candidate_idx, insert_idx);
537 for (int i = candidate_idx; i < insert_idx - 1; i++) {
538 register_demand[i] -= candidate_diff;
539 }
540 register_demand[insert_idx - 1] = new_demand;
541 register_pressure -= candidate_diff;
542 insert_idx--;
543 k++;
544 if (candidate_idx < ctx.last_SMEM_dep_idx)
545 ctx.last_SMEM_stall++;
546 }
547
548 /* create the initial set of values which depend on current */
549 std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
550 std::fill(ctx.RAR_dependencies.begin(), ctx.RAR_dependencies.end(), false);
551 for (const Definition& def : current->definitions) {
552 if (def.isTemp())
553 ctx.depends_on[def.tempId()] = true;
554 }
555
556 /* find the first instruction depending on current or find another VMEM */
557 insert_idx = idx;
558 moving_interaction = barrier_none;
559 moving_spill = false;
560
561 bool found_dependency = false;
562 /* second, check if we have instructions after current to move up */
563 for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) {
564 assert(candidate_idx < (int) block->instructions.size());
565 aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
566
567 if (candidate->opcode == aco_opcode::p_logical_end)
568 break;
569 if (!can_move_instr(candidate, current, moving_interaction))
570 break;
571
572 const bool writes_exec = std::any_of(candidate->definitions.begin(), candidate->definitions.end(),
573 [](const Definition& def) {return def.isFixed() && def.physReg() == exec; });
574 if (writes_exec)
575 break;
576
577 /* check if candidate depends on current */
578 bool is_dependency = !can_reorder(candidate.get(), true) && !can_reorder_cur;
579 for (const Operand& op : candidate->operands) {
580 if (op.isTemp() && ctx.depends_on[op.tempId()]) {
581 is_dependency = true;
582 break;
583 }
584 }
585 if (moving_spill && is_spill_reload(candidate))
586 is_dependency = true;
587 if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
588 is_dependency = true;
589 moving_interaction |= get_barrier_interaction(candidate.get());
590 moving_spill |= is_spill_reload(candidate);
591 if (is_dependency) {
592 for (const Definition& def : candidate->definitions) {
593 if (def.isTemp())
594 ctx.depends_on[def.tempId()] = true;
595 }
596 for (const Operand& op : candidate->operands) {
597 if (op.isTemp())
598 ctx.RAR_dependencies[op.tempId()] = true;
599 }
600 if (!found_dependency) {
601 insert_idx = candidate_idx;
602 found_dependency = true;
603 /* init register pressure */
604 register_pressure = register_demand[insert_idx - 1];
605 continue;
606 }
607 } else if (candidate->isVMEM()) {
608 for (const Definition& def : candidate->definitions) {
609 if (def.isTemp())
610 ctx.depends_on[def.tempId()] = true;
611 }
612 }
613
614 /* update register pressure */
615 register_pressure.update(register_demand[candidate_idx - 1]);
616
617 if (is_dependency || !found_dependency)
618 continue;
619 assert(insert_idx != idx);
620
621 bool register_pressure_unknown = false;
622 /* check if candidate uses/kills an operand which is used by a dependency */
623 for (const Operand& op : candidate->operands) {
624 if (op.isTemp() && op.isFirstKill() && ctx.RAR_dependencies[op.tempId()])
625 register_pressure_unknown = true;
626 }
627 if (register_pressure_unknown) {
628 for (const Definition& def : candidate->definitions) {
629 if (def.isTemp())
630 ctx.depends_on[def.tempId()] = true;
631 }
632 for (const Operand& op : candidate->operands) {
633 if (op.isTemp())
634 ctx.RAR_dependencies[op.tempId()] = true;
635 }
636 continue;
637 }
638
639 /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
640 const RegisterDemand candidate_diff = getLiveChanges(candidate);
641 const RegisterDemand temp = getTempRegisters(candidate);
642 if (RegisterDemand(register_pressure + candidate_diff).exceeds(ctx.max_registers))
643 break;
644 const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]);
645 const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + candidate_diff + temp;
646 if (new_demand.exceeds(ctx.max_registers))
647 break;
648
649 /* move the candidate above the insert_idx */
650 move_element(block->instructions, candidate_idx, insert_idx);
651
652 /* update register pressure */
653 move_element(register_demand, candidate_idx, insert_idx);
654 for (int i = insert_idx + 1; i <= candidate_idx; i++) {
655 register_demand[i] += candidate_diff;
656 }
657 register_demand[insert_idx] = new_demand;
658 register_pressure += candidate_diff;
659 insert_idx++;
660 k++;
661 }
662 }
663
664 void schedule_position_export(sched_ctx& ctx, Block* block,
665 std::vector<RegisterDemand>& register_demand,
666 Instruction* current, int idx)
667 {
668 assert(idx != 0);
669 int window_size = POS_EXP_WINDOW_SIZE;
670 int max_moves = POS_EXP_MAX_MOVES;
671 int16_t k = 0;
672
673 /* create the initial set of values which current depends on */
674 std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
675 std::fill(ctx.RAR_dependencies.begin(), ctx.RAR_dependencies.end(), false);
676 for (const Operand& op : current->operands) {
677 if (op.isTemp()) {
678 ctx.depends_on[op.tempId()] = true;
679 if (op.isFirstKill())
680 ctx.RAR_dependencies[op.tempId()] = true;
681 }
682 }
683
684 /* maintain how many registers remain free when moving instructions */
685 RegisterDemand register_pressure = register_demand[idx];
686
687 /* first, check if we have instructions before current to move down */
688 int insert_idx = idx + 1;
689 int moving_interaction = barrier_none;
690 bool moving_spill = false;
691
692 for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
693 assert(candidate_idx >= 0);
694 aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
695
696 /* break when encountering logical_start or barriers */
697 if (candidate->opcode == aco_opcode::p_logical_start)
698 break;
699 if (candidate->opcode == aco_opcode::p_exit_early_if)
700 break;
701 if (candidate->isVMEM() || candidate->format == Format::SMEM)
702 break;
703 if (!can_move_instr(candidate, current, moving_interaction))
704 break;
705
706 register_pressure.update(register_demand[candidate_idx]);
707
708 /* if current depends on candidate, add additional dependencies and continue */
709 bool can_move_down = true;
710 bool writes_exec = false;
711 for (unsigned i = 0; i < candidate->definitions.size(); i++) {
712 if (candidate->definitions[i].isTemp() && ctx.depends_on[candidate->definitions[i].tempId()])
713 can_move_down = false;
714 if (candidate->definitions[i].isFixed() && candidate->definitions[i].physReg() == exec)
715 writes_exec = true;
716 }
717 if (writes_exec)
718 break;
719
720 if (moving_spill && is_spill_reload(candidate))
721 can_move_down = false;
722 if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
723 can_move_down = false;
724 moving_interaction |= get_barrier_interaction(candidate.get());
725 moving_spill |= is_spill_reload(candidate);
726 if (!can_move_down) {
727 for (const Operand& op : candidate->operands) {
728 if (op.isTemp()) {
729 ctx.depends_on[op.tempId()] = true;
730 if (op.isFirstKill())
731 ctx.RAR_dependencies[op.tempId()] = true;
732 }
733 }
734 continue;
735 }
736
737 bool register_pressure_unknown = false;
738 /* check if one of candidate's operands is killed by depending instruction */
739 for (const Operand& op : candidate->operands) {
740 if (op.isTemp() && ctx.RAR_dependencies[op.tempId()]) {
741 // FIXME: account for difference in register pressure
742 register_pressure_unknown = true;
743 }
744 }
745 if (register_pressure_unknown) {
746 for (const Operand& op : candidate->operands) {
747 if (op.isTemp()) {
748 ctx.depends_on[op.tempId()] = true;
749 if (op.isFirstKill())
750 ctx.RAR_dependencies[op.tempId()] = true;
751 }
752 }
753 continue;
754 }
755
756 /* check if register pressure is low enough: the diff is negative if register pressure is increased */
757 const RegisterDemand candidate_diff = getLiveChanges(candidate);
758 const RegisterDemand temp = getTempRegisters(candidate);;
759 if (RegisterDemand(register_pressure - candidate_diff).exceeds(ctx.max_registers))
760 break;
761 const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]);
762 const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + temp;
763 if (new_demand.exceeds(ctx.max_registers))
764 break;
765 // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure
766
767 /* move the candidate below the export */
768 move_element(block->instructions, candidate_idx, insert_idx);
769
770 /* update register pressure */
771 move_element(register_demand, candidate_idx, insert_idx);
772 for (int i = candidate_idx; i < insert_idx - 1; i++) {
773 register_demand[i] -= candidate_diff;
774 }
775 register_demand[insert_idx - 1] = new_demand;
776 register_pressure -= candidate_diff;
777 insert_idx--;
778 k++;
779 }
780 }
781
782 void schedule_block(sched_ctx& ctx, Program *program, Block* block, live& live_vars)
783 {
784 ctx.last_SMEM_dep_idx = 0;
785 ctx.last_SMEM_stall = INT16_MIN;
786
787 /* go through all instructions and find memory loads */
788 for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
789 Instruction* current = block->instructions[idx].get();
790
791 if (current->definitions.empty())
792 continue;
793
794 if (current->isVMEM())
795 schedule_VMEM(ctx, block, live_vars.register_demand[block->index], current, idx);
796 if (current->format == Format::SMEM)
797 schedule_SMEM(ctx, block, live_vars.register_demand[block->index], current, idx);
798 }
799
800 if ((program->stage & hw_vs) && block->index == program->blocks.size() - 1) {
801 /* Try to move position exports as far up as possible, to reduce register
802 * usage and because ISA reference guides say so. */
803 for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
804 Instruction* current = block->instructions[idx].get();
805
806 if (current->format == Format::EXP) {
807 unsigned target = static_cast<Export_instruction*>(current)->dest;
808 if (target >= V_008DFC_SQ_EXP_POS && target < V_008DFC_SQ_EXP_PARAM)
809 schedule_position_export(ctx, block, live_vars.register_demand[block->index], current, idx);
810 }
811 }
812 }
813
814 /* resummarize the block's register demand */
815 block->register_demand = RegisterDemand();
816 for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
817 block->register_demand.update(live_vars.register_demand[block->index][idx]);
818 }
819 }
820
821
822 void schedule_program(Program *program, live& live_vars)
823 {
824 sched_ctx ctx;
825 ctx.depends_on.resize(program->peekAllocationId());
826 ctx.RAR_dependencies.resize(program->peekAllocationId());
827 /* Allowing the scheduler to reduce the number of waves to as low as 5
828 * improves performance of Thrones of Britannia significantly and doesn't
829 * seem to hurt anything else. */
830 if (program->num_waves <= 5)
831 ctx.num_waves = program->num_waves;
832 else if (program->max_reg_demand.vgpr >= 32)
833 ctx.num_waves = 5;
834 else if (program->max_reg_demand.vgpr >= 28)
835 ctx.num_waves = 6;
836 else if (program->max_reg_demand.vgpr >= 24)
837 ctx.num_waves = 7;
838 else
839 ctx.num_waves = 8;
840
841 assert(ctx.num_waves > 0 && ctx.num_waves <= program->num_waves);
842 ctx.max_registers = { int16_t(((256 / ctx.num_waves) & ~3) - 2), int16_t(get_addr_sgpr_from_waves(program, ctx.num_waves))};
843
844 for (Block& block : program->blocks)
845 schedule_block(ctx, program, &block, live_vars);
846
847 /* update max_reg_demand and num_waves */
848 RegisterDemand new_demand;
849 for (Block& block : program->blocks) {
850 new_demand.update(block.register_demand);
851 }
852 update_vgpr_sgpr_demand(program, new_demand);
853
854 /* if enabled, this code asserts that register_demand is updated correctly */
855 #if 0
856 int prev_num_waves = program->num_waves;
857 const RegisterDemand prev_max_demand = program->max_reg_demand;
858
859 std::vector<RegisterDemand> demands(program->blocks.size());
860 for (unsigned j = 0; j < program->blocks.size(); j++) {
861 demands[j] = program->blocks[j].register_demand;
862 }
863
864 struct radv_nir_compiler_options options;
865 options.chip_class = program->chip_class;
866 live live_vars2 = aco::live_var_analysis(program, &options);
867
868 for (unsigned j = 0; j < program->blocks.size(); j++) {
869 Block &b = program->blocks[j];
870 for (unsigned i = 0; i < b.instructions.size(); i++)
871 assert(live_vars.register_demand[b.index][i] == live_vars2.register_demand[b.index][i]);
872 assert(b.register_demand == demands[j]);
873 }
874
875 assert(program->max_reg_demand == prev_max_demand);
876 assert(program->num_waves == prev_num_waves);
877 #endif
878 }
879
880 }