aco: increase accuracy of SGPR limits
[mesa.git] / src / amd / compiler / aco_scheduler.cpp
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #include "aco_ir.h"
26 #include <unordered_set>
27 #include <algorithm>
28
29 #include "vulkan/radv_shader.h" // for radv_nir_compiler_options
30 #include "amdgfxregs.h"
31
32 #define SMEM_WINDOW_SIZE (350 - ctx.num_waves * 35)
33 #define VMEM_WINDOW_SIZE (1024 - ctx.num_waves * 64)
34 #define POS_EXP_WINDOW_SIZE 512
35 #define SMEM_MAX_MOVES (80 - ctx.num_waves * 8)
36 #define VMEM_MAX_MOVES (128 - ctx.num_waves * 4)
37 #define POS_EXP_MAX_MOVES 512
38
39 namespace aco {
40
41 struct sched_ctx {
42 std::vector<bool> depends_on;
43 std::vector<bool> RAR_dependencies;
44 RegisterDemand max_registers;
45 int16_t num_waves;
46 int16_t last_SMEM_stall;
47 int last_SMEM_dep_idx;
48 };
49
50 /* This scheduler is a simple bottom-up pass based on ideas from
51 * "A Novel Lightweight Instruction Scheduling Algorithm for Just-In-Time Compiler"
52 * from Xiaohua Shi and Peng Guo.
53 * The basic approach is to iterate over all instructions. When a memory instruction
54 * is encountered it tries to move independent instructions from above and below
55 * between the memory instruction and it's first user.
56 * The novelty is that this scheduler cares for the current register pressure:
57 * Instructions will only be moved if the register pressure won't exceed a certain bound.
58 */
59
60 template <typename T>
61 void move_element(T& list, size_t idx, size_t before) {
62 if (idx < before) {
63 auto begin = std::next(list.begin(), idx);
64 auto end = std::next(list.begin(), before);
65 std::rotate(begin, begin + 1, end);
66 } else if (idx > before) {
67 auto begin = std::next(list.begin(), before);
68 auto end = std::next(list.begin(), idx + 1);
69 std::rotate(begin, end - 1, end);
70 }
71 }
72
73 static RegisterDemand getLiveChanges(aco_ptr<Instruction>& instr)
74 {
75 RegisterDemand changes;
76 for (const Definition& def : instr->definitions) {
77 if (!def.isTemp() || def.isKill())
78 continue;
79 changes += def.getTemp();
80 }
81
82 for (const Operand& op : instr->operands) {
83 if (!op.isTemp() || !op.isFirstKill())
84 continue;
85 changes -= op.getTemp();
86 }
87
88 return changes;
89 }
90
91 static RegisterDemand getTempRegisters(aco_ptr<Instruction>& instr)
92 {
93 RegisterDemand temp_registers;
94 for (const Definition& def : instr->definitions) {
95 if (!def.isTemp() || !def.isKill())
96 continue;
97 temp_registers += def.getTemp();
98 }
99 return temp_registers;
100 }
101
102 static bool is_spill_reload(aco_ptr<Instruction>& instr)
103 {
104 return instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload;
105 }
106
107 bool can_move_instr(aco_ptr<Instruction>& instr, Instruction* current, int moving_interaction)
108 {
109 /* don't move exports so that they stay closer together */
110 if (instr->format == Format::EXP)
111 return false;
112
113 /* don't move s_memtime/s_memrealtime */
114 if (instr->opcode == aco_opcode::s_memtime || instr->opcode == aco_opcode::s_memrealtime)
115 return false;
116
117 /* handle barriers */
118
119 /* TODO: instead of stopping, maybe try to move the barriers and any
120 * instructions interacting with them instead? */
121 if (instr->format != Format::PSEUDO_BARRIER) {
122 if (instr->opcode == aco_opcode::s_barrier) {
123 bool can_reorder = false;
124 switch (current->format) {
125 case Format::SMEM:
126 can_reorder = static_cast<SMEM_instruction*>(current)->can_reorder;
127 break;
128 case Format::MUBUF:
129 can_reorder = static_cast<MUBUF_instruction*>(current)->can_reorder;
130 break;
131 case Format::MIMG:
132 can_reorder = static_cast<MIMG_instruction*>(current)->can_reorder;
133 break;
134 default:
135 break;
136 }
137 return can_reorder && moving_interaction == barrier_none;
138 } else {
139 return true;
140 }
141 }
142
143 int interaction = get_barrier_interaction(current);
144 interaction |= moving_interaction;
145
146 switch (instr->opcode) {
147 case aco_opcode::p_memory_barrier_atomic:
148 return !(interaction & barrier_atomic);
149 /* For now, buffer and image barriers are treated the same. this is because of
150 * dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_nonlocal.buffer.guard_nonlocal.image.comp
151 * which seems to use an image load to determine if the result of a buffer load is valid. So the ordering of the two loads is important.
152 * I /think/ we should probably eventually expand the meaning of a buffer barrier so that all buffer operations before it, must stay before it
153 * and that both image and buffer operations after it, must stay after it. We should also do the same for image barriers.
154 * Or perhaps the problem is that we don't have a combined barrier instruction for both buffers and images, but the CTS test expects us to?
155 * Either way, this solution should work. */
156 case aco_opcode::p_memory_barrier_buffer:
157 case aco_opcode::p_memory_barrier_image:
158 return !(interaction & (barrier_image | barrier_buffer));
159 case aco_opcode::p_memory_barrier_shared:
160 return !(interaction & barrier_shared);
161 case aco_opcode::p_memory_barrier_all:
162 return interaction == barrier_none;
163 default:
164 return false;
165 }
166 }
167
168 bool can_reorder(Instruction* candidate, bool allow_smem)
169 {
170 switch (candidate->format) {
171 case Format::SMEM:
172 return allow_smem || static_cast<SMEM_instruction*>(candidate)->can_reorder;
173 case Format::MUBUF:
174 return static_cast<MUBUF_instruction*>(candidate)->can_reorder;
175 case Format::MIMG:
176 return static_cast<MIMG_instruction*>(candidate)->can_reorder;
177 case Format::MTBUF:
178 return static_cast<MTBUF_instruction*>(candidate)->can_reorder;
179 case Format::FLAT:
180 case Format::GLOBAL:
181 case Format::SCRATCH:
182 return false;
183 default:
184 return true;
185 }
186 }
187
188 void schedule_SMEM(sched_ctx& ctx, Block* block,
189 std::vector<RegisterDemand>& register_demand,
190 Instruction* current, int idx)
191 {
192 assert(idx != 0);
193 int window_size = SMEM_WINDOW_SIZE;
194 int max_moves = SMEM_MAX_MOVES;
195 int16_t k = 0;
196 bool can_reorder_cur = can_reorder(current, false);
197
198 /* don't move s_memtime/s_memrealtime */
199 if (current->opcode == aco_opcode::s_memtime || current->opcode == aco_opcode::s_memrealtime)
200 return;
201
202 /* create the initial set of values which current depends on */
203 std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
204 for (const Operand& op : current->operands) {
205 if (op.isTemp())
206 ctx.depends_on[op.tempId()] = true;
207 }
208
209 /* maintain how many registers remain free when moving instructions */
210 RegisterDemand register_pressure = register_demand[idx];
211
212 /* first, check if we have instructions before current to move down */
213 int insert_idx = idx + 1;
214 int moving_interaction = barrier_none;
215 bool moving_spill = false;
216
217 for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
218 assert(candidate_idx >= 0);
219 aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
220
221 /* break if we'd make the previous SMEM instruction stall */
222 bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
223 if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0)
224 break;
225
226 /* break when encountering another MEM instruction, logical_start or barriers */
227 if (!can_reorder(candidate.get(), false) && !can_reorder_cur)
228 break;
229 if (candidate->opcode == aco_opcode::p_logical_start)
230 break;
231 if (candidate->opcode == aco_opcode::p_exit_early_if)
232 break;
233 if (!can_move_instr(candidate, current, moving_interaction))
234 break;
235 register_pressure.update(register_demand[candidate_idx]);
236
237 /* if current depends on candidate, add additional dependencies and continue */
238 bool can_move_down = true;
239 bool writes_exec = false;
240 for (const Definition& def : candidate->definitions) {
241 if (def.isTemp() && ctx.depends_on[def.tempId()])
242 can_move_down = false;
243 if (def.isFixed() && def.physReg() == exec)
244 writes_exec = true;
245 }
246 if (writes_exec)
247 break;
248
249 if (moving_spill && is_spill_reload(candidate))
250 can_move_down = false;
251 if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
252 can_move_down = false;
253 moving_interaction |= get_barrier_interaction(candidate.get());
254 moving_spill |= is_spill_reload(candidate);
255 if (!can_move_down) {
256 for (const Operand& op : candidate->operands) {
257 if (op.isTemp())
258 ctx.depends_on[op.tempId()] = true;
259 }
260 continue;
261 }
262
263 bool register_pressure_unknown = false;
264 /* check if one of candidate's operands is killed by depending instruction */
265 for (const Operand& op : candidate->operands) {
266 if (op.isTemp() && ctx.depends_on[op.tempId()]) {
267 // FIXME: account for difference in register pressure
268 register_pressure_unknown = true;
269 }
270 }
271 if (register_pressure_unknown) {
272 for (const Operand& op : candidate->operands) {
273 if (op.isTemp())
274 ctx.depends_on[op.tempId()] = true;
275 }
276 continue;
277 }
278
279 /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
280 const RegisterDemand candidate_diff = getLiveChanges(candidate);
281 const RegisterDemand tempDemand = getTempRegisters(candidate);
282 if (RegisterDemand(register_pressure - candidate_diff).exceeds(ctx.max_registers))
283 break;
284 const RegisterDemand tempDemand2 = getTempRegisters(block->instructions[insert_idx - 1]);
285 const RegisterDemand new_demand = register_demand[insert_idx - 1] - tempDemand2 + tempDemand;
286 if (new_demand.exceeds(ctx.max_registers))
287 break;
288 // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure
289
290 /* move the candidate below the memory load */
291 move_element(block->instructions, candidate_idx, insert_idx);
292
293 /* update register pressure */
294 move_element(register_demand, candidate_idx, insert_idx);
295 for (int i = candidate_idx; i < insert_idx - 1; i++) {
296 register_demand[i] -= candidate_diff;
297 }
298 register_demand[insert_idx - 1] = new_demand;
299 register_pressure -= candidate_diff;
300
301 if (candidate_idx < ctx.last_SMEM_dep_idx)
302 ctx.last_SMEM_stall++;
303 insert_idx--;
304 k++;
305 }
306
307 /* create the initial set of values which depend on current */
308 std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
309 std::fill(ctx.RAR_dependencies.begin(), ctx.RAR_dependencies.end(), false);
310 for (const Definition& def : current->definitions) {
311 if (def.isTemp())
312 ctx.depends_on[def.tempId()] = true;
313 }
314
315 /* find the first instruction depending on current or find another MEM */
316 insert_idx = idx + 1;
317 moving_interaction = barrier_none;
318 moving_spill = false;
319
320 bool found_dependency = false;
321 /* second, check if we have instructions after current to move up */
322 for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) {
323 assert(candidate_idx < (int) block->instructions.size());
324 aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
325
326 if (candidate->opcode == aco_opcode::p_logical_end)
327 break;
328 if (!can_move_instr(candidate, current, moving_interaction))
329 break;
330
331 const bool writes_exec = std::any_of(candidate->definitions.begin(), candidate->definitions.end(),
332 [](const Definition& def) { return def.isFixed() && def.physReg() == exec;});
333 if (writes_exec)
334 break;
335
336 /* check if candidate depends on current */
337 bool is_dependency = std::any_of(candidate->operands.begin(), candidate->operands.end(),
338 [&ctx](const Operand& op) { return op.isTemp() && ctx.depends_on[op.tempId()];});
339 if (moving_spill && is_spill_reload(candidate))
340 is_dependency = true;
341 if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
342 is_dependency = true;
343 moving_interaction |= get_barrier_interaction(candidate.get());
344 moving_spill |= is_spill_reload(candidate);
345 if (is_dependency) {
346 for (const Definition& def : candidate->definitions) {
347 if (def.isTemp())
348 ctx.depends_on[def.tempId()] = true;
349 }
350 for (const Operand& op : candidate->operands) {
351 if (op.isTemp())
352 ctx.RAR_dependencies[op.tempId()] = true;
353 }
354 if (!found_dependency) {
355 insert_idx = candidate_idx;
356 found_dependency = true;
357 /* init register pressure */
358 register_pressure = register_demand[insert_idx - 1];
359 }
360 }
361
362 if (!can_reorder(candidate.get(), false) && !can_reorder_cur)
363 break;
364
365 if (!found_dependency) {
366 k++;
367 continue;
368 }
369
370 /* update register pressure */
371 register_pressure.update(register_demand[candidate_idx - 1]);
372
373 if (is_dependency)
374 continue;
375 assert(insert_idx != idx);
376
377 // TODO: correctly calculate register pressure for this case
378 bool register_pressure_unknown = false;
379 /* check if candidate uses/kills an operand which is used by a dependency */
380 for (const Operand& op : candidate->operands) {
381 if (op.isTemp() && ctx.RAR_dependencies[op.tempId()])
382 register_pressure_unknown = true;
383 }
384 if (register_pressure_unknown) {
385 for (const Definition& def : candidate->definitions) {
386 if (def.isTemp())
387 ctx.RAR_dependencies[def.tempId()] = true;
388 }
389 for (const Operand& op : candidate->operands) {
390 if (op.isTemp())
391 ctx.RAR_dependencies[op.tempId()] = true;
392 }
393 continue;
394 }
395
396 /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
397 const RegisterDemand candidate_diff = getLiveChanges(candidate);
398 const RegisterDemand temp = getTempRegisters(candidate);
399 if (RegisterDemand(register_pressure + candidate_diff).exceeds(ctx.max_registers))
400 break;
401 const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]);
402 const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + candidate_diff + temp;
403 if (new_demand.exceeds(ctx.max_registers))
404 break;
405
406 /* move the candidate above the insert_idx */
407 move_element(block->instructions, candidate_idx, insert_idx);
408
409 /* update register pressure */
410 move_element(register_demand, candidate_idx, insert_idx);
411 for (int i = insert_idx + 1; i <= candidate_idx; i++) {
412 register_demand[i] += candidate_diff;
413 }
414 register_demand[insert_idx] = new_demand;
415 register_pressure += candidate_diff;
416 insert_idx++;
417 k++;
418 }
419
420 ctx.last_SMEM_dep_idx = found_dependency ? insert_idx : 0;
421 ctx.last_SMEM_stall = 10 - ctx.num_waves - k;
422 }
423
424 void schedule_VMEM(sched_ctx& ctx, Block* block,
425 std::vector<RegisterDemand>& register_demand,
426 Instruction* current, int idx)
427 {
428 assert(idx != 0);
429 int window_size = VMEM_WINDOW_SIZE;
430 int max_moves = VMEM_MAX_MOVES;
431 int16_t k = 0;
432 bool can_reorder_cur = can_reorder(current, false);
433
434 /* create the initial set of values which current depends on */
435 std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
436 for (const Operand& op : current->operands) {
437 if (op.isTemp())
438 ctx.depends_on[op.tempId()] = true;
439 }
440
441 /* maintain how many registers remain free when moving instructions */
442 RegisterDemand register_pressure = register_demand[idx];
443
444 /* first, check if we have instructions before current to move down */
445 int insert_idx = idx + 1;
446 int moving_interaction = barrier_none;
447 bool moving_spill = false;
448
449 for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
450 assert(candidate_idx >= 0);
451 aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
452
453 /* break when encountering another VMEM instruction, logical_start or barriers */
454 if (!can_reorder(candidate.get(), true) && !can_reorder_cur)
455 break;
456 if (candidate->opcode == aco_opcode::p_logical_start)
457 break;
458 if (candidate->opcode == aco_opcode::p_exit_early_if)
459 break;
460 if (!can_move_instr(candidate, current, moving_interaction))
461 break;
462
463 /* break if we'd make the previous SMEM instruction stall */
464 bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
465 if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0)
466 break;
467 register_pressure.update(register_demand[candidate_idx]);
468
469 /* if current depends on candidate, add additional dependencies and continue */
470 bool can_move_down = true;
471 bool writes_exec = false;
472 for (const Definition& def : candidate->definitions) {
473 if (def.isTemp() && ctx.depends_on[def.tempId()])
474 can_move_down = false;
475 if (def.isFixed() && def.physReg() == exec)
476 writes_exec = true;
477 }
478 if (writes_exec)
479 break;
480
481 if (moving_spill && is_spill_reload(candidate))
482 can_move_down = false;
483 if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
484 can_move_down = false;
485 moving_interaction |= get_barrier_interaction(candidate.get());
486 moving_spill |= is_spill_reload(candidate);
487 if (!can_move_down) {
488 for (const Operand& op : candidate->operands) {
489 if (op.isTemp())
490 ctx.depends_on[op.tempId()] = true;
491 }
492 continue;
493 }
494
495 bool register_pressure_unknown = false;
496 /* check if one of candidate's operands is killed by depending instruction */
497 for (const Operand& op : candidate->operands) {
498 if (op.isTemp() && ctx.depends_on[op.tempId()]) {
499 // FIXME: account for difference in register pressure
500 register_pressure_unknown = true;
501 }
502 }
503 if (register_pressure_unknown) {
504 for (const Operand& op : candidate->operands) {
505 if (op.isTemp())
506 ctx.depends_on[op.tempId()] = true;
507 }
508 continue;
509 }
510
511 /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
512 const RegisterDemand candidate_diff = getLiveChanges(candidate);
513 const RegisterDemand temp = getTempRegisters(candidate);;
514 if (RegisterDemand(register_pressure - candidate_diff).exceeds(ctx.max_registers))
515 break;
516 const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]);
517 const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + temp;
518 if (new_demand.exceeds(ctx.max_registers))
519 break;
520 // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure
521
522 /* move the candidate below the memory load */
523 move_element(block->instructions, candidate_idx, insert_idx);
524
525 /* update register pressure */
526 move_element(register_demand, candidate_idx, insert_idx);
527 for (int i = candidate_idx; i < insert_idx - 1; i++) {
528 register_demand[i] -= candidate_diff;
529 }
530 register_demand[insert_idx - 1] = new_demand;
531 register_pressure -= candidate_diff;
532 insert_idx--;
533 k++;
534 if (candidate_idx < ctx.last_SMEM_dep_idx)
535 ctx.last_SMEM_stall++;
536 }
537
538 /* create the initial set of values which depend on current */
539 std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
540 std::fill(ctx.RAR_dependencies.begin(), ctx.RAR_dependencies.end(), false);
541 for (const Definition& def : current->definitions) {
542 if (def.isTemp())
543 ctx.depends_on[def.tempId()] = true;
544 }
545
546 /* find the first instruction depending on current or find another VMEM */
547 insert_idx = idx;
548 moving_interaction = barrier_none;
549 moving_spill = false;
550
551 bool found_dependency = false;
552 /* second, check if we have instructions after current to move up */
553 for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) {
554 assert(candidate_idx < (int) block->instructions.size());
555 aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
556
557 if (candidate->opcode == aco_opcode::p_logical_end)
558 break;
559 if (!can_move_instr(candidate, current, moving_interaction))
560 break;
561
562 const bool writes_exec = std::any_of(candidate->definitions.begin(), candidate->definitions.end(),
563 [](const Definition& def) {return def.isFixed() && def.physReg() == exec; });
564 if (writes_exec)
565 break;
566
567 /* check if candidate depends on current */
568 bool is_dependency = !can_reorder(candidate.get(), true) && !can_reorder_cur;
569 for (const Operand& op : candidate->operands) {
570 if (op.isTemp() && ctx.depends_on[op.tempId()]) {
571 is_dependency = true;
572 break;
573 }
574 }
575 if (moving_spill && is_spill_reload(candidate))
576 is_dependency = true;
577 if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
578 is_dependency = true;
579 moving_interaction |= get_barrier_interaction(candidate.get());
580 moving_spill |= is_spill_reload(candidate);
581 if (is_dependency) {
582 for (const Definition& def : candidate->definitions) {
583 if (def.isTemp())
584 ctx.depends_on[def.tempId()] = true;
585 }
586 for (const Operand& op : candidate->operands) {
587 if (op.isTemp())
588 ctx.RAR_dependencies[op.tempId()] = true;
589 }
590 if (!found_dependency) {
591 insert_idx = candidate_idx;
592 found_dependency = true;
593 /* init register pressure */
594 register_pressure = register_demand[insert_idx - 1];
595 continue;
596 }
597 }
598
599 /* update register pressure */
600 register_pressure.update(register_demand[candidate_idx - 1]);
601
602 if (is_dependency || !found_dependency)
603 continue;
604 assert(insert_idx != idx);
605
606 bool register_pressure_unknown = false;
607 /* check if candidate uses/kills an operand which is used by a dependency */
608 for (const Operand& op : candidate->operands) {
609 if (op.isTemp() && ctx.RAR_dependencies[op.tempId()])
610 register_pressure_unknown = true;
611 }
612 if (register_pressure_unknown) {
613 for (const Definition& def : candidate->definitions) {
614 if (def.isTemp())
615 ctx.RAR_dependencies[def.tempId()] = true;
616 }
617 for (const Operand& op : candidate->operands) {
618 if (op.isTemp())
619 ctx.RAR_dependencies[op.tempId()] = true;
620 }
621 continue;
622 }
623
624 /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
625 const RegisterDemand candidate_diff = getLiveChanges(candidate);
626 const RegisterDemand temp = getTempRegisters(candidate);
627 if (RegisterDemand(register_pressure + candidate_diff).exceeds(ctx.max_registers))
628 break;
629 const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]);
630 const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + candidate_diff + temp;
631 if (new_demand.exceeds(ctx.max_registers))
632 break;
633
634 /* move the candidate above the insert_idx */
635 move_element(block->instructions, candidate_idx, insert_idx);
636
637 /* update register pressure */
638 move_element(register_demand, candidate_idx, insert_idx);
639 for (int i = insert_idx + 1; i <= candidate_idx; i++) {
640 register_demand[i] += candidate_diff;
641 }
642 register_demand[insert_idx] = new_demand;
643 register_pressure += candidate_diff;
644 insert_idx++;
645 k++;
646 }
647 }
648
649 void schedule_position_export(sched_ctx& ctx, Block* block,
650 std::vector<RegisterDemand>& register_demand,
651 Instruction* current, int idx)
652 {
653 assert(idx != 0);
654 int window_size = POS_EXP_WINDOW_SIZE;
655 int max_moves = POS_EXP_MAX_MOVES;
656 int16_t k = 0;
657
658 /* create the initial set of values which current depends on */
659 std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
660 for (unsigned i = 0; i < current->operands.size(); i++) {
661 if (current->operands[i].isTemp())
662 ctx.depends_on[current->operands[i].tempId()] = true;
663 }
664
665 /* maintain how many registers remain free when moving instructions */
666 RegisterDemand register_pressure = register_demand[idx];
667
668 /* first, check if we have instructions before current to move down */
669 int insert_idx = idx + 1;
670 int moving_interaction = barrier_none;
671 bool moving_spill = false;
672
673 for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
674 assert(candidate_idx >= 0);
675 aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
676
677 /* break when encountering logical_start or barriers */
678 if (candidate->opcode == aco_opcode::p_logical_start)
679 break;
680 if (candidate->opcode == aco_opcode::p_exit_early_if)
681 break;
682 if (candidate->isVMEM() || candidate->format == Format::SMEM)
683 break;
684 if (!can_move_instr(candidate, current, moving_interaction))
685 break;
686
687 register_pressure.update(register_demand[candidate_idx]);
688
689 /* if current depends on candidate, add additional dependencies and continue */
690 bool can_move_down = true;
691 bool writes_exec = false;
692 for (unsigned i = 0; i < candidate->definitions.size(); i++) {
693 if (candidate->definitions[i].isTemp() && ctx.depends_on[candidate->definitions[i].tempId()])
694 can_move_down = false;
695 if (candidate->definitions[i].isFixed() && candidate->definitions[i].physReg() == exec)
696 writes_exec = true;
697 }
698 if (writes_exec)
699 break;
700
701 if (moving_spill && is_spill_reload(candidate))
702 can_move_down = false;
703 if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
704 can_move_down = false;
705 moving_interaction |= get_barrier_interaction(candidate.get());
706 moving_spill |= is_spill_reload(candidate);
707 if (!can_move_down) {
708 for (unsigned i = 0; i < candidate->operands.size(); i++) {
709 if (candidate->operands[i].isTemp())
710 ctx.depends_on[candidate->operands[i].tempId()] = true;
711 }
712 continue;
713 }
714
715 bool register_pressure_unknown = false;
716 /* check if one of candidate's operands is killed by depending instruction */
717 for (unsigned i = 0; i < candidate->operands.size(); i++) {
718 if (candidate->operands[i].isTemp() && ctx.depends_on[candidate->operands[i].tempId()]) {
719 // FIXME: account for difference in register pressure
720 register_pressure_unknown = true;
721 }
722 }
723 if (register_pressure_unknown) {
724 for (unsigned i = 0; i < candidate->operands.size(); i++) {
725 if (candidate->operands[i].isTemp())
726 ctx.depends_on[candidate->operands[i].tempId()] = true;
727 }
728 continue;
729 }
730
731 /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
732 const RegisterDemand candidate_diff = getLiveChanges(candidate);
733 const RegisterDemand temp = getTempRegisters(candidate);;
734 if (RegisterDemand(register_pressure - candidate_diff).exceeds(ctx.max_registers))
735 break;
736 const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]);
737 const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + temp;
738 if (new_demand.exceeds(ctx.max_registers))
739 break;
740 // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure
741
742 /* move the candidate below the export */
743 move_element(block->instructions, candidate_idx, insert_idx);
744
745 /* update register pressure */
746 move_element(register_demand, candidate_idx, insert_idx);
747 for (int i = candidate_idx; i < insert_idx - 1; i++) {
748 register_demand[i] -= candidate_diff;
749 }
750 register_demand[insert_idx - 1] = new_demand;
751 register_pressure -= candidate_diff;
752 insert_idx--;
753 k++;
754 }
755 }
756
757 void schedule_block(sched_ctx& ctx, Program *program, Block* block, live& live_vars)
758 {
759 ctx.last_SMEM_dep_idx = 0;
760 ctx.last_SMEM_stall = INT16_MIN;
761
762 /* go through all instructions and find memory loads */
763 for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
764 Instruction* current = block->instructions[idx].get();
765
766 if (current->definitions.empty())
767 continue;
768
769 if (current->isVMEM())
770 schedule_VMEM(ctx, block, live_vars.register_demand[block->index], current, idx);
771 if (current->format == Format::SMEM)
772 schedule_SMEM(ctx, block, live_vars.register_demand[block->index], current, idx);
773 }
774
775 if ((program->stage & hw_vs) && block->index == program->blocks.size() - 1) {
776 /* Try to move position exports as far up as possible, to reduce register
777 * usage and because ISA reference guides say so. */
778 for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
779 Instruction* current = block->instructions[idx].get();
780
781 if (current->format == Format::EXP) {
782 unsigned target = static_cast<Export_instruction*>(current)->dest;
783 if (target >= V_008DFC_SQ_EXP_POS && target < V_008DFC_SQ_EXP_PARAM)
784 schedule_position_export(ctx, block, live_vars.register_demand[block->index], current, idx);
785 }
786 }
787 }
788
789 /* resummarize the block's register demand */
790 block->register_demand = RegisterDemand();
791 for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
792 block->register_demand.update(live_vars.register_demand[block->index][idx]);
793 }
794 }
795
796
797 void schedule_program(Program *program, live& live_vars)
798 {
799 sched_ctx ctx;
800 ctx.depends_on.resize(program->peekAllocationId());
801 ctx.RAR_dependencies.resize(program->peekAllocationId());
802 /* Allowing the scheduler to reduce the number of waves to as low as 5
803 * improves performance of Thrones of Britannia significantly and doesn't
804 * seem to hurt anything else. */
805 //TODO: maybe use some sort of heuristic instead
806 //TODO: this also increases window-size/max-moves? did I realize that at the time?
807 ctx.num_waves = std::min<uint16_t>(program->num_waves, 5);
808 assert(ctx.num_waves);
809 uint16_t total_sgpr_regs = program->physical_sgprs;
810 uint16_t max_addressible_sgpr = program->sgpr_limit;
811 ctx.max_registers = { int16_t(((256 / ctx.num_waves) & ~3) - 2), std::min<int16_t>(((total_sgpr_regs / ctx.num_waves) & ~program->sgpr_alloc_granule) - 2, max_addressible_sgpr)};
812
813 for (Block& block : program->blocks)
814 schedule_block(ctx, program, &block, live_vars);
815
816 /* update max_reg_demand and num_waves */
817 RegisterDemand new_demand;
818 for (Block& block : program->blocks) {
819 new_demand.update(block.register_demand);
820 }
821 update_vgpr_sgpr_demand(program, new_demand);
822
823 /* if enabled, this code asserts that register_demand is updated correctly */
824 #if 0
825 int prev_num_waves = program->num_waves;
826 const RegisterDemand prev_max_demand = program->max_reg_demand;
827
828 std::vector<RegisterDemand> demands(program->blocks.size());
829 for (unsigned j = 0; j < program->blocks.size(); j++) {
830 demands[j] = program->blocks[j].register_demand;
831 }
832
833 struct radv_nir_compiler_options options;
834 options.chip_class = program->chip_class;
835 live live_vars2 = aco::live_var_analysis(program, &options);
836
837 for (unsigned j = 0; j < program->blocks.size(); j++) {
838 Block &b = program->blocks[j];
839 for (unsigned i = 0; i < b.instructions.size(); i++)
840 assert(live_vars.register_demand[b.index][i] == live_vars2.register_demand[b.index][i]);
841 assert(b.register_demand == demands[j]);
842 }
843
844 assert(program->max_reg_demand == prev_max_demand);
845 assert(program->num_waves == prev_num_waves);
846 #endif
847 }
848
849 }