aco: Initial commit of independent AMD compiler
[mesa.git] / src / amd / compiler / aco_scheduler.cpp
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #include "aco_ir.h"
26 #include <unordered_set>
27 #include <algorithm>
28
29 #include "vulkan/radv_shader.h" // for radv_nir_compiler_options
30 #include "amdgfxregs.h"
31
32 #define SMEM_WINDOW_SIZE (350 - ctx.num_waves * 35)
33 #define VMEM_WINDOW_SIZE (1024 - ctx.num_waves * 64)
34 #define POS_EXP_WINDOW_SIZE 512
35 #define SMEM_MAX_MOVES (80 - ctx.num_waves * 8)
36 #define VMEM_MAX_MOVES (128 - ctx.num_waves * 4)
37 #define POS_EXP_MAX_MOVES 512
38
39 namespace aco {
40
41 struct sched_ctx {
42 std::vector<bool> depends_on;
43 std::vector<bool> RAR_dependencies;
44 RegisterDemand max_registers;
45 int16_t num_waves;
46 int16_t last_SMEM_stall;
47 int last_SMEM_dep_idx;
48 };
49
50 /* This scheduler is a simple bottom-up pass based on ideas from
51 * "A Novel Lightweight Instruction Scheduling Algorithm for Just-In-Time Compiler"
52 * from Xiaohua Shi and Peng Guo.
53 * The basic approach is to iterate over all instructions. When a memory instruction
54 * is encountered it tries to move independent instructions from above and below
55 * between the memory instruction and it's first user.
56 * The novelty is that this scheduler cares for the current register pressure:
57 * Instructions will only be moved if the register pressure won't exceed a certain bound.
58 */
59
60 template <typename T>
61 void move_element(T& list, size_t idx, size_t before) {
62 if (idx < before) {
63 auto begin = std::next(list.begin(), idx);
64 auto end = std::next(list.begin(), before);
65 std::rotate(begin, begin + 1, end);
66 } else if (idx > before) {
67 auto begin = std::next(list.begin(), before);
68 auto end = std::next(list.begin(), idx + 1);
69 std::rotate(begin, end - 1, end);
70 }
71 }
72
73 static RegisterDemand getLiveChanges(aco_ptr<Instruction>& instr)
74 {
75 RegisterDemand changes;
76 for (const Definition& def : instr->definitions) {
77 if (!def.isTemp() || def.isKill())
78 continue;
79 changes += def.getTemp();
80 }
81
82 for (const Operand& op : instr->operands) {
83 if (!op.isTemp() || !op.isFirstKill())
84 continue;
85 changes -= op.getTemp();
86 }
87
88 return changes;
89 }
90
91 static RegisterDemand getTempRegisters(aco_ptr<Instruction>& instr)
92 {
93 RegisterDemand temp_registers;
94 for (const Definition& def : instr->definitions) {
95 if (!def.isTemp() || !def.isKill())
96 continue;
97 temp_registers += def.getTemp();
98 }
99 return temp_registers;
100 }
101
102 static bool is_spill_reload(aco_ptr<Instruction>& instr)
103 {
104 return instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload;
105 }
106
107 bool can_move_instr(aco_ptr<Instruction>& instr, Instruction* current, int moving_interaction)
108 {
109 /* don't move exports so that they stay closer together */
110 if (instr->format == Format::EXP)
111 return false;
112
113 /* handle barriers */
114
115 /* TODO: instead of stopping, maybe try to move the barriers and any
116 * instructions interacting with them instead? */
117 if (instr->format != Format::PSEUDO_BARRIER) {
118 if (instr->opcode == aco_opcode::s_barrier) {
119 bool can_reorder = false;
120 switch (current->format) {
121 case Format::SMEM:
122 can_reorder = static_cast<SMEM_instruction*>(current)->can_reorder;
123 break;
124 case Format::MUBUF:
125 can_reorder = static_cast<MUBUF_instruction*>(current)->can_reorder;
126 break;
127 case Format::MIMG:
128 can_reorder = static_cast<MIMG_instruction*>(current)->can_reorder;
129 break;
130 default:
131 break;
132 }
133 return can_reorder && moving_interaction == barrier_none;
134 } else {
135 return true;
136 }
137 }
138
139 int interaction = get_barrier_interaction(current);
140 interaction |= moving_interaction;
141
142 switch (instr->opcode) {
143 case aco_opcode::p_memory_barrier_atomic:
144 return !(interaction & barrier_atomic);
145 /* For now, buffer and image barriers are treated the same. this is because of
146 * dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_nonlocal.buffer.guard_nonlocal.image.comp
147 * which seems to use an image load to determine if the result of a buffer load is valid. So the ordering of the two loads is important.
148 * I /think/ we should probably eventually expand the meaning of a buffer barrier so that all buffer operations before it, must stay before it
149 * and that both image and buffer operations after it, must stay after it. We should also do the same for image barriers.
150 * Or perhaps the problem is that we don't have a combined barrier instruction for both buffers and images, but the CTS test expects us to?
151 * Either way, this solution should work. */
152 case aco_opcode::p_memory_barrier_buffer:
153 case aco_opcode::p_memory_barrier_image:
154 return !(interaction & (barrier_image | barrier_buffer));
155 case aco_opcode::p_memory_barrier_shared:
156 return !(interaction & barrier_shared);
157 case aco_opcode::p_memory_barrier_all:
158 return interaction == barrier_none;
159 default:
160 return false;
161 }
162 }
163
164 bool can_reorder(Instruction* candidate, bool allow_smem)
165 {
166 switch (candidate->format) {
167 case Format::SMEM:
168 return allow_smem || static_cast<SMEM_instruction*>(candidate)->can_reorder;
169 case Format::MUBUF:
170 return static_cast<MUBUF_instruction*>(candidate)->can_reorder;
171 case Format::MIMG:
172 return static_cast<MIMG_instruction*>(candidate)->can_reorder;
173 case Format::MTBUF:
174 return static_cast<MTBUF_instruction*>(candidate)->can_reorder;
175 case Format::FLAT:
176 case Format::GLOBAL:
177 case Format::SCRATCH:
178 return false;
179 default:
180 return true;
181 }
182 }
183
184 void schedule_SMEM(sched_ctx& ctx, Block* block,
185 std::vector<RegisterDemand>& register_demand,
186 Instruction* current, int idx)
187 {
188 assert(idx != 0);
189 int window_size = SMEM_WINDOW_SIZE;
190 int max_moves = SMEM_MAX_MOVES;
191 int16_t k = 0;
192 bool can_reorder_cur = can_reorder(current, false);
193
194 /* create the initial set of values which current depends on */
195 std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
196 for (const Operand& op : current->operands) {
197 if (op.isTemp())
198 ctx.depends_on[op.tempId()] = true;
199 }
200
201 /* maintain how many registers remain free when moving instructions */
202 RegisterDemand register_pressure = register_demand[idx];
203
204 /* first, check if we have instructions before current to move down */
205 int insert_idx = idx + 1;
206 int moving_interaction = barrier_none;
207 bool moving_spill = false;
208
209 for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
210 assert(candidate_idx >= 0);
211 aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
212
213 /* break if we'd make the previous SMEM instruction stall */
214 bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
215 if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0)
216 break;
217
218 /* break when encountering another MEM instruction, logical_start or barriers */
219 if (!can_reorder(candidate.get(), false) && !can_reorder_cur)
220 break;
221 if (candidate->opcode == aco_opcode::p_logical_start)
222 break;
223 if (!can_move_instr(candidate, current, moving_interaction))
224 break;
225 register_pressure.update(register_demand[candidate_idx]);
226
227 /* if current depends on candidate, add additional dependencies and continue */
228 bool can_move_down = true;
229 bool writes_exec = false;
230 for (const Definition& def : candidate->definitions) {
231 if (def.isTemp() && ctx.depends_on[def.tempId()])
232 can_move_down = false;
233 if (def.isFixed() && def.physReg() == exec)
234 writes_exec = true;
235 }
236 if (writes_exec)
237 break;
238
239 if (moving_spill && is_spill_reload(candidate))
240 can_move_down = false;
241 if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
242 can_move_down = false;
243 moving_interaction |= get_barrier_interaction(candidate.get());
244 moving_spill |= is_spill_reload(candidate);
245 if (!can_move_down) {
246 for (const Operand& op : candidate->operands) {
247 if (op.isTemp())
248 ctx.depends_on[op.tempId()] = true;
249 }
250 continue;
251 }
252
253 bool register_pressure_unknown = false;
254 /* check if one of candidate's operands is killed by depending instruction */
255 for (const Operand& op : candidate->operands) {
256 if (op.isTemp() && ctx.depends_on[op.tempId()]) {
257 // FIXME: account for difference in register pressure
258 register_pressure_unknown = true;
259 }
260 }
261 if (register_pressure_unknown) {
262 for (const Operand& op : candidate->operands) {
263 if (op.isTemp())
264 ctx.depends_on[op.tempId()] = true;
265 }
266 continue;
267 }
268
269 /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
270 const RegisterDemand candidate_diff = getLiveChanges(candidate);
271 const RegisterDemand tempDemand = getTempRegisters(candidate);
272 if (RegisterDemand(register_pressure - candidate_diff).exceeds(ctx.max_registers))
273 break;
274 const RegisterDemand tempDemand2 = getTempRegisters(block->instructions[insert_idx - 1]);
275 const RegisterDemand new_demand = register_demand[insert_idx - 1] - tempDemand2 + tempDemand;
276 if (new_demand.exceeds(ctx.max_registers))
277 break;
278 // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure
279
280 /* move the candidate below the memory load */
281 move_element(block->instructions, candidate_idx, insert_idx);
282
283 /* update register pressure */
284 move_element(register_demand, candidate_idx, insert_idx);
285 for (int i = candidate_idx; i < insert_idx - 1; i++) {
286 register_demand[i] -= candidate_diff;
287 }
288 register_demand[insert_idx - 1] = new_demand;
289 register_pressure -= candidate_diff;
290
291 if (candidate_idx < ctx.last_SMEM_dep_idx)
292 ctx.last_SMEM_stall++;
293 insert_idx--;
294 k++;
295 }
296
297 /* create the initial set of values which depend on current */
298 std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
299 std::fill(ctx.RAR_dependencies.begin(), ctx.RAR_dependencies.end(), false);
300 for (const Definition& def : current->definitions) {
301 if (def.isTemp())
302 ctx.depends_on[def.tempId()] = true;
303 }
304
305 /* find the first instruction depending on current or find another MEM */
306 insert_idx = idx + 1;
307 moving_interaction = barrier_none;
308 moving_spill = false;
309
310 bool found_dependency = false;
311 /* second, check if we have instructions after current to move up */
312 for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) {
313 assert(candidate_idx < (int) block->instructions.size());
314 aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
315
316 if (candidate->opcode == aco_opcode::p_logical_end)
317 break;
318 if (!can_move_instr(candidate, current, moving_interaction))
319 break;
320
321 const bool writes_exec = std::any_of(candidate->definitions.begin(), candidate->definitions.end(),
322 [](const Definition& def) { return def.isFixed() && def.physReg() == exec;});
323 if (writes_exec)
324 break;
325
326 /* check if candidate depends on current */
327 bool is_dependency = std::any_of(candidate->operands.begin(), candidate->operands.end(),
328 [&ctx](const Operand& op) { return op.isTemp() && ctx.depends_on[op.tempId()];});
329 if (moving_spill && is_spill_reload(candidate))
330 is_dependency = true;
331 if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
332 is_dependency = true;
333 moving_interaction |= get_barrier_interaction(candidate.get());
334 moving_spill |= is_spill_reload(candidate);
335 if (is_dependency) {
336 for (const Definition& def : candidate->definitions) {
337 if (def.isTemp())
338 ctx.depends_on[def.tempId()] = true;
339 }
340 for (const Operand& op : candidate->operands) {
341 if (op.isTemp())
342 ctx.RAR_dependencies[op.tempId()] = true;
343 }
344 if (!found_dependency) {
345 insert_idx = candidate_idx;
346 found_dependency = true;
347 /* init register pressure */
348 register_pressure = register_demand[insert_idx - 1];
349 }
350 }
351
352 if (!can_reorder(candidate.get(), false) && !can_reorder_cur)
353 break;
354
355 if (!found_dependency) {
356 k++;
357 continue;
358 }
359
360 /* update register pressure */
361 register_pressure.update(register_demand[candidate_idx - 1]);
362
363 if (is_dependency)
364 continue;
365 assert(insert_idx != idx);
366
367 // TODO: correctly calculate register pressure for this case
368 bool register_pressure_unknown = false;
369 /* check if candidate uses/kills an operand which is used by a dependency */
370 for (const Operand& op : candidate->operands) {
371 if (op.isTemp() && ctx.RAR_dependencies[op.tempId()])
372 register_pressure_unknown = true;
373 }
374 if (register_pressure_unknown) {
375 for (const Definition& def : candidate->definitions) {
376 if (def.isTemp())
377 ctx.RAR_dependencies[def.tempId()] = true;
378 }
379 for (const Operand& op : candidate->operands) {
380 if (op.isTemp())
381 ctx.RAR_dependencies[op.tempId()] = true;
382 }
383 continue;
384 }
385
386 /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
387 const RegisterDemand candidate_diff = getLiveChanges(candidate);
388 const RegisterDemand temp = getTempRegisters(candidate);
389 if (RegisterDemand(register_pressure + candidate_diff).exceeds(ctx.max_registers))
390 break;
391 const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]);
392 const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + candidate_diff + temp;
393 if (new_demand.exceeds(ctx.max_registers))
394 break;
395
396 /* move the candidate above the insert_idx */
397 move_element(block->instructions, candidate_idx, insert_idx);
398
399 /* update register pressure */
400 move_element(register_demand, candidate_idx, insert_idx);
401 for (int i = insert_idx + 1; i <= candidate_idx; i++) {
402 register_demand[i] += candidate_diff;
403 }
404 register_demand[insert_idx] = new_demand;
405 register_pressure += candidate_diff;
406 insert_idx++;
407 k++;
408 }
409
410 ctx.last_SMEM_dep_idx = found_dependency ? insert_idx : 0;
411 ctx.last_SMEM_stall = 10 - ctx.num_waves - k;
412 }
413
414 void schedule_VMEM(sched_ctx& ctx, Block* block,
415 std::vector<RegisterDemand>& register_demand,
416 Instruction* current, int idx)
417 {
418 assert(idx != 0);
419 int window_size = VMEM_WINDOW_SIZE;
420 int max_moves = VMEM_MAX_MOVES;
421 int16_t k = 0;
422 bool can_reorder_cur = can_reorder(current, false);
423
424 /* create the initial set of values which current depends on */
425 std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
426 for (const Operand& op : current->operands) {
427 if (op.isTemp())
428 ctx.depends_on[op.tempId()] = true;
429 }
430
431 /* maintain how many registers remain free when moving instructions */
432 RegisterDemand register_pressure = register_demand[idx];
433
434 /* first, check if we have instructions before current to move down */
435 int insert_idx = idx + 1;
436 int moving_interaction = barrier_none;
437 bool moving_spill = false;
438
439 for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
440 assert(candidate_idx >= 0);
441 aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
442
443 /* break when encountering another VMEM instruction, logical_start or barriers */
444 if (!can_reorder(candidate.get(), true) && !can_reorder_cur)
445 break;
446 if (candidate->opcode == aco_opcode::p_logical_start)
447 break;
448 if (!can_move_instr(candidate, current, moving_interaction))
449 break;
450
451 /* break if we'd make the previous SMEM instruction stall */
452 bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
453 if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0)
454 break;
455 register_pressure.update(register_demand[candidate_idx]);
456
457 /* if current depends on candidate, add additional dependencies and continue */
458 bool can_move_down = true;
459 bool writes_exec = false;
460 for (const Definition& def : candidate->definitions) {
461 if (def.isTemp() && ctx.depends_on[def.tempId()])
462 can_move_down = false;
463 if (def.isFixed() && def.physReg() == exec)
464 writes_exec = true;
465 }
466 if (writes_exec)
467 break;
468
469 if (moving_spill && is_spill_reload(candidate))
470 can_move_down = false;
471 if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
472 can_move_down = false;
473 moving_interaction |= get_barrier_interaction(candidate.get());
474 moving_spill |= is_spill_reload(candidate);
475 if (!can_move_down) {
476 for (const Operand& op : candidate->operands) {
477 if (op.isTemp())
478 ctx.depends_on[op.tempId()] = true;
479 }
480 continue;
481 }
482
483 bool register_pressure_unknown = false;
484 /* check if one of candidate's operands is killed by depending instruction */
485 for (const Operand& op : candidate->operands) {
486 if (op.isTemp() && ctx.depends_on[op.tempId()]) {
487 // FIXME: account for difference in register pressure
488 register_pressure_unknown = true;
489 }
490 }
491 if (register_pressure_unknown) {
492 for (const Operand& op : candidate->operands) {
493 if (op.isTemp())
494 ctx.depends_on[op.tempId()] = true;
495 }
496 continue;
497 }
498
499 /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
500 const RegisterDemand candidate_diff = getLiveChanges(candidate);
501 const RegisterDemand temp = getTempRegisters(candidate);;
502 if (RegisterDemand(register_pressure - candidate_diff).exceeds(ctx.max_registers))
503 break;
504 const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]);
505 const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + temp;
506 if (new_demand.exceeds(ctx.max_registers))
507 break;
508 // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure
509
510 /* move the candidate below the memory load */
511 move_element(block->instructions, candidate_idx, insert_idx);
512
513 /* update register pressure */
514 move_element(register_demand, candidate_idx, insert_idx);
515 for (int i = candidate_idx; i < insert_idx - 1; i++) {
516 register_demand[i] -= candidate_diff;
517 }
518 register_demand[insert_idx - 1] = new_demand;
519 register_pressure -= candidate_diff;
520 insert_idx--;
521 k++;
522 if (candidate_idx < ctx.last_SMEM_dep_idx)
523 ctx.last_SMEM_stall++;
524 }
525
526 /* create the initial set of values which depend on current */
527 std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
528 std::fill(ctx.RAR_dependencies.begin(), ctx.RAR_dependencies.end(), false);
529 for (const Definition& def : current->definitions) {
530 if (def.isTemp())
531 ctx.depends_on[def.tempId()] = true;
532 }
533
534 /* find the first instruction depending on current or find another VMEM */
535 insert_idx = idx;
536 moving_interaction = barrier_none;
537 moving_spill = false;
538
539 bool found_dependency = false;
540 /* second, check if we have instructions after current to move up */
541 for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) {
542 assert(candidate_idx < (int) block->instructions.size());
543 aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
544
545 if (candidate->opcode == aco_opcode::p_logical_end)
546 break;
547 if (!can_move_instr(candidate, current, moving_interaction))
548 break;
549
550 const bool writes_exec = std::any_of(candidate->definitions.begin(), candidate->definitions.end(),
551 [](const Definition& def) {return def.isFixed() && def.physReg() == exec; });
552 if (writes_exec)
553 break;
554
555 /* check if candidate depends on current */
556 bool is_dependency = !can_reorder(candidate.get(), true) && !can_reorder_cur;
557 for (const Operand& op : candidate->operands) {
558 if (op.isTemp() && ctx.depends_on[op.tempId()]) {
559 is_dependency = true;
560 break;
561 }
562 }
563 if (moving_spill && is_spill_reload(candidate))
564 is_dependency = true;
565 if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
566 is_dependency = true;
567 moving_interaction |= get_barrier_interaction(candidate.get());
568 moving_spill |= is_spill_reload(candidate);
569 if (is_dependency) {
570 for (const Definition& def : candidate->definitions) {
571 if (def.isTemp())
572 ctx.depends_on[def.tempId()] = true;
573 }
574 for (const Operand& op : candidate->operands) {
575 if (op.isTemp())
576 ctx.RAR_dependencies[op.tempId()] = true;
577 }
578 if (!found_dependency) {
579 insert_idx = candidate_idx;
580 found_dependency = true;
581 /* init register pressure */
582 register_pressure = register_demand[insert_idx - 1];
583 continue;
584 }
585 }
586
587 /* update register pressure */
588 register_pressure.update(register_demand[candidate_idx - 1]);
589
590 if (is_dependency || !found_dependency)
591 continue;
592 assert(insert_idx != idx);
593
594 bool register_pressure_unknown = false;
595 /* check if candidate uses/kills an operand which is used by a dependency */
596 for (const Operand& op : candidate->operands) {
597 if (op.isTemp() && ctx.RAR_dependencies[op.tempId()])
598 register_pressure_unknown = true;
599 }
600 if (register_pressure_unknown) {
601 for (const Definition& def : candidate->definitions) {
602 if (def.isTemp())
603 ctx.RAR_dependencies[def.tempId()] = true;
604 }
605 for (const Operand& op : candidate->operands) {
606 if (op.isTemp())
607 ctx.RAR_dependencies[op.tempId()] = true;
608 }
609 continue;
610 }
611
612 /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
613 const RegisterDemand candidate_diff = getLiveChanges(candidate);
614 const RegisterDemand temp = getTempRegisters(candidate);
615 if (RegisterDemand(register_pressure + candidate_diff).exceeds(ctx.max_registers))
616 break;
617 const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]);
618 const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + candidate_diff + temp;
619 if (new_demand.exceeds(ctx.max_registers))
620 break;
621
622 /* move the candidate above the insert_idx */
623 move_element(block->instructions, candidate_idx, insert_idx);
624
625 /* update register pressure */
626 move_element(register_demand, candidate_idx, insert_idx);
627 for (int i = insert_idx + 1; i <= candidate_idx; i++) {
628 register_demand[i] += candidate_diff;
629 }
630 register_demand[insert_idx] = new_demand;
631 register_pressure += candidate_diff;
632 insert_idx++;
633 k++;
634 }
635 }
636
637 void schedule_position_export(sched_ctx& ctx, Block* block,
638 std::vector<RegisterDemand>& register_demand,
639 Instruction* current, int idx)
640 {
641 assert(idx != 0);
642 int window_size = POS_EXP_WINDOW_SIZE;
643 int max_moves = POS_EXP_MAX_MOVES;
644 int16_t k = 0;
645
646 /* create the initial set of values which current depends on */
647 std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
648 for (unsigned i = 0; i < current->operands.size(); i++) {
649 if (current->operands[i].isTemp())
650 ctx.depends_on[current->operands[i].tempId()] = true;
651 }
652
653 /* maintain how many registers remain free when moving instructions */
654 RegisterDemand register_pressure = register_demand[idx];
655
656 /* first, check if we have instructions before current to move down */
657 int insert_idx = idx + 1;
658 int moving_interaction = barrier_none;
659 bool moving_spill = false;
660
661 for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
662 assert(candidate_idx >= 0);
663 aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
664
665 /* break when encountering logical_start or barriers */
666 if (candidate->opcode == aco_opcode::p_logical_start)
667 break;
668 if (candidate->isVMEM() || candidate->format == Format::SMEM)
669 break;
670 if (!can_move_instr(candidate, current, moving_interaction))
671 break;
672
673 register_pressure.update(register_demand[candidate_idx]);
674
675 /* if current depends on candidate, add additional dependencies and continue */
676 bool can_move_down = true;
677 bool writes_exec = false;
678 for (unsigned i = 0; i < candidate->definitions.size(); i++) {
679 if (candidate->definitions[i].isTemp() && ctx.depends_on[candidate->definitions[i].tempId()])
680 can_move_down = false;
681 if (candidate->definitions[i].isFixed() && candidate->definitions[i].physReg() == exec)
682 writes_exec = true;
683 }
684 if (writes_exec)
685 break;
686
687 if (moving_spill && is_spill_reload(candidate))
688 can_move_down = false;
689 if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
690 can_move_down = false;
691 moving_interaction |= get_barrier_interaction(candidate.get());
692 moving_spill |= is_spill_reload(candidate);
693 if (!can_move_down) {
694 for (unsigned i = 0; i < candidate->operands.size(); i++) {
695 if (candidate->operands[i].isTemp())
696 ctx.depends_on[candidate->operands[i].tempId()] = true;
697 }
698 continue;
699 }
700
701 bool register_pressure_unknown = false;
702 /* check if one of candidate's operands is killed by depending instruction */
703 for (unsigned i = 0; i < candidate->operands.size(); i++) {
704 if (candidate->operands[i].isTemp() && ctx.depends_on[candidate->operands[i].tempId()]) {
705 // FIXME: account for difference in register pressure
706 register_pressure_unknown = true;
707 }
708 }
709 if (register_pressure_unknown) {
710 for (unsigned i = 0; i < candidate->operands.size(); i++) {
711 if (candidate->operands[i].isTemp())
712 ctx.depends_on[candidate->operands[i].tempId()] = true;
713 }
714 continue;
715 }
716
717 /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
718 const RegisterDemand candidate_diff = getLiveChanges(candidate);
719 const RegisterDemand temp = getTempRegisters(candidate);;
720 if (RegisterDemand(register_pressure - candidate_diff).exceeds(ctx.max_registers))
721 break;
722 const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]);
723 const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + temp;
724 if (new_demand.exceeds(ctx.max_registers))
725 break;
726 // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure
727
728 /* move the candidate below the export */
729 move_element(block->instructions, candidate_idx, insert_idx);
730
731 /* update register pressure */
732 move_element(register_demand, candidate_idx, insert_idx);
733 for (int i = candidate_idx; i < insert_idx - 1; i++) {
734 register_demand[i] -= candidate_diff;
735 }
736 register_demand[insert_idx - 1] = new_demand;
737 register_pressure -= candidate_diff;
738 insert_idx--;
739 k++;
740 }
741 }
742
743 void schedule_block(sched_ctx& ctx, Program *program, Block* block, live& live_vars)
744 {
745 ctx.last_SMEM_dep_idx = 0;
746 ctx.last_SMEM_stall = INT16_MIN;
747
748 /* go through all instructions and find memory loads */
749 for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
750 Instruction* current = block->instructions[idx].get();
751
752 if (current->definitions.empty())
753 continue;
754
755 if (current->isVMEM())
756 schedule_VMEM(ctx, block, live_vars.register_demand[block->index], current, idx);
757 if (current->format == Format::SMEM)
758 schedule_SMEM(ctx, block, live_vars.register_demand[block->index], current, idx);
759 }
760
761 if ((program->stage & hw_vs) && block->index == program->blocks.size() - 1) {
762 /* Try to move position exports as far up as possible, to reduce register
763 * usage and because ISA reference guides say so. */
764 for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
765 Instruction* current = block->instructions[idx].get();
766
767 if (current->format == Format::EXP) {
768 unsigned target = static_cast<Export_instruction*>(current)->dest;
769 if (target >= V_008DFC_SQ_EXP_POS && target < V_008DFC_SQ_EXP_PARAM)
770 schedule_position_export(ctx, block, live_vars.register_demand[block->index], current, idx);
771 }
772 }
773 }
774
775 /* resummarize the block's register demand */
776 block->register_demand = RegisterDemand();
777 for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
778 block->register_demand.update(live_vars.register_demand[block->index][idx]);
779 }
780 }
781
782
783 void schedule_program(Program *program, live& live_vars)
784 {
785 sched_ctx ctx;
786 ctx.depends_on.resize(program->peekAllocationId());
787 ctx.RAR_dependencies.resize(program->peekAllocationId());
788 /* Allowing the scheduler to reduce the number of waves to as low as 5
789 * improves performance of Thrones of Britannia significantly and doesn't
790 * seem to hurt anything else. */
791 //TODO: maybe use some sort of heuristic instead
792 //TODO: this also increases window-size/max-moves? did I realize that at the time?
793 ctx.num_waves = std::min<uint16_t>(program->num_waves, 5);
794 assert(ctx.num_waves);
795 uint16_t total_sgpr_regs = program->chip_class >= GFX8 ? 800 : 512;
796 uint16_t max_addressible_sgpr = program->sgpr_limit;
797 ctx.max_registers = { int16_t(((256 / ctx.num_waves) & ~3) - 2), std::min<int16_t>(((total_sgpr_regs / ctx.num_waves) & ~7) - 2, max_addressible_sgpr)};
798
799 for (Block& block : program->blocks)
800 schedule_block(ctx, program, &block, live_vars);
801
802 /* update max_reg_demand and num_waves */
803 RegisterDemand new_demand;
804 for (Block& block : program->blocks) {
805 new_demand.update(block.register_demand);
806 }
807 update_vgpr_sgpr_demand(program, new_demand);
808
809 /* if enabled, this code asserts that register_demand is updated correctly */
810 #if 0
811 int prev_num_waves = program->num_waves;
812 const RegisterDemand prev_max_demand = program->max_reg_demand;
813
814 std::vector<RegisterDemand> demands(program->blocks.size());
815 for (unsigned j = 0; j < program->blocks.size(); j++) {
816 demands[j] = program->blocks[j].register_demand;
817 }
818
819 struct radv_nir_compiler_options options;
820 options.chip_class = program->chip_class;
821 live live_vars2 = aco::live_var_analysis(program, &options);
822
823 for (unsigned j = 0; j < program->blocks.size(); j++) {
824 Block &b = program->blocks[j];
825 for (unsigned i = 0; i < b.instructions.size(); i++)
826 assert(live_vars.register_demand[b.index][i] == live_vars2.register_demand[b.index][i]);
827 assert(b.register_demand == demands[j]);
828 }
829
830 assert(program->max_reg_demand == prev_max_demand);
831 assert(program->num_waves == prev_num_waves);
832 #endif
833 }
834
835 }