aco: improve FLAT/GLOBAL scheduling
[mesa.git] / src / amd / compiler / aco_scheduler.cpp
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #include "aco_ir.h"
26 #include <unordered_set>
27 #include <algorithm>
28
29 #include "vulkan/radv_shader.h" // for radv_nir_compiler_options
30 #include "amdgfxregs.h"
31
32 #define SMEM_WINDOW_SIZE (350 - ctx.num_waves * 35)
33 #define VMEM_WINDOW_SIZE (1024 - ctx.num_waves * 64)
34 #define POS_EXP_WINDOW_SIZE 512
35 #define SMEM_MAX_MOVES (64 - ctx.num_waves * 4)
36 #define VMEM_MAX_MOVES (128 - ctx.num_waves * 8)
37 /* creating clauses decreases def-use distances, so make it less aggressive the lower num_waves is */
38 #define VMEM_CLAUSE_MAX_GRAB_DIST ((ctx.num_waves - 1) * 8)
39 #define POS_EXP_MAX_MOVES 512
40
41 namespace aco {
42
43 struct sched_ctx {
44 std::vector<bool> depends_on;
45 std::vector<bool> RAR_dependencies;
46 /* For downwards VMEM scheduling, same as RAR_dependencies but excludes the
47 * instructions in the clause, since new instructions in the clause are not
48 * moved past any other instructions in the clause. */
49 std::vector<bool> new_RAR_dependencies;
50
51 RegisterDemand max_registers;
52 int16_t num_waves;
53 int16_t last_SMEM_stall;
54 int last_SMEM_dep_idx;
55 };
56
57 /* This scheduler is a simple bottom-up pass based on ideas from
58 * "A Novel Lightweight Instruction Scheduling Algorithm for Just-In-Time Compiler"
59 * from Xiaohua Shi and Peng Guo.
60 * The basic approach is to iterate over all instructions. When a memory instruction
61 * is encountered it tries to move independent instructions from above and below
62 * between the memory instruction and it's first user.
63 * The novelty is that this scheduler cares for the current register pressure:
64 * Instructions will only be moved if the register pressure won't exceed a certain bound.
65 */
66
67 template <typename T>
68 void move_element(T& list, size_t idx, size_t before) {
69 if (idx < before) {
70 auto begin = std::next(list.begin(), idx);
71 auto end = std::next(list.begin(), before);
72 std::rotate(begin, begin + 1, end);
73 } else if (idx > before) {
74 auto begin = std::next(list.begin(), before);
75 auto end = std::next(list.begin(), idx + 1);
76 std::rotate(begin, end - 1, end);
77 }
78 }
79
80 static RegisterDemand getLiveChanges(aco_ptr<Instruction>& instr)
81 {
82 RegisterDemand changes;
83 for (const Definition& def : instr->definitions) {
84 if (!def.isTemp() || def.isKill())
85 continue;
86 changes += def.getTemp();
87 }
88
89 for (const Operand& op : instr->operands) {
90 if (!op.isTemp() || !op.isFirstKill())
91 continue;
92 changes -= op.getTemp();
93 }
94
95 return changes;
96 }
97
98 static RegisterDemand getTempRegisters(aco_ptr<Instruction>& instr)
99 {
100 RegisterDemand temp_registers;
101 for (const Definition& def : instr->definitions) {
102 if (!def.isTemp() || !def.isKill())
103 continue;
104 temp_registers += def.getTemp();
105 }
106 return temp_registers;
107 }
108
109 static bool is_spill_reload(aco_ptr<Instruction>& instr)
110 {
111 return instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload;
112 }
113
114 bool can_move_instr(aco_ptr<Instruction>& instr, Instruction* current, int moving_interaction)
115 {
116 /* don't move exports so that they stay closer together */
117 if (instr->format == Format::EXP)
118 return false;
119
120 /* don't move s_memtime/s_memrealtime */
121 if (instr->opcode == aco_opcode::s_memtime || instr->opcode == aco_opcode::s_memrealtime)
122 return false;
123
124 /* handle barriers */
125
126 /* TODO: instead of stopping, maybe try to move the barriers and any
127 * instructions interacting with them instead? */
128 if (instr->format != Format::PSEUDO_BARRIER) {
129 if (instr->opcode == aco_opcode::s_barrier) {
130 bool can_reorder = false;
131 switch (current->format) {
132 case Format::SMEM:
133 can_reorder = static_cast<SMEM_instruction*>(current)->can_reorder;
134 break;
135 case Format::MUBUF:
136 can_reorder = static_cast<MUBUF_instruction*>(current)->can_reorder;
137 break;
138 case Format::MIMG:
139 can_reorder = static_cast<MIMG_instruction*>(current)->can_reorder;
140 break;
141 case Format::FLAT:
142 case Format::GLOBAL:
143 case Format::SCRATCH:
144 can_reorder = static_cast<FLAT_instruction*>(current)->can_reorder;
145 break;
146 default:
147 break;
148 }
149 return can_reorder && moving_interaction == barrier_none;
150 } else {
151 return true;
152 }
153 }
154
155 int interaction = get_barrier_interaction(current);
156 interaction |= moving_interaction;
157
158 switch (instr->opcode) {
159 case aco_opcode::p_memory_barrier_atomic:
160 return !(interaction & barrier_atomic);
161 /* For now, buffer and image barriers are treated the same. this is because of
162 * dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_nonlocal.buffer.guard_nonlocal.image.comp
163 * which seems to use an image load to determine if the result of a buffer load is valid. So the ordering of the two loads is important.
164 * I /think/ we should probably eventually expand the meaning of a buffer barrier so that all buffer operations before it, must stay before it
165 * and that both image and buffer operations after it, must stay after it. We should also do the same for image barriers.
166 * Or perhaps the problem is that we don't have a combined barrier instruction for both buffers and images, but the CTS test expects us to?
167 * Either way, this solution should work. */
168 case aco_opcode::p_memory_barrier_buffer:
169 case aco_opcode::p_memory_barrier_image:
170 return !(interaction & (barrier_image | barrier_buffer));
171 case aco_opcode::p_memory_barrier_shared:
172 return !(interaction & barrier_shared);
173 case aco_opcode::p_memory_barrier_all:
174 return interaction == barrier_none;
175 default:
176 return false;
177 }
178 }
179
180 bool can_reorder(Instruction* candidate)
181 {
182 switch (candidate->format) {
183 case Format::SMEM:
184 return static_cast<SMEM_instruction*>(candidate)->can_reorder;
185 case Format::MUBUF:
186 return static_cast<MUBUF_instruction*>(candidate)->can_reorder;
187 case Format::MIMG:
188 return static_cast<MIMG_instruction*>(candidate)->can_reorder;
189 case Format::MTBUF:
190 return static_cast<MTBUF_instruction*>(candidate)->can_reorder;
191 case Format::FLAT:
192 case Format::GLOBAL:
193 case Format::SCRATCH:
194 return static_cast<FLAT_instruction*>(candidate)->can_reorder;
195 default:
196 return true;
197 }
198 }
199
200 void schedule_SMEM(sched_ctx& ctx, Block* block,
201 std::vector<RegisterDemand>& register_demand,
202 Instruction* current, int idx)
203 {
204 assert(idx != 0);
205 int window_size = SMEM_WINDOW_SIZE;
206 int max_moves = SMEM_MAX_MOVES;
207 int16_t k = 0;
208 bool can_reorder_cur = can_reorder(current);
209
210 /* don't move s_memtime/s_memrealtime */
211 if (current->opcode == aco_opcode::s_memtime || current->opcode == aco_opcode::s_memrealtime)
212 return;
213
214 /* create the initial set of values which current depends on */
215 std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
216 for (const Operand& op : current->operands) {
217 if (op.isTemp())
218 ctx.depends_on[op.tempId()] = true;
219 }
220
221 /* maintain how many registers remain free when moving instructions */
222 RegisterDemand register_pressure = register_demand[idx];
223
224 /* first, check if we have instructions before current to move down */
225 int insert_idx = idx + 1;
226 int moving_interaction = barrier_none;
227 bool moving_spill = false;
228
229 for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
230 assert(candidate_idx >= 0);
231 aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
232 bool can_reorder_candidate = can_reorder(candidate.get());
233
234 /* break if we'd make the previous SMEM instruction stall */
235 bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
236 if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0)
237 break;
238
239 /* break when encountering another MEM instruction, logical_start or barriers */
240 if (!can_reorder_candidate && !can_reorder_cur)
241 break;
242 if (candidate->opcode == aco_opcode::p_logical_start)
243 break;
244 if (candidate->opcode == aco_opcode::p_exit_early_if)
245 break;
246 if (!can_move_instr(candidate, current, moving_interaction))
247 break;
248 if (candidate->isVMEM())
249 break;
250 register_pressure.update(register_demand[candidate_idx]);
251
252 /* if current depends on candidate, add additional dependencies and continue */
253 bool can_move_down = true;
254 bool writes_exec = false;
255 for (const Definition& def : candidate->definitions) {
256 if (def.isTemp() && ctx.depends_on[def.tempId()])
257 can_move_down = false;
258 if (def.isFixed() && def.physReg() == exec)
259 writes_exec = true;
260 }
261 if (writes_exec)
262 break;
263
264 if (moving_spill && is_spill_reload(candidate))
265 can_move_down = false;
266 if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
267 can_move_down = false;
268 moving_interaction |= get_barrier_interaction(candidate.get());
269 moving_spill |= is_spill_reload(candidate);
270 if (!can_move_down) {
271 for (const Operand& op : candidate->operands) {
272 if (op.isTemp())
273 ctx.depends_on[op.tempId()] = true;
274 }
275 can_reorder_cur &= can_reorder_candidate;
276 continue;
277 }
278
279 bool register_pressure_unknown = false;
280 /* check if one of candidate's operands is killed by depending instruction */
281 for (const Operand& op : candidate->operands) {
282 if (op.isTemp() && ctx.depends_on[op.tempId()]) {
283 // FIXME: account for difference in register pressure
284 register_pressure_unknown = true;
285 }
286 }
287 if (register_pressure_unknown) {
288 for (const Operand& op : candidate->operands) {
289 if (op.isTemp())
290 ctx.depends_on[op.tempId()] = true;
291 }
292 can_reorder_cur &= can_reorder_candidate;
293 continue;
294 }
295
296 /* check if register pressure is low enough: the diff is negative if register pressure is increased */
297 const RegisterDemand candidate_diff = getLiveChanges(candidate);
298 const RegisterDemand tempDemand = getTempRegisters(candidate);
299 if (RegisterDemand(register_pressure - candidate_diff).exceeds(ctx.max_registers))
300 break;
301 const RegisterDemand tempDemand2 = getTempRegisters(block->instructions[insert_idx - 1]);
302 const RegisterDemand new_demand = register_demand[insert_idx - 1] - tempDemand2 + tempDemand;
303 if (new_demand.exceeds(ctx.max_registers))
304 break;
305 // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure
306
307 /* move the candidate below the memory load */
308 move_element(block->instructions, candidate_idx, insert_idx);
309
310 /* update register pressure */
311 move_element(register_demand, candidate_idx, insert_idx);
312 for (int i = candidate_idx; i < insert_idx - 1; i++) {
313 register_demand[i] -= candidate_diff;
314 }
315 register_demand[insert_idx - 1] = new_demand;
316 register_pressure -= candidate_diff;
317
318 if (candidate_idx < ctx.last_SMEM_dep_idx)
319 ctx.last_SMEM_stall++;
320 insert_idx--;
321 k++;
322 }
323
324 /* create the initial set of values which depend on current */
325 std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
326 std::fill(ctx.RAR_dependencies.begin(), ctx.RAR_dependencies.end(), false);
327 for (const Definition& def : current->definitions) {
328 if (def.isTemp())
329 ctx.depends_on[def.tempId()] = true;
330 }
331
332 /* find the first instruction depending on current or find another MEM */
333 insert_idx = idx + 1;
334 moving_interaction = barrier_none;
335 moving_spill = false;
336 can_reorder_cur = true;
337
338 bool found_dependency = false;
339 /* second, check if we have instructions after current to move up */
340 for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) {
341 assert(candidate_idx < (int) block->instructions.size());
342 aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
343 bool can_reorder_candidate = can_reorder(candidate.get());
344
345 if (candidate->opcode == aco_opcode::p_logical_end)
346 break;
347 if (!can_move_instr(candidate, current, moving_interaction))
348 break;
349
350 const bool writes_exec = std::any_of(candidate->definitions.begin(), candidate->definitions.end(),
351 [](const Definition& def) { return def.isFixed() && def.physReg() == exec;});
352 if (writes_exec)
353 break;
354
355 /* check if candidate depends on current */
356 bool is_dependency = std::any_of(candidate->operands.begin(), candidate->operands.end(),
357 [&ctx](const Operand& op) { return op.isTemp() && ctx.depends_on[op.tempId()];});
358 /* no need to steal from following VMEM instructions */
359 if (is_dependency && candidate->isVMEM())
360 break;
361 if (moving_spill && is_spill_reload(candidate))
362 is_dependency = true;
363 if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
364 is_dependency = true;
365 moving_interaction |= get_barrier_interaction(candidate.get());
366 moving_spill |= is_spill_reload(candidate);
367 if (is_dependency) {
368 for (const Definition& def : candidate->definitions) {
369 if (def.isTemp())
370 ctx.depends_on[def.tempId()] = true;
371 }
372 for (const Operand& op : candidate->operands) {
373 if (op.isTemp())
374 ctx.RAR_dependencies[op.tempId()] = true;
375 }
376 if (!found_dependency) {
377 insert_idx = candidate_idx;
378 found_dependency = true;
379 /* init register pressure */
380 register_pressure = register_demand[insert_idx - 1];
381 }
382 }
383
384 if (!can_reorder_candidate && !can_reorder_cur)
385 break;
386
387 if (!found_dependency) {
388 k++;
389 continue;
390 }
391
392 /* update register pressure */
393 register_pressure.update(register_demand[candidate_idx - 1]);
394
395 if (is_dependency) {
396 can_reorder_cur &= can_reorder_candidate;
397 continue;
398 }
399 assert(insert_idx != idx);
400
401 // TODO: correctly calculate register pressure for this case
402 bool register_pressure_unknown = false;
403 /* check if candidate uses/kills an operand which is used by a dependency */
404 for (const Operand& op : candidate->operands) {
405 if (op.isTemp() && ctx.RAR_dependencies[op.tempId()])
406 register_pressure_unknown = true;
407 }
408 if (register_pressure_unknown) {
409 if (candidate->isVMEM())
410 break;
411 for (const Definition& def : candidate->definitions) {
412 if (def.isTemp())
413 ctx.RAR_dependencies[def.tempId()] = true;
414 }
415 for (const Operand& op : candidate->operands) {
416 if (op.isTemp())
417 ctx.RAR_dependencies[op.tempId()] = true;
418 }
419 can_reorder_cur &= can_reorder_candidate;
420 continue;
421 }
422
423 /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
424 const RegisterDemand candidate_diff = getLiveChanges(candidate);
425 const RegisterDemand temp = getTempRegisters(candidate);
426 if (RegisterDemand(register_pressure + candidate_diff).exceeds(ctx.max_registers))
427 break;
428 const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]);
429 const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + candidate_diff + temp;
430 if (new_demand.exceeds(ctx.max_registers))
431 break;
432
433 /* move the candidate above the insert_idx */
434 move_element(block->instructions, candidate_idx, insert_idx);
435
436 /* update register pressure */
437 move_element(register_demand, candidate_idx, insert_idx);
438 for (int i = insert_idx + 1; i <= candidate_idx; i++) {
439 register_demand[i] += candidate_diff;
440 }
441 register_demand[insert_idx] = new_demand;
442 register_pressure += candidate_diff;
443 insert_idx++;
444 k++;
445 }
446
447 ctx.last_SMEM_dep_idx = found_dependency ? insert_idx : 0;
448 ctx.last_SMEM_stall = 10 - ctx.num_waves - k;
449 }
450
451 void schedule_VMEM(sched_ctx& ctx, Block* block,
452 std::vector<RegisterDemand>& register_demand,
453 Instruction* current, int idx)
454 {
455 assert(idx != 0);
456 int window_size = VMEM_WINDOW_SIZE;
457 int max_moves = VMEM_MAX_MOVES;
458 int clause_max_grab_dist = VMEM_CLAUSE_MAX_GRAB_DIST;
459 int16_t k = 0;
460 /* initially true as we don't pull other VMEM instructions
461 * through the current instruction */
462 bool can_reorder_vmem = true;
463 bool can_reorder_smem = true;
464
465 /* create the initial set of values which current depends on */
466 std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
467 std::fill(ctx.RAR_dependencies.begin(), ctx.RAR_dependencies.end(), false);
468 std::fill(ctx.new_RAR_dependencies.begin(), ctx.new_RAR_dependencies.end(), false);
469 for (const Operand& op : current->operands) {
470 if (op.isTemp()) {
471 ctx.depends_on[op.tempId()] = true;
472 if (op.isFirstKill())
473 ctx.RAR_dependencies[op.tempId()] = true;
474 }
475 }
476
477 /* maintain how many registers remain free when moving instructions */
478 RegisterDemand register_pressure_indep = register_demand[idx];
479 RegisterDemand register_pressure_clause = register_demand[idx];
480
481 /* first, check if we have instructions before current to move down */
482 int indep_insert_idx = idx + 1;
483 int clause_insert_idx = idx;
484 int moving_interaction = barrier_none;
485 bool moving_spill = false;
486
487 for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
488 assert(candidate_idx >= 0);
489 aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
490 bool can_reorder_candidate = can_reorder(candidate.get());
491 bool is_vmem = candidate->isVMEM() || candidate->isFlatOrGlobal();
492
493 /* break when encountering another VMEM instruction, logical_start or barriers */
494 if (!can_reorder_smem && candidate->format == Format::SMEM && !can_reorder_candidate)
495 break;
496 if (candidate->opcode == aco_opcode::p_logical_start)
497 break;
498 if (candidate->opcode == aco_opcode::p_exit_early_if)
499 break;
500 if (!can_move_instr(candidate, current, moving_interaction))
501 break;
502
503 /* break if we'd make the previous SMEM instruction stall */
504 bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
505 if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0)
506 break;
507 register_pressure_indep.update(register_demand[candidate_idx]);
508
509 bool part_of_clause = false;
510 if (current->isVMEM() == candidate->isVMEM()) {
511 bool same_resource = true;
512 if (current->isVMEM())
513 same_resource = candidate->operands[1].tempId() == current->operands[1].tempId();
514 bool can_reorder = can_reorder_vmem || can_reorder_candidate;
515 int grab_dist = clause_insert_idx - candidate_idx;
516 /* We can't easily tell how much this will decrease the def-to-use
517 * distances, so just use how far it will be moved as a heuristic. */
518 part_of_clause = can_reorder && same_resource && grab_dist < clause_max_grab_dist;
519 }
520
521 /* if current depends on candidate, add additional dependencies and continue */
522 bool can_move_down = !is_vmem || part_of_clause;
523 bool writes_exec = false;
524 for (const Definition& def : candidate->definitions) {
525 if (def.isTemp() && ctx.depends_on[def.tempId()])
526 can_move_down = false;
527 if (def.isFixed() && def.physReg() == exec)
528 writes_exec = true;
529 }
530 if (writes_exec)
531 break;
532
533 if (moving_spill && is_spill_reload(candidate))
534 can_move_down = false;
535 if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
536 can_move_down = false;
537 moving_interaction |= get_barrier_interaction(candidate.get());
538 moving_spill |= is_spill_reload(candidate);
539 if (!can_move_down) {
540 for (const Operand& op : candidate->operands) {
541 if (op.isTemp()) {
542 ctx.depends_on[op.tempId()] = true;
543 if (op.isFirstKill()) {
544 ctx.RAR_dependencies[op.tempId()] = true;
545 ctx.new_RAR_dependencies[op.tempId()] = true;
546 }
547 }
548 }
549 register_pressure_clause.update(register_demand[candidate_idx]);
550 can_reorder_smem &= candidate->format != Format::SMEM || can_reorder_candidate;
551 can_reorder_vmem &= !is_vmem || can_reorder_candidate;
552 continue;
553 }
554
555 if (part_of_clause) {
556 for (const Operand& op : candidate->operands) {
557 if (op.isTemp()) {
558 ctx.depends_on[op.tempId()] = true;
559 if (op.isFirstKill())
560 ctx.RAR_dependencies[op.tempId()] = true;
561 }
562 }
563 }
564
565 bool register_pressure_unknown = false;
566 std::vector<bool>& RAR_deps = part_of_clause ? ctx.new_RAR_dependencies : ctx.RAR_dependencies;
567 /* check if one of candidate's operands is killed by depending instruction */
568 for (const Operand& op : candidate->operands) {
569 if (op.isTemp() && RAR_deps[op.tempId()]) {
570 // FIXME: account for difference in register pressure
571 register_pressure_unknown = true;
572 }
573 }
574 if (register_pressure_unknown) {
575 for (const Operand& op : candidate->operands) {
576 if (op.isTemp()) {
577 ctx.depends_on[op.tempId()] = true;
578 if (op.isFirstKill()) {
579 ctx.RAR_dependencies[op.tempId()] = true;
580 ctx.new_RAR_dependencies[op.tempId()] = true;
581 }
582 }
583 }
584 register_pressure_clause.update(register_demand[candidate_idx]);
585 can_reorder_smem &= candidate->format != Format::SMEM || can_reorder_candidate;
586 can_reorder_vmem &= !is_vmem || can_reorder_candidate;
587 continue;
588 }
589
590 int insert_idx = part_of_clause ? clause_insert_idx : indep_insert_idx;
591 RegisterDemand register_pressure = part_of_clause ? register_pressure_clause : register_pressure_indep;
592
593 /* check if register pressure is low enough: the diff is negative if register pressure is increased */
594 const RegisterDemand candidate_diff = getLiveChanges(candidate);
595 const RegisterDemand temp = getTempRegisters(candidate);;
596 if (RegisterDemand(register_pressure - candidate_diff).exceeds(ctx.max_registers))
597 break;
598 const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]);
599 const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + temp;
600 if (new_demand.exceeds(ctx.max_registers))
601 break;
602 // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure
603
604 /* move the candidate below the memory load */
605 move_element(block->instructions, candidate_idx, insert_idx);
606
607 /* update register pressure */
608 move_element(register_demand, candidate_idx, insert_idx);
609 for (int i = candidate_idx; i < insert_idx - 1; i++) {
610 register_demand[i] -= candidate_diff;
611 }
612 register_demand[insert_idx - 1] = new_demand;
613 register_pressure_clause -= candidate_diff;
614 clause_insert_idx--;
615 if (!part_of_clause) {
616 register_pressure_indep -= candidate_diff;
617 indep_insert_idx--;
618 }
619 k++;
620 if (candidate_idx < ctx.last_SMEM_dep_idx)
621 ctx.last_SMEM_stall++;
622 }
623
624 /* create the initial set of values which depend on current */
625 std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
626 std::fill(ctx.RAR_dependencies.begin(), ctx.RAR_dependencies.end(), false);
627 for (const Definition& def : current->definitions) {
628 if (def.isTemp())
629 ctx.depends_on[def.tempId()] = true;
630 }
631
632 /* find the first instruction depending on current or find another VMEM */
633 RegisterDemand register_pressure;
634 int insert_idx = idx;
635 moving_interaction = barrier_none;
636 moving_spill = false;
637 // TODO: differentiate between loads and stores (load-load can always reorder)
638 can_reorder_vmem = true;
639 can_reorder_smem = true;
640
641 bool found_dependency = false;
642 /* second, check if we have instructions after current to move up */
643 for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) {
644 assert(candidate_idx < (int) block->instructions.size());
645 aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
646 bool can_reorder_candidate = can_reorder(candidate.get());
647 bool is_vmem = candidate->isVMEM() || candidate->isFlatOrGlobal();
648
649 if (candidate->opcode == aco_opcode::p_logical_end)
650 break;
651 if (!can_move_instr(candidate, current, moving_interaction))
652 break;
653
654 const bool writes_exec = std::any_of(candidate->definitions.begin(), candidate->definitions.end(),
655 [](const Definition& def) {return def.isFixed() && def.physReg() == exec; });
656 if (writes_exec)
657 break;
658
659 /* check if candidate depends on current */
660 bool is_dependency = false;
661 if (candidate->format == Format::SMEM)
662 is_dependency = !can_reorder_smem && !can_reorder_candidate;
663 if (is_vmem)
664 is_dependency = !can_reorder_vmem && !can_reorder_candidate;
665 for (const Operand& op : candidate->operands) {
666 if (op.isTemp() && ctx.depends_on[op.tempId()]) {
667 is_dependency = true;
668 break;
669 }
670 }
671 if (moving_spill && is_spill_reload(candidate))
672 is_dependency = true;
673 if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
674 is_dependency = true;
675 moving_interaction |= get_barrier_interaction(candidate.get());
676 moving_spill |= is_spill_reload(candidate);
677 if (is_dependency) {
678 for (const Definition& def : candidate->definitions) {
679 if (def.isTemp())
680 ctx.depends_on[def.tempId()] = true;
681 }
682 for (const Operand& op : candidate->operands) {
683 if (op.isTemp())
684 ctx.RAR_dependencies[op.tempId()] = true;
685 }
686 /* update flag whether we can reorder other memory instructions */
687 can_reorder_smem &= candidate->format != Format::SMEM || can_reorder_candidate;
688 can_reorder_vmem &= !is_vmem || can_reorder_candidate;
689
690 if (!found_dependency) {
691 insert_idx = candidate_idx;
692 found_dependency = true;
693 /* init register pressure */
694 register_pressure = register_demand[insert_idx - 1];
695 continue;
696 }
697
698 } else if (is_vmem) {
699 /* don't move up dependencies of other VMEM instructions */
700 for (const Definition& def : candidate->definitions) {
701 if (def.isTemp())
702 ctx.depends_on[def.tempId()] = true;
703 }
704 }
705
706 /* update register pressure */
707 register_pressure.update(register_demand[candidate_idx - 1]);
708
709 if (is_dependency || !found_dependency)
710 continue;
711 assert(insert_idx != idx);
712
713 bool register_pressure_unknown = false;
714 /* check if candidate uses/kills an operand which is used by a dependency */
715 for (const Operand& op : candidate->operands) {
716 if (op.isTemp() && op.isFirstKill() && ctx.RAR_dependencies[op.tempId()])
717 register_pressure_unknown = true;
718 }
719 if (register_pressure_unknown) {
720 for (const Definition& def : candidate->definitions) {
721 if (def.isTemp())
722 ctx.depends_on[def.tempId()] = true;
723 }
724 for (const Operand& op : candidate->operands) {
725 if (op.isTemp())
726 ctx.RAR_dependencies[op.tempId()] = true;
727 }
728 can_reorder_smem &= candidate->format != Format::SMEM || can_reorder_candidate;
729 can_reorder_vmem &= !is_vmem || can_reorder_candidate;
730 continue;
731 }
732
733 /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
734 const RegisterDemand candidate_diff = getLiveChanges(candidate);
735 const RegisterDemand temp = getTempRegisters(candidate);
736 if (RegisterDemand(register_pressure + candidate_diff).exceeds(ctx.max_registers))
737 break;
738 const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]);
739 const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + candidate_diff + temp;
740 if (new_demand.exceeds(ctx.max_registers))
741 break;
742
743 /* move the candidate above the insert_idx */
744 move_element(block->instructions, candidate_idx, insert_idx);
745
746 /* update register pressure */
747 move_element(register_demand, candidate_idx, insert_idx);
748 for (int i = insert_idx + 1; i <= candidate_idx; i++) {
749 register_demand[i] += candidate_diff;
750 }
751 register_demand[insert_idx] = new_demand;
752 register_pressure += candidate_diff;
753 insert_idx++;
754 k++;
755 }
756 }
757
758 void schedule_position_export(sched_ctx& ctx, Block* block,
759 std::vector<RegisterDemand>& register_demand,
760 Instruction* current, int idx)
761 {
762 assert(idx != 0);
763 int window_size = POS_EXP_WINDOW_SIZE;
764 int max_moves = POS_EXP_MAX_MOVES;
765 int16_t k = 0;
766
767 /* create the initial set of values which current depends on */
768 std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
769 std::fill(ctx.RAR_dependencies.begin(), ctx.RAR_dependencies.end(), false);
770 for (const Operand& op : current->operands) {
771 if (op.isTemp()) {
772 ctx.depends_on[op.tempId()] = true;
773 if (op.isFirstKill())
774 ctx.RAR_dependencies[op.tempId()] = true;
775 }
776 }
777
778 /* maintain how many registers remain free when moving instructions */
779 RegisterDemand register_pressure = register_demand[idx];
780
781 /* first, check if we have instructions before current to move down */
782 int insert_idx = idx + 1;
783 int moving_interaction = barrier_none;
784 bool moving_spill = false;
785
786 for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
787 assert(candidate_idx >= 0);
788 aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
789
790 /* break when encountering logical_start or barriers */
791 if (candidate->opcode == aco_opcode::p_logical_start)
792 break;
793 if (candidate->opcode == aco_opcode::p_exit_early_if)
794 break;
795 if (candidate->isVMEM() || candidate->format == Format::SMEM || candidate->isFlatOrGlobal())
796 break;
797 if (!can_move_instr(candidate, current, moving_interaction))
798 break;
799
800 register_pressure.update(register_demand[candidate_idx]);
801
802 /* if current depends on candidate, add additional dependencies and continue */
803 bool can_move_down = true;
804 bool writes_exec = false;
805 for (unsigned i = 0; i < candidate->definitions.size(); i++) {
806 if (candidate->definitions[i].isTemp() && ctx.depends_on[candidate->definitions[i].tempId()])
807 can_move_down = false;
808 if (candidate->definitions[i].isFixed() && candidate->definitions[i].physReg() == exec)
809 writes_exec = true;
810 }
811 if (writes_exec)
812 break;
813
814 if (moving_spill && is_spill_reload(candidate))
815 can_move_down = false;
816 if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
817 can_move_down = false;
818 moving_interaction |= get_barrier_interaction(candidate.get());
819 moving_spill |= is_spill_reload(candidate);
820 if (!can_move_down) {
821 for (const Operand& op : candidate->operands) {
822 if (op.isTemp()) {
823 ctx.depends_on[op.tempId()] = true;
824 if (op.isFirstKill())
825 ctx.RAR_dependencies[op.tempId()] = true;
826 }
827 }
828 continue;
829 }
830
831 bool register_pressure_unknown = false;
832 /* check if one of candidate's operands is killed by depending instruction */
833 for (const Operand& op : candidate->operands) {
834 if (op.isTemp() && ctx.RAR_dependencies[op.tempId()]) {
835 // FIXME: account for difference in register pressure
836 register_pressure_unknown = true;
837 }
838 }
839 if (register_pressure_unknown) {
840 for (const Operand& op : candidate->operands) {
841 if (op.isTemp()) {
842 ctx.depends_on[op.tempId()] = true;
843 if (op.isFirstKill())
844 ctx.RAR_dependencies[op.tempId()] = true;
845 }
846 }
847 continue;
848 }
849
850 /* check if register pressure is low enough: the diff is negative if register pressure is increased */
851 const RegisterDemand candidate_diff = getLiveChanges(candidate);
852 const RegisterDemand temp = getTempRegisters(candidate);;
853 if (RegisterDemand(register_pressure - candidate_diff).exceeds(ctx.max_registers))
854 break;
855 const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]);
856 const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + temp;
857 if (new_demand.exceeds(ctx.max_registers))
858 break;
859 // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure
860
861 /* move the candidate below the export */
862 move_element(block->instructions, candidate_idx, insert_idx);
863
864 /* update register pressure */
865 move_element(register_demand, candidate_idx, insert_idx);
866 for (int i = candidate_idx; i < insert_idx - 1; i++) {
867 register_demand[i] -= candidate_diff;
868 }
869 register_demand[insert_idx - 1] = new_demand;
870 register_pressure -= candidate_diff;
871 insert_idx--;
872 k++;
873 }
874 }
875
876 void schedule_block(sched_ctx& ctx, Program *program, Block* block, live& live_vars)
877 {
878 ctx.last_SMEM_dep_idx = 0;
879 ctx.last_SMEM_stall = INT16_MIN;
880
881 /* go through all instructions and find memory loads */
882 for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
883 Instruction* current = block->instructions[idx].get();
884
885 if (current->definitions.empty())
886 continue;
887
888 if (current->isVMEM() || current->isFlatOrGlobal())
889 schedule_VMEM(ctx, block, live_vars.register_demand[block->index], current, idx);
890 if (current->format == Format::SMEM)
891 schedule_SMEM(ctx, block, live_vars.register_demand[block->index], current, idx);
892 }
893
894 if ((program->stage & hw_vs) && block->index == program->blocks.size() - 1) {
895 /* Try to move position exports as far up as possible, to reduce register
896 * usage and because ISA reference guides say so. */
897 for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
898 Instruction* current = block->instructions[idx].get();
899
900 if (current->format == Format::EXP) {
901 unsigned target = static_cast<Export_instruction*>(current)->dest;
902 if (target >= V_008DFC_SQ_EXP_POS && target < V_008DFC_SQ_EXP_PARAM)
903 schedule_position_export(ctx, block, live_vars.register_demand[block->index], current, idx);
904 }
905 }
906 }
907
908 /* resummarize the block's register demand */
909 block->register_demand = RegisterDemand();
910 for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
911 block->register_demand.update(live_vars.register_demand[block->index][idx]);
912 }
913 }
914
915
916 void schedule_program(Program *program, live& live_vars)
917 {
918 sched_ctx ctx;
919 ctx.depends_on.resize(program->peekAllocationId());
920 ctx.RAR_dependencies.resize(program->peekAllocationId());
921 ctx.new_RAR_dependencies.resize(program->peekAllocationId());
922 /* Allowing the scheduler to reduce the number of waves to as low as 5
923 * improves performance of Thrones of Britannia significantly and doesn't
924 * seem to hurt anything else. */
925 if (program->num_waves <= 5)
926 ctx.num_waves = program->num_waves;
927 else if (program->max_reg_demand.vgpr >= 32)
928 ctx.num_waves = 5;
929 else if (program->max_reg_demand.vgpr >= 28)
930 ctx.num_waves = 6;
931 else if (program->max_reg_demand.vgpr >= 24)
932 ctx.num_waves = 7;
933 else
934 ctx.num_waves = 8;
935
936 assert(ctx.num_waves > 0 && ctx.num_waves <= program->num_waves);
937 ctx.max_registers = { int16_t(((256 / ctx.num_waves) & ~3) - 2), int16_t(get_addr_sgpr_from_waves(program, ctx.num_waves))};
938
939 for (Block& block : program->blocks)
940 schedule_block(ctx, program, &block, live_vars);
941
942 /* update max_reg_demand and num_waves */
943 RegisterDemand new_demand;
944 for (Block& block : program->blocks) {
945 new_demand.update(block.register_demand);
946 }
947 update_vgpr_sgpr_demand(program, new_demand);
948
949 /* if enabled, this code asserts that register_demand is updated correctly */
950 #if 0
951 int prev_num_waves = program->num_waves;
952 const RegisterDemand prev_max_demand = program->max_reg_demand;
953
954 std::vector<RegisterDemand> demands(program->blocks.size());
955 for (unsigned j = 0; j < program->blocks.size(); j++) {
956 demands[j] = program->blocks[j].register_demand;
957 }
958
959 struct radv_nir_compiler_options options;
960 options.chip_class = program->chip_class;
961 live live_vars2 = aco::live_var_analysis(program, &options);
962
963 for (unsigned j = 0; j < program->blocks.size(); j++) {
964 Block &b = program->blocks[j];
965 for (unsigned i = 0; i < b.instructions.size(); i++)
966 assert(live_vars.register_demand[b.index][i] == live_vars2.register_demand[b.index][i]);
967 assert(b.register_demand == demands[j]);
968 }
969
970 assert(program->max_reg_demand == prev_max_demand);
971 assert(program->num_waves == prev_num_waves);
972 #endif
973 }
974
975 }