d5f2d913a658fd0aaa783f05976cf76025f3e32a
[mesa.git] / src / amd / compiler / aco_scheduler.cpp
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #include "aco_ir.h"
26 #include "aco_builder.h"
27 #include <unordered_set>
28 #include <algorithm>
29
30 #include "vulkan/radv_shader.h" // for radv_nir_compiler_options
31 #include "amdgfxregs.h"
32
33 #define SMEM_WINDOW_SIZE (350 - ctx.num_waves * 35)
34 #define VMEM_WINDOW_SIZE (1024 - ctx.num_waves * 64)
35 #define POS_EXP_WINDOW_SIZE 512
36 #define SMEM_MAX_MOVES (64 - ctx.num_waves * 4)
37 #define VMEM_MAX_MOVES (128 - ctx.num_waves * 8)
38 /* creating clauses decreases def-use distances, so make it less aggressive the lower num_waves is */
39 #define VMEM_CLAUSE_MAX_GRAB_DIST ((ctx.num_waves - 1) * 8)
40 #define POS_EXP_MAX_MOVES 512
41
42 namespace aco {
43
44 struct sched_ctx {
45 std::vector<bool> depends_on;
46 std::vector<bool> RAR_dependencies;
47 /* For downwards VMEM scheduling, same as RAR_dependencies but excludes the
48 * instructions in the clause, since new instructions in the clause are not
49 * moved past any other instructions in the clause. */
50 std::vector<bool> new_RAR_dependencies;
51
52 RegisterDemand max_registers;
53 int16_t num_waves;
54 int16_t last_SMEM_stall;
55 int last_SMEM_dep_idx;
56 };
57
58 /* This scheduler is a simple bottom-up pass based on ideas from
59 * "A Novel Lightweight Instruction Scheduling Algorithm for Just-In-Time Compiler"
60 * from Xiaohua Shi and Peng Guo.
61 * The basic approach is to iterate over all instructions. When a memory instruction
62 * is encountered it tries to move independent instructions from above and below
63 * between the memory instruction and it's first user.
64 * The novelty is that this scheduler cares for the current register pressure:
65 * Instructions will only be moved if the register pressure won't exceed a certain bound.
66 */
67
68 template <typename T>
69 void move_element(T& list, size_t idx, size_t before) {
70 if (idx < before) {
71 auto begin = std::next(list.begin(), idx);
72 auto end = std::next(list.begin(), before);
73 std::rotate(begin, begin + 1, end);
74 } else if (idx > before) {
75 auto begin = std::next(list.begin(), before);
76 auto end = std::next(list.begin(), idx + 1);
77 std::rotate(begin, end - 1, end);
78 }
79 }
80
81 static RegisterDemand getLiveChanges(aco_ptr<Instruction>& instr)
82 {
83 RegisterDemand changes;
84 for (const Definition& def : instr->definitions) {
85 if (!def.isTemp() || def.isKill())
86 continue;
87 changes += def.getTemp();
88 }
89
90 for (const Operand& op : instr->operands) {
91 if (!op.isTemp() || !op.isFirstKill())
92 continue;
93 changes -= op.getTemp();
94 }
95
96 return changes;
97 }
98
99 static RegisterDemand getTempRegisters(aco_ptr<Instruction>& instr)
100 {
101 RegisterDemand temp_registers;
102 for (const Definition& def : instr->definitions) {
103 if (!def.isTemp() || !def.isKill())
104 continue;
105 temp_registers += def.getTemp();
106 }
107 return temp_registers;
108 }
109
110 static bool is_spill_reload(aco_ptr<Instruction>& instr)
111 {
112 return instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload;
113 }
114
115 bool can_reorder(Instruction* candidate)
116 {
117 switch (candidate->format) {
118 case Format::SMEM:
119 return static_cast<SMEM_instruction*>(candidate)->can_reorder;
120 case Format::MUBUF:
121 return static_cast<MUBUF_instruction*>(candidate)->can_reorder;
122 case Format::MIMG:
123 return static_cast<MIMG_instruction*>(candidate)->can_reorder;
124 case Format::MTBUF:
125 return static_cast<MTBUF_instruction*>(candidate)->can_reorder;
126 case Format::FLAT:
127 case Format::GLOBAL:
128 case Format::SCRATCH:
129 return static_cast<FLAT_instruction*>(candidate)->can_reorder;
130 default:
131 return true;
132 }
133 }
134
135 bool is_gs_or_done_sendmsg(Instruction *instr)
136 {
137 if (instr->opcode == aco_opcode::s_sendmsg) {
138 uint16_t imm = static_cast<SOPP_instruction*>(instr)->imm;
139 return (imm & sendmsg_id_mask) == _sendmsg_gs ||
140 (imm & sendmsg_id_mask) == _sendmsg_gs_done;
141 }
142 return false;
143 }
144
145 bool is_done_sendmsg(Instruction *instr)
146 {
147 if (instr->opcode == aco_opcode::s_sendmsg) {
148 uint16_t imm = static_cast<SOPP_instruction*>(instr)->imm;
149 return (imm & sendmsg_id_mask) == _sendmsg_gs_done;
150 }
151 return false;
152 }
153
154 barrier_interaction get_barrier_interaction(Instruction* instr)
155 {
156 switch (instr->format) {
157 case Format::SMEM:
158 return static_cast<SMEM_instruction*>(instr)->barrier;
159 case Format::MUBUF:
160 return static_cast<MUBUF_instruction*>(instr)->barrier;
161 case Format::MIMG:
162 return static_cast<MIMG_instruction*>(instr)->barrier;
163 case Format::MTBUF:
164 return static_cast<MTBUF_instruction*>(instr)->barrier;
165 case Format::FLAT:
166 case Format::GLOBAL:
167 case Format::SCRATCH:
168 return static_cast<FLAT_instruction*>(instr)->barrier;
169 case Format::DS:
170 return barrier_shared;
171 case Format::SOPP:
172 if (is_done_sendmsg(instr))
173 return (barrier_interaction)(barrier_gs_data | barrier_gs_sendmsg);
174 else if (is_gs_or_done_sendmsg(instr))
175 return barrier_gs_sendmsg;
176 else
177 return barrier_none;
178 default:
179 return barrier_none;
180 }
181 }
182
183 bool can_move_instr(aco_ptr<Instruction>& instr, Instruction* current, int moving_interaction)
184 {
185 /* don't move exports so that they stay closer together */
186 if (instr->format == Format::EXP)
187 return false;
188
189 /* don't move s_memtime/s_memrealtime */
190 if (instr->opcode == aco_opcode::s_memtime || instr->opcode == aco_opcode::s_memrealtime)
191 return false;
192
193 /* handle barriers */
194
195 /* TODO: instead of stopping, maybe try to move the barriers and any
196 * instructions interacting with them instead? */
197 if (instr->format != Format::PSEUDO_BARRIER) {
198 if (instr->opcode == aco_opcode::s_barrier) {
199 return can_reorder(current) && moving_interaction == barrier_none;
200 } else if (is_gs_or_done_sendmsg(instr.get())) {
201 int interaction = get_barrier_interaction(current);
202 interaction |= moving_interaction;
203 return !(interaction & get_barrier_interaction(instr.get()));
204 } else {
205 return true;
206 }
207 }
208
209 int interaction = get_barrier_interaction(current);
210 interaction |= moving_interaction;
211
212 switch (instr->opcode) {
213 case aco_opcode::p_memory_barrier_atomic:
214 return !(interaction & barrier_atomic);
215 /* For now, buffer and image barriers are treated the same. this is because of
216 * dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_nonlocal.buffer.guard_nonlocal.image.comp
217 * which seems to use an image load to determine if the result of a buffer load is valid. So the ordering of the two loads is important.
218 * I /think/ we should probably eventually expand the meaning of a buffer barrier so that all buffer operations before it, must stay before it
219 * and that both image and buffer operations after it, must stay after it. We should also do the same for image barriers.
220 * Or perhaps the problem is that we don't have a combined barrier instruction for both buffers and images, but the CTS test expects us to?
221 * Either way, this solution should work. */
222 case aco_opcode::p_memory_barrier_buffer:
223 case aco_opcode::p_memory_barrier_image:
224 return !(interaction & (barrier_image | barrier_buffer));
225 case aco_opcode::p_memory_barrier_shared:
226 return !(interaction & barrier_shared);
227 case aco_opcode::p_memory_barrier_common:
228 return !(interaction & (barrier_image | barrier_buffer | barrier_shared | barrier_atomic));
229 case aco_opcode::p_memory_barrier_gs_data:
230 return !(interaction & barrier_gs_data);
231 case aco_opcode::p_memory_barrier_gs_sendmsg:
232 return !(interaction & barrier_gs_sendmsg);
233 default:
234 return false;
235 }
236 }
237
238 void schedule_SMEM(sched_ctx& ctx, Block* block,
239 std::vector<RegisterDemand>& register_demand,
240 Instruction* current, int idx)
241 {
242 assert(idx != 0);
243 int window_size = SMEM_WINDOW_SIZE;
244 int max_moves = SMEM_MAX_MOVES;
245 int16_t k = 0;
246 bool can_reorder_cur = can_reorder(current);
247
248 /* don't move s_memtime/s_memrealtime */
249 if (current->opcode == aco_opcode::s_memtime || current->opcode == aco_opcode::s_memrealtime)
250 return;
251
252 /* create the initial set of values which current depends on */
253 std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
254 for (const Operand& op : current->operands) {
255 if (op.isTemp())
256 ctx.depends_on[op.tempId()] = true;
257 }
258
259 /* maintain how many registers remain free when moving instructions */
260 RegisterDemand register_pressure = register_demand[idx];
261
262 /* first, check if we have instructions before current to move down */
263 int insert_idx = idx + 1;
264 int moving_interaction = barrier_none;
265 bool moving_spill = false;
266
267 for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
268 assert(candidate_idx >= 0);
269 aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
270 bool can_reorder_candidate = can_reorder(candidate.get());
271
272 /* break if we'd make the previous SMEM instruction stall */
273 bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
274 if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0)
275 break;
276
277 /* break when encountering another MEM instruction, logical_start or barriers */
278 if (!can_reorder_candidate && !can_reorder_cur)
279 break;
280 if (candidate->opcode == aco_opcode::p_logical_start)
281 break;
282 if (candidate->opcode == aco_opcode::p_exit_early_if)
283 break;
284 if (!can_move_instr(candidate, current, moving_interaction))
285 break;
286 if (candidate->isVMEM())
287 break;
288 register_pressure.update(register_demand[candidate_idx]);
289
290 /* if current depends on candidate, add additional dependencies and continue */
291 bool can_move_down = true;
292 bool writes_exec = false;
293 for (const Definition& def : candidate->definitions) {
294 if (def.isTemp() && ctx.depends_on[def.tempId()])
295 can_move_down = false;
296 if (def.isFixed() && def.physReg() == exec)
297 writes_exec = true;
298 }
299 if (writes_exec)
300 break;
301
302 if (moving_spill && is_spill_reload(candidate))
303 can_move_down = false;
304 if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
305 can_move_down = false;
306 moving_interaction |= get_barrier_interaction(candidate.get());
307 moving_spill |= is_spill_reload(candidate);
308 if (!can_move_down) {
309 for (const Operand& op : candidate->operands) {
310 if (op.isTemp())
311 ctx.depends_on[op.tempId()] = true;
312 }
313 can_reorder_cur &= can_reorder_candidate;
314 continue;
315 }
316
317 bool register_pressure_unknown = false;
318 /* check if one of candidate's operands is killed by depending instruction */
319 for (const Operand& op : candidate->operands) {
320 if (op.isTemp() && ctx.depends_on[op.tempId()]) {
321 // FIXME: account for difference in register pressure
322 register_pressure_unknown = true;
323 }
324 }
325 if (register_pressure_unknown) {
326 for (const Operand& op : candidate->operands) {
327 if (op.isTemp())
328 ctx.depends_on[op.tempId()] = true;
329 }
330 can_reorder_cur &= can_reorder_candidate;
331 continue;
332 }
333
334 /* check if register pressure is low enough: the diff is negative if register pressure is increased */
335 const RegisterDemand candidate_diff = getLiveChanges(candidate);
336 const RegisterDemand tempDemand = getTempRegisters(candidate);
337 if (RegisterDemand(register_pressure - candidate_diff).exceeds(ctx.max_registers))
338 break;
339 const RegisterDemand tempDemand2 = getTempRegisters(block->instructions[insert_idx - 1]);
340 const RegisterDemand new_demand = register_demand[insert_idx - 1] - tempDemand2 + tempDemand;
341 if (new_demand.exceeds(ctx.max_registers))
342 break;
343 // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure
344
345 /* move the candidate below the memory load */
346 move_element(block->instructions, candidate_idx, insert_idx);
347
348 /* update register pressure */
349 move_element(register_demand, candidate_idx, insert_idx);
350 for (int i = candidate_idx; i < insert_idx - 1; i++) {
351 register_demand[i] -= candidate_diff;
352 }
353 register_demand[insert_idx - 1] = new_demand;
354 register_pressure -= candidate_diff;
355
356 if (candidate_idx < ctx.last_SMEM_dep_idx)
357 ctx.last_SMEM_stall++;
358 insert_idx--;
359 k++;
360 }
361
362 /* create the initial set of values which depend on current */
363 std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
364 std::fill(ctx.RAR_dependencies.begin(), ctx.RAR_dependencies.end(), false);
365 for (const Definition& def : current->definitions) {
366 if (def.isTemp())
367 ctx.depends_on[def.tempId()] = true;
368 }
369
370 /* find the first instruction depending on current or find another MEM */
371 insert_idx = idx + 1;
372 moving_interaction = barrier_none;
373 moving_spill = false;
374 can_reorder_cur = true;
375
376 bool found_dependency = false;
377 /* second, check if we have instructions after current to move up */
378 for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) {
379 assert(candidate_idx < (int) block->instructions.size());
380 aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
381 bool can_reorder_candidate = can_reorder(candidate.get());
382
383 if (candidate->opcode == aco_opcode::p_logical_end)
384 break;
385 if (!can_move_instr(candidate, current, moving_interaction))
386 break;
387
388 const bool writes_exec = std::any_of(candidate->definitions.begin(), candidate->definitions.end(),
389 [](const Definition& def) { return def.isFixed() && def.physReg() == exec;});
390 if (writes_exec)
391 break;
392
393 /* check if candidate depends on current */
394 bool is_dependency = std::any_of(candidate->operands.begin(), candidate->operands.end(),
395 [&ctx](const Operand& op) { return op.isTemp() && ctx.depends_on[op.tempId()];});
396 /* no need to steal from following VMEM instructions */
397 if (is_dependency && candidate->isVMEM())
398 break;
399 if (moving_spill && is_spill_reload(candidate))
400 is_dependency = true;
401 if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
402 is_dependency = true;
403 moving_interaction |= get_barrier_interaction(candidate.get());
404 moving_spill |= is_spill_reload(candidate);
405 if (is_dependency) {
406 for (const Definition& def : candidate->definitions) {
407 if (def.isTemp())
408 ctx.depends_on[def.tempId()] = true;
409 }
410 for (const Operand& op : candidate->operands) {
411 if (op.isTemp())
412 ctx.RAR_dependencies[op.tempId()] = true;
413 }
414 if (!found_dependency) {
415 insert_idx = candidate_idx;
416 found_dependency = true;
417 /* init register pressure */
418 register_pressure = register_demand[insert_idx - 1];
419 }
420 }
421
422 if (!can_reorder_candidate && !can_reorder_cur)
423 break;
424
425 if (!found_dependency) {
426 k++;
427 continue;
428 }
429
430 /* update register pressure */
431 register_pressure.update(register_demand[candidate_idx - 1]);
432
433 if (is_dependency) {
434 can_reorder_cur &= can_reorder_candidate;
435 continue;
436 }
437 assert(insert_idx != idx);
438
439 // TODO: correctly calculate register pressure for this case
440 bool register_pressure_unknown = false;
441 /* check if candidate uses/kills an operand which is used by a dependency */
442 for (const Operand& op : candidate->operands) {
443 if (op.isTemp() && ctx.RAR_dependencies[op.tempId()])
444 register_pressure_unknown = true;
445 }
446 if (register_pressure_unknown) {
447 if (candidate->isVMEM())
448 break;
449 for (const Definition& def : candidate->definitions) {
450 if (def.isTemp())
451 ctx.RAR_dependencies[def.tempId()] = true;
452 }
453 for (const Operand& op : candidate->operands) {
454 if (op.isTemp())
455 ctx.RAR_dependencies[op.tempId()] = true;
456 }
457 can_reorder_cur &= can_reorder_candidate;
458 continue;
459 }
460
461 /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
462 const RegisterDemand candidate_diff = getLiveChanges(candidate);
463 const RegisterDemand temp = getTempRegisters(candidate);
464 if (RegisterDemand(register_pressure + candidate_diff).exceeds(ctx.max_registers))
465 break;
466 const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]);
467 const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + candidate_diff + temp;
468 if (new_demand.exceeds(ctx.max_registers))
469 break;
470
471 /* move the candidate above the insert_idx */
472 move_element(block->instructions, candidate_idx, insert_idx);
473
474 /* update register pressure */
475 move_element(register_demand, candidate_idx, insert_idx);
476 for (int i = insert_idx + 1; i <= candidate_idx; i++) {
477 register_demand[i] += candidate_diff;
478 }
479 register_demand[insert_idx] = new_demand;
480 register_pressure += candidate_diff;
481 insert_idx++;
482 k++;
483 }
484
485 ctx.last_SMEM_dep_idx = found_dependency ? insert_idx : 0;
486 ctx.last_SMEM_stall = 10 - ctx.num_waves - k;
487 }
488
489 void schedule_VMEM(sched_ctx& ctx, Block* block,
490 std::vector<RegisterDemand>& register_demand,
491 Instruction* current, int idx)
492 {
493 assert(idx != 0);
494 int window_size = VMEM_WINDOW_SIZE;
495 int max_moves = VMEM_MAX_MOVES;
496 int clause_max_grab_dist = VMEM_CLAUSE_MAX_GRAB_DIST;
497 int16_t k = 0;
498 /* initially true as we don't pull other VMEM instructions
499 * through the current instruction */
500 bool can_reorder_vmem = true;
501 bool can_reorder_smem = true;
502
503 /* create the initial set of values which current depends on */
504 std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
505 std::fill(ctx.RAR_dependencies.begin(), ctx.RAR_dependencies.end(), false);
506 std::fill(ctx.new_RAR_dependencies.begin(), ctx.new_RAR_dependencies.end(), false);
507 for (const Operand& op : current->operands) {
508 if (op.isTemp()) {
509 ctx.depends_on[op.tempId()] = true;
510 if (op.isFirstKill())
511 ctx.RAR_dependencies[op.tempId()] = true;
512 }
513 }
514
515 /* maintain how many registers remain free when moving instructions */
516 RegisterDemand register_pressure_indep = register_demand[idx];
517 RegisterDemand register_pressure_clause = register_demand[idx];
518
519 /* first, check if we have instructions before current to move down */
520 int indep_insert_idx = idx + 1;
521 int clause_insert_idx = idx;
522 int moving_interaction = barrier_none;
523 bool moving_spill = false;
524
525 for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
526 assert(candidate_idx >= 0);
527 aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
528 bool can_reorder_candidate = can_reorder(candidate.get());
529 bool is_vmem = candidate->isVMEM() || candidate->isFlatOrGlobal();
530
531 /* break when encountering another VMEM instruction, logical_start or barriers */
532 if (!can_reorder_smem && candidate->format == Format::SMEM && !can_reorder_candidate)
533 break;
534 if (candidate->opcode == aco_opcode::p_logical_start)
535 break;
536 if (candidate->opcode == aco_opcode::p_exit_early_if)
537 break;
538 if (!can_move_instr(candidate, current, moving_interaction))
539 break;
540
541 /* break if we'd make the previous SMEM instruction stall */
542 bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
543 if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0)
544 break;
545 register_pressure_indep.update(register_demand[candidate_idx]);
546
547 bool part_of_clause = false;
548 if (current->isVMEM() == candidate->isVMEM()) {
549 bool same_resource = true;
550 if (current->isVMEM())
551 same_resource = candidate->operands[1].tempId() == current->operands[1].tempId();
552 bool can_reorder = can_reorder_vmem || can_reorder_candidate;
553 int grab_dist = clause_insert_idx - candidate_idx;
554 /* We can't easily tell how much this will decrease the def-to-use
555 * distances, so just use how far it will be moved as a heuristic. */
556 part_of_clause = can_reorder && same_resource && grab_dist < clause_max_grab_dist;
557 }
558
559 /* if current depends on candidate, add additional dependencies and continue */
560 bool can_move_down = !is_vmem || part_of_clause;
561 bool writes_exec = false;
562 for (const Definition& def : candidate->definitions) {
563 if (def.isTemp() && ctx.depends_on[def.tempId()])
564 can_move_down = false;
565 if (def.isFixed() && def.physReg() == exec)
566 writes_exec = true;
567 }
568 if (writes_exec)
569 break;
570
571 if (moving_spill && is_spill_reload(candidate))
572 can_move_down = false;
573 if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
574 can_move_down = false;
575 moving_interaction |= get_barrier_interaction(candidate.get());
576 moving_spill |= is_spill_reload(candidate);
577 if (!can_move_down) {
578 for (const Operand& op : candidate->operands) {
579 if (op.isTemp()) {
580 ctx.depends_on[op.tempId()] = true;
581 if (op.isFirstKill()) {
582 ctx.RAR_dependencies[op.tempId()] = true;
583 ctx.new_RAR_dependencies[op.tempId()] = true;
584 }
585 }
586 }
587 register_pressure_clause.update(register_demand[candidate_idx]);
588 can_reorder_smem &= candidate->format != Format::SMEM || can_reorder_candidate;
589 can_reorder_vmem &= !is_vmem || can_reorder_candidate;
590 continue;
591 }
592
593 if (part_of_clause) {
594 for (const Operand& op : candidate->operands) {
595 if (op.isTemp()) {
596 ctx.depends_on[op.tempId()] = true;
597 if (op.isFirstKill())
598 ctx.RAR_dependencies[op.tempId()] = true;
599 }
600 }
601 }
602
603 bool register_pressure_unknown = false;
604 std::vector<bool>& RAR_deps = part_of_clause ? ctx.new_RAR_dependencies : ctx.RAR_dependencies;
605 /* check if one of candidate's operands is killed by depending instruction */
606 for (const Operand& op : candidate->operands) {
607 if (op.isTemp() && RAR_deps[op.tempId()]) {
608 // FIXME: account for difference in register pressure
609 register_pressure_unknown = true;
610 }
611 }
612 if (register_pressure_unknown) {
613 for (const Operand& op : candidate->operands) {
614 if (op.isTemp()) {
615 ctx.depends_on[op.tempId()] = true;
616 if (op.isFirstKill()) {
617 ctx.RAR_dependencies[op.tempId()] = true;
618 ctx.new_RAR_dependencies[op.tempId()] = true;
619 }
620 }
621 }
622 register_pressure_clause.update(register_demand[candidate_idx]);
623 can_reorder_smem &= candidate->format != Format::SMEM || can_reorder_candidate;
624 can_reorder_vmem &= !is_vmem || can_reorder_candidate;
625 continue;
626 }
627
628 int insert_idx = part_of_clause ? clause_insert_idx : indep_insert_idx;
629 RegisterDemand register_pressure = part_of_clause ? register_pressure_clause : register_pressure_indep;
630
631 /* check if register pressure is low enough: the diff is negative if register pressure is increased */
632 const RegisterDemand candidate_diff = getLiveChanges(candidate);
633 const RegisterDemand temp = getTempRegisters(candidate);;
634 if (RegisterDemand(register_pressure - candidate_diff).exceeds(ctx.max_registers))
635 break;
636 const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]);
637 const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + temp;
638 if (new_demand.exceeds(ctx.max_registers))
639 break;
640 // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure
641
642 /* move the candidate below the memory load */
643 move_element(block->instructions, candidate_idx, insert_idx);
644
645 /* update register pressure */
646 move_element(register_demand, candidate_idx, insert_idx);
647 for (int i = candidate_idx; i < insert_idx - 1; i++) {
648 register_demand[i] -= candidate_diff;
649 }
650 register_demand[insert_idx - 1] = new_demand;
651 register_pressure_clause -= candidate_diff;
652 clause_insert_idx--;
653 if (!part_of_clause) {
654 register_pressure_indep -= candidate_diff;
655 indep_insert_idx--;
656 }
657 k++;
658 if (candidate_idx < ctx.last_SMEM_dep_idx)
659 ctx.last_SMEM_stall++;
660 }
661
662 /* create the initial set of values which depend on current */
663 std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
664 std::fill(ctx.RAR_dependencies.begin(), ctx.RAR_dependencies.end(), false);
665 for (const Definition& def : current->definitions) {
666 if (def.isTemp())
667 ctx.depends_on[def.tempId()] = true;
668 }
669
670 /* find the first instruction depending on current or find another VMEM */
671 RegisterDemand register_pressure;
672 int insert_idx = idx;
673 moving_interaction = barrier_none;
674 moving_spill = false;
675 // TODO: differentiate between loads and stores (load-load can always reorder)
676 can_reorder_vmem = true;
677 can_reorder_smem = true;
678
679 bool found_dependency = false;
680 /* second, check if we have instructions after current to move up */
681 for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) {
682 assert(candidate_idx < (int) block->instructions.size());
683 aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
684 bool can_reorder_candidate = can_reorder(candidate.get());
685 bool is_vmem = candidate->isVMEM() || candidate->isFlatOrGlobal();
686
687 if (candidate->opcode == aco_opcode::p_logical_end)
688 break;
689 if (!can_move_instr(candidate, current, moving_interaction))
690 break;
691
692 const bool writes_exec = std::any_of(candidate->definitions.begin(), candidate->definitions.end(),
693 [](const Definition& def) {return def.isFixed() && def.physReg() == exec; });
694 if (writes_exec)
695 break;
696
697 /* check if candidate depends on current */
698 bool is_dependency = false;
699 if (candidate->format == Format::SMEM)
700 is_dependency = !can_reorder_smem && !can_reorder_candidate;
701 if (is_vmem)
702 is_dependency = !can_reorder_vmem && !can_reorder_candidate;
703 for (const Operand& op : candidate->operands) {
704 if (op.isTemp() && ctx.depends_on[op.tempId()]) {
705 is_dependency = true;
706 break;
707 }
708 }
709 if (moving_spill && is_spill_reload(candidate))
710 is_dependency = true;
711 if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
712 is_dependency = true;
713 moving_interaction |= get_barrier_interaction(candidate.get());
714 moving_spill |= is_spill_reload(candidate);
715 if (is_dependency) {
716 for (const Definition& def : candidate->definitions) {
717 if (def.isTemp())
718 ctx.depends_on[def.tempId()] = true;
719 }
720 for (const Operand& op : candidate->operands) {
721 if (op.isTemp())
722 ctx.RAR_dependencies[op.tempId()] = true;
723 }
724 /* update flag whether we can reorder other memory instructions */
725 can_reorder_smem &= candidate->format != Format::SMEM || can_reorder_candidate;
726 can_reorder_vmem &= !is_vmem || can_reorder_candidate;
727
728 if (!found_dependency) {
729 insert_idx = candidate_idx;
730 found_dependency = true;
731 /* init register pressure */
732 register_pressure = register_demand[insert_idx - 1];
733 continue;
734 }
735
736 } else if (is_vmem) {
737 /* don't move up dependencies of other VMEM instructions */
738 for (const Definition& def : candidate->definitions) {
739 if (def.isTemp())
740 ctx.depends_on[def.tempId()] = true;
741 }
742 }
743
744 /* update register pressure */
745 register_pressure.update(register_demand[candidate_idx - 1]);
746
747 if (is_dependency || !found_dependency)
748 continue;
749 assert(insert_idx != idx);
750
751 bool register_pressure_unknown = false;
752 /* check if candidate uses/kills an operand which is used by a dependency */
753 for (const Operand& op : candidate->operands) {
754 if (op.isTemp() && op.isFirstKill() && ctx.RAR_dependencies[op.tempId()])
755 register_pressure_unknown = true;
756 }
757 if (register_pressure_unknown) {
758 for (const Definition& def : candidate->definitions) {
759 if (def.isTemp())
760 ctx.depends_on[def.tempId()] = true;
761 }
762 for (const Operand& op : candidate->operands) {
763 if (op.isTemp())
764 ctx.RAR_dependencies[op.tempId()] = true;
765 }
766 can_reorder_smem &= candidate->format != Format::SMEM || can_reorder_candidate;
767 can_reorder_vmem &= !is_vmem || can_reorder_candidate;
768 continue;
769 }
770
771 /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
772 const RegisterDemand candidate_diff = getLiveChanges(candidate);
773 const RegisterDemand temp = getTempRegisters(candidate);
774 if (RegisterDemand(register_pressure + candidate_diff).exceeds(ctx.max_registers))
775 break;
776 const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]);
777 const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + candidate_diff + temp;
778 if (new_demand.exceeds(ctx.max_registers))
779 break;
780
781 /* move the candidate above the insert_idx */
782 move_element(block->instructions, candidate_idx, insert_idx);
783
784 /* update register pressure */
785 move_element(register_demand, candidate_idx, insert_idx);
786 for (int i = insert_idx + 1; i <= candidate_idx; i++) {
787 register_demand[i] += candidate_diff;
788 }
789 register_demand[insert_idx] = new_demand;
790 register_pressure += candidate_diff;
791 insert_idx++;
792 k++;
793 }
794 }
795
796 void schedule_position_export(sched_ctx& ctx, Block* block,
797 std::vector<RegisterDemand>& register_demand,
798 Instruction* current, int idx)
799 {
800 assert(idx != 0);
801 int window_size = POS_EXP_WINDOW_SIZE;
802 int max_moves = POS_EXP_MAX_MOVES;
803 int16_t k = 0;
804
805 /* create the initial set of values which current depends on */
806 std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
807 std::fill(ctx.RAR_dependencies.begin(), ctx.RAR_dependencies.end(), false);
808 for (const Operand& op : current->operands) {
809 if (op.isTemp()) {
810 ctx.depends_on[op.tempId()] = true;
811 if (op.isFirstKill())
812 ctx.RAR_dependencies[op.tempId()] = true;
813 }
814 }
815
816 /* maintain how many registers remain free when moving instructions */
817 RegisterDemand register_pressure = register_demand[idx];
818
819 /* first, check if we have instructions before current to move down */
820 int insert_idx = idx + 1;
821 int moving_interaction = barrier_none;
822 bool moving_spill = false;
823
824 for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
825 assert(candidate_idx >= 0);
826 aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
827
828 /* break when encountering logical_start or barriers */
829 if (candidate->opcode == aco_opcode::p_logical_start)
830 break;
831 if (candidate->opcode == aco_opcode::p_exit_early_if)
832 break;
833 if (candidate->isVMEM() || candidate->format == Format::SMEM || candidate->isFlatOrGlobal())
834 break;
835 if (!can_move_instr(candidate, current, moving_interaction))
836 break;
837
838 register_pressure.update(register_demand[candidate_idx]);
839
840 /* if current depends on candidate, add additional dependencies and continue */
841 bool can_move_down = true;
842 bool writes_exec = false;
843 for (unsigned i = 0; i < candidate->definitions.size(); i++) {
844 if (candidate->definitions[i].isTemp() && ctx.depends_on[candidate->definitions[i].tempId()])
845 can_move_down = false;
846 if (candidate->definitions[i].isFixed() && candidate->definitions[i].physReg() == exec)
847 writes_exec = true;
848 }
849 if (writes_exec)
850 break;
851
852 if (moving_spill && is_spill_reload(candidate))
853 can_move_down = false;
854 if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
855 can_move_down = false;
856 moving_interaction |= get_barrier_interaction(candidate.get());
857 moving_spill |= is_spill_reload(candidate);
858 if (!can_move_down) {
859 for (const Operand& op : candidate->operands) {
860 if (op.isTemp()) {
861 ctx.depends_on[op.tempId()] = true;
862 if (op.isFirstKill())
863 ctx.RAR_dependencies[op.tempId()] = true;
864 }
865 }
866 continue;
867 }
868
869 bool register_pressure_unknown = false;
870 /* check if one of candidate's operands is killed by depending instruction */
871 for (const Operand& op : candidate->operands) {
872 if (op.isTemp() && ctx.RAR_dependencies[op.tempId()]) {
873 // FIXME: account for difference in register pressure
874 register_pressure_unknown = true;
875 }
876 }
877 if (register_pressure_unknown) {
878 for (const Operand& op : candidate->operands) {
879 if (op.isTemp()) {
880 ctx.depends_on[op.tempId()] = true;
881 if (op.isFirstKill())
882 ctx.RAR_dependencies[op.tempId()] = true;
883 }
884 }
885 continue;
886 }
887
888 /* check if register pressure is low enough: the diff is negative if register pressure is increased */
889 const RegisterDemand candidate_diff = getLiveChanges(candidate);
890 const RegisterDemand temp = getTempRegisters(candidate);;
891 if (RegisterDemand(register_pressure - candidate_diff).exceeds(ctx.max_registers))
892 break;
893 const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]);
894 const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + temp;
895 if (new_demand.exceeds(ctx.max_registers))
896 break;
897 // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure
898
899 /* move the candidate below the export */
900 move_element(block->instructions, candidate_idx, insert_idx);
901
902 /* update register pressure */
903 move_element(register_demand, candidate_idx, insert_idx);
904 for (int i = candidate_idx; i < insert_idx - 1; i++) {
905 register_demand[i] -= candidate_diff;
906 }
907 register_demand[insert_idx - 1] = new_demand;
908 register_pressure -= candidate_diff;
909 insert_idx--;
910 k++;
911 }
912 }
913
914 void schedule_block(sched_ctx& ctx, Program *program, Block* block, live& live_vars)
915 {
916 ctx.last_SMEM_dep_idx = 0;
917 ctx.last_SMEM_stall = INT16_MIN;
918
919 /* go through all instructions and find memory loads */
920 for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
921 Instruction* current = block->instructions[idx].get();
922
923 if (current->definitions.empty())
924 continue;
925
926 if (current->isVMEM() || current->isFlatOrGlobal())
927 schedule_VMEM(ctx, block, live_vars.register_demand[block->index], current, idx);
928 if (current->format == Format::SMEM)
929 schedule_SMEM(ctx, block, live_vars.register_demand[block->index], current, idx);
930 }
931
932 if ((program->stage & hw_vs) && block->index == program->blocks.size() - 1) {
933 /* Try to move position exports as far up as possible, to reduce register
934 * usage and because ISA reference guides say so. */
935 for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
936 Instruction* current = block->instructions[idx].get();
937
938 if (current->format == Format::EXP) {
939 unsigned target = static_cast<Export_instruction*>(current)->dest;
940 if (target >= V_008DFC_SQ_EXP_POS && target < V_008DFC_SQ_EXP_PARAM)
941 schedule_position_export(ctx, block, live_vars.register_demand[block->index], current, idx);
942 }
943 }
944 }
945
946 /* resummarize the block's register demand */
947 block->register_demand = RegisterDemand();
948 for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
949 block->register_demand.update(live_vars.register_demand[block->index][idx]);
950 }
951 }
952
953
954 void schedule_program(Program *program, live& live_vars)
955 {
956 sched_ctx ctx;
957 ctx.depends_on.resize(program->peekAllocationId());
958 ctx.RAR_dependencies.resize(program->peekAllocationId());
959 ctx.new_RAR_dependencies.resize(program->peekAllocationId());
960 /* Allowing the scheduler to reduce the number of waves to as low as 5
961 * improves performance of Thrones of Britannia significantly and doesn't
962 * seem to hurt anything else. */
963 if (program->num_waves <= 5)
964 ctx.num_waves = program->num_waves;
965 else if (program->max_reg_demand.vgpr >= 32)
966 ctx.num_waves = 5;
967 else if (program->max_reg_demand.vgpr >= 28)
968 ctx.num_waves = 6;
969 else if (program->max_reg_demand.vgpr >= 24)
970 ctx.num_waves = 7;
971 else
972 ctx.num_waves = 8;
973 ctx.num_waves = std::max<uint16_t>(ctx.num_waves, program->min_waves);
974
975 assert(ctx.num_waves > 0 && ctx.num_waves <= program->num_waves);
976 ctx.max_registers = { int16_t(get_addr_vgpr_from_waves(program, ctx.num_waves) - 2),
977 int16_t(get_addr_sgpr_from_waves(program, ctx.num_waves))};
978
979 for (Block& block : program->blocks)
980 schedule_block(ctx, program, &block, live_vars);
981
982 /* update max_reg_demand and num_waves */
983 RegisterDemand new_demand;
984 for (Block& block : program->blocks) {
985 new_demand.update(block.register_demand);
986 }
987 update_vgpr_sgpr_demand(program, new_demand);
988
989 /* if enabled, this code asserts that register_demand is updated correctly */
990 #if 0
991 int prev_num_waves = program->num_waves;
992 const RegisterDemand prev_max_demand = program->max_reg_demand;
993
994 std::vector<RegisterDemand> demands(program->blocks.size());
995 for (unsigned j = 0; j < program->blocks.size(); j++) {
996 demands[j] = program->blocks[j].register_demand;
997 }
998
999 struct radv_nir_compiler_options options;
1000 options.chip_class = program->chip_class;
1001 live live_vars2 = aco::live_var_analysis(program, &options);
1002
1003 for (unsigned j = 0; j < program->blocks.size(); j++) {
1004 Block &b = program->blocks[j];
1005 for (unsigned i = 0; i < b.instructions.size(); i++)
1006 assert(live_vars.register_demand[b.index][i] == live_vars2.register_demand[b.index][i]);
1007 assert(b.register_demand == demands[j]);
1008 }
1009
1010 assert(program->max_reg_demand == prev_max_demand);
1011 assert(program->num_waves == prev_num_waves);
1012 #endif
1013 }
1014
1015 }