cb22491bf64a1b5a2c58b553d591e26ec89c7cda
[mesa.git] / src / amd / compiler / aco_scheduler.cpp
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #include "aco_ir.h"
26 #include "aco_builder.h"
27 #include <unordered_set>
28 #include <algorithm>
29
30 #include "vulkan/radv_shader.h" // for radv_nir_compiler_options
31 #include "amdgfxregs.h"
32
33 #define SMEM_WINDOW_SIZE (350 - ctx.num_waves * 35)
34 #define VMEM_WINDOW_SIZE (1024 - ctx.num_waves * 64)
35 #define POS_EXP_WINDOW_SIZE 512
36 #define SMEM_MAX_MOVES (64 - ctx.num_waves * 4)
37 #define VMEM_MAX_MOVES (128 - ctx.num_waves * 8)
38 /* creating clauses decreases def-use distances, so make it less aggressive the lower num_waves is */
39 #define VMEM_CLAUSE_MAX_GRAB_DIST ((ctx.num_waves - 1) * 8)
40 #define POS_EXP_MAX_MOVES 512
41
42 namespace aco {
43
44 enum MoveResult {
45 move_success,
46 move_fail_ssa,
47 move_fail_rar,
48 move_fail_pressure,
49 };
50
51 struct MoveState {
52 RegisterDemand max_registers;
53
54 Block *block;
55 Instruction *current;
56 RegisterDemand *register_demand;
57 bool improved_rar;
58
59 std::vector<bool> depends_on;
60 /* Two are needed because, for downwards VMEM scheduling, one needs to
61 * exclude the instructions in the clause, since new instructions in the
62 * clause are not moved past any other instructions in the clause. */
63 std::vector<bool> RAR_dependencies;
64 std::vector<bool> RAR_dependencies_clause;
65
66 int source_idx;
67 int insert_idx, insert_idx_clause;
68 RegisterDemand total_demand, total_demand_clause;
69
70 /* for moving instructions before the current instruction to after it */
71 void downwards_init(int current_idx, bool improved_rar, bool may_form_clauses);
72 MoveResult downwards_move(bool clause);
73 void downwards_skip();
74
75 /* for moving instructions after the first use of the current instruction upwards */
76 void upwards_init(int source_idx, bool improved_rar);
77 bool upwards_check_deps();
78 void upwards_set_insert_idx(int before);
79 MoveResult upwards_move();
80 void upwards_skip();
81
82 private:
83 void downwards_advance_helper();
84 };
85
86 struct sched_ctx {
87 int16_t num_waves;
88 int16_t last_SMEM_stall;
89 int last_SMEM_dep_idx;
90 MoveState mv;
91 };
92
93 /* This scheduler is a simple bottom-up pass based on ideas from
94 * "A Novel Lightweight Instruction Scheduling Algorithm for Just-In-Time Compiler"
95 * from Xiaohua Shi and Peng Guo.
96 * The basic approach is to iterate over all instructions. When a memory instruction
97 * is encountered it tries to move independent instructions from above and below
98 * between the memory instruction and it's first user.
99 * The novelty is that this scheduler cares for the current register pressure:
100 * Instructions will only be moved if the register pressure won't exceed a certain bound.
101 */
102
103 template <typename T>
104 void move_element(T begin_it, size_t idx, size_t before) {
105 if (idx < before) {
106 auto begin = std::next(begin_it, idx);
107 auto end = std::next(begin_it, before);
108 std::rotate(begin, begin + 1, end);
109 } else if (idx > before) {
110 auto begin = std::next(begin_it, before);
111 auto end = std::next(begin_it, idx + 1);
112 std::rotate(begin, end - 1, end);
113 }
114 }
115
116 void MoveState::downwards_advance_helper()
117 {
118 source_idx--;
119 total_demand.update(register_demand[source_idx]);
120 }
121
122 void MoveState::downwards_init(int current_idx, bool improved_rar_, bool may_form_clauses)
123 {
124 improved_rar = improved_rar_;
125 source_idx = current_idx;
126
127 insert_idx = current_idx + 1;
128 insert_idx_clause = current_idx;
129
130 total_demand = total_demand_clause = register_demand[current_idx];
131
132 std::fill(depends_on.begin(), depends_on.end(), false);
133 if (improved_rar) {
134 std::fill(RAR_dependencies.begin(), RAR_dependencies.end(), false);
135 if (may_form_clauses)
136 std::fill(RAR_dependencies_clause.begin(), RAR_dependencies_clause.end(), false);
137 }
138
139 for (const Operand& op : current->operands) {
140 if (op.isTemp()) {
141 depends_on[op.tempId()] = true;
142 if (improved_rar && op.isFirstKill())
143 RAR_dependencies[op.tempId()] = true;
144 }
145 }
146
147 /* update total_demand/source_idx */
148 downwards_advance_helper();
149 }
150
151 MoveResult MoveState::downwards_move(bool clause)
152 {
153 aco_ptr<Instruction>& instr = block->instructions[source_idx];
154
155 for (const Definition& def : instr->definitions)
156 if (def.isTemp() && depends_on[def.tempId()])
157 return move_fail_ssa;
158
159 /* check if one of candidate's operands is killed by depending instruction */
160 std::vector<bool>& RAR_deps = improved_rar ? (clause ? RAR_dependencies_clause : RAR_dependencies) : depends_on;
161 for (const Operand& op : instr->operands) {
162 if (op.isTemp() && RAR_deps[op.tempId()]) {
163 // FIXME: account for difference in register pressure
164 return move_fail_rar;
165 }
166 }
167
168 if (clause) {
169 for (const Operand& op : instr->operands) {
170 if (op.isTemp()) {
171 depends_on[op.tempId()] = true;
172 if (op.isFirstKill())
173 RAR_dependencies[op.tempId()] = true;
174 }
175 }
176 }
177
178 int dest_insert_idx = clause ? insert_idx_clause : insert_idx;
179 RegisterDemand register_pressure = clause ? total_demand_clause : total_demand;
180
181 const RegisterDemand candidate_diff = get_live_changes(instr);
182 const RegisterDemand temp = get_temp_registers(instr);
183 if (RegisterDemand(register_pressure - candidate_diff).exceeds(max_registers))
184 return move_fail_pressure;
185 const RegisterDemand temp2 = get_temp_registers(block->instructions[dest_insert_idx - 1]);
186 const RegisterDemand new_demand = register_demand[dest_insert_idx - 1] - temp2 + temp;
187 if (new_demand.exceeds(max_registers))
188 return move_fail_pressure;
189
190 /* move the candidate below the memory load */
191 move_element(block->instructions.begin(), source_idx, dest_insert_idx);
192
193 /* update register pressure */
194 move_element(register_demand, source_idx, dest_insert_idx);
195 for (int i = source_idx; i < dest_insert_idx - 1; i++)
196 register_demand[i] -= candidate_diff;
197 register_demand[dest_insert_idx - 1] = new_demand;
198 total_demand_clause -= candidate_diff;
199 insert_idx_clause--;
200 if (!clause) {
201 total_demand -= candidate_diff;
202 insert_idx--;
203 }
204
205 downwards_advance_helper();
206 return move_success;
207 }
208
209 void MoveState::downwards_skip()
210 {
211 aco_ptr<Instruction>& instr = block->instructions[source_idx];
212
213 for (const Operand& op : instr->operands) {
214 if (op.isTemp()) {
215 depends_on[op.tempId()] = true;
216 if (improved_rar && op.isFirstKill()) {
217 RAR_dependencies[op.tempId()] = true;
218 RAR_dependencies_clause[op.tempId()] = true;
219 }
220 }
221 }
222 total_demand_clause.update(register_demand[source_idx]);
223
224 downwards_advance_helper();
225 }
226
227 void MoveState::upwards_init(int source_idx_, bool improved_rar_)
228 {
229 source_idx = source_idx_;
230 improved_rar = improved_rar_;
231
232 insert_idx = -1;
233
234 std::fill(depends_on.begin(), depends_on.end(), false);
235 std::fill(RAR_dependencies.begin(), RAR_dependencies.end(), false);
236
237 for (const Definition& def : current->definitions) {
238 if (def.isTemp())
239 depends_on[def.tempId()] = true;
240 }
241 }
242
243 bool MoveState::upwards_check_deps()
244 {
245 aco_ptr<Instruction>& instr = block->instructions[source_idx];
246 for (const Operand& op : instr->operands) {
247 if (op.isTemp() && depends_on[op.tempId()])
248 return false;
249 }
250 return true;
251 }
252
253 void MoveState::upwards_set_insert_idx(int before)
254 {
255 insert_idx = before;
256 total_demand = register_demand[before - 1];
257 }
258
259 MoveResult MoveState::upwards_move()
260 {
261 assert(insert_idx >= 0);
262
263 aco_ptr<Instruction>& instr = block->instructions[source_idx];
264 for (const Operand& op : instr->operands) {
265 if (op.isTemp() && depends_on[op.tempId()])
266 return move_fail_ssa;
267 }
268
269 /* check if candidate uses/kills an operand which is used by a dependency */
270 for (const Operand& op : instr->operands) {
271 if (op.isTemp() && (!improved_rar || op.isFirstKill()) && RAR_dependencies[op.tempId()])
272 return move_fail_rar;
273 }
274
275 /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
276 const RegisterDemand candidate_diff = get_live_changes(instr);
277 const RegisterDemand temp = get_temp_registers(instr);
278 if (RegisterDemand(total_demand + candidate_diff).exceeds(max_registers))
279 return move_fail_pressure;
280 const RegisterDemand temp2 = get_temp_registers(block->instructions[insert_idx - 1]);
281 const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + candidate_diff + temp;
282 if (new_demand.exceeds(max_registers))
283 return move_fail_pressure;
284
285 /* move the candidate above the insert_idx */
286 move_element(block->instructions.begin(), source_idx, insert_idx);
287
288 /* update register pressure */
289 move_element(register_demand, source_idx, insert_idx);
290 for (int i = insert_idx + 1; i <= source_idx; i++)
291 register_demand[i] += candidate_diff;
292 register_demand[insert_idx] = new_demand;
293 total_demand += candidate_diff;
294
295 insert_idx++;
296
297 total_demand.update(register_demand[source_idx]);
298 source_idx++;
299
300 return move_success;
301 }
302
303 void MoveState::upwards_skip()
304 {
305 if (insert_idx >= 0) {
306 aco_ptr<Instruction>& instr = block->instructions[source_idx];
307 for (const Definition& def : instr->definitions) {
308 if (def.isTemp())
309 depends_on[def.tempId()] = true;
310 }
311 for (const Operand& op : instr->operands) {
312 if (op.isTemp())
313 RAR_dependencies[op.tempId()] = true;
314 }
315 total_demand.update(register_demand[source_idx]);
316 }
317
318 source_idx++;
319 }
320
321 bool can_reorder(Instruction* candidate)
322 {
323 switch (candidate->format) {
324 case Format::SMEM:
325 return static_cast<SMEM_instruction*>(candidate)->can_reorder;
326 case Format::MUBUF:
327 return static_cast<MUBUF_instruction*>(candidate)->can_reorder;
328 case Format::MIMG:
329 return static_cast<MIMG_instruction*>(candidate)->can_reorder;
330 case Format::MTBUF:
331 return static_cast<MTBUF_instruction*>(candidate)->can_reorder;
332 case Format::FLAT:
333 case Format::GLOBAL:
334 case Format::SCRATCH:
335 return static_cast<FLAT_instruction*>(candidate)->can_reorder;
336 default:
337 return true;
338 }
339 }
340
341 bool is_gs_or_done_sendmsg(const Instruction *instr)
342 {
343 if (instr->opcode == aco_opcode::s_sendmsg) {
344 uint16_t imm = static_cast<const SOPP_instruction*>(instr)->imm;
345 return (imm & sendmsg_id_mask) == _sendmsg_gs ||
346 (imm & sendmsg_id_mask) == _sendmsg_gs_done;
347 }
348 return false;
349 }
350
351 bool is_done_sendmsg(const Instruction *instr)
352 {
353 if (instr->opcode == aco_opcode::s_sendmsg) {
354 uint16_t imm = static_cast<const SOPP_instruction*>(instr)->imm;
355 return (imm & sendmsg_id_mask) == _sendmsg_gs_done;
356 }
357 return false;
358 }
359
360 barrier_interaction get_barrier_interaction(const Instruction* instr)
361 {
362 switch (instr->format) {
363 case Format::SMEM:
364 return static_cast<const SMEM_instruction*>(instr)->barrier;
365 case Format::MUBUF:
366 return static_cast<const MUBUF_instruction*>(instr)->barrier;
367 case Format::MIMG:
368 return static_cast<const MIMG_instruction*>(instr)->barrier;
369 case Format::MTBUF:
370 return static_cast<const MTBUF_instruction*>(instr)->barrier;
371 case Format::FLAT:
372 case Format::GLOBAL:
373 case Format::SCRATCH:
374 return static_cast<const FLAT_instruction*>(instr)->barrier;
375 case Format::DS:
376 return barrier_shared;
377 case Format::SOPP:
378 if (is_done_sendmsg(instr))
379 return (barrier_interaction)(barrier_gs_data | barrier_gs_sendmsg);
380 else if (is_gs_or_done_sendmsg(instr))
381 return barrier_gs_sendmsg;
382 else
383 return barrier_none;
384 case Format::PSEUDO_BARRIER:
385 return barrier_barrier;
386 default:
387 return barrier_none;
388 }
389 }
390
391 barrier_interaction parse_barrier(Instruction *instr)
392 {
393 if (instr->format == Format::PSEUDO_BARRIER) {
394 switch (instr->opcode) {
395 case aco_opcode::p_memory_barrier_atomic:
396 return barrier_atomic;
397 /* For now, buffer and image barriers are treated the same. this is because of
398 * dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_nonlocal.buffer.guard_nonlocal.image.comp
399 * which seems to use an image load to determine if the result of a buffer load is valid. So the ordering of the two loads is important.
400 * I /think/ we should probably eventually expand the meaning of a buffer barrier so that all buffer operations before it, must stay before it
401 * and that both image and buffer operations after it, must stay after it. We should also do the same for image barriers.
402 * Or perhaps the problem is that we don't have a combined barrier instruction for both buffers and images, but the CTS test expects us to?
403 * Either way, this solution should work. */
404 case aco_opcode::p_memory_barrier_buffer:
405 case aco_opcode::p_memory_barrier_image:
406 return (barrier_interaction)(barrier_image | barrier_buffer);
407 case aco_opcode::p_memory_barrier_shared:
408 return barrier_shared;
409 case aco_opcode::p_memory_barrier_common:
410 return (barrier_interaction)(barrier_image | barrier_buffer | barrier_shared | barrier_atomic);
411 case aco_opcode::p_memory_barrier_gs_data:
412 return barrier_gs_data;
413 case aco_opcode::p_memory_barrier_gs_sendmsg:
414 return barrier_gs_sendmsg;
415 default:
416 break;
417 }
418 } else if (instr->opcode == aco_opcode::s_barrier) {
419 return (barrier_interaction)(barrier_barrier | barrier_image | barrier_buffer | barrier_shared | barrier_atomic);
420 }
421 return barrier_none;
422 }
423
424 struct hazard_query {
425 bool contains_spill;
426 int barriers;
427 int barrier_interaction;
428 bool can_reorder_vmem;
429 bool can_reorder_smem;
430 };
431
432 void init_hazard_query(hazard_query *query) {
433 query->contains_spill = false;
434 query->barriers = 0;
435 query->barrier_interaction = 0;
436 query->can_reorder_vmem = true;
437 query->can_reorder_smem = true;
438 }
439
440 void add_to_hazard_query(hazard_query *query, Instruction *instr)
441 {
442 query->barriers |= parse_barrier(instr);
443 query->barrier_interaction |= get_barrier_interaction(instr);
444 if (instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload)
445 query->contains_spill = true;
446
447 bool can_reorder_instr = can_reorder(instr);
448 query->can_reorder_smem &= instr->format != Format::SMEM || can_reorder_instr;
449 query->can_reorder_vmem &= !(instr->isVMEM() || instr->isFlatOrGlobal()) || can_reorder_instr;
450 }
451
452 enum HazardResult {
453 hazard_success,
454 hazard_fail_reorder_vmem_smem,
455 hazard_fail_reorder_ds,
456 hazard_fail_reorder_sendmsg,
457 hazard_fail_spill,
458 hazard_fail_export,
459 hazard_fail_barrier,
460 /* Must stop at these failures. The hazard query code doesn't consider them
461 * when added. */
462 hazard_fail_exec,
463 hazard_fail_unreorderable,
464 };
465
466 HazardResult perform_hazard_query(hazard_query *query, Instruction *instr)
467 {
468 bool can_reorder_candidate = can_reorder(instr);
469
470 if (instr->opcode == aco_opcode::p_exit_early_if)
471 return hazard_fail_exec;
472 for (const Definition& def : instr->definitions) {
473 if (def.isFixed() && def.physReg() == exec)
474 return hazard_fail_exec;
475 }
476
477 /* don't move exports so that they stay closer together */
478 if (instr->format == Format::EXP)
479 return hazard_fail_export;
480
481 /* don't move non-reorderable instructions */
482 if (instr->opcode == aco_opcode::s_memtime ||
483 instr->opcode == aco_opcode::s_memrealtime ||
484 instr->opcode == aco_opcode::s_setprio)
485 return hazard_fail_unreorderable;
486
487 barrier_interaction bar = parse_barrier(instr);
488 if (query->barrier_interaction && (query->barrier_interaction & bar))
489 return hazard_fail_barrier;
490 if (bar && query->barriers && (query->barriers & ~bar))
491 return hazard_fail_barrier;
492 if (query->barriers && (query->barriers & get_barrier_interaction(instr)))
493 return hazard_fail_barrier;
494
495 if (!query->can_reorder_smem && instr->format == Format::SMEM && !can_reorder_candidate)
496 return hazard_fail_reorder_vmem_smem;
497 if (!query->can_reorder_vmem && (instr->isVMEM() || instr->isFlatOrGlobal()) && !can_reorder_candidate)
498 return hazard_fail_reorder_vmem_smem;
499 if ((query->barrier_interaction & barrier_shared) && instr->format == Format::DS)
500 return hazard_fail_reorder_ds;
501 if (is_gs_or_done_sendmsg(instr) && (query->barrier_interaction & get_barrier_interaction(instr)))
502 return hazard_fail_reorder_sendmsg;
503
504 if ((instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload) &&
505 query->contains_spill)
506 return hazard_fail_spill;
507
508 return hazard_success;
509 }
510
511 void schedule_SMEM(sched_ctx& ctx, Block* block,
512 std::vector<RegisterDemand>& register_demand,
513 Instruction* current, int idx)
514 {
515 assert(idx != 0);
516 int window_size = SMEM_WINDOW_SIZE;
517 int max_moves = SMEM_MAX_MOVES;
518 int16_t k = 0;
519
520 /* don't move s_memtime/s_memrealtime */
521 if (current->opcode == aco_opcode::s_memtime || current->opcode == aco_opcode::s_memrealtime)
522 return;
523
524 /* first, check if we have instructions before current to move down */
525 hazard_query hq;
526 init_hazard_query(&hq);
527 add_to_hazard_query(&hq, current);
528
529 ctx.mv.downwards_init(idx, false, false);
530
531 for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
532 assert(candidate_idx >= 0);
533 assert(candidate_idx == ctx.mv.source_idx);
534 aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
535
536 /* break if we'd make the previous SMEM instruction stall */
537 bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
538 if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0)
539 break;
540
541 /* break when encountering another MEM instruction, logical_start or barriers */
542 if (candidate->opcode == aco_opcode::p_logical_start)
543 break;
544 if (candidate->isVMEM())
545 break;
546
547 bool can_move_down = true;
548
549 HazardResult haz = perform_hazard_query(&hq, candidate.get());
550 if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill || haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier || haz == hazard_fail_export)
551 can_move_down = false;
552 else if (haz != hazard_success)
553 break;
554
555 /* don't use LDS/GDS instructions to hide latency since it can
556 * significanly worsen LDS scheduling */
557 if (candidate->format == Format::DS || !can_move_down) {
558 add_to_hazard_query(&hq, candidate.get());
559 ctx.mv.downwards_skip();
560 continue;
561 }
562
563 MoveResult res = ctx.mv.downwards_move(false);
564 if (res == move_fail_ssa || res == move_fail_rar) {
565 add_to_hazard_query(&hq, candidate.get());
566 ctx.mv.downwards_skip();
567 continue;
568 } else if (res == move_fail_pressure) {
569 break;
570 }
571
572 if (candidate_idx < ctx.last_SMEM_dep_idx)
573 ctx.last_SMEM_stall++;
574 k++;
575 }
576
577 /* find the first instruction depending on current or find another MEM */
578 ctx.mv.upwards_init(idx + 1, false);
579
580 bool found_dependency = false;
581 /* second, check if we have instructions after current to move up */
582 for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) {
583 assert(candidate_idx == ctx.mv.source_idx);
584 assert(candidate_idx < (int) block->instructions.size());
585 aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
586
587 if (candidate->opcode == aco_opcode::p_logical_end)
588 break;
589
590 /* check if candidate depends on current */
591 bool is_dependency = !found_dependency && !ctx.mv.upwards_check_deps();
592 /* no need to steal from following VMEM instructions */
593 if (is_dependency && candidate->isVMEM())
594 break;
595
596 if (found_dependency) {
597 HazardResult haz = perform_hazard_query(&hq, candidate.get());
598 if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
599 haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier ||
600 haz == hazard_fail_export)
601 is_dependency = true;
602 else if (haz != hazard_success)
603 break;
604 }
605
606 if (is_dependency) {
607 if (!found_dependency) {
608 ctx.mv.upwards_set_insert_idx(candidate_idx);
609 init_hazard_query(&hq);
610 found_dependency = true;
611 }
612 }
613
614 if (is_dependency || !found_dependency) {
615 if (found_dependency)
616 add_to_hazard_query(&hq, candidate.get());
617 else
618 k++;
619 ctx.mv.upwards_skip();
620 continue;
621 }
622
623 MoveResult res = ctx.mv.upwards_move();
624 if (res == move_fail_ssa || res == move_fail_rar) {
625 /* no need to steal from following VMEM instructions */
626 if (res == move_fail_ssa && candidate->isVMEM())
627 break;
628 add_to_hazard_query(&hq, candidate.get());
629 ctx.mv.upwards_skip();
630 continue;
631 } else if (res == move_fail_pressure) {
632 break;
633 }
634 k++;
635 }
636
637 ctx.last_SMEM_dep_idx = found_dependency ? ctx.mv.insert_idx : 0;
638 ctx.last_SMEM_stall = 10 - ctx.num_waves - k;
639 }
640
641 void schedule_VMEM(sched_ctx& ctx, Block* block,
642 std::vector<RegisterDemand>& register_demand,
643 Instruction* current, int idx)
644 {
645 assert(idx != 0);
646 int window_size = VMEM_WINDOW_SIZE;
647 int max_moves = VMEM_MAX_MOVES;
648 int clause_max_grab_dist = VMEM_CLAUSE_MAX_GRAB_DIST;
649 int16_t k = 0;
650
651 /* first, check if we have instructions before current to move down */
652 hazard_query indep_hq;
653 hazard_query clause_hq;
654 init_hazard_query(&indep_hq);
655 init_hazard_query(&clause_hq);
656 add_to_hazard_query(&indep_hq, current);
657
658 ctx.mv.downwards_init(idx, true, true);
659
660 for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
661 assert(candidate_idx == ctx.mv.source_idx);
662 assert(candidate_idx >= 0);
663 aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
664 bool is_vmem = candidate->isVMEM() || candidate->isFlatOrGlobal();
665
666 /* break when encountering another VMEM instruction, logical_start or barriers */
667 if (candidate->opcode == aco_opcode::p_logical_start)
668 break;
669
670 /* break if we'd make the previous SMEM instruction stall */
671 bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
672 if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0)
673 break;
674
675 bool part_of_clause = false;
676 if (current->isVMEM() == candidate->isVMEM()) {
677 bool same_resource = true;
678 if (current->isVMEM())
679 same_resource = candidate->operands[0].tempId() == current->operands[0].tempId();
680 int grab_dist = ctx.mv.insert_idx_clause - candidate_idx;
681 /* We can't easily tell how much this will decrease the def-to-use
682 * distances, so just use how far it will be moved as a heuristic. */
683 part_of_clause = same_resource && grab_dist < clause_max_grab_dist;
684 }
685
686 /* if current depends on candidate, add additional dependencies and continue */
687 bool can_move_down = !is_vmem || part_of_clause;
688
689 HazardResult haz = perform_hazard_query(part_of_clause ? &clause_hq : &indep_hq, candidate.get());
690 if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
691 haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier ||
692 haz == hazard_fail_export)
693 can_move_down = false;
694 else if (haz != hazard_success)
695 break;
696
697 if (!can_move_down) {
698 add_to_hazard_query(&indep_hq, candidate.get());
699 add_to_hazard_query(&clause_hq, candidate.get());
700 ctx.mv.downwards_skip();
701 continue;
702 }
703
704 MoveResult res = ctx.mv.downwards_move(part_of_clause);
705 if (res == move_fail_ssa || res == move_fail_rar) {
706 add_to_hazard_query(&indep_hq, candidate.get());
707 add_to_hazard_query(&clause_hq, candidate.get());
708 ctx.mv.downwards_skip();
709 continue;
710 } else if (res == move_fail_pressure) {
711 break;
712 }
713 k++;
714 if (candidate_idx < ctx.last_SMEM_dep_idx)
715 ctx.last_SMEM_stall++;
716 }
717
718 /* find the first instruction depending on current or find another VMEM */
719 ctx.mv.upwards_init(idx + 1, true);
720
721 bool found_dependency = false;
722 /* second, check if we have instructions after current to move up */
723 for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) {
724 assert(candidate_idx == ctx.mv.source_idx);
725 assert(candidate_idx < (int) block->instructions.size());
726 aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
727 bool is_vmem = candidate->isVMEM() || candidate->isFlatOrGlobal();
728
729 if (candidate->opcode == aco_opcode::p_logical_end)
730 break;
731
732 /* check if candidate depends on current */
733 bool is_dependency = false;
734 if (found_dependency) {
735 HazardResult haz = perform_hazard_query(&indep_hq, candidate.get());
736 if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
737 haz == hazard_fail_reorder_vmem_smem || haz == hazard_fail_reorder_sendmsg ||
738 haz == hazard_fail_barrier || haz == hazard_fail_export)
739 is_dependency = true;
740 else if (haz != hazard_success)
741 break;
742 }
743
744 is_dependency |= !found_dependency && !ctx.mv.upwards_check_deps();
745 if (is_dependency) {
746 if (!found_dependency) {
747 ctx.mv.upwards_set_insert_idx(candidate_idx);
748 init_hazard_query(&indep_hq);
749 found_dependency = true;
750 }
751 } else if (is_vmem) {
752 /* don't move up dependencies of other VMEM instructions */
753 for (const Definition& def : candidate->definitions) {
754 if (def.isTemp())
755 ctx.mv.depends_on[def.tempId()] = true;
756 }
757 }
758
759 if (is_dependency || !found_dependency) {
760 if (found_dependency)
761 add_to_hazard_query(&indep_hq, candidate.get());
762 ctx.mv.upwards_skip();
763 continue;
764 }
765
766 MoveResult res = ctx.mv.upwards_move();
767 if (res == move_fail_ssa || res == move_fail_rar) {
768 add_to_hazard_query(&indep_hq, candidate.get());
769 ctx.mv.upwards_skip();
770 continue;
771 } else if (res == move_fail_pressure) {
772 break;
773 }
774 k++;
775 }
776 }
777
778 void schedule_position_export(sched_ctx& ctx, Block* block,
779 std::vector<RegisterDemand>& register_demand,
780 Instruction* current, int idx)
781 {
782 assert(idx != 0);
783 int window_size = POS_EXP_WINDOW_SIZE;
784 int max_moves = POS_EXP_MAX_MOVES;
785 int16_t k = 0;
786
787 ctx.mv.downwards_init(idx, true, false);
788
789 hazard_query hq;
790 init_hazard_query(&hq);
791 add_to_hazard_query(&hq, current);
792
793 for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
794 assert(candidate_idx >= 0);
795 aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
796
797 if (candidate->opcode == aco_opcode::p_logical_start)
798 break;
799 if (candidate->isVMEM() || candidate->format == Format::SMEM || candidate->isFlatOrGlobal())
800 break;
801
802 HazardResult haz = perform_hazard_query(&hq, candidate.get());
803 if (haz == hazard_fail_exec || haz == hazard_fail_unreorderable)
804 break;
805
806 if (haz != hazard_success) {
807 add_to_hazard_query(&hq, candidate.get());
808 ctx.mv.downwards_skip();
809 continue;
810 }
811
812 MoveResult res = ctx.mv.downwards_move(false);
813 if (res == move_fail_ssa || res == move_fail_rar) {
814 add_to_hazard_query(&hq, candidate.get());
815 ctx.mv.downwards_skip();
816 continue;
817 } else if (res == move_fail_pressure) {
818 break;
819 }
820 k++;
821 }
822 }
823
824 void schedule_block(sched_ctx& ctx, Program *program, Block* block, live& live_vars)
825 {
826 ctx.last_SMEM_dep_idx = 0;
827 ctx.last_SMEM_stall = INT16_MIN;
828 ctx.mv.block = block;
829 ctx.mv.register_demand = live_vars.register_demand[block->index].data();
830
831 /* go through all instructions and find memory loads */
832 for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
833 Instruction* current = block->instructions[idx].get();
834
835 if (current->definitions.empty())
836 continue;
837
838 if (current->isVMEM() || current->isFlatOrGlobal()) {
839 ctx.mv.current = current;
840 schedule_VMEM(ctx, block, live_vars.register_demand[block->index], current, idx);
841 }
842
843 if (current->format == Format::SMEM) {
844 ctx.mv.current = current;
845 schedule_SMEM(ctx, block, live_vars.register_demand[block->index], current, idx);
846 }
847 }
848
849 if ((program->stage & (hw_vs | hw_ngg_gs)) && (block->kind & block_kind_export_end)) {
850 /* Try to move position exports as far up as possible, to reduce register
851 * usage and because ISA reference guides say so. */
852 for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
853 Instruction* current = block->instructions[idx].get();
854
855 if (current->format == Format::EXP) {
856 unsigned target = static_cast<Export_instruction*>(current)->dest;
857 if (target >= V_008DFC_SQ_EXP_POS && target < V_008DFC_SQ_EXP_PARAM) {
858 ctx.mv.current = current;
859 schedule_position_export(ctx, block, live_vars.register_demand[block->index], current, idx);
860 }
861 }
862 }
863 }
864
865 /* resummarize the block's register demand */
866 block->register_demand = RegisterDemand();
867 for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
868 block->register_demand.update(live_vars.register_demand[block->index][idx]);
869 }
870 }
871
872
873 void schedule_program(Program *program, live& live_vars)
874 {
875 sched_ctx ctx;
876 ctx.mv.depends_on.resize(program->peekAllocationId());
877 ctx.mv.RAR_dependencies.resize(program->peekAllocationId());
878 ctx.mv.RAR_dependencies_clause.resize(program->peekAllocationId());
879 /* Allowing the scheduler to reduce the number of waves to as low as 5
880 * improves performance of Thrones of Britannia significantly and doesn't
881 * seem to hurt anything else. */
882 if (program->num_waves <= 5)
883 ctx.num_waves = program->num_waves;
884 else if (program->max_reg_demand.vgpr >= 32)
885 ctx.num_waves = 5;
886 else if (program->max_reg_demand.vgpr >= 28)
887 ctx.num_waves = 6;
888 else if (program->max_reg_demand.vgpr >= 24)
889 ctx.num_waves = 7;
890 else
891 ctx.num_waves = 8;
892 ctx.num_waves = std::max<uint16_t>(ctx.num_waves, program->min_waves);
893
894 assert(ctx.num_waves > 0 && ctx.num_waves <= program->num_waves);
895 ctx.mv.max_registers = { int16_t(get_addr_vgpr_from_waves(program, ctx.num_waves) - 2),
896 int16_t(get_addr_sgpr_from_waves(program, ctx.num_waves))};
897
898 for (Block& block : program->blocks)
899 schedule_block(ctx, program, &block, live_vars);
900
901 /* update max_reg_demand and num_waves */
902 RegisterDemand new_demand;
903 for (Block& block : program->blocks) {
904 new_demand.update(block.register_demand);
905 }
906 update_vgpr_sgpr_demand(program, new_demand);
907
908 /* if enabled, this code asserts that register_demand is updated correctly */
909 #if 0
910 int prev_num_waves = program->num_waves;
911 const RegisterDemand prev_max_demand = program->max_reg_demand;
912
913 std::vector<RegisterDemand> demands(program->blocks.size());
914 for (unsigned j = 0; j < program->blocks.size(); j++) {
915 demands[j] = program->blocks[j].register_demand;
916 }
917
918 struct radv_nir_compiler_options options;
919 options.chip_class = program->chip_class;
920 live live_vars2 = aco::live_var_analysis(program, &options);
921
922 for (unsigned j = 0; j < program->blocks.size(); j++) {
923 Block &b = program->blocks[j];
924 for (unsigned i = 0; i < b.instructions.size(); i++)
925 assert(live_vars.register_demand[b.index][i] == live_vars2.register_demand[b.index][i]);
926 assert(b.register_demand == demands[j]);
927 }
928
929 assert(program->max_reg_demand == prev_max_demand);
930 assert(program->num_waves == prev_num_waves);
931 #endif
932 }
933
934 }