aco: only emit waitcnt on loop continues if we there was some load or export
[mesa.git] / src / amd / compiler / aco_insert_waitcnt.cpp
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #include <algorithm>
26 #include <map>
27
28 #include "aco_ir.h"
29 #include "vulkan/radv_shader.h"
30
31 namespace aco {
32
33 namespace {
34
35 /**
36 * The general idea of this pass is:
37 * The CFG is traversed in reverse postorder (forward).
38 * Per BB one wait_ctx is maintained.
39 * The in-context is the joined out-contexts of the predecessors.
40 * The context contains a map: gpr -> wait_entry
41 * consisting of the information about the cnt values to be waited for.
42 * Note: After merge-nodes, it might occur that for the same register
43 * multiple cnt values are to be waited for.
44 *
45 * The values are updated according to the encountered instructions:
46 * - additional events increment the counter of waits of the same type
47 * - or erase gprs with counters higher than to be waited for.
48 */
49
50 // TODO: do a more clever insertion of wait_cnt (lgkm_cnt) when there is a load followed by a use of a previous load
51
52 /* Instructions of the same event will finish in-order except for smem
53 * and maybe flat. Instructions of different events may not finish in-order. */
54 enum wait_event : uint16_t {
55 event_smem = 1 << 0,
56 event_lds = 1 << 1,
57 event_gds = 1 << 2,
58 event_vmem = 1 << 3,
59 event_vmem_store = 1 << 4, /* GFX10+ */
60 event_flat = 1 << 5,
61 event_exp_pos = 1 << 6,
62 event_exp_param = 1 << 7,
63 event_exp_mrt_null = 1 << 8,
64 event_gds_gpr_lock = 1 << 9,
65 event_vmem_gpr_lock = 1 << 10,
66 };
67
68 enum counter_type : uint8_t {
69 counter_exp = 1 << 0,
70 counter_lgkm = 1 << 1,
71 counter_vm = 1 << 2,
72 counter_vs = 1 << 3,
73 };
74
75 static const uint16_t exp_events = event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock | event_vmem_gpr_lock;
76 static const uint16_t lgkm_events = event_smem | event_lds | event_gds | event_flat;
77 static const uint16_t vm_events = event_vmem | event_flat;
78 static const uint16_t vs_events = event_vmem_store;
79
80 uint8_t get_counters_for_event(wait_event ev)
81 {
82 switch (ev) {
83 case event_smem:
84 case event_lds:
85 case event_gds:
86 return counter_lgkm;
87 case event_vmem:
88 return counter_vm;
89 case event_vmem_store:
90 return counter_vs;
91 case event_flat:
92 return counter_vm | counter_lgkm;
93 case event_exp_pos:
94 case event_exp_param:
95 case event_exp_mrt_null:
96 case event_gds_gpr_lock:
97 case event_vmem_gpr_lock:
98 return counter_exp;
99 default:
100 return 0;
101 }
102 }
103
104 struct wait_imm {
105 static const uint8_t unset_counter = 0xff;
106
107 uint8_t vm;
108 uint8_t exp;
109 uint8_t lgkm;
110 uint8_t vs;
111
112 wait_imm() :
113 vm(unset_counter), exp(unset_counter), lgkm(unset_counter), vs(unset_counter) {}
114 wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_) :
115 vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_) {}
116
117 uint16_t pack(enum chip_class chip) const
118 {
119 uint16_t imm = 0;
120 assert(exp == unset_counter || exp <= 0x7);
121 switch (chip) {
122 case GFX10:
123 assert(lgkm == unset_counter || lgkm <= 0x3f);
124 assert(vm == unset_counter || vm <= 0x3f);
125 imm = ((vm & 0x30) << 10) | ((lgkm & 0x3f) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
126 break;
127 case GFX9:
128 assert(lgkm == unset_counter || lgkm <= 0xf);
129 assert(vm == unset_counter || vm <= 0x3f);
130 imm = ((vm & 0x30) << 10) | ((lgkm & 0xf) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
131 break;
132 default:
133 assert(lgkm == unset_counter || lgkm <= 0xf);
134 assert(vm == unset_counter || vm <= 0xf);
135 imm = ((lgkm & 0xf) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
136 break;
137 }
138 if (chip < GFX9 && vm == wait_imm::unset_counter)
139 imm |= 0xc000; /* should have no effect on pre-GFX9 and now we won't have to worry about the architecture when interpreting the immediate */
140 if (chip < GFX10 && lgkm == wait_imm::unset_counter)
141 imm |= 0x3000; /* should have no effect on pre-GFX10 and now we won't have to worry about the architecture when interpreting the immediate */
142 return imm;
143 }
144
145 void combine(const wait_imm& other)
146 {
147 vm = std::min(vm, other.vm);
148 exp = std::min(exp, other.exp);
149 lgkm = std::min(lgkm, other.lgkm);
150 vs = std::min(vs, other.vs);
151 }
152
153 bool empty() const
154 {
155 return vm == unset_counter && exp == unset_counter &&
156 lgkm == unset_counter && vs == unset_counter;
157 }
158 };
159
160 struct wait_entry {
161 wait_imm imm;
162 uint16_t events; /* use wait_event notion */
163 uint8_t counters; /* use counter_type notion */
164 bool wait_on_read:1;
165 bool logical:1;
166
167 wait_entry(wait_event event, wait_imm imm, bool logical, bool wait_on_read)
168 : imm(imm), events(event), counters(get_counters_for_event(event)),
169 wait_on_read(wait_on_read), logical(logical) {}
170
171 void join(const wait_entry& other)
172 {
173 events |= other.events;
174 counters |= other.counters;
175 imm.combine(other.imm);
176 wait_on_read = wait_on_read || other.wait_on_read;
177 assert(logical == other.logical);
178 }
179
180 void remove_counter(counter_type counter)
181 {
182 counters &= ~counter;
183
184 if (counter == counter_lgkm) {
185 imm.lgkm = wait_imm::unset_counter;
186 events &= ~(event_smem | event_lds | event_gds);
187 }
188
189 if (counter == counter_vm) {
190 imm.vm = wait_imm::unset_counter;
191 events &= ~event_vmem;
192 }
193
194 if (counter == counter_exp) {
195 imm.exp = wait_imm::unset_counter;
196 events &= ~(event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock | event_vmem_gpr_lock);
197 }
198
199 if (counter == counter_vs) {
200 imm.vs = wait_imm::unset_counter;
201 events &= ~event_vmem_store;
202 }
203
204 if (!(counters & counter_lgkm) && !(counters & counter_vm))
205 events &= ~event_flat;
206 }
207 };
208
209 struct wait_ctx {
210 Program *program;
211 enum chip_class chip_class;
212 uint16_t max_vm_cnt;
213 uint16_t max_exp_cnt;
214 uint16_t max_lgkm_cnt;
215 uint16_t max_vs_cnt;
216 uint16_t unordered_events = event_smem | event_flat;
217
218 uint8_t vm_cnt = 0;
219 uint8_t exp_cnt = 0;
220 uint8_t lgkm_cnt = 0;
221 uint8_t vs_cnt = 0;
222 bool pending_flat_lgkm = false;
223 bool pending_flat_vm = false;
224
225 wait_imm barrier_imm[barrier_count];
226
227 std::map<PhysReg,wait_entry> gpr_map;
228
229 wait_ctx() {}
230 wait_ctx(Program *program_)
231 : program(program_),
232 chip_class(program_->chip_class),
233 max_vm_cnt(program_->chip_class >= GFX9 ? 62 : 14),
234 max_exp_cnt(6),
235 max_lgkm_cnt(program_->chip_class >= GFX10 ? 62 : 14),
236 max_vs_cnt(program_->chip_class >= GFX10 ? 62 : 0),
237 unordered_events(event_smem | (program_->chip_class < GFX10 ? event_flat : 0)) {}
238
239 void join(const wait_ctx* other, bool logical)
240 {
241 exp_cnt = std::max(exp_cnt, other->exp_cnt);
242 vm_cnt = std::max(vm_cnt, other->vm_cnt);
243 lgkm_cnt = std::max(lgkm_cnt, other->lgkm_cnt);
244 vs_cnt = std::max(vs_cnt, other->vs_cnt);
245 pending_flat_lgkm |= other->pending_flat_lgkm;
246 pending_flat_vm |= other->pending_flat_vm;
247
248 for (std::pair<PhysReg,wait_entry> entry : other->gpr_map)
249 {
250 std::map<PhysReg,wait_entry>::iterator it = gpr_map.find(entry.first);
251 if (entry.second.logical != logical)
252 continue;
253
254 if (it != gpr_map.end())
255 it->second.join(entry.second);
256 else
257 gpr_map.insert(entry);
258 }
259
260 for (unsigned i = 0; i < barrier_count; i++)
261 barrier_imm[i].combine(other->barrier_imm[i]);
262 }
263 };
264
265 wait_imm check_instr(Instruction* instr, wait_ctx& ctx)
266 {
267 wait_imm wait;
268
269 for (const Operand op : instr->operands) {
270 if (op.isConstant() || op.isUndefined())
271 continue;
272
273 /* check consecutively read gprs */
274 for (unsigned j = 0; j < op.size(); j++) {
275 PhysReg reg{op.physReg() + j};
276 std::map<PhysReg,wait_entry>::iterator it = ctx.gpr_map.find(reg);
277 if (it == ctx.gpr_map.end() || !it->second.wait_on_read)
278 continue;
279
280 wait.combine(it->second.imm);
281 }
282 }
283
284 for (const Definition& def : instr->definitions) {
285 /* check consecutively written gprs */
286 for (unsigned j = 0; j < def.getTemp().size(); j++)
287 {
288 PhysReg reg{def.physReg() + j};
289
290 std::map<PhysReg,wait_entry>::iterator it = ctx.gpr_map.find(reg);
291 if (it == ctx.gpr_map.end())
292 continue;
293
294 /* Vector Memory reads and writes return in the order they were issued */
295 if (instr->isVMEM() && ((it->second.events & vm_events) == event_vmem)) {
296 it->second.remove_counter(counter_vm);
297 if (!it->second.counters)
298 it = ctx.gpr_map.erase(it);
299 continue;
300 }
301
302 /* LDS reads and writes return in the order they were issued. same for GDS */
303 if (instr->format == Format::DS) {
304 bool gds = static_cast<DS_instruction*>(instr)->gds;
305 if ((it->second.events & lgkm_events) == (gds ? event_gds : event_lds)) {
306 it->second.remove_counter(counter_lgkm);
307 if (!it->second.counters)
308 it = ctx.gpr_map.erase(it);
309 continue;
310 }
311 }
312
313 wait.combine(it->second.imm);
314 }
315 }
316
317 return wait;
318 }
319
320 wait_imm kill(Instruction* instr, wait_ctx& ctx)
321 {
322 wait_imm imm;
323 if (ctx.exp_cnt || ctx.vm_cnt || ctx.lgkm_cnt)
324 imm.combine(check_instr(instr, ctx));
325
326 if (instr->format == Format::PSEUDO_BARRIER) {
327 unsigned* bsize = ctx.program->info->cs.block_size;
328 unsigned workgroup_size = bsize[0] * bsize[1] * bsize[2];
329 switch (instr->opcode) {
330 case aco_opcode::p_memory_barrier_all:
331 for (unsigned i = 0; i < barrier_count; i++) {
332 if ((1 << i) == barrier_shared && workgroup_size <= 64)
333 continue;
334 imm.combine(ctx.barrier_imm[i]);
335 }
336 break;
337 case aco_opcode::p_memory_barrier_atomic:
338 imm.combine(ctx.barrier_imm[ffs(barrier_atomic) - 1]);
339 break;
340 /* see comment in aco_scheduler.cpp's can_move_instr() on why these barriers are merged */
341 case aco_opcode::p_memory_barrier_buffer:
342 case aco_opcode::p_memory_barrier_image:
343 imm.combine(ctx.barrier_imm[ffs(barrier_buffer) - 1]);
344 imm.combine(ctx.barrier_imm[ffs(barrier_image) - 1]);
345 break;
346 case aco_opcode::p_memory_barrier_shared:
347 if (workgroup_size > 64)
348 imm.combine(ctx.barrier_imm[ffs(barrier_shared) - 1]);
349 break;
350 default:
351 assert(false);
352 break;
353 }
354 }
355
356 if (!imm.empty()) {
357 if (ctx.pending_flat_vm && imm.vm != wait_imm::unset_counter)
358 imm.vm = 0;
359 if (ctx.pending_flat_lgkm && imm.lgkm != wait_imm::unset_counter)
360 imm.lgkm = 0;
361
362 /* reset counters */
363 ctx.exp_cnt = std::min(ctx.exp_cnt, imm.exp);
364 ctx.vm_cnt = std::min(ctx.vm_cnt, imm.vm);
365 ctx.lgkm_cnt = std::min(ctx.lgkm_cnt, imm.lgkm);
366 ctx.vs_cnt = std::min(ctx.vs_cnt, imm.vs);
367
368 /* update barrier wait imms */
369 for (unsigned i = 0; i < barrier_count; i++) {
370 wait_imm& bar = ctx.barrier_imm[i];
371 if (bar.exp != wait_imm::unset_counter && imm.exp <= bar.exp)
372 bar.exp = wait_imm::unset_counter;
373 if (bar.vm != wait_imm::unset_counter && imm.vm <= bar.vm)
374 bar.vm = wait_imm::unset_counter;
375 if (bar.lgkm != wait_imm::unset_counter && imm.lgkm <= bar.lgkm)
376 bar.lgkm = wait_imm::unset_counter;
377 if (bar.vs != wait_imm::unset_counter && imm.vs <= bar.vs)
378 bar.vs = wait_imm::unset_counter;
379 }
380
381 /* remove all vgprs with higher counter from map */
382 std::map<PhysReg,wait_entry>::iterator it = ctx.gpr_map.begin();
383 while (it != ctx.gpr_map.end())
384 {
385 if (imm.exp != wait_imm::unset_counter && imm.exp <= it->second.imm.exp)
386 it->second.remove_counter(counter_exp);
387 if (imm.vm != wait_imm::unset_counter && imm.vm <= it->second.imm.vm)
388 it->second.remove_counter(counter_vm);
389 if (imm.lgkm != wait_imm::unset_counter && imm.lgkm <= it->second.imm.lgkm)
390 it->second.remove_counter(counter_lgkm);
391 if (imm.lgkm != wait_imm::unset_counter && imm.vs <= it->second.imm.vs)
392 it->second.remove_counter(counter_vs);
393 if (!it->second.counters)
394 it = ctx.gpr_map.erase(it);
395 else
396 it++;
397 }
398 }
399
400 if (imm.vm == 0)
401 ctx.pending_flat_vm = false;
402 if (imm.lgkm == 0)
403 ctx.pending_flat_lgkm = false;
404
405 return imm;
406 }
407
408 void update_barrier_imm(wait_ctx& ctx, uint8_t counters, barrier_interaction barrier)
409 {
410 unsigned barrier_index = ffs(barrier) - 1;
411 for (unsigned i = 0; i < barrier_count; i++) {
412 wait_imm& bar = ctx.barrier_imm[i];
413 if (i == barrier_index) {
414 if (counters & counter_lgkm)
415 bar.lgkm = 0;
416 if (counters & counter_vm)
417 bar.vm = 0;
418 if (counters & counter_exp)
419 bar.exp = 0;
420 if (counters & counter_vs)
421 bar.vs = 0;
422 } else {
423 if (counters & counter_lgkm && bar.lgkm != wait_imm::unset_counter && bar.lgkm < ctx.max_lgkm_cnt)
424 bar.lgkm++;
425 if (counters & counter_vm && bar.vm != wait_imm::unset_counter && bar.vm < ctx.max_vm_cnt)
426 bar.vm++;
427 if (counters & counter_exp && bar.exp != wait_imm::unset_counter && bar.exp < ctx.max_exp_cnt)
428 bar.exp++;
429 if (counters & counter_vs && bar.vs != wait_imm::unset_counter && bar.vs < ctx.max_vs_cnt)
430 bar.vs++;
431 }
432 }
433 }
434
435 void update_counters(wait_ctx& ctx, wait_event event, barrier_interaction barrier=barrier_none)
436 {
437 uint8_t counters = get_counters_for_event(event);
438
439 if (counters & counter_lgkm && ctx.lgkm_cnt <= ctx.max_lgkm_cnt)
440 ctx.lgkm_cnt++;
441 if (counters & counter_vm && ctx.vm_cnt <= ctx.max_vm_cnt)
442 ctx.vm_cnt++;
443 if (counters & counter_exp && ctx.exp_cnt <= ctx.max_exp_cnt)
444 ctx.exp_cnt++;
445 if (counters & counter_vs && ctx.vs_cnt <= ctx.max_vs_cnt)
446 ctx.vs_cnt++;
447
448 update_barrier_imm(ctx, counters, barrier);
449
450 if (ctx.unordered_events & event)
451 return;
452
453 if (ctx.pending_flat_lgkm)
454 counters &= ~counter_lgkm;
455 if (ctx.pending_flat_vm)
456 counters &= ~counter_vm;
457
458 for (std::pair<const PhysReg,wait_entry>& e : ctx.gpr_map) {
459 wait_entry& entry = e.second;
460
461 if (entry.events & ctx.unordered_events)
462 continue;
463
464 assert(entry.events);
465
466 if ((counters & counter_exp) && (entry.events & exp_events) == event && entry.imm.exp < ctx.max_exp_cnt)
467 entry.imm.exp++;
468 if ((counters & counter_lgkm) && (entry.events & lgkm_events) == event && entry.imm.lgkm < ctx.max_lgkm_cnt)
469 entry.imm.lgkm++;
470 if ((counters & counter_vm) && (entry.events & vm_events) == event && entry.imm.vm < ctx.max_vm_cnt)
471 entry.imm.vm++;
472 if ((counters & counter_vs) && (entry.events & vs_events) == event && entry.imm.vs < ctx.max_vs_cnt)
473 entry.imm.vs++;
474 }
475 }
476
477 void update_counters_for_flat_load(wait_ctx& ctx, barrier_interaction barrier=barrier_none)
478 {
479 assert(ctx.chip_class < GFX10);
480
481 if (ctx.lgkm_cnt <= ctx.max_lgkm_cnt)
482 ctx.lgkm_cnt++;
483 if (ctx.lgkm_cnt <= ctx.max_vm_cnt)
484 ctx.vm_cnt++;
485
486 update_barrier_imm(ctx, counter_vm | counter_lgkm, barrier);
487
488 for (std::pair<PhysReg,wait_entry> e : ctx.gpr_map)
489 {
490 if (e.second.counters & counter_vm)
491 e.second.imm.vm = 0;
492 if (e.second.counters & counter_lgkm)
493 e.second.imm.lgkm = 0;
494 }
495 ctx.pending_flat_lgkm = true;
496 ctx.pending_flat_vm = true;
497 }
498
499 void insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event, bool wait_on_read)
500 {
501 uint16_t counters = get_counters_for_event(event);
502 wait_imm imm;
503 if (counters & counter_lgkm)
504 imm.lgkm = 0;
505 if (counters & counter_vm)
506 imm.vm = 0;
507 if (counters & counter_exp)
508 imm.exp = 0;
509 if (counters & counter_vs)
510 imm.vs = 0;
511
512 wait_entry new_entry(event, imm, !rc.is_linear(), wait_on_read);
513
514 for (unsigned i = 0; i < rc.size(); i++) {
515 auto it = ctx.gpr_map.emplace(PhysReg{reg.reg+i}, new_entry);
516 if (!it.second)
517 it.first->second.join(new_entry);
518 }
519 }
520
521 void insert_wait_entry(wait_ctx& ctx, Operand op, wait_event event)
522 {
523 if (!op.isConstant() && !op.isUndefined())
524 insert_wait_entry(ctx, op.physReg(), op.regClass(), event, false);
525 }
526
527 void insert_wait_entry(wait_ctx& ctx, Definition def, wait_event event)
528 {
529 insert_wait_entry(ctx, def.physReg(), def.regClass(), event, true);
530 }
531
532 void gen(Instruction* instr, wait_ctx& ctx)
533 {
534 switch (instr->format) {
535 case Format::EXP: {
536 Export_instruction* exp_instr = static_cast<Export_instruction*>(instr);
537
538 wait_event ev;
539 if (exp_instr->dest <= 9)
540 ev = event_exp_mrt_null;
541 else if (exp_instr->dest <= 15)
542 ev = event_exp_pos;
543 else
544 ev = event_exp_param;
545 update_counters(ctx, ev);
546
547 /* insert new entries for exported vgprs */
548 for (unsigned i = 0; i < 4; i++)
549 {
550 if (exp_instr->enabled_mask & (1 << i)) {
551 unsigned idx = exp_instr->compressed ? i >> 1 : i;
552 assert(idx < exp_instr->operands.size());
553 insert_wait_entry(ctx, exp_instr->operands[idx], ev);
554
555 }
556 }
557 insert_wait_entry(ctx, exec, s2, ev, false);
558 break;
559 }
560 case Format::FLAT: {
561 if (ctx.chip_class < GFX10 && !instr->definitions.empty())
562 update_counters_for_flat_load(ctx, barrier_buffer);
563 else
564 update_counters(ctx, event_flat, barrier_buffer);
565
566 if (!instr->definitions.empty())
567 insert_wait_entry(ctx, instr->definitions[0], event_flat);
568 break;
569 }
570 case Format::SMEM: {
571 update_counters(ctx, event_smem, static_cast<SMEM_instruction*>(instr)->barrier);
572
573 if (!instr->definitions.empty())
574 insert_wait_entry(ctx, instr->definitions[0], event_smem);
575 break;
576 }
577 case Format::DS: {
578 bool gds = static_cast<DS_instruction*>(instr)->gds;
579 update_counters(ctx, gds ? event_gds : event_lds, gds ? barrier_none : barrier_shared);
580 if (gds)
581 update_counters(ctx, event_gds_gpr_lock);
582
583 if (!instr->definitions.empty())
584 insert_wait_entry(ctx, instr->definitions[0], gds ? event_gds : event_lds);
585
586 if (gds) {
587 for (const Operand& op : instr->operands)
588 insert_wait_entry(ctx, op, event_gds_gpr_lock);
589 insert_wait_entry(ctx, exec, s2, event_gds_gpr_lock, false);
590 }
591 break;
592 }
593 case Format::MUBUF:
594 case Format::MTBUF:
595 case Format::MIMG:
596 case Format::GLOBAL: {
597 wait_event ev = !instr->definitions.empty() || ctx.chip_class < GFX10 ? event_vmem : event_vmem_store;
598 update_counters(ctx, ev, get_barrier_interaction(instr));
599
600 if (!instr->definitions.empty())
601 insert_wait_entry(ctx, instr->definitions[0], ev);
602
603 if (instr->operands.size() == 4 && ctx.chip_class == GFX6) {
604 ctx.exp_cnt++;
605 update_counters(ctx, event_vmem_gpr_lock);
606 insert_wait_entry(ctx, instr->operands[3], event_vmem_gpr_lock);
607 }
608 break;
609 }
610 default:
611 break;
612 }
613 }
614
615 void emit_waitcnt(wait_ctx& ctx, std::vector<aco_ptr<Instruction>>& instructions, wait_imm imm)
616 {
617 if (imm.vs != wait_imm::unset_counter) {
618 assert(ctx.chip_class >= GFX10);
619 SOPK_instruction* waitcnt_vs = create_instruction<SOPK_instruction>(aco_opcode::s_waitcnt_vscnt, Format::SOPK, 0, 0);
620 waitcnt_vs->imm = imm.vs;
621 instructions.emplace_back(waitcnt_vs);
622 imm.vs = wait_imm::unset_counter;
623 }
624 if (!imm.empty()) {
625 SOPP_instruction* waitcnt = create_instruction<SOPP_instruction>(aco_opcode::s_waitcnt, Format::SOPP, 0, 0);
626 waitcnt->imm = imm.pack(ctx.chip_class);
627 waitcnt->block = -1;
628 instructions.emplace_back(waitcnt);
629 }
630 }
631
632 void handle_block(Program *program, Block& block, wait_ctx& ctx)
633 {
634 std::vector<aco_ptr<Instruction>> new_instructions;
635
636 for (aco_ptr<Instruction>& instr : block.instructions) {
637 wait_imm imm = kill(instr.get(), ctx);
638
639 if (!imm.empty())
640 emit_waitcnt(ctx, new_instructions, imm);
641
642 gen(instr.get(), ctx);
643
644 if (instr->format != Format::PSEUDO_BARRIER)
645 new_instructions.emplace_back(std::move(instr));
646 }
647
648 /* check if this block is at the end of a loop */
649 for (unsigned succ_idx : block.linear_succs) {
650 /* eliminate any remaining counters */
651 if (succ_idx <= block.index && (ctx.vm_cnt || ctx.exp_cnt || ctx.lgkm_cnt || ctx.vs_cnt) && !ctx.gpr_map.empty()) {
652 // TODO: we could do better if we only wait if the regs between the block and other predecessors differ
653
654 aco_ptr<Instruction> branch = std::move(new_instructions.back());
655 new_instructions.pop_back();
656
657 wait_imm imm(ctx.vm_cnt ? 0 : wait_imm::unset_counter,
658 ctx.exp_cnt ? 0 : wait_imm::unset_counter,
659 ctx.lgkm_cnt ? 0 : wait_imm::unset_counter,
660 ctx.vs_cnt ? 0 : wait_imm::unset_counter);
661 emit_waitcnt(ctx, new_instructions, imm);
662
663 new_instructions.push_back(std::move(branch));
664
665 ctx = wait_ctx(program);
666 break;
667 }
668 }
669 block.instructions.swap(new_instructions);
670 }
671
672 } /* end namespace */
673
674 void insert_wait_states(Program* program)
675 {
676 wait_ctx out_ctx[program->blocks.size()]; /* per BB ctx */
677 for (unsigned i = 0; i < program->blocks.size(); i++)
678 out_ctx[i] = wait_ctx(program);
679
680 for (unsigned i = 0; i < program->blocks.size(); i++) {
681 Block& current = program->blocks[i];
682 wait_ctx& in = out_ctx[current.index];
683
684 for (unsigned b : current.linear_preds)
685 in.join(&out_ctx[b], false);
686 for (unsigned b : current.logical_preds)
687 in.join(&out_ctx[b], true);
688
689 if (current.instructions.empty())
690 continue;
691
692 handle_block(program, current, in);
693 }
694 }
695
696 }
697