aco: Fix s_dcache_wb on GFX10.
[mesa.git] / src / amd / compiler / aco_insert_waitcnt.cpp
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #include <algorithm>
26 #include <map>
27
28 #include "aco_ir.h"
29 #include "vulkan/radv_shader.h"
30
31 namespace aco {
32
33 namespace {
34
35 /**
36 * The general idea of this pass is:
37 * The CFG is traversed in reverse postorder (forward).
38 * Per BB one wait_ctx is maintained.
39 * The in-context is the joined out-contexts of the predecessors.
40 * The context contains a map: gpr -> wait_entry
41 * consisting of the information about the cnt values to be waited for.
42 * Note: After merge-nodes, it might occur that for the same register
43 * multiple cnt values are to be waited for.
44 *
45 * The values are updated according to the encountered instructions:
46 * - additional events increment the counter of waits of the same type
47 * - or erase gprs with counters higher than to be waited for.
48 */
49
50 // TODO: do a more clever insertion of wait_cnt (lgkm_cnt) when there is a load followed by a use of a previous load
51
52 /* Instructions of the same event will finish in-order except for smem
53 * and maybe flat. Instructions of different events may not finish in-order. */
54 enum wait_event : uint16_t {
55 event_smem = 1 << 0,
56 event_lds = 1 << 1,
57 event_gds = 1 << 2,
58 event_vmem = 1 << 3,
59 event_vmem_store = 1 << 4, /* GFX10+ */
60 event_flat = 1 << 5,
61 event_exp_pos = 1 << 6,
62 event_exp_param = 1 << 7,
63 event_exp_mrt_null = 1 << 8,
64 event_gds_gpr_lock = 1 << 9,
65 event_vmem_gpr_lock = 1 << 10,
66 };
67
68 enum counter_type : uint8_t {
69 counter_exp = 1 << 0,
70 counter_lgkm = 1 << 1,
71 counter_vm = 1 << 2,
72 counter_vs = 1 << 3,
73 };
74
75 static const uint16_t exp_events = event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock | event_vmem_gpr_lock;
76 static const uint16_t lgkm_events = event_smem | event_lds | event_gds | event_flat;
77 static const uint16_t vm_events = event_vmem | event_flat;
78 static const uint16_t vs_events = event_vmem_store;
79
80 uint8_t get_counters_for_event(wait_event ev)
81 {
82 switch (ev) {
83 case event_smem:
84 case event_lds:
85 case event_gds:
86 return counter_lgkm;
87 case event_vmem:
88 return counter_vm;
89 case event_vmem_store:
90 return counter_vs;
91 case event_flat:
92 return counter_vm | counter_lgkm;
93 case event_exp_pos:
94 case event_exp_param:
95 case event_exp_mrt_null:
96 case event_gds_gpr_lock:
97 case event_vmem_gpr_lock:
98 return counter_exp;
99 default:
100 return 0;
101 }
102 }
103
104 struct wait_imm {
105 static const uint8_t unset_counter = 0xff;
106
107 uint8_t vm;
108 uint8_t exp;
109 uint8_t lgkm;
110 uint8_t vs;
111
112 wait_imm() :
113 vm(unset_counter), exp(unset_counter), lgkm(unset_counter), vs(unset_counter) {}
114 wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_) :
115 vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_) {}
116
117 uint16_t pack(enum chip_class chip) const
118 {
119 uint16_t imm = 0;
120 assert(exp == unset_counter || exp <= 0x7);
121 switch (chip) {
122 case GFX10:
123 assert(lgkm == unset_counter || lgkm <= 0x3f);
124 assert(vm == unset_counter || vm <= 0x3f);
125 imm = ((vm & 0x30) << 10) | ((lgkm & 0x3f) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
126 break;
127 case GFX9:
128 assert(lgkm == unset_counter || lgkm <= 0xf);
129 assert(vm == unset_counter || vm <= 0x3f);
130 imm = ((vm & 0x30) << 10) | ((lgkm & 0xf) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
131 break;
132 default:
133 assert(lgkm == unset_counter || lgkm <= 0xf);
134 assert(vm == unset_counter || vm <= 0xf);
135 imm = ((lgkm & 0xf) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
136 break;
137 }
138 if (chip < GFX9 && vm == wait_imm::unset_counter)
139 imm |= 0xc000; /* should have no effect on pre-GFX9 and now we won't have to worry about the architecture when interpreting the immediate */
140 if (chip < GFX10 && lgkm == wait_imm::unset_counter)
141 imm |= 0x3000; /* should have no effect on pre-GFX10 and now we won't have to worry about the architecture when interpreting the immediate */
142 return imm;
143 }
144
145 void combine(const wait_imm& other)
146 {
147 vm = std::min(vm, other.vm);
148 exp = std::min(exp, other.exp);
149 lgkm = std::min(lgkm, other.lgkm);
150 vs = std::min(vs, other.vs);
151 }
152
153 bool empty() const
154 {
155 return vm == unset_counter && exp == unset_counter &&
156 lgkm == unset_counter && vs == unset_counter;
157 }
158 };
159
160 struct wait_entry {
161 wait_imm imm;
162 uint16_t events; /* use wait_event notion */
163 uint8_t counters; /* use counter_type notion */
164 bool wait_on_read:1;
165 bool logical:1;
166
167 wait_entry(wait_event event, wait_imm imm, bool logical, bool wait_on_read)
168 : imm(imm), events(event), counters(get_counters_for_event(event)),
169 wait_on_read(wait_on_read), logical(logical) {}
170
171 void join(const wait_entry& other)
172 {
173 events |= other.events;
174 counters |= other.counters;
175 imm.combine(other.imm);
176 wait_on_read = wait_on_read || other.wait_on_read;
177 assert(logical == other.logical);
178 }
179
180 void remove_counter(counter_type counter)
181 {
182 counters &= ~counter;
183
184 if (counter == counter_lgkm) {
185 imm.lgkm = wait_imm::unset_counter;
186 events &= ~(event_smem | event_lds | event_gds);
187 }
188
189 if (counter == counter_vm) {
190 imm.vm = wait_imm::unset_counter;
191 events &= ~event_vmem;
192 }
193
194 if (counter == counter_exp) {
195 imm.exp = wait_imm::unset_counter;
196 events &= ~(event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock | event_vmem_gpr_lock);
197 }
198
199 if (counter == counter_vs) {
200 imm.vs = wait_imm::unset_counter;
201 events &= ~event_vmem_store;
202 }
203
204 if (!(counters & counter_lgkm) && !(counters & counter_vm))
205 events &= ~event_flat;
206 }
207 };
208
209 struct wait_ctx {
210 Program *program;
211 enum chip_class chip_class;
212 uint16_t max_vm_cnt;
213 uint16_t max_exp_cnt;
214 uint16_t max_lgkm_cnt;
215 uint16_t max_vs_cnt;
216 uint16_t unordered_events = event_smem | event_flat;
217
218 uint8_t vm_cnt = 0;
219 uint8_t exp_cnt = 0;
220 uint8_t lgkm_cnt = 0;
221 uint8_t vs_cnt = 0;
222 bool pending_flat_lgkm = false;
223 bool pending_flat_vm = false;
224
225 wait_imm barrier_imm[barrier_count];
226
227 std::map<PhysReg,wait_entry> gpr_map;
228
229 wait_ctx() {}
230 wait_ctx(Program *program_)
231 : program(program_),
232 chip_class(program_->chip_class),
233 max_vm_cnt(program_->chip_class >= GFX9 ? 62 : 14),
234 max_exp_cnt(6),
235 max_lgkm_cnt(program_->chip_class >= GFX10 ? 62 : 14),
236 max_vs_cnt(program_->chip_class >= GFX10 ? 62 : 0),
237 unordered_events(event_smem | (program_->chip_class < GFX10 ? event_flat : 0)) {}
238
239 void join(const wait_ctx* other, bool logical)
240 {
241 exp_cnt = std::max(exp_cnt, other->exp_cnt);
242 vm_cnt = std::max(vm_cnt, other->vm_cnt);
243 lgkm_cnt = std::max(lgkm_cnt, other->lgkm_cnt);
244 vs_cnt = std::max(vs_cnt, other->vs_cnt);
245 pending_flat_lgkm |= other->pending_flat_lgkm;
246 pending_flat_vm |= other->pending_flat_vm;
247
248 for (std::pair<PhysReg,wait_entry> entry : other->gpr_map)
249 {
250 std::map<PhysReg,wait_entry>::iterator it = gpr_map.find(entry.first);
251 if (entry.second.logical != logical)
252 continue;
253
254 if (it != gpr_map.end())
255 it->second.join(entry.second);
256 else
257 gpr_map.insert(entry);
258 }
259
260 for (unsigned i = 0; i < barrier_count; i++)
261 barrier_imm[i].combine(other->barrier_imm[i]);
262 }
263 };
264
265 wait_imm check_instr(Instruction* instr, wait_ctx& ctx)
266 {
267 wait_imm wait;
268
269 for (const Operand op : instr->operands) {
270 if (op.isConstant() || op.isUndefined())
271 continue;
272
273 /* check consecutively read gprs */
274 for (unsigned j = 0; j < op.size(); j++) {
275 PhysReg reg{op.physReg() + j};
276 std::map<PhysReg,wait_entry>::iterator it = ctx.gpr_map.find(reg);
277 if (it == ctx.gpr_map.end() || !it->second.wait_on_read)
278 continue;
279
280 wait.combine(it->second.imm);
281 }
282 }
283
284 for (const Definition& def : instr->definitions) {
285 /* check consecutively written gprs */
286 for (unsigned j = 0; j < def.getTemp().size(); j++)
287 {
288 PhysReg reg{def.physReg() + j};
289
290 std::map<PhysReg,wait_entry>::iterator it = ctx.gpr_map.find(reg);
291 if (it == ctx.gpr_map.end())
292 continue;
293
294 /* Vector Memory reads and writes return in the order they were issued */
295 if (instr->isVMEM() && ((it->second.events & vm_events) == event_vmem)) {
296 it->second.remove_counter(counter_vm);
297 if (!it->second.counters)
298 it = ctx.gpr_map.erase(it);
299 continue;
300 }
301
302 /* LDS reads and writes return in the order they were issued. same for GDS */
303 if (instr->format == Format::DS) {
304 bool gds = static_cast<DS_instruction*>(instr)->gds;
305 if ((it->second.events & lgkm_events) == (gds ? event_gds : event_lds)) {
306 it->second.remove_counter(counter_lgkm);
307 if (!it->second.counters)
308 it = ctx.gpr_map.erase(it);
309 continue;
310 }
311 }
312
313 wait.combine(it->second.imm);
314 }
315 }
316
317 return wait;
318 }
319
320 wait_imm kill(Instruction* instr, wait_ctx& ctx)
321 {
322 wait_imm imm;
323 if (ctx.exp_cnt || ctx.vm_cnt || ctx.lgkm_cnt)
324 imm.combine(check_instr(instr, ctx));
325
326 if (ctx.chip_class >= GFX10) {
327 /* Seems to be required on GFX10 to achieve correct behaviour.
328 * It shouldn't cost anything anyways since we're about to do s_endpgm.
329 */
330 if (ctx.lgkm_cnt && instr->opcode == aco_opcode::s_dcache_wb)
331 imm.lgkm = 0;
332 }
333
334 if (instr->format == Format::PSEUDO_BARRIER) {
335 unsigned* bsize = ctx.program->info->cs.block_size;
336 unsigned workgroup_size = bsize[0] * bsize[1] * bsize[2];
337 switch (instr->opcode) {
338 case aco_opcode::p_memory_barrier_all:
339 for (unsigned i = 0; i < barrier_count; i++) {
340 if ((1 << i) == barrier_shared && workgroup_size <= 64)
341 continue;
342 imm.combine(ctx.barrier_imm[i]);
343 }
344 break;
345 case aco_opcode::p_memory_barrier_atomic:
346 imm.combine(ctx.barrier_imm[ffs(barrier_atomic) - 1]);
347 break;
348 /* see comment in aco_scheduler.cpp's can_move_instr() on why these barriers are merged */
349 case aco_opcode::p_memory_barrier_buffer:
350 case aco_opcode::p_memory_barrier_image:
351 imm.combine(ctx.barrier_imm[ffs(barrier_buffer) - 1]);
352 imm.combine(ctx.barrier_imm[ffs(barrier_image) - 1]);
353 break;
354 case aco_opcode::p_memory_barrier_shared:
355 if (workgroup_size > 64)
356 imm.combine(ctx.barrier_imm[ffs(barrier_shared) - 1]);
357 break;
358 default:
359 assert(false);
360 break;
361 }
362 }
363
364 if (!imm.empty()) {
365 if (ctx.pending_flat_vm && imm.vm != wait_imm::unset_counter)
366 imm.vm = 0;
367 if (ctx.pending_flat_lgkm && imm.lgkm != wait_imm::unset_counter)
368 imm.lgkm = 0;
369
370 /* reset counters */
371 ctx.exp_cnt = std::min(ctx.exp_cnt, imm.exp);
372 ctx.vm_cnt = std::min(ctx.vm_cnt, imm.vm);
373 ctx.lgkm_cnt = std::min(ctx.lgkm_cnt, imm.lgkm);
374 ctx.vs_cnt = std::min(ctx.vs_cnt, imm.vs);
375
376 /* update barrier wait imms */
377 for (unsigned i = 0; i < barrier_count; i++) {
378 wait_imm& bar = ctx.barrier_imm[i];
379 if (bar.exp != wait_imm::unset_counter && imm.exp <= bar.exp)
380 bar.exp = wait_imm::unset_counter;
381 if (bar.vm != wait_imm::unset_counter && imm.vm <= bar.vm)
382 bar.vm = wait_imm::unset_counter;
383 if (bar.lgkm != wait_imm::unset_counter && imm.lgkm <= bar.lgkm)
384 bar.lgkm = wait_imm::unset_counter;
385 if (bar.vs != wait_imm::unset_counter && imm.vs <= bar.vs)
386 bar.vs = wait_imm::unset_counter;
387 }
388
389 /* remove all gprs with higher counter from map */
390 std::map<PhysReg,wait_entry>::iterator it = ctx.gpr_map.begin();
391 while (it != ctx.gpr_map.end())
392 {
393 if (imm.exp != wait_imm::unset_counter && imm.exp <= it->second.imm.exp)
394 it->second.remove_counter(counter_exp);
395 if (imm.vm != wait_imm::unset_counter && imm.vm <= it->second.imm.vm)
396 it->second.remove_counter(counter_vm);
397 if (imm.lgkm != wait_imm::unset_counter && imm.lgkm <= it->second.imm.lgkm)
398 it->second.remove_counter(counter_lgkm);
399 if (imm.lgkm != wait_imm::unset_counter && imm.vs <= it->second.imm.vs)
400 it->second.remove_counter(counter_vs);
401 if (!it->second.counters)
402 it = ctx.gpr_map.erase(it);
403 else
404 it++;
405 }
406 }
407
408 if (imm.vm == 0)
409 ctx.pending_flat_vm = false;
410 if (imm.lgkm == 0)
411 ctx.pending_flat_lgkm = false;
412
413 return imm;
414 }
415
416 void update_barrier_imm(wait_ctx& ctx, uint8_t counters, barrier_interaction barrier)
417 {
418 unsigned barrier_index = ffs(barrier) - 1;
419 for (unsigned i = 0; i < barrier_count; i++) {
420 wait_imm& bar = ctx.barrier_imm[i];
421 if (i == barrier_index) {
422 if (counters & counter_lgkm)
423 bar.lgkm = 0;
424 if (counters & counter_vm)
425 bar.vm = 0;
426 if (counters & counter_exp)
427 bar.exp = 0;
428 if (counters & counter_vs)
429 bar.vs = 0;
430 } else {
431 if (counters & counter_lgkm && bar.lgkm != wait_imm::unset_counter && bar.lgkm < ctx.max_lgkm_cnt)
432 bar.lgkm++;
433 if (counters & counter_vm && bar.vm != wait_imm::unset_counter && bar.vm < ctx.max_vm_cnt)
434 bar.vm++;
435 if (counters & counter_exp && bar.exp != wait_imm::unset_counter && bar.exp < ctx.max_exp_cnt)
436 bar.exp++;
437 if (counters & counter_vs && bar.vs != wait_imm::unset_counter && bar.vs < ctx.max_vs_cnt)
438 bar.vs++;
439 }
440 }
441 }
442
443 void update_counters(wait_ctx& ctx, wait_event event, barrier_interaction barrier=barrier_none)
444 {
445 uint8_t counters = get_counters_for_event(event);
446
447 if (counters & counter_lgkm && ctx.lgkm_cnt <= ctx.max_lgkm_cnt)
448 ctx.lgkm_cnt++;
449 if (counters & counter_vm && ctx.vm_cnt <= ctx.max_vm_cnt)
450 ctx.vm_cnt++;
451 if (counters & counter_exp && ctx.exp_cnt <= ctx.max_exp_cnt)
452 ctx.exp_cnt++;
453 if (counters & counter_vs && ctx.vs_cnt <= ctx.max_vs_cnt)
454 ctx.vs_cnt++;
455
456 update_barrier_imm(ctx, counters, barrier);
457
458 if (ctx.unordered_events & event)
459 return;
460
461 if (ctx.pending_flat_lgkm)
462 counters &= ~counter_lgkm;
463 if (ctx.pending_flat_vm)
464 counters &= ~counter_vm;
465
466 for (std::pair<const PhysReg,wait_entry>& e : ctx.gpr_map) {
467 wait_entry& entry = e.second;
468
469 if (entry.events & ctx.unordered_events)
470 continue;
471
472 assert(entry.events);
473
474 if ((counters & counter_exp) && (entry.events & exp_events) == event && entry.imm.exp < ctx.max_exp_cnt)
475 entry.imm.exp++;
476 if ((counters & counter_lgkm) && (entry.events & lgkm_events) == event && entry.imm.lgkm < ctx.max_lgkm_cnt)
477 entry.imm.lgkm++;
478 if ((counters & counter_vm) && (entry.events & vm_events) == event && entry.imm.vm < ctx.max_vm_cnt)
479 entry.imm.vm++;
480 if ((counters & counter_vs) && (entry.events & vs_events) == event && entry.imm.vs < ctx.max_vs_cnt)
481 entry.imm.vs++;
482 }
483 }
484
485 void update_counters_for_flat_load(wait_ctx& ctx, barrier_interaction barrier=barrier_none)
486 {
487 assert(ctx.chip_class < GFX10);
488
489 if (ctx.lgkm_cnt <= ctx.max_lgkm_cnt)
490 ctx.lgkm_cnt++;
491 if (ctx.lgkm_cnt <= ctx.max_vm_cnt)
492 ctx.vm_cnt++;
493
494 update_barrier_imm(ctx, counter_vm | counter_lgkm, barrier);
495
496 for (std::pair<PhysReg,wait_entry> e : ctx.gpr_map)
497 {
498 if (e.second.counters & counter_vm)
499 e.second.imm.vm = 0;
500 if (e.second.counters & counter_lgkm)
501 e.second.imm.lgkm = 0;
502 }
503 ctx.pending_flat_lgkm = true;
504 ctx.pending_flat_vm = true;
505 }
506
507 void insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event, bool wait_on_read)
508 {
509 uint16_t counters = get_counters_for_event(event);
510 wait_imm imm;
511 if (counters & counter_lgkm)
512 imm.lgkm = 0;
513 if (counters & counter_vm)
514 imm.vm = 0;
515 if (counters & counter_exp)
516 imm.exp = 0;
517 if (counters & counter_vs)
518 imm.vs = 0;
519
520 wait_entry new_entry(event, imm, !rc.is_linear(), wait_on_read);
521
522 for (unsigned i = 0; i < rc.size(); i++) {
523 auto it = ctx.gpr_map.emplace(PhysReg{reg.reg+i}, new_entry);
524 if (!it.second)
525 it.first->second.join(new_entry);
526 }
527 }
528
529 void insert_wait_entry(wait_ctx& ctx, Operand op, wait_event event)
530 {
531 if (!op.isConstant() && !op.isUndefined())
532 insert_wait_entry(ctx, op.physReg(), op.regClass(), event, false);
533 }
534
535 void insert_wait_entry(wait_ctx& ctx, Definition def, wait_event event)
536 {
537 insert_wait_entry(ctx, def.physReg(), def.regClass(), event, true);
538 }
539
540 void gen(Instruction* instr, wait_ctx& ctx)
541 {
542 switch (instr->format) {
543 case Format::EXP: {
544 Export_instruction* exp_instr = static_cast<Export_instruction*>(instr);
545
546 wait_event ev;
547 if (exp_instr->dest <= 9)
548 ev = event_exp_mrt_null;
549 else if (exp_instr->dest <= 15)
550 ev = event_exp_pos;
551 else
552 ev = event_exp_param;
553 update_counters(ctx, ev);
554
555 /* insert new entries for exported vgprs */
556 for (unsigned i = 0; i < 4; i++)
557 {
558 if (exp_instr->enabled_mask & (1 << i)) {
559 unsigned idx = exp_instr->compressed ? i >> 1 : i;
560 assert(idx < exp_instr->operands.size());
561 insert_wait_entry(ctx, exp_instr->operands[idx], ev);
562
563 }
564 }
565 insert_wait_entry(ctx, exec, s2, ev, false);
566 break;
567 }
568 case Format::FLAT: {
569 if (ctx.chip_class < GFX10 && !instr->definitions.empty())
570 update_counters_for_flat_load(ctx, barrier_buffer);
571 else
572 update_counters(ctx, event_flat, barrier_buffer);
573
574 if (!instr->definitions.empty())
575 insert_wait_entry(ctx, instr->definitions[0], event_flat);
576 break;
577 }
578 case Format::SMEM: {
579 update_counters(ctx, event_smem, static_cast<SMEM_instruction*>(instr)->barrier);
580
581 if (!instr->definitions.empty())
582 insert_wait_entry(ctx, instr->definitions[0], event_smem);
583 break;
584 }
585 case Format::DS: {
586 bool gds = static_cast<DS_instruction*>(instr)->gds;
587 update_counters(ctx, gds ? event_gds : event_lds, gds ? barrier_none : barrier_shared);
588 if (gds)
589 update_counters(ctx, event_gds_gpr_lock);
590
591 if (!instr->definitions.empty())
592 insert_wait_entry(ctx, instr->definitions[0], gds ? event_gds : event_lds);
593
594 if (gds) {
595 for (const Operand& op : instr->operands)
596 insert_wait_entry(ctx, op, event_gds_gpr_lock);
597 insert_wait_entry(ctx, exec, s2, event_gds_gpr_lock, false);
598 }
599 break;
600 }
601 case Format::MUBUF:
602 case Format::MTBUF:
603 case Format::MIMG:
604 case Format::GLOBAL: {
605 wait_event ev = !instr->definitions.empty() || ctx.chip_class < GFX10 ? event_vmem : event_vmem_store;
606 update_counters(ctx, ev, get_barrier_interaction(instr));
607
608 if (!instr->definitions.empty())
609 insert_wait_entry(ctx, instr->definitions[0], ev);
610
611 if (instr->operands.size() == 4 && ctx.chip_class == GFX6) {
612 ctx.exp_cnt++;
613 update_counters(ctx, event_vmem_gpr_lock);
614 insert_wait_entry(ctx, instr->operands[3], event_vmem_gpr_lock);
615 }
616 break;
617 }
618 default:
619 break;
620 }
621 }
622
623 void emit_waitcnt(wait_ctx& ctx, std::vector<aco_ptr<Instruction>>& instructions, wait_imm imm)
624 {
625 if (imm.vs != wait_imm::unset_counter) {
626 assert(ctx.chip_class >= GFX10);
627 SOPK_instruction* waitcnt_vs = create_instruction<SOPK_instruction>(aco_opcode::s_waitcnt_vscnt, Format::SOPK, 0, 1);
628 waitcnt_vs->definitions[0] = Definition(sgpr_null, s1);
629 waitcnt_vs->imm = imm.vs;
630 instructions.emplace_back(waitcnt_vs);
631 imm.vs = wait_imm::unset_counter;
632 }
633 if (!imm.empty()) {
634 SOPP_instruction* waitcnt = create_instruction<SOPP_instruction>(aco_opcode::s_waitcnt, Format::SOPP, 0, 0);
635 waitcnt->imm = imm.pack(ctx.chip_class);
636 waitcnt->block = -1;
637 instructions.emplace_back(waitcnt);
638 }
639 }
640
641 void handle_block(Program *program, Block& block, wait_ctx& ctx)
642 {
643 std::vector<aco_ptr<Instruction>> new_instructions;
644
645 for (aco_ptr<Instruction>& instr : block.instructions) {
646 wait_imm imm = kill(instr.get(), ctx);
647
648 if (!imm.empty())
649 emit_waitcnt(ctx, new_instructions, imm);
650
651 gen(instr.get(), ctx);
652
653 if (instr->format != Format::PSEUDO_BARRIER)
654 new_instructions.emplace_back(std::move(instr));
655 }
656
657 /* check if this block is at the end of a loop */
658 for (unsigned succ_idx : block.linear_succs) {
659 /* eliminate any remaining counters */
660 if (succ_idx <= block.index && (ctx.vm_cnt || ctx.exp_cnt || ctx.lgkm_cnt || ctx.vs_cnt) && !ctx.gpr_map.empty()) {
661 // TODO: we could do better if we only wait if the regs between the block and other predecessors differ
662
663 aco_ptr<Instruction> branch = std::move(new_instructions.back());
664 new_instructions.pop_back();
665
666 wait_imm imm(ctx.vm_cnt ? 0 : wait_imm::unset_counter,
667 ctx.exp_cnt ? 0 : wait_imm::unset_counter,
668 ctx.lgkm_cnt ? 0 : wait_imm::unset_counter,
669 ctx.vs_cnt ? 0 : wait_imm::unset_counter);
670 emit_waitcnt(ctx, new_instructions, imm);
671
672 new_instructions.push_back(std::move(branch));
673
674 ctx = wait_ctx(program);
675 break;
676 }
677 }
678 block.instructions.swap(new_instructions);
679 }
680
681 } /* end namespace */
682
683 void insert_wait_states(Program* program)
684 {
685 wait_ctx out_ctx[program->blocks.size()]; /* per BB ctx */
686 for (unsigned i = 0; i < program->blocks.size(); i++)
687 out_ctx[i] = wait_ctx(program);
688
689 for (unsigned i = 0; i < program->blocks.size(); i++) {
690 Block& current = program->blocks[i];
691 wait_ctx& in = out_ctx[current.index];
692
693 for (unsigned b : current.linear_preds)
694 in.join(&out_ctx[b], false);
695 for (unsigned b : current.logical_preds)
696 in.join(&out_ctx[b], true);
697
698 if (current.instructions.empty())
699 continue;
700
701 handle_block(program, current, in);
702 }
703 }
704
705 }
706