aco/gfx10: Wait for pending SMEM stores before loads
[mesa.git] / src / amd / compiler / aco_insert_waitcnt.cpp
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #include <algorithm>
26 #include <map>
27
28 #include "aco_ir.h"
29 #include "vulkan/radv_shader.h"
30
31 namespace aco {
32
33 namespace {
34
35 /**
36 * The general idea of this pass is:
37 * The CFG is traversed in reverse postorder (forward).
38 * Per BB one wait_ctx is maintained.
39 * The in-context is the joined out-contexts of the predecessors.
40 * The context contains a map: gpr -> wait_entry
41 * consisting of the information about the cnt values to be waited for.
42 * Note: After merge-nodes, it might occur that for the same register
43 * multiple cnt values are to be waited for.
44 *
45 * The values are updated according to the encountered instructions:
46 * - additional events increment the counter of waits of the same type
47 * - or erase gprs with counters higher than to be waited for.
48 */
49
50 // TODO: do a more clever insertion of wait_cnt (lgkm_cnt) when there is a load followed by a use of a previous load
51
52 /* Instructions of the same event will finish in-order except for smem
53 * and maybe flat. Instructions of different events may not finish in-order. */
54 enum wait_event : uint16_t {
55 event_smem = 1 << 0,
56 event_lds = 1 << 1,
57 event_gds = 1 << 2,
58 event_vmem = 1 << 3,
59 event_vmem_store = 1 << 4, /* GFX10+ */
60 event_flat = 1 << 5,
61 event_exp_pos = 1 << 6,
62 event_exp_param = 1 << 7,
63 event_exp_mrt_null = 1 << 8,
64 event_gds_gpr_lock = 1 << 9,
65 event_vmem_gpr_lock = 1 << 10,
66 };
67
68 enum counter_type : uint8_t {
69 counter_exp = 1 << 0,
70 counter_lgkm = 1 << 1,
71 counter_vm = 1 << 2,
72 counter_vs = 1 << 3,
73 };
74
75 static const uint16_t exp_events = event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock | event_vmem_gpr_lock;
76 static const uint16_t lgkm_events = event_smem | event_lds | event_gds | event_flat;
77 static const uint16_t vm_events = event_vmem | event_flat;
78 static const uint16_t vs_events = event_vmem_store;
79
80 uint8_t get_counters_for_event(wait_event ev)
81 {
82 switch (ev) {
83 case event_smem:
84 case event_lds:
85 case event_gds:
86 return counter_lgkm;
87 case event_vmem:
88 return counter_vm;
89 case event_vmem_store:
90 return counter_vs;
91 case event_flat:
92 return counter_vm | counter_lgkm;
93 case event_exp_pos:
94 case event_exp_param:
95 case event_exp_mrt_null:
96 case event_gds_gpr_lock:
97 case event_vmem_gpr_lock:
98 return counter_exp;
99 default:
100 return 0;
101 }
102 }
103
104 struct wait_imm {
105 static const uint8_t unset_counter = 0xff;
106
107 uint8_t vm;
108 uint8_t exp;
109 uint8_t lgkm;
110 uint8_t vs;
111
112 wait_imm() :
113 vm(unset_counter), exp(unset_counter), lgkm(unset_counter), vs(unset_counter) {}
114 wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_) :
115 vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_) {}
116
117 uint16_t pack(enum chip_class chip) const
118 {
119 uint16_t imm = 0;
120 assert(exp == unset_counter || exp <= 0x7);
121 switch (chip) {
122 case GFX10:
123 assert(lgkm == unset_counter || lgkm <= 0x3f);
124 assert(vm == unset_counter || vm <= 0x3f);
125 imm = ((vm & 0x30) << 10) | ((lgkm & 0x3f) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
126 break;
127 case GFX9:
128 assert(lgkm == unset_counter || lgkm <= 0xf);
129 assert(vm == unset_counter || vm <= 0x3f);
130 imm = ((vm & 0x30) << 10) | ((lgkm & 0xf) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
131 break;
132 default:
133 assert(lgkm == unset_counter || lgkm <= 0xf);
134 assert(vm == unset_counter || vm <= 0xf);
135 imm = ((lgkm & 0xf) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
136 break;
137 }
138 if (chip < GFX9 && vm == wait_imm::unset_counter)
139 imm |= 0xc000; /* should have no effect on pre-GFX9 and now we won't have to worry about the architecture when interpreting the immediate */
140 if (chip < GFX10 && lgkm == wait_imm::unset_counter)
141 imm |= 0x3000; /* should have no effect on pre-GFX10 and now we won't have to worry about the architecture when interpreting the immediate */
142 return imm;
143 }
144
145 void combine(const wait_imm& other)
146 {
147 vm = std::min(vm, other.vm);
148 exp = std::min(exp, other.exp);
149 lgkm = std::min(lgkm, other.lgkm);
150 vs = std::min(vs, other.vs);
151 }
152
153 bool empty() const
154 {
155 return vm == unset_counter && exp == unset_counter &&
156 lgkm == unset_counter && vs == unset_counter;
157 }
158 };
159
160 struct wait_entry {
161 wait_imm imm;
162 uint16_t events; /* use wait_event notion */
163 uint8_t counters; /* use counter_type notion */
164 bool wait_on_read:1;
165 bool logical:1;
166
167 wait_entry(wait_event event, wait_imm imm, bool logical, bool wait_on_read)
168 : imm(imm), events(event), counters(get_counters_for_event(event)),
169 wait_on_read(wait_on_read), logical(logical) {}
170
171 void join(const wait_entry& other)
172 {
173 events |= other.events;
174 counters |= other.counters;
175 imm.combine(other.imm);
176 wait_on_read = wait_on_read || other.wait_on_read;
177 assert(logical == other.logical);
178 }
179
180 void remove_counter(counter_type counter)
181 {
182 counters &= ~counter;
183
184 if (counter == counter_lgkm) {
185 imm.lgkm = wait_imm::unset_counter;
186 events &= ~(event_smem | event_lds | event_gds);
187 }
188
189 if (counter == counter_vm) {
190 imm.vm = wait_imm::unset_counter;
191 events &= ~event_vmem;
192 }
193
194 if (counter == counter_exp) {
195 imm.exp = wait_imm::unset_counter;
196 events &= ~(event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock | event_vmem_gpr_lock);
197 }
198
199 if (counter == counter_vs) {
200 imm.vs = wait_imm::unset_counter;
201 events &= ~event_vmem_store;
202 }
203
204 if (!(counters & counter_lgkm) && !(counters & counter_vm))
205 events &= ~event_flat;
206 }
207 };
208
209 struct wait_ctx {
210 Program *program;
211 enum chip_class chip_class;
212 uint16_t max_vm_cnt;
213 uint16_t max_exp_cnt;
214 uint16_t max_lgkm_cnt;
215 uint16_t max_vs_cnt;
216 uint16_t unordered_events = event_smem | event_flat;
217
218 uint8_t vm_cnt = 0;
219 uint8_t exp_cnt = 0;
220 uint8_t lgkm_cnt = 0;
221 uint8_t vs_cnt = 0;
222 bool pending_flat_lgkm = false;
223 bool pending_flat_vm = false;
224 bool pending_s_buffer_store = false; /* GFX10 workaround */
225
226 wait_imm barrier_imm[barrier_count];
227
228 std::map<PhysReg,wait_entry> gpr_map;
229
230 wait_ctx() {}
231 wait_ctx(Program *program_)
232 : program(program_),
233 chip_class(program_->chip_class),
234 max_vm_cnt(program_->chip_class >= GFX9 ? 62 : 14),
235 max_exp_cnt(6),
236 max_lgkm_cnt(program_->chip_class >= GFX10 ? 62 : 14),
237 max_vs_cnt(program_->chip_class >= GFX10 ? 62 : 0),
238 unordered_events(event_smem | (program_->chip_class < GFX10 ? event_flat : 0)) {}
239
240 void join(const wait_ctx* other, bool logical)
241 {
242 exp_cnt = std::max(exp_cnt, other->exp_cnt);
243 vm_cnt = std::max(vm_cnt, other->vm_cnt);
244 lgkm_cnt = std::max(lgkm_cnt, other->lgkm_cnt);
245 vs_cnt = std::max(vs_cnt, other->vs_cnt);
246 pending_flat_lgkm |= other->pending_flat_lgkm;
247 pending_flat_vm |= other->pending_flat_vm;
248 pending_s_buffer_store |= other->pending_s_buffer_store;
249
250 for (std::pair<PhysReg,wait_entry> entry : other->gpr_map)
251 {
252 std::map<PhysReg,wait_entry>::iterator it = gpr_map.find(entry.first);
253 if (entry.second.logical != logical)
254 continue;
255
256 if (it != gpr_map.end())
257 it->second.join(entry.second);
258 else
259 gpr_map.insert(entry);
260 }
261
262 for (unsigned i = 0; i < barrier_count; i++)
263 barrier_imm[i].combine(other->barrier_imm[i]);
264 }
265 };
266
267 wait_imm check_instr(Instruction* instr, wait_ctx& ctx)
268 {
269 wait_imm wait;
270
271 for (const Operand op : instr->operands) {
272 if (op.isConstant() || op.isUndefined())
273 continue;
274
275 /* check consecutively read gprs */
276 for (unsigned j = 0; j < op.size(); j++) {
277 PhysReg reg{op.physReg() + j};
278 std::map<PhysReg,wait_entry>::iterator it = ctx.gpr_map.find(reg);
279 if (it == ctx.gpr_map.end() || !it->second.wait_on_read)
280 continue;
281
282 wait.combine(it->second.imm);
283 }
284 }
285
286 for (const Definition& def : instr->definitions) {
287 /* check consecutively written gprs */
288 for (unsigned j = 0; j < def.getTemp().size(); j++)
289 {
290 PhysReg reg{def.physReg() + j};
291
292 std::map<PhysReg,wait_entry>::iterator it = ctx.gpr_map.find(reg);
293 if (it == ctx.gpr_map.end())
294 continue;
295
296 /* Vector Memory reads and writes return in the order they were issued */
297 if (instr->isVMEM() && ((it->second.events & vm_events) == event_vmem)) {
298 it->second.remove_counter(counter_vm);
299 if (!it->second.counters)
300 it = ctx.gpr_map.erase(it);
301 continue;
302 }
303
304 /* LDS reads and writes return in the order they were issued. same for GDS */
305 if (instr->format == Format::DS) {
306 bool gds = static_cast<DS_instruction*>(instr)->gds;
307 if ((it->second.events & lgkm_events) == (gds ? event_gds : event_lds)) {
308 it->second.remove_counter(counter_lgkm);
309 if (!it->second.counters)
310 it = ctx.gpr_map.erase(it);
311 continue;
312 }
313 }
314
315 wait.combine(it->second.imm);
316 }
317 }
318
319 return wait;
320 }
321
322 wait_imm kill(Instruction* instr, wait_ctx& ctx)
323 {
324 wait_imm imm;
325 if (ctx.exp_cnt || ctx.vm_cnt || ctx.lgkm_cnt)
326 imm.combine(check_instr(instr, ctx));
327
328 if (ctx.chip_class >= GFX10) {
329 /* Seems to be required on GFX10 to achieve correct behaviour.
330 * It shouldn't cost anything anyways since we're about to do s_endpgm.
331 */
332 if (ctx.lgkm_cnt && instr->opcode == aco_opcode::s_dcache_wb)
333 imm.lgkm = 0;
334
335 /* GFX10: A store followed by a load at the same address causes a problem because
336 * the load doesn't load the correct values unless we wait for the store first.
337 * This is NOT mitigated by an s_nop.
338 *
339 * TODO: Refine this when we have proper alias analysis.
340 */
341 SMEM_instruction *smem = static_cast<SMEM_instruction *>(instr);
342 if (ctx.pending_s_buffer_store &&
343 !smem->definitions.empty() &&
344 !smem->can_reorder && smem->barrier == barrier_buffer) {
345 imm.lgkm = 0;
346 }
347 }
348
349 if (instr->format == Format::PSEUDO_BARRIER) {
350 unsigned* bsize = ctx.program->info->cs.block_size;
351 unsigned workgroup_size = bsize[0] * bsize[1] * bsize[2];
352 switch (instr->opcode) {
353 case aco_opcode::p_memory_barrier_all:
354 for (unsigned i = 0; i < barrier_count; i++) {
355 if ((1 << i) == barrier_shared && workgroup_size <= 64)
356 continue;
357 imm.combine(ctx.barrier_imm[i]);
358 }
359 break;
360 case aco_opcode::p_memory_barrier_atomic:
361 imm.combine(ctx.barrier_imm[ffs(barrier_atomic) - 1]);
362 break;
363 /* see comment in aco_scheduler.cpp's can_move_instr() on why these barriers are merged */
364 case aco_opcode::p_memory_barrier_buffer:
365 case aco_opcode::p_memory_barrier_image:
366 imm.combine(ctx.barrier_imm[ffs(barrier_buffer) - 1]);
367 imm.combine(ctx.barrier_imm[ffs(barrier_image) - 1]);
368 break;
369 case aco_opcode::p_memory_barrier_shared:
370 if (workgroup_size > 64)
371 imm.combine(ctx.barrier_imm[ffs(barrier_shared) - 1]);
372 break;
373 default:
374 assert(false);
375 break;
376 }
377 }
378
379 if (!imm.empty()) {
380 if (ctx.pending_flat_vm && imm.vm != wait_imm::unset_counter)
381 imm.vm = 0;
382 if (ctx.pending_flat_lgkm && imm.lgkm != wait_imm::unset_counter)
383 imm.lgkm = 0;
384
385 /* reset counters */
386 ctx.exp_cnt = std::min(ctx.exp_cnt, imm.exp);
387 ctx.vm_cnt = std::min(ctx.vm_cnt, imm.vm);
388 ctx.lgkm_cnt = std::min(ctx.lgkm_cnt, imm.lgkm);
389 ctx.vs_cnt = std::min(ctx.vs_cnt, imm.vs);
390
391 /* update barrier wait imms */
392 for (unsigned i = 0; i < barrier_count; i++) {
393 wait_imm& bar = ctx.barrier_imm[i];
394 if (bar.exp != wait_imm::unset_counter && imm.exp <= bar.exp)
395 bar.exp = wait_imm::unset_counter;
396 if (bar.vm != wait_imm::unset_counter && imm.vm <= bar.vm)
397 bar.vm = wait_imm::unset_counter;
398 if (bar.lgkm != wait_imm::unset_counter && imm.lgkm <= bar.lgkm)
399 bar.lgkm = wait_imm::unset_counter;
400 if (bar.vs != wait_imm::unset_counter && imm.vs <= bar.vs)
401 bar.vs = wait_imm::unset_counter;
402 }
403
404 /* remove all gprs with higher counter from map */
405 std::map<PhysReg,wait_entry>::iterator it = ctx.gpr_map.begin();
406 while (it != ctx.gpr_map.end())
407 {
408 if (imm.exp != wait_imm::unset_counter && imm.exp <= it->second.imm.exp)
409 it->second.remove_counter(counter_exp);
410 if (imm.vm != wait_imm::unset_counter && imm.vm <= it->second.imm.vm)
411 it->second.remove_counter(counter_vm);
412 if (imm.lgkm != wait_imm::unset_counter && imm.lgkm <= it->second.imm.lgkm)
413 it->second.remove_counter(counter_lgkm);
414 if (imm.lgkm != wait_imm::unset_counter && imm.vs <= it->second.imm.vs)
415 it->second.remove_counter(counter_vs);
416 if (!it->second.counters)
417 it = ctx.gpr_map.erase(it);
418 else
419 it++;
420 }
421 }
422
423 if (imm.vm == 0)
424 ctx.pending_flat_vm = false;
425 if (imm.lgkm == 0) {
426 ctx.pending_flat_lgkm = false;
427 ctx.pending_s_buffer_store = false;
428 }
429
430 return imm;
431 }
432
433 void update_barrier_imm(wait_ctx& ctx, uint8_t counters, barrier_interaction barrier)
434 {
435 unsigned barrier_index = ffs(barrier) - 1;
436 for (unsigned i = 0; i < barrier_count; i++) {
437 wait_imm& bar = ctx.barrier_imm[i];
438 if (i == barrier_index) {
439 if (counters & counter_lgkm)
440 bar.lgkm = 0;
441 if (counters & counter_vm)
442 bar.vm = 0;
443 if (counters & counter_exp)
444 bar.exp = 0;
445 if (counters & counter_vs)
446 bar.vs = 0;
447 } else {
448 if (counters & counter_lgkm && bar.lgkm != wait_imm::unset_counter && bar.lgkm < ctx.max_lgkm_cnt)
449 bar.lgkm++;
450 if (counters & counter_vm && bar.vm != wait_imm::unset_counter && bar.vm < ctx.max_vm_cnt)
451 bar.vm++;
452 if (counters & counter_exp && bar.exp != wait_imm::unset_counter && bar.exp < ctx.max_exp_cnt)
453 bar.exp++;
454 if (counters & counter_vs && bar.vs != wait_imm::unset_counter && bar.vs < ctx.max_vs_cnt)
455 bar.vs++;
456 }
457 }
458 }
459
460 void update_counters(wait_ctx& ctx, wait_event event, barrier_interaction barrier=barrier_none)
461 {
462 uint8_t counters = get_counters_for_event(event);
463
464 if (counters & counter_lgkm && ctx.lgkm_cnt <= ctx.max_lgkm_cnt)
465 ctx.lgkm_cnt++;
466 if (counters & counter_vm && ctx.vm_cnt <= ctx.max_vm_cnt)
467 ctx.vm_cnt++;
468 if (counters & counter_exp && ctx.exp_cnt <= ctx.max_exp_cnt)
469 ctx.exp_cnt++;
470 if (counters & counter_vs && ctx.vs_cnt <= ctx.max_vs_cnt)
471 ctx.vs_cnt++;
472
473 update_barrier_imm(ctx, counters, barrier);
474
475 if (ctx.unordered_events & event)
476 return;
477
478 if (ctx.pending_flat_lgkm)
479 counters &= ~counter_lgkm;
480 if (ctx.pending_flat_vm)
481 counters &= ~counter_vm;
482
483 for (std::pair<const PhysReg,wait_entry>& e : ctx.gpr_map) {
484 wait_entry& entry = e.second;
485
486 if (entry.events & ctx.unordered_events)
487 continue;
488
489 assert(entry.events);
490
491 if ((counters & counter_exp) && (entry.events & exp_events) == event && entry.imm.exp < ctx.max_exp_cnt)
492 entry.imm.exp++;
493 if ((counters & counter_lgkm) && (entry.events & lgkm_events) == event && entry.imm.lgkm < ctx.max_lgkm_cnt)
494 entry.imm.lgkm++;
495 if ((counters & counter_vm) && (entry.events & vm_events) == event && entry.imm.vm < ctx.max_vm_cnt)
496 entry.imm.vm++;
497 if ((counters & counter_vs) && (entry.events & vs_events) == event && entry.imm.vs < ctx.max_vs_cnt)
498 entry.imm.vs++;
499 }
500 }
501
502 void update_counters_for_flat_load(wait_ctx& ctx, barrier_interaction barrier=barrier_none)
503 {
504 assert(ctx.chip_class < GFX10);
505
506 if (ctx.lgkm_cnt <= ctx.max_lgkm_cnt)
507 ctx.lgkm_cnt++;
508 if (ctx.lgkm_cnt <= ctx.max_vm_cnt)
509 ctx.vm_cnt++;
510
511 update_barrier_imm(ctx, counter_vm | counter_lgkm, barrier);
512
513 for (std::pair<PhysReg,wait_entry> e : ctx.gpr_map)
514 {
515 if (e.second.counters & counter_vm)
516 e.second.imm.vm = 0;
517 if (e.second.counters & counter_lgkm)
518 e.second.imm.lgkm = 0;
519 }
520 ctx.pending_flat_lgkm = true;
521 ctx.pending_flat_vm = true;
522 }
523
524 void insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event, bool wait_on_read)
525 {
526 uint16_t counters = get_counters_for_event(event);
527 wait_imm imm;
528 if (counters & counter_lgkm)
529 imm.lgkm = 0;
530 if (counters & counter_vm)
531 imm.vm = 0;
532 if (counters & counter_exp)
533 imm.exp = 0;
534 if (counters & counter_vs)
535 imm.vs = 0;
536
537 wait_entry new_entry(event, imm, !rc.is_linear(), wait_on_read);
538
539 for (unsigned i = 0; i < rc.size(); i++) {
540 auto it = ctx.gpr_map.emplace(PhysReg{reg.reg+i}, new_entry);
541 if (!it.second)
542 it.first->second.join(new_entry);
543 }
544 }
545
546 void insert_wait_entry(wait_ctx& ctx, Operand op, wait_event event)
547 {
548 if (!op.isConstant() && !op.isUndefined())
549 insert_wait_entry(ctx, op.physReg(), op.regClass(), event, false);
550 }
551
552 void insert_wait_entry(wait_ctx& ctx, Definition def, wait_event event)
553 {
554 insert_wait_entry(ctx, def.physReg(), def.regClass(), event, true);
555 }
556
557 void gen(Instruction* instr, wait_ctx& ctx)
558 {
559 switch (instr->format) {
560 case Format::EXP: {
561 Export_instruction* exp_instr = static_cast<Export_instruction*>(instr);
562
563 wait_event ev;
564 if (exp_instr->dest <= 9)
565 ev = event_exp_mrt_null;
566 else if (exp_instr->dest <= 15)
567 ev = event_exp_pos;
568 else
569 ev = event_exp_param;
570 update_counters(ctx, ev);
571
572 /* insert new entries for exported vgprs */
573 for (unsigned i = 0; i < 4; i++)
574 {
575 if (exp_instr->enabled_mask & (1 << i)) {
576 unsigned idx = exp_instr->compressed ? i >> 1 : i;
577 assert(idx < exp_instr->operands.size());
578 insert_wait_entry(ctx, exp_instr->operands[idx], ev);
579
580 }
581 }
582 insert_wait_entry(ctx, exec, s2, ev, false);
583 break;
584 }
585 case Format::FLAT: {
586 if (ctx.chip_class < GFX10 && !instr->definitions.empty())
587 update_counters_for_flat_load(ctx, barrier_buffer);
588 else
589 update_counters(ctx, event_flat, barrier_buffer);
590
591 if (!instr->definitions.empty())
592 insert_wait_entry(ctx, instr->definitions[0], event_flat);
593 break;
594 }
595 case Format::SMEM: {
596 SMEM_instruction *smem = static_cast<SMEM_instruction*>(instr);
597 update_counters(ctx, event_smem, static_cast<SMEM_instruction*>(instr)->barrier);
598
599 if (!instr->definitions.empty())
600 insert_wait_entry(ctx, instr->definitions[0], event_smem);
601 else if (ctx.chip_class >= GFX10 &&
602 !smem->can_reorder &&
603 smem->barrier == barrier_buffer)
604 ctx.pending_s_buffer_store = true;
605
606 break;
607 }
608 case Format::DS: {
609 bool gds = static_cast<DS_instruction*>(instr)->gds;
610 update_counters(ctx, gds ? event_gds : event_lds, gds ? barrier_none : barrier_shared);
611 if (gds)
612 update_counters(ctx, event_gds_gpr_lock);
613
614 if (!instr->definitions.empty())
615 insert_wait_entry(ctx, instr->definitions[0], gds ? event_gds : event_lds);
616
617 if (gds) {
618 for (const Operand& op : instr->operands)
619 insert_wait_entry(ctx, op, event_gds_gpr_lock);
620 insert_wait_entry(ctx, exec, s2, event_gds_gpr_lock, false);
621 }
622 break;
623 }
624 case Format::MUBUF:
625 case Format::MTBUF:
626 case Format::MIMG:
627 case Format::GLOBAL: {
628 wait_event ev = !instr->definitions.empty() || ctx.chip_class < GFX10 ? event_vmem : event_vmem_store;
629 update_counters(ctx, ev, get_barrier_interaction(instr));
630
631 if (!instr->definitions.empty())
632 insert_wait_entry(ctx, instr->definitions[0], ev);
633
634 if (instr->operands.size() == 4 && ctx.chip_class == GFX6) {
635 ctx.exp_cnt++;
636 update_counters(ctx, event_vmem_gpr_lock);
637 insert_wait_entry(ctx, instr->operands[3], event_vmem_gpr_lock);
638 }
639 break;
640 }
641 default:
642 break;
643 }
644 }
645
646 void emit_waitcnt(wait_ctx& ctx, std::vector<aco_ptr<Instruction>>& instructions, wait_imm imm)
647 {
648 if (imm.vs != wait_imm::unset_counter) {
649 assert(ctx.chip_class >= GFX10);
650 SOPK_instruction* waitcnt_vs = create_instruction<SOPK_instruction>(aco_opcode::s_waitcnt_vscnt, Format::SOPK, 0, 1);
651 waitcnt_vs->definitions[0] = Definition(sgpr_null, s1);
652 waitcnt_vs->imm = imm.vs;
653 instructions.emplace_back(waitcnt_vs);
654 imm.vs = wait_imm::unset_counter;
655 }
656 if (!imm.empty()) {
657 SOPP_instruction* waitcnt = create_instruction<SOPP_instruction>(aco_opcode::s_waitcnt, Format::SOPP, 0, 0);
658 waitcnt->imm = imm.pack(ctx.chip_class);
659 waitcnt->block = -1;
660 instructions.emplace_back(waitcnt);
661 }
662 }
663
664 void handle_block(Program *program, Block& block, wait_ctx& ctx)
665 {
666 std::vector<aco_ptr<Instruction>> new_instructions;
667
668 for (aco_ptr<Instruction>& instr : block.instructions) {
669 wait_imm imm = kill(instr.get(), ctx);
670
671 if (!imm.empty())
672 emit_waitcnt(ctx, new_instructions, imm);
673
674 gen(instr.get(), ctx);
675
676 if (instr->format != Format::PSEUDO_BARRIER)
677 new_instructions.emplace_back(std::move(instr));
678 }
679
680 /* check if this block is at the end of a loop */
681 for (unsigned succ_idx : block.linear_succs) {
682 /* eliminate any remaining counters */
683 if (succ_idx <= block.index && (ctx.vm_cnt || ctx.exp_cnt || ctx.lgkm_cnt || ctx.vs_cnt) && !ctx.gpr_map.empty()) {
684 // TODO: we could do better if we only wait if the regs between the block and other predecessors differ
685
686 aco_ptr<Instruction> branch = std::move(new_instructions.back());
687 new_instructions.pop_back();
688
689 wait_imm imm(ctx.vm_cnt ? 0 : wait_imm::unset_counter,
690 ctx.exp_cnt ? 0 : wait_imm::unset_counter,
691 ctx.lgkm_cnt ? 0 : wait_imm::unset_counter,
692 ctx.vs_cnt ? 0 : wait_imm::unset_counter);
693 emit_waitcnt(ctx, new_instructions, imm);
694
695 new_instructions.push_back(std::move(branch));
696
697 ctx = wait_ctx(program);
698 break;
699 }
700 }
701 block.instructions.swap(new_instructions);
702 }
703
704 } /* end namespace */
705
706 void insert_wait_states(Program* program)
707 {
708 wait_ctx out_ctx[program->blocks.size()]; /* per BB ctx */
709 for (unsigned i = 0; i < program->blocks.size(); i++)
710 out_ctx[i] = wait_ctx(program);
711
712 for (unsigned i = 0; i < program->blocks.size(); i++) {
713 Block& current = program->blocks[i];
714 wait_ctx& in = out_ctx[current.index];
715
716 for (unsigned b : current.linear_preds)
717 in.join(&out_ctx[b], false);
718 for (unsigned b : current.logical_preds)
719 in.join(&out_ctx[b], true);
720
721 if (current.instructions.empty())
722 continue;
723
724 handle_block(program, current, in);
725 }
726 }
727
728 }
729