aco: add ACO_DEBUG=force-waitcnt to emit wait-states
[mesa.git] / src / amd / compiler / aco_ir.h
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #ifndef ACO_IR_H
26 #define ACO_IR_H
27
28 #include <vector>
29 #include <set>
30 #include <unordered_set>
31 #include <bitset>
32 #include <memory>
33
34 #include "nir.h"
35 #include "ac_binary.h"
36 #include "amd_family.h"
37 #include "aco_opcodes.h"
38 #include "aco_util.h"
39
40 #include "vulkan/radv_shader.h"
41
42 struct radv_nir_compiler_options;
43 struct radv_shader_args;
44 struct radv_shader_info;
45
46 namespace aco {
47
48 extern uint64_t debug_flags;
49
50 enum {
51 DEBUG_VALIDATE_IR = 0x1,
52 DEBUG_VALIDATE_RA = 0x2,
53 DEBUG_PERFWARN = 0x4,
54 DEBUG_FORCE_WAITCNT = 0x8,
55 };
56
57 /**
58 * Representation of the instruction's microcode encoding format
59 * Note: Some Vector ALU Formats can be combined, such that:
60 * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
61 * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
62 * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
63 *
64 * (*) The same is applicable for VOP1 and VOPC instructions.
65 */
66 enum class Format : std::uint16_t {
67 /* Pseudo Instruction Format */
68 PSEUDO = 0,
69 /* Scalar ALU & Control Formats */
70 SOP1 = 1,
71 SOP2 = 2,
72 SOPK = 3,
73 SOPP = 4,
74 SOPC = 5,
75 /* Scalar Memory Format */
76 SMEM = 6,
77 /* LDS/GDS Format */
78 DS = 8,
79 /* Vector Memory Buffer Formats */
80 MTBUF = 9,
81 MUBUF = 10,
82 /* Vector Memory Image Format */
83 MIMG = 11,
84 /* Export Format */
85 EXP = 12,
86 /* Flat Formats */
87 FLAT = 13,
88 GLOBAL = 14,
89 SCRATCH = 15,
90
91 PSEUDO_BRANCH = 16,
92 PSEUDO_BARRIER = 17,
93 PSEUDO_REDUCTION = 18,
94
95 /* Vector ALU Formats */
96 VOP3P = 19,
97 VOP1 = 1 << 8,
98 VOP2 = 1 << 9,
99 VOPC = 1 << 10,
100 VOP3 = 1 << 11,
101 VOP3A = 1 << 11,
102 VOP3B = 1 << 11,
103 /* Vector Parameter Interpolation Format */
104 VINTRP = 1 << 12,
105 DPP = 1 << 13,
106 SDWA = 1 << 14,
107 };
108
109 enum storage_class : uint8_t {
110 storage_none = 0x0, /* no synchronization and can be reordered around aliasing stores */
111 storage_buffer = 0x1, /* SSBOs and global memory */
112 storage_atomic_counter = 0x2, /* not used for Vulkan */
113 storage_image = 0x4,
114 storage_shared = 0x8, /* or TCS output */
115 storage_vmem_output = 0x10, /* GS or TCS output stores using VMEM */
116 storage_scratch = 0x20,
117 storage_vgpr_spill = 0x40,
118 storage_count = 8,
119 };
120
121 enum memory_semantics : uint8_t {
122 semantic_none = 0x0,
123 /* for loads: don't move any access after this load to before this load (even other loads)
124 * for barriers: don't move any access after the barrier to before any
125 * atomics/control_barriers/sendmsg_gs_done before the barrier */
126 semantic_acquire = 0x1,
127 /* for stores: don't move any access before this store to after this store
128 * for barriers: don't move any access before the barrier to after any
129 * atomics/control_barriers/sendmsg_gs_done after the barrier */
130 semantic_release = 0x2,
131
132 /* the rest are for load/stores/atomics only */
133 /* cannot be DCE'd or CSE'd */
134 semantic_volatile = 0x4,
135 /* does not interact with barriers and assumes this lane is the only lane
136 * accessing this memory */
137 semantic_private = 0x8,
138 /* this operation can be reordered around operations of the same storage. says nothing about barriers */
139 semantic_can_reorder = 0x10,
140 /* this is a atomic instruction (may only read or write memory) */
141 semantic_atomic = 0x20,
142 /* this is instruction both reads and writes memory */
143 semantic_rmw = 0x40,
144
145 semantic_acqrel = semantic_acquire | semantic_release,
146 semantic_atomicrmw = semantic_volatile | semantic_atomic | semantic_rmw,
147 };
148
149 enum sync_scope : uint8_t {
150 scope_invocation = 0,
151 scope_subgroup = 1,
152 scope_workgroup = 2,
153 scope_queuefamily = 3,
154 scope_device = 4,
155 };
156
157 struct memory_sync_info {
158 memory_sync_info() : storage(storage_none), semantics(semantic_none), scope(scope_invocation) {}
159 memory_sync_info(int storage, int semantics=0, sync_scope scope=scope_invocation)
160 : storage((storage_class)storage), semantics((memory_semantics)semantics), scope(scope) {}
161
162 storage_class storage:8;
163 memory_semantics semantics:8;
164 sync_scope scope:8;
165
166 bool operator == (const memory_sync_info& rhs) const {
167 return storage == rhs.storage &&
168 semantics == rhs.semantics &&
169 scope == rhs.scope;
170 }
171
172 bool can_reorder() const {
173 if (semantics & semantic_acqrel)
174 return false;
175 /* Also check storage so that zero-initialized memory_sync_info can be
176 * reordered. */
177 return (!storage || (semantics & semantic_can_reorder)) && !(semantics & semantic_volatile);
178 }
179 };
180 static_assert(sizeof(memory_sync_info) == 3, "Unexpected padding");
181
182 enum fp_round {
183 fp_round_ne = 0,
184 fp_round_pi = 1,
185 fp_round_ni = 2,
186 fp_round_tz = 3,
187 };
188
189 enum fp_denorm {
190 /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
191 * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
192 fp_denorm_flush = 0x0,
193 fp_denorm_keep = 0x3,
194 };
195
196 struct float_mode {
197 /* matches encoding of the MODE register */
198 union {
199 struct {
200 fp_round round32:2;
201 fp_round round16_64:2;
202 unsigned denorm32:2;
203 unsigned denorm16_64:2;
204 };
205 struct {
206 uint8_t round:4;
207 uint8_t denorm:4;
208 };
209 uint8_t val = 0;
210 };
211 /* if false, optimizations which may remove infs/nan/-0.0 can be done */
212 bool preserve_signed_zero_inf_nan32:1;
213 bool preserve_signed_zero_inf_nan16_64:1;
214 /* if false, optimizations which may remove denormal flushing can be done */
215 bool must_flush_denorms32:1;
216 bool must_flush_denorms16_64:1;
217 bool care_about_round32:1;
218 bool care_about_round16_64:1;
219
220 /* Returns true if instructions using the mode "other" can safely use the
221 * current one instead. */
222 bool canReplace(float_mode other) const noexcept {
223 return val == other.val &&
224 (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
225 (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
226 (must_flush_denorms32 || !other.must_flush_denorms32) &&
227 (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
228 (care_about_round32 || !other.care_about_round32) &&
229 (care_about_round16_64 || !other.care_about_round16_64);
230 }
231 };
232
233 constexpr Format asVOP3(Format format) {
234 return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
235 };
236
237 constexpr Format asSDWA(Format format) {
238 assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC);
239 return (Format) ((uint32_t) Format::SDWA | (uint32_t) format);
240 }
241
242 enum class RegType {
243 none = 0,
244 sgpr,
245 vgpr,
246 linear_vgpr,
247 };
248
249 struct RegClass {
250
251 enum RC : uint8_t {
252 s1 = 1,
253 s2 = 2,
254 s3 = 3,
255 s4 = 4,
256 s6 = 6,
257 s8 = 8,
258 s16 = 16,
259 v1 = s1 | (1 << 5),
260 v2 = s2 | (1 << 5),
261 v3 = s3 | (1 << 5),
262 v4 = s4 | (1 << 5),
263 v5 = 5 | (1 << 5),
264 v6 = 6 | (1 << 5),
265 v7 = 7 | (1 << 5),
266 v8 = 8 | (1 << 5),
267 /* byte-sized register class */
268 v1b = v1 | (1 << 7),
269 v2b = v2 | (1 << 7),
270 v3b = v3 | (1 << 7),
271 v4b = v4 | (1 << 7),
272 v6b = v6 | (1 << 7),
273 v8b = v8 | (1 << 7),
274 /* these are used for WWM and spills to vgpr */
275 v1_linear = v1 | (1 << 6),
276 v2_linear = v2 | (1 << 6),
277 };
278
279 RegClass() = default;
280 constexpr RegClass(RC rc)
281 : rc(rc) {}
282 constexpr RegClass(RegType type, unsigned size)
283 : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
284
285 constexpr operator RC() const { return rc; }
286 explicit operator bool() = delete;
287
288 constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
289 constexpr bool is_subdword() const { return rc & (1 << 7); }
290 constexpr unsigned bytes() const { return ((unsigned) rc & 0x1F) * (is_subdword() ? 1 : 4); }
291 //TODO: use size() less in favor of bytes()
292 constexpr unsigned size() const { return (bytes() + 3) >> 2; }
293 constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
294 constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
295 constexpr RegClass as_subdword() const { return RegClass((RC) (rc | 1 << 7)); }
296
297 static constexpr RegClass get(RegType type, unsigned bytes) {
298 if (type == RegType::sgpr) {
299 return RegClass(type, DIV_ROUND_UP(bytes, 4u));
300 } else {
301 return bytes % 4u ? RegClass(type, bytes).as_subdword() :
302 RegClass(type, bytes / 4u);
303 }
304 }
305
306 private:
307 RC rc;
308 };
309
310 /* transitional helper expressions */
311 static constexpr RegClass s1{RegClass::s1};
312 static constexpr RegClass s2{RegClass::s2};
313 static constexpr RegClass s3{RegClass::s3};
314 static constexpr RegClass s4{RegClass::s4};
315 static constexpr RegClass s8{RegClass::s8};
316 static constexpr RegClass s16{RegClass::s16};
317 static constexpr RegClass v1{RegClass::v1};
318 static constexpr RegClass v2{RegClass::v2};
319 static constexpr RegClass v3{RegClass::v3};
320 static constexpr RegClass v4{RegClass::v4};
321 static constexpr RegClass v5{RegClass::v5};
322 static constexpr RegClass v6{RegClass::v6};
323 static constexpr RegClass v7{RegClass::v7};
324 static constexpr RegClass v8{RegClass::v8};
325 static constexpr RegClass v1b{RegClass::v1b};
326 static constexpr RegClass v2b{RegClass::v2b};
327 static constexpr RegClass v3b{RegClass::v3b};
328 static constexpr RegClass v4b{RegClass::v4b};
329 static constexpr RegClass v6b{RegClass::v6b};
330 static constexpr RegClass v8b{RegClass::v8b};
331
332 /**
333 * Temp Class
334 * Each temporary virtual register has a
335 * register class (i.e. size and type)
336 * and SSA id.
337 */
338 struct Temp {
339 Temp() noexcept : id_(0), reg_class(0) {}
340 constexpr Temp(uint32_t id, RegClass cls) noexcept
341 : id_(id), reg_class(uint8_t(cls)) {}
342
343 constexpr uint32_t id() const noexcept { return id_; }
344 constexpr RegClass regClass() const noexcept { return (RegClass::RC)reg_class; }
345
346 constexpr unsigned bytes() const noexcept { return regClass().bytes(); }
347 constexpr unsigned size() const noexcept { return regClass().size(); }
348 constexpr RegType type() const noexcept { return regClass().type(); }
349 constexpr bool is_linear() const noexcept { return regClass().is_linear(); }
350
351 constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
352 constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
353 constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
354
355 private:
356 uint32_t id_: 24;
357 uint32_t reg_class : 8;
358 };
359
360 /**
361 * PhysReg
362 * Represents the physical register for each
363 * Operand and Definition.
364 */
365 struct PhysReg {
366 constexpr PhysReg() = default;
367 explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {}
368 constexpr unsigned reg() const { return reg_b >> 2; }
369 constexpr unsigned byte() const { return reg_b & 0x3; }
370 constexpr operator unsigned() const { return reg(); }
371 constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; }
372 constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; }
373 constexpr bool operator <(PhysReg other) const { return reg_b < other.reg_b; }
374 constexpr PhysReg advance(int bytes) const { PhysReg res = *this; res.reg_b += bytes; return res; }
375
376 uint16_t reg_b = 0;
377 };
378
379 /* helper expressions for special registers */
380 static constexpr PhysReg m0{124};
381 static constexpr PhysReg vcc{106};
382 static constexpr PhysReg vcc_hi{107};
383 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
384 static constexpr PhysReg exec{126};
385 static constexpr PhysReg exec_lo{126};
386 static constexpr PhysReg exec_hi{127};
387 static constexpr PhysReg vccz{251};
388 static constexpr PhysReg execz{252};
389 static constexpr PhysReg scc{253};
390
391 /**
392 * Operand Class
393 * Initially, each Operand refers to either
394 * a temporary virtual register
395 * or to a constant value
396 * Temporary registers get mapped to physical register during RA
397 * Constant values are inlined into the instruction sequence.
398 */
399 class Operand final
400 {
401 public:
402 constexpr Operand()
403 : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
404 isKill_(false), isUndef_(true), isFirstKill_(false), constSize(0),
405 isLateKill_(false) {}
406
407 explicit Operand(Temp r) noexcept
408 {
409 data_.temp = r;
410 if (r.id()) {
411 isTemp_ = true;
412 } else {
413 isUndef_ = true;
414 setFixed(PhysReg{128});
415 }
416 };
417 explicit Operand(uint8_t v) noexcept
418 {
419 /* 8-bit constants are only used for copies and copies from any 8-bit
420 * constant can be implemented with a SDWA v_mul_u32_u24. So consider all
421 * to be inline constants. */
422 data_.i = v;
423 isConstant_ = true;
424 constSize = 0;
425 setFixed(PhysReg{0u});
426 };
427 explicit Operand(uint16_t v) noexcept
428 {
429 data_.i = v;
430 isConstant_ = true;
431 constSize = 1;
432 if (v <= 64)
433 setFixed(PhysReg{128u + v});
434 else if (v >= 0xFFF0) /* [-16 .. -1] */
435 setFixed(PhysReg{192u + (0xFFFF - v)});
436 else if (v == 0x3800) /* 0.5 */
437 setFixed(PhysReg{240});
438 else if (v == 0xB800) /* -0.5 */
439 setFixed(PhysReg{241});
440 else if (v == 0x3C00) /* 1.0 */
441 setFixed(PhysReg{242});
442 else if (v == 0xBC00) /* -1.0 */
443 setFixed(PhysReg{243});
444 else if (v == 0x4000) /* 2.0 */
445 setFixed(PhysReg{244});
446 else if (v == 0xC000) /* -2.0 */
447 setFixed(PhysReg{245});
448 else if (v == 0x4400) /* 4.0 */
449 setFixed(PhysReg{246});
450 else if (v == 0xC400) /* -4.0 */
451 setFixed(PhysReg{247});
452 else if (v == 0x3118) /* 1/2 PI */
453 setFixed(PhysReg{248});
454 else /* Literal Constant */
455 setFixed(PhysReg{255});
456 };
457 explicit Operand(uint32_t v, bool is64bit = false) noexcept
458 {
459 data_.i = v;
460 isConstant_ = true;
461 constSize = is64bit ? 3 : 2;
462 if (v <= 64)
463 setFixed(PhysReg{128 + v});
464 else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
465 setFixed(PhysReg{192 - v});
466 else if (v == 0x3f000000) /* 0.5 */
467 setFixed(PhysReg{240});
468 else if (v == 0xbf000000) /* -0.5 */
469 setFixed(PhysReg{241});
470 else if (v == 0x3f800000) /* 1.0 */
471 setFixed(PhysReg{242});
472 else if (v == 0xbf800000) /* -1.0 */
473 setFixed(PhysReg{243});
474 else if (v == 0x40000000) /* 2.0 */
475 setFixed(PhysReg{244});
476 else if (v == 0xc0000000) /* -2.0 */
477 setFixed(PhysReg{245});
478 else if (v == 0x40800000) /* 4.0 */
479 setFixed(PhysReg{246});
480 else if (v == 0xc0800000) /* -4.0 */
481 setFixed(PhysReg{247});
482 else { /* Literal Constant */
483 assert(!is64bit && "attempt to create a 64-bit literal constant");
484 setFixed(PhysReg{255});
485 }
486 };
487 explicit Operand(uint64_t v) noexcept
488 {
489 isConstant_ = true;
490 constSize = 3;
491 if (v <= 64) {
492 data_.i = (uint32_t) v;
493 setFixed(PhysReg{128 + (uint32_t) v});
494 } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
495 data_.i = (uint32_t) v;
496 setFixed(PhysReg{192 - (uint32_t) v});
497 } else if (v == 0x3FE0000000000000) { /* 0.5 */
498 data_.i = 0x3f000000;
499 setFixed(PhysReg{240});
500 } else if (v == 0xBFE0000000000000) { /* -0.5 */
501 data_.i = 0xbf000000;
502 setFixed(PhysReg{241});
503 } else if (v == 0x3FF0000000000000) { /* 1.0 */
504 data_.i = 0x3f800000;
505 setFixed(PhysReg{242});
506 } else if (v == 0xBFF0000000000000) { /* -1.0 */
507 data_.i = 0xbf800000;
508 setFixed(PhysReg{243});
509 } else if (v == 0x4000000000000000) { /* 2.0 */
510 data_.i = 0x40000000;
511 setFixed(PhysReg{244});
512 } else if (v == 0xC000000000000000) { /* -2.0 */
513 data_.i = 0xc0000000;
514 setFixed(PhysReg{245});
515 } else if (v == 0x4010000000000000) { /* 4.0 */
516 data_.i = 0x40800000;
517 setFixed(PhysReg{246});
518 } else if (v == 0xC010000000000000) { /* -4.0 */
519 data_.i = 0xc0800000;
520 setFixed(PhysReg{247});
521 } else { /* Literal Constant: we don't know if it is a long or double.*/
522 isConstant_ = 0;
523 assert(false && "attempt to create a 64-bit literal constant");
524 }
525 };
526 explicit Operand(RegClass type) noexcept
527 {
528 isUndef_ = true;
529 data_.temp = Temp(0, type);
530 setFixed(PhysReg{128});
531 };
532 explicit Operand(PhysReg reg, RegClass type) noexcept
533 {
534 data_.temp = Temp(0, type);
535 setFixed(reg);
536 }
537
538 constexpr bool isTemp() const noexcept
539 {
540 return isTemp_;
541 }
542
543 constexpr void setTemp(Temp t) noexcept {
544 assert(!isConstant_);
545 isTemp_ = true;
546 data_.temp = t;
547 }
548
549 constexpr Temp getTemp() const noexcept
550 {
551 return data_.temp;
552 }
553
554 constexpr uint32_t tempId() const noexcept
555 {
556 return data_.temp.id();
557 }
558
559 constexpr bool hasRegClass() const noexcept
560 {
561 return isTemp() || isUndefined();
562 }
563
564 constexpr RegClass regClass() const noexcept
565 {
566 return data_.temp.regClass();
567 }
568
569 constexpr unsigned bytes() const noexcept
570 {
571 if (isConstant())
572 return 1 << constSize;
573 else
574 return data_.temp.bytes();
575 }
576
577 constexpr unsigned size() const noexcept
578 {
579 if (isConstant())
580 return constSize > 2 ? 2 : 1;
581 else
582 return data_.temp.size();
583 }
584
585 constexpr bool isFixed() const noexcept
586 {
587 return isFixed_;
588 }
589
590 constexpr PhysReg physReg() const noexcept
591 {
592 return reg_;
593 }
594
595 constexpr void setFixed(PhysReg reg) noexcept
596 {
597 isFixed_ = reg != unsigned(-1);
598 reg_ = reg;
599 }
600
601 constexpr bool isConstant() const noexcept
602 {
603 return isConstant_;
604 }
605
606 constexpr bool isLiteral() const noexcept
607 {
608 return isConstant() && reg_ == 255;
609 }
610
611 constexpr bool isUndefined() const noexcept
612 {
613 return isUndef_;
614 }
615
616 constexpr uint32_t constantValue() const noexcept
617 {
618 return data_.i;
619 }
620
621 constexpr bool constantEquals(uint32_t cmp) const noexcept
622 {
623 return isConstant() && constantValue() == cmp;
624 }
625
626 constexpr uint64_t constantValue64(bool signext=false) const noexcept
627 {
628 if (constSize == 3) {
629 if (reg_ <= 192)
630 return reg_ - 128;
631 else if (reg_ <= 208)
632 return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);
633
634 switch (reg_) {
635 case 240:
636 return 0x3FE0000000000000;
637 case 241:
638 return 0xBFE0000000000000;
639 case 242:
640 return 0x3FF0000000000000;
641 case 243:
642 return 0xBFF0000000000000;
643 case 244:
644 return 0x4000000000000000;
645 case 245:
646 return 0xC000000000000000;
647 case 246:
648 return 0x4010000000000000;
649 case 247:
650 return 0xC010000000000000;
651 }
652 } else if (constSize == 1) {
653 return (signext && (data_.i & 0x8000u) ? 0xffffffffffff0000ull : 0ull) | data_.i;
654 } else if (constSize == 0) {
655 return (signext && (data_.i & 0x80u) ? 0xffffffffffffff00ull : 0ull) | data_.i;
656 }
657 return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
658 }
659
660 constexpr bool isOfType(RegType type) const noexcept
661 {
662 return hasRegClass() && regClass().type() == type;
663 }
664
665 /* Indicates that the killed operand's live range intersects with the
666 * instruction's definitions. Unlike isKill() and isFirstKill(), this is
667 * not set by liveness analysis. */
668 constexpr void setLateKill(bool flag) noexcept
669 {
670 isLateKill_ = flag;
671 }
672
673 constexpr bool isLateKill() const noexcept
674 {
675 return isLateKill_;
676 }
677
678 constexpr void setKill(bool flag) noexcept
679 {
680 isKill_ = flag;
681 if (!flag)
682 setFirstKill(false);
683 }
684
685 constexpr bool isKill() const noexcept
686 {
687 return isKill_ || isFirstKill();
688 }
689
690 constexpr void setFirstKill(bool flag) noexcept
691 {
692 isFirstKill_ = flag;
693 if (flag)
694 setKill(flag);
695 }
696
697 /* When there are multiple operands killing the same temporary,
698 * isFirstKill() is only returns true for the first one. */
699 constexpr bool isFirstKill() const noexcept
700 {
701 return isFirstKill_;
702 }
703
704 constexpr bool isKillBeforeDef() const noexcept
705 {
706 return isKill() && !isLateKill();
707 }
708
709 constexpr bool isFirstKillBeforeDef() const noexcept
710 {
711 return isFirstKill() && !isLateKill();
712 }
713
714 constexpr bool operator == (Operand other) const noexcept
715 {
716 if (other.size() != size())
717 return false;
718 if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef())
719 return false;
720 if (isFixed() && other.isFixed() && physReg() != other.physReg())
721 return false;
722 if (isLiteral())
723 return other.isLiteral() && other.constantValue() == constantValue();
724 else if (isConstant())
725 return other.isConstant() && other.physReg() == physReg();
726 else if (isUndefined())
727 return other.isUndefined() && other.regClass() == regClass();
728 else
729 return other.isTemp() && other.getTemp() == getTemp();
730 }
731 private:
732 union {
733 uint32_t i;
734 float f;
735 Temp temp = Temp(0, s1);
736 } data_;
737 PhysReg reg_;
738 union {
739 struct {
740 uint8_t isTemp_:1;
741 uint8_t isFixed_:1;
742 uint8_t isConstant_:1;
743 uint8_t isKill_:1;
744 uint8_t isUndef_:1;
745 uint8_t isFirstKill_:1;
746 uint8_t constSize:2;
747 uint8_t isLateKill_:1;
748 };
749 /* can't initialize bit-fields in c++11, so work around using a union */
750 uint16_t control_ = 0;
751 };
752 };
753
754 /**
755 * Definition Class
756 * Definitions are the results of Instructions
757 * and refer to temporary virtual registers
758 * which are later mapped to physical registers
759 */
760 class Definition final
761 {
762 public:
763 constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0),
764 isKill_(0), isPrecise_(0), isNUW_(0) {}
765 Definition(uint32_t index, RegClass type) noexcept
766 : temp(index, type) {}
767 explicit Definition(Temp tmp) noexcept
768 : temp(tmp) {}
769 Definition(PhysReg reg, RegClass type) noexcept
770 : temp(Temp(0, type))
771 {
772 setFixed(reg);
773 }
774 Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
775 : temp(Temp(tmpId, type))
776 {
777 setFixed(reg);
778 }
779
780 constexpr bool isTemp() const noexcept
781 {
782 return tempId() > 0;
783 }
784
785 constexpr Temp getTemp() const noexcept
786 {
787 return temp;
788 }
789
790 constexpr uint32_t tempId() const noexcept
791 {
792 return temp.id();
793 }
794
795 constexpr void setTemp(Temp t) noexcept {
796 temp = t;
797 }
798
799 constexpr RegClass regClass() const noexcept
800 {
801 return temp.regClass();
802 }
803
804 constexpr unsigned bytes() const noexcept
805 {
806 return temp.bytes();
807 }
808
809 constexpr unsigned size() const noexcept
810 {
811 return temp.size();
812 }
813
814 constexpr bool isFixed() const noexcept
815 {
816 return isFixed_;
817 }
818
819 constexpr PhysReg physReg() const noexcept
820 {
821 return reg_;
822 }
823
824 constexpr void setFixed(PhysReg reg) noexcept
825 {
826 isFixed_ = 1;
827 reg_ = reg;
828 }
829
830 constexpr void setHint(PhysReg reg) noexcept
831 {
832 hasHint_ = 1;
833 reg_ = reg;
834 }
835
836 constexpr bool hasHint() const noexcept
837 {
838 return hasHint_;
839 }
840
841 constexpr void setKill(bool flag) noexcept
842 {
843 isKill_ = flag;
844 }
845
846 constexpr bool isKill() const noexcept
847 {
848 return isKill_;
849 }
850
851 constexpr void setPrecise(bool precise) noexcept
852 {
853 isPrecise_ = precise;
854 }
855
856 constexpr bool isPrecise() const noexcept
857 {
858 return isPrecise_;
859 }
860
861 /* No Unsigned Wrap */
862 constexpr void setNUW(bool nuw) noexcept
863 {
864 isNUW_ = nuw;
865 }
866
867 constexpr bool isNUW() const noexcept
868 {
869 return isNUW_;
870 }
871
872 private:
873 Temp temp = Temp(0, s1);
874 PhysReg reg_;
875 union {
876 struct {
877 uint8_t isFixed_:1;
878 uint8_t hasHint_:1;
879 uint8_t isKill_:1;
880 uint8_t isPrecise_:1;
881 uint8_t isNUW_:1;
882 };
883 /* can't initialize bit-fields in c++11, so work around using a union */
884 uint8_t control_ = 0;
885 };
886 };
887
888 struct Block;
889
890 struct Instruction {
891 aco_opcode opcode;
892 Format format;
893 uint32_t pass_flags;
894
895 aco::span<Operand> operands;
896 aco::span<Definition> definitions;
897
898 constexpr bool isVALU() const noexcept
899 {
900 return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
901 || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
902 || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
903 || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
904 || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
905 || format == Format::VOP3P;
906 }
907
908 constexpr bool isSALU() const noexcept
909 {
910 return format == Format::SOP1 ||
911 format == Format::SOP2 ||
912 format == Format::SOPC ||
913 format == Format::SOPK ||
914 format == Format::SOPP;
915 }
916
917 constexpr bool isVMEM() const noexcept
918 {
919 return format == Format::MTBUF ||
920 format == Format::MUBUF ||
921 format == Format::MIMG;
922 }
923
924 constexpr bool isDPP() const noexcept
925 {
926 return (uint16_t) format & (uint16_t) Format::DPP;
927 }
928
929 constexpr bool isVOP3() const noexcept
930 {
931 return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
932 ((uint16_t) format & (uint16_t) Format::VOP3B);
933 }
934
935 constexpr bool isSDWA() const noexcept
936 {
937 return (uint16_t) format & (uint16_t) Format::SDWA;
938 }
939
940 constexpr bool isFlatOrGlobal() const noexcept
941 {
942 return format == Format::FLAT || format == Format::GLOBAL;
943 }
944
945 constexpr bool usesModifiers() const noexcept;
946
947 constexpr bool reads_exec() const noexcept
948 {
949 for (const Operand& op : operands) {
950 if (op.isFixed() && op.physReg() == exec)
951 return true;
952 }
953 return false;
954 }
955 };
956 static_assert(sizeof(Instruction) == 16, "Unexpected padding");
957
958 struct SOPK_instruction : public Instruction {
959 uint16_t imm;
960 uint16_t padding;
961 };
962 static_assert(sizeof(SOPK_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
963
964 struct SOPP_instruction : public Instruction {
965 uint32_t imm;
966 int block;
967 };
968 static_assert(sizeof(SOPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
969
970 struct SOPC_instruction : public Instruction {
971 };
972 static_assert(sizeof(SOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
973
974 struct SOP1_instruction : public Instruction {
975 };
976 static_assert(sizeof(SOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
977
978 struct SOP2_instruction : public Instruction {
979 };
980 static_assert(sizeof(SOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
981
982 /**
983 * Scalar Memory Format:
984 * For s_(buffer_)load_dword*:
985 * Operand(0): SBASE - SGPR-pair which provides base address
986 * Operand(1): Offset - immediate (un)signed offset or SGPR
987 * Operand(2) / Definition(0): SDATA - SGPR for read / write result
988 * Operand(n-1): SOffset - SGPR offset (Vega only)
989 *
990 * Having no operands is also valid for instructions such as s_dcache_inv.
991 *
992 */
993 struct SMEM_instruction : public Instruction {
994 memory_sync_info sync;
995 bool glc : 1; /* VI+: globally coherent */
996 bool dlc : 1; /* NAVI: device level coherent */
997 bool nv : 1; /* VEGA only: Non-volatile */
998 bool disable_wqm : 1;
999 bool prevent_overflow : 1; /* avoid overflow when combining additions */
1000 uint32_t padding: 3;
1001 };
1002 static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1003
1004 struct VOP1_instruction : public Instruction {
1005 };
1006 static_assert(sizeof(VOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1007
1008 struct VOP2_instruction : public Instruction {
1009 };
1010 static_assert(sizeof(VOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1011
1012 struct VOPC_instruction : public Instruction {
1013 };
1014 static_assert(sizeof(VOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1015
1016 struct VOP3A_instruction : public Instruction {
1017 bool abs[3];
1018 bool neg[3];
1019 uint8_t opsel : 4;
1020 uint8_t omod : 2;
1021 bool clamp : 1;
1022 uint32_t padding : 9;
1023 };
1024 static_assert(sizeof(VOP3A_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1025
1026 struct VOP3P_instruction : public Instruction {
1027 bool neg_lo[3];
1028 bool neg_hi[3];
1029 uint8_t opsel_lo : 3;
1030 uint8_t opsel_hi : 3;
1031 bool clamp : 1;
1032 uint32_t padding : 9;
1033 };
1034 static_assert(sizeof(VOP3P_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1035
1036 /**
1037 * Data Parallel Primitives Format:
1038 * This format can be used for VOP1, VOP2 or VOPC instructions.
1039 * The swizzle applies to the src0 operand.
1040 *
1041 */
1042 struct DPP_instruction : public Instruction {
1043 bool abs[2];
1044 bool neg[2];
1045 uint16_t dpp_ctrl;
1046 uint8_t row_mask : 4;
1047 uint8_t bank_mask : 4;
1048 bool bound_ctrl : 1;
1049 uint32_t padding : 7;
1050 };
1051 static_assert(sizeof(DPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1052
1053 enum sdwa_sel : uint8_t {
1054 /* masks */
1055 sdwa_wordnum = 0x1,
1056 sdwa_bytenum = 0x3,
1057 sdwa_asuint = 0x7 | 0x10,
1058 sdwa_rasize = 0x3,
1059
1060 /* flags */
1061 sdwa_isword = 0x4,
1062 sdwa_sext = 0x8,
1063 sdwa_isra = 0x10,
1064
1065 /* specific values */
1066 sdwa_ubyte0 = 0,
1067 sdwa_ubyte1 = 1,
1068 sdwa_ubyte2 = 2,
1069 sdwa_ubyte3 = 3,
1070 sdwa_uword0 = sdwa_isword | 0,
1071 sdwa_uword1 = sdwa_isword | 1,
1072 sdwa_udword = 6,
1073
1074 sdwa_sbyte0 = sdwa_ubyte0 | sdwa_sext,
1075 sdwa_sbyte1 = sdwa_ubyte1 | sdwa_sext,
1076 sdwa_sbyte2 = sdwa_ubyte2 | sdwa_sext,
1077 sdwa_sbyte3 = sdwa_ubyte3 | sdwa_sext,
1078 sdwa_sword0 = sdwa_uword0 | sdwa_sext,
1079 sdwa_sword1 = sdwa_uword1 | sdwa_sext,
1080 sdwa_sdword = sdwa_udword | sdwa_sext,
1081
1082 /* register-allocated */
1083 sdwa_ubyte = 1 | sdwa_isra,
1084 sdwa_uword = 2 | sdwa_isra,
1085 sdwa_sbyte = sdwa_ubyte | sdwa_sext,
1086 sdwa_sword = sdwa_uword | sdwa_sext,
1087 };
1088
1089 /**
1090 * Sub-Dword Addressing Format:
1091 * This format can be used for VOP1, VOP2 or VOPC instructions.
1092 *
1093 * omod and SGPR/constant operands are only available on GFX9+. For VOPC,
1094 * the definition doesn't have to be VCC on GFX9+.
1095 *
1096 */
1097 struct SDWA_instruction : public Instruction {
1098 /* these destination modifiers aren't available with VOPC except for
1099 * clamp on GFX8 */
1100 uint8_t sel[2];
1101 uint8_t dst_sel;
1102 bool neg[2];
1103 bool abs[2];
1104 bool dst_preserve : 1;
1105 bool clamp : 1;
1106 uint8_t omod : 2; /* GFX9+ */
1107 uint32_t padding : 4;
1108 };
1109 static_assert(sizeof(SDWA_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1110
1111 struct Interp_instruction : public Instruction {
1112 uint8_t attribute;
1113 uint8_t component;
1114 uint16_t padding;
1115 };
1116 static_assert(sizeof(Interp_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1117
1118 /**
1119 * Local and Global Data Sharing instructions
1120 * Operand(0): ADDR - VGPR which supplies the address.
1121 * Operand(1): DATA0 - First data VGPR.
1122 * Operand(2): DATA1 - Second data VGPR.
1123 * Operand(n-1): M0 - LDS size.
1124 * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
1125 *
1126 */
1127 struct DS_instruction : public Instruction {
1128 memory_sync_info sync;
1129 bool gds;
1130 int16_t offset0;
1131 int8_t offset1;
1132 uint8_t padding;
1133 };
1134 static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1135
1136 /**
1137 * Vector Memory Untyped-buffer Instructions
1138 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1139 * Operand(1): VADDR - Address source. Can carry an index and/or offset
1140 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1141 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1142 *
1143 */
1144 struct MUBUF_instruction : public Instruction {
1145 memory_sync_info sync;
1146 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1147 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1148 bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
1149 bool glc : 1; /* globally coherent */
1150 bool dlc : 1; /* NAVI: device level coherent */
1151 bool slc : 1; /* system level coherent */
1152 bool tfe : 1; /* texture fail enable */
1153 bool lds : 1; /* Return read-data to LDS instead of VGPRs */
1154 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1155 uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
1156 bool swizzled : 1;
1157 uint32_t padding1 : 18;
1158 };
1159 static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1160
1161 /**
1162 * Vector Memory Typed-buffer Instructions
1163 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1164 * Operand(1): VADDR - Address source. Can carry an index and/or offset
1165 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1166 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1167 *
1168 */
1169 struct MTBUF_instruction : public Instruction {
1170 memory_sync_info sync;
1171 uint8_t dfmt : 4; /* Data Format of data in memory buffer */
1172 uint8_t nfmt : 3; /* Numeric format of data in memory */
1173 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1174 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1175 bool glc : 1; /* globally coherent */
1176 bool dlc : 1; /* NAVI: device level coherent */
1177 bool slc : 1; /* system level coherent */
1178 bool tfe : 1; /* texture fail enable */
1179 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1180 uint32_t padding : 10;
1181 uint16_t offset; /* Unsigned byte offset - 12 bit */
1182 };
1183 static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1184
1185 /**
1186 * Vector Memory Image Instructions
1187 * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
1188 * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
1189 * or VDATA - Vector GPR for write data.
1190 * Operand(2): VADDR - Address source. Can carry an offset or an index.
1191 * Definition(0): VDATA - Vector GPR for read result.
1192 *
1193 */
1194 struct MIMG_instruction : public Instruction {
1195 memory_sync_info sync;
1196 uint8_t dmask; /* Data VGPR enable mask */
1197 uint8_t dim : 3; /* NAVI: dimensionality */
1198 bool unrm : 1; /* Force address to be un-normalized */
1199 bool dlc : 1; /* NAVI: device level coherent */
1200 bool glc : 1; /* globally coherent */
1201 bool slc : 1; /* system level coherent */
1202 bool tfe : 1; /* texture fail enable */
1203 bool da : 1; /* declare an array */
1204 bool lwe : 1; /* Force data to be un-normalized */
1205 bool r128 : 1; /* NAVI: Texture resource size */
1206 bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
1207 bool d16 : 1; /* Convert 32-bit data to 16-bit data */
1208 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1209 uint32_t padding : 18;
1210 };
1211 static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1212
1213 /**
1214 * Flat/Scratch/Global Instructions
1215 * Operand(0): ADDR
1216 * Operand(1): SADDR
1217 * Operand(2) / Definition(0): DATA/VDST
1218 *
1219 */
1220 struct FLAT_instruction : public Instruction {
1221 memory_sync_info sync;
1222 bool slc : 1; /* system level coherent */
1223 bool glc : 1; /* globally coherent */
1224 bool dlc : 1; /* NAVI: device level coherent */
1225 bool lds : 1;
1226 bool nv : 1;
1227 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1228 uint32_t padding0 : 2;
1229 uint16_t offset; /* Vega/Navi only */
1230 uint16_t padding1;
1231 };
1232 static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1233
1234 struct Export_instruction : public Instruction {
1235 uint8_t enabled_mask;
1236 uint8_t dest;
1237 bool compressed : 1;
1238 bool done : 1;
1239 bool valid_mask : 1;
1240 uint32_t padding : 13;
1241 };
1242 static_assert(sizeof(Export_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1243
1244 struct Pseudo_instruction : public Instruction {
1245 PhysReg scratch_sgpr; /* might not be valid if it's not needed */
1246 bool tmp_in_scc;
1247 uint8_t padding;
1248 };
1249 static_assert(sizeof(Pseudo_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1250
1251 struct Pseudo_branch_instruction : public Instruction {
1252 /* target[0] is the block index of the branch target.
1253 * For conditional branches, target[1] contains the fall-through alternative.
1254 * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
1255 */
1256 uint32_t target[2];
1257 };
1258 static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1259
1260 struct Pseudo_barrier_instruction : public Instruction {
1261 memory_sync_info sync;
1262 sync_scope exec_scope;
1263 };
1264 static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1265
1266 enum ReduceOp : uint16_t {
1267 iadd8, iadd16, iadd32, iadd64,
1268 imul8, imul16, imul32, imul64,
1269 fadd16, fadd32, fadd64,
1270 fmul16, fmul32, fmul64,
1271 imin8, imin16, imin32, imin64,
1272 imax8, imax16, imax32, imax64,
1273 umin8, umin16, umin32, umin64,
1274 umax8, umax16, umax32, umax64,
1275 fmin16, fmin32, fmin64,
1276 fmax16, fmax32, fmax64,
1277 iand8, iand16, iand32, iand64,
1278 ior8, ior16, ior32, ior64,
1279 ixor8, ixor16, ixor32, ixor64,
1280 };
1281
1282 /**
1283 * Subgroup Reduction Instructions, everything except for the data to be
1284 * reduced and the result as inserted by setup_reduce_temp().
1285 * Operand(0): data to be reduced
1286 * Operand(1): reduce temporary
1287 * Operand(2): vector temporary
1288 * Definition(0): result
1289 * Definition(1): scalar temporary
1290 * Definition(2): scalar identity temporary (not used to store identity on GFX10)
1291 * Definition(3): scc clobber
1292 * Definition(4): vcc clobber
1293 *
1294 */
1295 struct Pseudo_reduction_instruction : public Instruction {
1296 ReduceOp reduce_op;
1297 uint16_t cluster_size; // must be 0 for scans
1298 };
1299 static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1300
1301 struct instr_deleter_functor {
1302 void operator()(void* p) {
1303 free(p);
1304 }
1305 };
1306
1307 template<typename T>
1308 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
1309
1310 template<typename T>
1311 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
1312 {
1313 std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1314 char *data = (char*) calloc(1, size);
1315 T* inst = (T*) data;
1316
1317 inst->opcode = opcode;
1318 inst->format = format;
1319
1320 uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
1321 inst->operands = aco::span<Operand>(operands_offset, num_operands);
1322 uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1323 inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1324
1325 return inst;
1326 }
1327
1328 constexpr bool Instruction::usesModifiers() const noexcept
1329 {
1330 if (isDPP() || isSDWA())
1331 return true;
1332
1333 if (format == Format::VOP3P) {
1334 const VOP3P_instruction *vop3p = static_cast<const VOP3P_instruction*>(this);
1335 for (unsigned i = 0; i < operands.size(); i++) {
1336 if (vop3p->neg_lo[i] || vop3p->neg_hi[i])
1337 return true;
1338 }
1339 return vop3p->opsel_lo || vop3p->opsel_hi || vop3p->clamp;
1340 } else if (isVOP3()) {
1341 const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
1342 for (unsigned i = 0; i < operands.size(); i++) {
1343 if (vop3->abs[i] || vop3->neg[i])
1344 return true;
1345 }
1346 return vop3->opsel || vop3->clamp || vop3->omod;
1347 }
1348 return false;
1349 }
1350
1351 constexpr bool is_phi(Instruction* instr)
1352 {
1353 return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
1354 }
1355
1356 static inline bool is_phi(aco_ptr<Instruction>& instr)
1357 {
1358 return is_phi(instr.get());
1359 }
1360
1361 memory_sync_info get_sync_info(const Instruction* instr);
1362
1363 bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr);
1364
1365 bool can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high);
1366 bool can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr);
1367 /* updates "instr" and returns the old instruction (or NULL if no update was needed) */
1368 aco_ptr<Instruction> convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& instr);
1369
1370 enum block_kind {
1371 /* uniform indicates that leaving this block,
1372 * all actives lanes stay active */
1373 block_kind_uniform = 1 << 0,
1374 block_kind_top_level = 1 << 1,
1375 block_kind_loop_preheader = 1 << 2,
1376 block_kind_loop_header = 1 << 3,
1377 block_kind_loop_exit = 1 << 4,
1378 block_kind_continue = 1 << 5,
1379 block_kind_break = 1 << 6,
1380 block_kind_continue_or_break = 1 << 7,
1381 block_kind_discard = 1 << 8,
1382 block_kind_branch = 1 << 9,
1383 block_kind_merge = 1 << 10,
1384 block_kind_invert = 1 << 11,
1385 block_kind_uses_discard_if = 1 << 12,
1386 block_kind_needs_lowering = 1 << 13,
1387 block_kind_uses_demote = 1 << 14,
1388 block_kind_export_end = 1 << 15,
1389 };
1390
1391
1392 struct RegisterDemand {
1393 constexpr RegisterDemand() = default;
1394 constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1395 : vgpr{v}, sgpr{s} {}
1396 int16_t vgpr = 0;
1397 int16_t sgpr = 0;
1398
1399 constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1400 return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1401 }
1402
1403 constexpr bool exceeds(const RegisterDemand other) const noexcept {
1404 return vgpr > other.vgpr || sgpr > other.sgpr;
1405 }
1406
1407 constexpr RegisterDemand operator+(const Temp t) const noexcept {
1408 if (t.type() == RegType::sgpr)
1409 return RegisterDemand( vgpr, sgpr + t.size() );
1410 else
1411 return RegisterDemand( vgpr + t.size(), sgpr );
1412 }
1413
1414 constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1415 return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1416 }
1417
1418 constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1419 return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1420 }
1421
1422 constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1423 vgpr += other.vgpr;
1424 sgpr += other.sgpr;
1425 return *this;
1426 }
1427
1428 constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1429 vgpr -= other.vgpr;
1430 sgpr -= other.sgpr;
1431 return *this;
1432 }
1433
1434 constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1435 if (t.type() == RegType::sgpr)
1436 sgpr += t.size();
1437 else
1438 vgpr += t.size();
1439 return *this;
1440 }
1441
1442 constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1443 if (t.type() == RegType::sgpr)
1444 sgpr -= t.size();
1445 else
1446 vgpr -= t.size();
1447 return *this;
1448 }
1449
1450 constexpr void update(const RegisterDemand other) noexcept {
1451 vgpr = std::max(vgpr, other.vgpr);
1452 sgpr = std::max(sgpr, other.sgpr);
1453 }
1454
1455 };
1456
1457 /* CFG */
1458 struct Block {
1459 float_mode fp_mode;
1460 unsigned index;
1461 unsigned offset = 0;
1462 std::vector<aco_ptr<Instruction>> instructions;
1463 std::vector<unsigned> logical_preds;
1464 std::vector<unsigned> linear_preds;
1465 std::vector<unsigned> logical_succs;
1466 std::vector<unsigned> linear_succs;
1467 RegisterDemand register_demand = RegisterDemand();
1468 uint16_t loop_nest_depth = 0;
1469 uint16_t kind = 0;
1470 int logical_idom = -1;
1471 int linear_idom = -1;
1472 Temp live_out_exec = Temp();
1473
1474 /* this information is needed for predecessors to blocks with phis when
1475 * moving out of ssa */
1476 bool scc_live_out = false;
1477 PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1478
1479 Block(unsigned idx) : index(idx) {}
1480 Block() : index(0) {}
1481 };
1482
1483 using Stage = uint16_t;
1484
1485 /* software stages */
1486 static constexpr Stage sw_vs = 1 << 0;
1487 static constexpr Stage sw_gs = 1 << 1;
1488 static constexpr Stage sw_tcs = 1 << 2;
1489 static constexpr Stage sw_tes = 1 << 3;
1490 static constexpr Stage sw_fs = 1 << 4;
1491 static constexpr Stage sw_cs = 1 << 5;
1492 static constexpr Stage sw_gs_copy = 1 << 6;
1493 static constexpr Stage sw_mask = 0x7f;
1494
1495 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1496 static constexpr Stage hw_vs = 1 << 7;
1497 static constexpr Stage hw_es = 1 << 8; /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */
1498 static constexpr Stage hw_gs = 1 << 9; /* Geometry shader on GFX10/legacy and GFX6-9. */
1499 static constexpr Stage hw_ngg_gs = 1 << 10; /* Geometry shader on GFX10/NGG. */
1500 static constexpr Stage hw_ls = 1 << 11; /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */
1501 static constexpr Stage hw_hs = 1 << 12; /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */
1502 static constexpr Stage hw_fs = 1 << 13;
1503 static constexpr Stage hw_cs = 1 << 14;
1504 static constexpr Stage hw_mask = 0xff << 7;
1505
1506 /* possible settings of Program::stage */
1507 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1508 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1509 static constexpr Stage compute_cs = sw_cs | hw_cs;
1510 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1511 static constexpr Stage gs_copy_vs = sw_gs_copy | hw_vs;
1512 /* GFX10/NGG */
1513 static constexpr Stage ngg_vertex_gs = sw_vs | hw_ngg_gs;
1514 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_ngg_gs;
1515 static constexpr Stage ngg_tess_eval_gs = sw_tes | hw_ngg_gs;
1516 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_ngg_gs;
1517 /* GFX9 (and GFX10 if NGG isn't used) */
1518 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1519 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1520 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1521 /* pre-GFX9 */
1522 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1523 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1524 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1525 static constexpr Stage tess_eval_es = sw_tes | hw_es; /* tesselation evaluation before geometry */
1526 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1527
1528 enum statistic {
1529 statistic_hash,
1530 statistic_instructions,
1531 statistic_copies,
1532 statistic_branches,
1533 statistic_cycles,
1534 statistic_vmem_clauses,
1535 statistic_smem_clauses,
1536 statistic_vmem_score,
1537 statistic_smem_score,
1538 statistic_sgpr_presched,
1539 statistic_vgpr_presched,
1540 num_statistics
1541 };
1542
1543 class Program final {
1544 public:
1545 float_mode next_fp_mode;
1546 std::vector<Block> blocks;
1547 RegisterDemand max_reg_demand = RegisterDemand();
1548 uint16_t num_waves = 0;
1549 uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1550 ac_shader_config* config;
1551 struct radv_shader_info *info;
1552 enum chip_class chip_class;
1553 enum radeon_family family;
1554 unsigned wave_size;
1555 RegClass lane_mask;
1556 Stage stage; /* Stage */
1557 bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1558 bool needs_wqm = false; /* there exists a p_wqm instruction */
1559 bool wb_smem_l1_on_end = false;
1560
1561 std::vector<uint8_t> constant_data;
1562 Temp private_segment_buffer;
1563 Temp scratch_offset;
1564
1565 uint16_t min_waves = 0;
1566 uint16_t lds_alloc_granule;
1567 uint32_t lds_limit; /* in bytes */
1568 bool has_16bank_lds;
1569 uint16_t vgpr_limit;
1570 uint16_t sgpr_limit;
1571 uint16_t physical_sgprs;
1572 uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1573 uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
1574 unsigned workgroup_size; /* if known; otherwise UINT_MAX */
1575
1576 bool xnack_enabled = false;
1577 bool sram_ecc_enabled = false;
1578 bool has_fast_fma32 = false;
1579
1580 bool needs_vcc = false;
1581 bool needs_flat_scr = false;
1582
1583 bool collect_statistics = false;
1584 uint32_t statistics[num_statistics];
1585
1586 struct {
1587 void (*func)(void *private_data,
1588 enum radv_compiler_debug_level level,
1589 const char *message);
1590 void *private_data;
1591 } debug;
1592
1593 uint32_t allocateId()
1594 {
1595 assert(allocationID <= 16777215);
1596 return allocationID++;
1597 }
1598
1599 uint32_t peekAllocationId()
1600 {
1601 return allocationID;
1602 }
1603
1604 void setAllocationId(uint32_t id)
1605 {
1606 allocationID = id;
1607 }
1608
1609 Block* create_and_insert_block() {
1610 blocks.emplace_back(blocks.size());
1611 blocks.back().fp_mode = next_fp_mode;
1612 return &blocks.back();
1613 }
1614
1615 Block* insert_block(Block&& block) {
1616 block.index = blocks.size();
1617 block.fp_mode = next_fp_mode;
1618 blocks.emplace_back(std::move(block));
1619 return &blocks.back();
1620 }
1621
1622 private:
1623 uint32_t allocationID = 1;
1624 };
1625
1626 struct TempHash {
1627 std::size_t operator()(Temp t) const {
1628 return t.id();
1629 }
1630 };
1631 using TempSet = std::unordered_set<Temp, TempHash>;
1632
1633 struct live {
1634 /* live temps out per block */
1635 std::vector<TempSet> live_out;
1636 /* register demand (sgpr/vgpr) per instruction per block */
1637 std::vector<std::vector<RegisterDemand>> register_demand;
1638 };
1639
1640 void init();
1641
1642 void init_program(Program *program, Stage stage, struct radv_shader_info *info,
1643 enum chip_class chip_class, enum radeon_family family,
1644 ac_shader_config *config);
1645
1646 void select_program(Program *program,
1647 unsigned shader_count,
1648 struct nir_shader *const *shaders,
1649 ac_shader_config* config,
1650 struct radv_shader_args *args);
1651 void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
1652 ac_shader_config* config,
1653 struct radv_shader_args *args);
1654
1655 void lower_wqm(Program* program, live& live_vars,
1656 const struct radv_nir_compiler_options *options);
1657 void lower_phis(Program* program);
1658 void calc_min_waves(Program* program);
1659 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1660 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1661 std::vector<uint16_t> dead_code_analysis(Program *program);
1662 void dominator_tree(Program* program);
1663 void insert_exec_mask(Program *program);
1664 void value_numbering(Program* program);
1665 void optimize(Program* program);
1666 void setup_reduce_temp(Program* program);
1667 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1668 void register_allocation(Program *program, std::vector<TempSet>& live_out_per_block);
1669 void ssa_elimination(Program* program);
1670 void lower_to_hw_instr(Program* program);
1671 void schedule_program(Program* program, live& live_vars);
1672 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1673 void insert_wait_states(Program* program);
1674 void insert_NOPs(Program* program);
1675 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1676 void print_asm(Program *program, std::vector<uint32_t>& binary,
1677 unsigned exec_size, std::ostream& out);
1678 bool validate_ir(Program* program);
1679 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options);
1680 #ifndef NDEBUG
1681 void perfwarn(Program *program, bool cond, const char *msg, Instruction *instr=NULL);
1682 #else
1683 #define perfwarn(program, cond, msg, ...) do {} while(0)
1684 #endif
1685
1686 void collect_presched_stats(Program *program);
1687 void collect_preasm_stats(Program *program);
1688 void collect_postasm_stats(Program *program, const std::vector<uint32_t>& code);
1689
1690 void aco_print_instr(const Instruction *instr, FILE *output);
1691 void aco_print_program(const Program *program, FILE *output);
1692
1693 void _aco_perfwarn(Program *program, const char *file, unsigned line,
1694 const char *fmt, ...);
1695 void _aco_err(Program *program, const char *file, unsigned line,
1696 const char *fmt, ...);
1697
1698 #define aco_perfwarn(program, ...) _aco_perfwarn(program, __FILE__, __LINE__, __VA_ARGS__)
1699 #define aco_err(program, ...) _aco_err(program, __FILE__, __LINE__, __VA_ARGS__)
1700
1701 /* utilities for dealing with register demand */
1702 RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
1703 RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
1704 RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, aco_ptr<Instruction>& instr_before);
1705
1706 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1707 uint16_t get_extra_sgprs(Program *program);
1708
1709 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
1710 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1711 uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs);
1712
1713 /* return number of addressable sgprs/vgprs for max_waves */
1714 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1715 uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves);
1716
1717 typedef struct {
1718 const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
1719 const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1720 const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1721 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1722 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1723 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
1724 const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1725 const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1726 /* sizes used for input/output modifiers and constants */
1727 const unsigned operand_size[static_cast<int>(aco_opcode::num_opcodes)];
1728 const unsigned definition_size[static_cast<int>(aco_opcode::num_opcodes)];
1729 } Info;
1730
1731 extern const Info instr_info;
1732
1733 }
1734
1735 #endif /* ACO_IR_H */
1736