radv,aco: report ACO errors/warnings back via VK_EXT_debug_report
[mesa.git] / src / amd / compiler / aco_ir.h
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #ifndef ACO_IR_H
26 #define ACO_IR_H
27
28 #include <vector>
29 #include <set>
30 #include <unordered_set>
31 #include <bitset>
32 #include <memory>
33
34 #include "nir.h"
35 #include "ac_binary.h"
36 #include "amd_family.h"
37 #include "aco_opcodes.h"
38 #include "aco_util.h"
39
40 #include "vulkan/radv_shader.h"
41
42 struct radv_nir_compiler_options;
43 struct radv_shader_args;
44 struct radv_shader_info;
45
46 namespace aco {
47
48 extern uint64_t debug_flags;
49
50 enum {
51 DEBUG_VALIDATE_IR = 0x1,
52 DEBUG_VALIDATE_RA = 0x2,
53 DEBUG_PERFWARN = 0x4,
54 };
55
56 /**
57 * Representation of the instruction's microcode encoding format
58 * Note: Some Vector ALU Formats can be combined, such that:
59 * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
60 * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
61 * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
62 *
63 * (*) The same is applicable for VOP1 and VOPC instructions.
64 */
65 enum class Format : std::uint16_t {
66 /* Pseudo Instruction Format */
67 PSEUDO = 0,
68 /* Scalar ALU & Control Formats */
69 SOP1 = 1,
70 SOP2 = 2,
71 SOPK = 3,
72 SOPP = 4,
73 SOPC = 5,
74 /* Scalar Memory Format */
75 SMEM = 6,
76 /* LDS/GDS Format */
77 DS = 8,
78 /* Vector Memory Buffer Formats */
79 MTBUF = 9,
80 MUBUF = 10,
81 /* Vector Memory Image Format */
82 MIMG = 11,
83 /* Export Format */
84 EXP = 12,
85 /* Flat Formats */
86 FLAT = 13,
87 GLOBAL = 14,
88 SCRATCH = 15,
89
90 PSEUDO_BRANCH = 16,
91 PSEUDO_BARRIER = 17,
92 PSEUDO_REDUCTION = 18,
93
94 /* Vector ALU Formats */
95 VOP3P = 19,
96 VOP1 = 1 << 8,
97 VOP2 = 1 << 9,
98 VOPC = 1 << 10,
99 VOP3 = 1 << 11,
100 VOP3A = 1 << 11,
101 VOP3B = 1 << 11,
102 /* Vector Parameter Interpolation Format */
103 VINTRP = 1 << 12,
104 DPP = 1 << 13,
105 SDWA = 1 << 14,
106 };
107
108 enum storage_class : uint8_t {
109 storage_none = 0x0, /* no synchronization and can be reordered around aliasing stores */
110 storage_buffer = 0x1, /* SSBOs and global memory */
111 storage_atomic_counter = 0x2, /* not used for Vulkan */
112 storage_image = 0x4,
113 storage_shared = 0x8, /* or TCS output */
114 storage_vmem_output = 0x10, /* GS or TCS output stores using VMEM */
115 storage_scratch = 0x20,
116 storage_vgpr_spill = 0x40,
117 storage_count = 8,
118 };
119
120 enum memory_semantics : uint8_t {
121 semantic_none = 0x0,
122 /* for loads: don't move any access after this load to before this load (even other loads)
123 * for barriers: don't move any access after the barrier to before any
124 * atomics/control_barriers/sendmsg_gs_done before the barrier */
125 semantic_acquire = 0x1,
126 /* for stores: don't move any access before this store to after this store
127 * for barriers: don't move any access before the barrier to after any
128 * atomics/control_barriers/sendmsg_gs_done after the barrier */
129 semantic_release = 0x2,
130
131 /* the rest are for load/stores/atomics only */
132 /* cannot be DCE'd or CSE'd */
133 semantic_volatile = 0x4,
134 /* does not interact with barriers and assumes this lane is the only lane
135 * accessing this memory */
136 semantic_private = 0x8,
137 /* this operation can be reordered around operations of the same storage. says nothing about barriers */
138 semantic_can_reorder = 0x10,
139 /* this is a atomic instruction (may only read or write memory) */
140 semantic_atomic = 0x20,
141 /* this is instruction both reads and writes memory */
142 semantic_rmw = 0x40,
143
144 semantic_acqrel = semantic_acquire | semantic_release,
145 semantic_atomicrmw = semantic_volatile | semantic_atomic | semantic_rmw,
146 };
147
148 enum sync_scope : uint8_t {
149 scope_invocation = 0,
150 scope_subgroup = 1,
151 scope_workgroup = 2,
152 scope_queuefamily = 3,
153 scope_device = 4,
154 };
155
156 struct memory_sync_info {
157 memory_sync_info() : storage(storage_none), semantics(semantic_none), scope(scope_invocation) {}
158 memory_sync_info(int storage, int semantics=0, sync_scope scope=scope_invocation)
159 : storage((storage_class)storage), semantics((memory_semantics)semantics), scope(scope) {}
160
161 storage_class storage:8;
162 memory_semantics semantics:8;
163 sync_scope scope:8;
164
165 bool operator == (const memory_sync_info& rhs) const {
166 return storage == rhs.storage &&
167 semantics == rhs.semantics &&
168 scope == rhs.scope;
169 }
170
171 bool can_reorder() const {
172 if (semantics & semantic_acqrel)
173 return false;
174 /* Also check storage so that zero-initialized memory_sync_info can be
175 * reordered. */
176 return (!storage || (semantics & semantic_can_reorder)) && !(semantics & semantic_volatile);
177 }
178 };
179 static_assert(sizeof(memory_sync_info) == 3, "Unexpected padding");
180
181 enum fp_round {
182 fp_round_ne = 0,
183 fp_round_pi = 1,
184 fp_round_ni = 2,
185 fp_round_tz = 3,
186 };
187
188 enum fp_denorm {
189 /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
190 * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
191 fp_denorm_flush = 0x0,
192 fp_denorm_keep = 0x3,
193 };
194
195 struct float_mode {
196 /* matches encoding of the MODE register */
197 union {
198 struct {
199 fp_round round32:2;
200 fp_round round16_64:2;
201 unsigned denorm32:2;
202 unsigned denorm16_64:2;
203 };
204 struct {
205 uint8_t round:4;
206 uint8_t denorm:4;
207 };
208 uint8_t val = 0;
209 };
210 /* if false, optimizations which may remove infs/nan/-0.0 can be done */
211 bool preserve_signed_zero_inf_nan32:1;
212 bool preserve_signed_zero_inf_nan16_64:1;
213 /* if false, optimizations which may remove denormal flushing can be done */
214 bool must_flush_denorms32:1;
215 bool must_flush_denorms16_64:1;
216 bool care_about_round32:1;
217 bool care_about_round16_64:1;
218
219 /* Returns true if instructions using the mode "other" can safely use the
220 * current one instead. */
221 bool canReplace(float_mode other) const noexcept {
222 return val == other.val &&
223 (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
224 (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
225 (must_flush_denorms32 || !other.must_flush_denorms32) &&
226 (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
227 (care_about_round32 || !other.care_about_round32) &&
228 (care_about_round16_64 || !other.care_about_round16_64);
229 }
230 };
231
232 constexpr Format asVOP3(Format format) {
233 return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
234 };
235
236 constexpr Format asSDWA(Format format) {
237 assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC);
238 return (Format) ((uint32_t) Format::SDWA | (uint32_t) format);
239 }
240
241 enum class RegType {
242 none = 0,
243 sgpr,
244 vgpr,
245 linear_vgpr,
246 };
247
248 struct RegClass {
249
250 enum RC : uint8_t {
251 s1 = 1,
252 s2 = 2,
253 s3 = 3,
254 s4 = 4,
255 s6 = 6,
256 s8 = 8,
257 s16 = 16,
258 v1 = s1 | (1 << 5),
259 v2 = s2 | (1 << 5),
260 v3 = s3 | (1 << 5),
261 v4 = s4 | (1 << 5),
262 v5 = 5 | (1 << 5),
263 v6 = 6 | (1 << 5),
264 v7 = 7 | (1 << 5),
265 v8 = 8 | (1 << 5),
266 /* byte-sized register class */
267 v1b = v1 | (1 << 7),
268 v2b = v2 | (1 << 7),
269 v3b = v3 | (1 << 7),
270 v4b = v4 | (1 << 7),
271 v6b = v6 | (1 << 7),
272 v8b = v8 | (1 << 7),
273 /* these are used for WWM and spills to vgpr */
274 v1_linear = v1 | (1 << 6),
275 v2_linear = v2 | (1 << 6),
276 };
277
278 RegClass() = default;
279 constexpr RegClass(RC rc)
280 : rc(rc) {}
281 constexpr RegClass(RegType type, unsigned size)
282 : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
283
284 constexpr operator RC() const { return rc; }
285 explicit operator bool() = delete;
286
287 constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
288 constexpr bool is_subdword() const { return rc & (1 << 7); }
289 constexpr unsigned bytes() const { return ((unsigned) rc & 0x1F) * (is_subdword() ? 1 : 4); }
290 //TODO: use size() less in favor of bytes()
291 constexpr unsigned size() const { return (bytes() + 3) >> 2; }
292 constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
293 constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
294 constexpr RegClass as_subdword() const { return RegClass((RC) (rc | 1 << 7)); }
295
296 static constexpr RegClass get(RegType type, unsigned bytes) {
297 if (type == RegType::sgpr) {
298 return RegClass(type, DIV_ROUND_UP(bytes, 4u));
299 } else {
300 return bytes % 4u ? RegClass(type, bytes).as_subdword() :
301 RegClass(type, bytes / 4u);
302 }
303 }
304
305 private:
306 RC rc;
307 };
308
309 /* transitional helper expressions */
310 static constexpr RegClass s1{RegClass::s1};
311 static constexpr RegClass s2{RegClass::s2};
312 static constexpr RegClass s3{RegClass::s3};
313 static constexpr RegClass s4{RegClass::s4};
314 static constexpr RegClass s8{RegClass::s8};
315 static constexpr RegClass s16{RegClass::s16};
316 static constexpr RegClass v1{RegClass::v1};
317 static constexpr RegClass v2{RegClass::v2};
318 static constexpr RegClass v3{RegClass::v3};
319 static constexpr RegClass v4{RegClass::v4};
320 static constexpr RegClass v5{RegClass::v5};
321 static constexpr RegClass v6{RegClass::v6};
322 static constexpr RegClass v7{RegClass::v7};
323 static constexpr RegClass v8{RegClass::v8};
324 static constexpr RegClass v1b{RegClass::v1b};
325 static constexpr RegClass v2b{RegClass::v2b};
326 static constexpr RegClass v3b{RegClass::v3b};
327 static constexpr RegClass v4b{RegClass::v4b};
328 static constexpr RegClass v6b{RegClass::v6b};
329 static constexpr RegClass v8b{RegClass::v8b};
330
331 /**
332 * Temp Class
333 * Each temporary virtual register has a
334 * register class (i.e. size and type)
335 * and SSA id.
336 */
337 struct Temp {
338 Temp() noexcept : id_(0), reg_class(0) {}
339 constexpr Temp(uint32_t id, RegClass cls) noexcept
340 : id_(id), reg_class(uint8_t(cls)) {}
341
342 constexpr uint32_t id() const noexcept { return id_; }
343 constexpr RegClass regClass() const noexcept { return (RegClass::RC)reg_class; }
344
345 constexpr unsigned bytes() const noexcept { return regClass().bytes(); }
346 constexpr unsigned size() const noexcept { return regClass().size(); }
347 constexpr RegType type() const noexcept { return regClass().type(); }
348 constexpr bool is_linear() const noexcept { return regClass().is_linear(); }
349
350 constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
351 constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
352 constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
353
354 private:
355 uint32_t id_: 24;
356 uint32_t reg_class : 8;
357 };
358
359 /**
360 * PhysReg
361 * Represents the physical register for each
362 * Operand and Definition.
363 */
364 struct PhysReg {
365 constexpr PhysReg() = default;
366 explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {}
367 constexpr unsigned reg() const { return reg_b >> 2; }
368 constexpr unsigned byte() const { return reg_b & 0x3; }
369 constexpr operator unsigned() const { return reg(); }
370 constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; }
371 constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; }
372 constexpr bool operator <(PhysReg other) const { return reg_b < other.reg_b; }
373 constexpr PhysReg advance(int bytes) const { PhysReg res = *this; res.reg_b += bytes; return res; }
374
375 uint16_t reg_b = 0;
376 };
377
378 /* helper expressions for special registers */
379 static constexpr PhysReg m0{124};
380 static constexpr PhysReg vcc{106};
381 static constexpr PhysReg vcc_hi{107};
382 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
383 static constexpr PhysReg exec{126};
384 static constexpr PhysReg exec_lo{126};
385 static constexpr PhysReg exec_hi{127};
386 static constexpr PhysReg vccz{251};
387 static constexpr PhysReg execz{252};
388 static constexpr PhysReg scc{253};
389
390 /**
391 * Operand Class
392 * Initially, each Operand refers to either
393 * a temporary virtual register
394 * or to a constant value
395 * Temporary registers get mapped to physical register during RA
396 * Constant values are inlined into the instruction sequence.
397 */
398 class Operand final
399 {
400 public:
401 constexpr Operand()
402 : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
403 isKill_(false), isUndef_(true), isFirstKill_(false), constSize(0),
404 isLateKill_(false) {}
405
406 explicit Operand(Temp r) noexcept
407 {
408 data_.temp = r;
409 if (r.id()) {
410 isTemp_ = true;
411 } else {
412 isUndef_ = true;
413 setFixed(PhysReg{128});
414 }
415 };
416 explicit Operand(uint8_t v) noexcept
417 {
418 /* 8-bit constants are only used for copies and copies from any 8-bit
419 * constant can be implemented with a SDWA v_mul_u32_u24. So consider all
420 * to be inline constants. */
421 data_.i = v;
422 isConstant_ = true;
423 constSize = 0;
424 setFixed(PhysReg{0u});
425 };
426 explicit Operand(uint16_t v) noexcept
427 {
428 data_.i = v;
429 isConstant_ = true;
430 constSize = 1;
431 if (v <= 64)
432 setFixed(PhysReg{128u + v});
433 else if (v >= 0xFFF0) /* [-16 .. -1] */
434 setFixed(PhysReg{192u + (0xFFFF - v)});
435 else if (v == 0x3800) /* 0.5 */
436 setFixed(PhysReg{240});
437 else if (v == 0xB800) /* -0.5 */
438 setFixed(PhysReg{241});
439 else if (v == 0x3C00) /* 1.0 */
440 setFixed(PhysReg{242});
441 else if (v == 0xBC00) /* -1.0 */
442 setFixed(PhysReg{243});
443 else if (v == 0x4000) /* 2.0 */
444 setFixed(PhysReg{244});
445 else if (v == 0xC000) /* -2.0 */
446 setFixed(PhysReg{245});
447 else if (v == 0x4400) /* 4.0 */
448 setFixed(PhysReg{246});
449 else if (v == 0xC400) /* -4.0 */
450 setFixed(PhysReg{247});
451 else if (v == 0x3118) /* 1/2 PI */
452 setFixed(PhysReg{248});
453 else /* Literal Constant */
454 setFixed(PhysReg{255});
455 };
456 explicit Operand(uint32_t v, bool is64bit = false) noexcept
457 {
458 data_.i = v;
459 isConstant_ = true;
460 constSize = is64bit ? 3 : 2;
461 if (v <= 64)
462 setFixed(PhysReg{128 + v});
463 else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
464 setFixed(PhysReg{192 - v});
465 else if (v == 0x3f000000) /* 0.5 */
466 setFixed(PhysReg{240});
467 else if (v == 0xbf000000) /* -0.5 */
468 setFixed(PhysReg{241});
469 else if (v == 0x3f800000) /* 1.0 */
470 setFixed(PhysReg{242});
471 else if (v == 0xbf800000) /* -1.0 */
472 setFixed(PhysReg{243});
473 else if (v == 0x40000000) /* 2.0 */
474 setFixed(PhysReg{244});
475 else if (v == 0xc0000000) /* -2.0 */
476 setFixed(PhysReg{245});
477 else if (v == 0x40800000) /* 4.0 */
478 setFixed(PhysReg{246});
479 else if (v == 0xc0800000) /* -4.0 */
480 setFixed(PhysReg{247});
481 else { /* Literal Constant */
482 assert(!is64bit && "attempt to create a 64-bit literal constant");
483 setFixed(PhysReg{255});
484 }
485 };
486 explicit Operand(uint64_t v) noexcept
487 {
488 isConstant_ = true;
489 constSize = 3;
490 if (v <= 64) {
491 data_.i = (uint32_t) v;
492 setFixed(PhysReg{128 + (uint32_t) v});
493 } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
494 data_.i = (uint32_t) v;
495 setFixed(PhysReg{192 - (uint32_t) v});
496 } else if (v == 0x3FE0000000000000) { /* 0.5 */
497 data_.i = 0x3f000000;
498 setFixed(PhysReg{240});
499 } else if (v == 0xBFE0000000000000) { /* -0.5 */
500 data_.i = 0xbf000000;
501 setFixed(PhysReg{241});
502 } else if (v == 0x3FF0000000000000) { /* 1.0 */
503 data_.i = 0x3f800000;
504 setFixed(PhysReg{242});
505 } else if (v == 0xBFF0000000000000) { /* -1.0 */
506 data_.i = 0xbf800000;
507 setFixed(PhysReg{243});
508 } else if (v == 0x4000000000000000) { /* 2.0 */
509 data_.i = 0x40000000;
510 setFixed(PhysReg{244});
511 } else if (v == 0xC000000000000000) { /* -2.0 */
512 data_.i = 0xc0000000;
513 setFixed(PhysReg{245});
514 } else if (v == 0x4010000000000000) { /* 4.0 */
515 data_.i = 0x40800000;
516 setFixed(PhysReg{246});
517 } else if (v == 0xC010000000000000) { /* -4.0 */
518 data_.i = 0xc0800000;
519 setFixed(PhysReg{247});
520 } else { /* Literal Constant: we don't know if it is a long or double.*/
521 isConstant_ = 0;
522 assert(false && "attempt to create a 64-bit literal constant");
523 }
524 };
525 explicit Operand(RegClass type) noexcept
526 {
527 isUndef_ = true;
528 data_.temp = Temp(0, type);
529 setFixed(PhysReg{128});
530 };
531 explicit Operand(PhysReg reg, RegClass type) noexcept
532 {
533 data_.temp = Temp(0, type);
534 setFixed(reg);
535 }
536
537 constexpr bool isTemp() const noexcept
538 {
539 return isTemp_;
540 }
541
542 constexpr void setTemp(Temp t) noexcept {
543 assert(!isConstant_);
544 isTemp_ = true;
545 data_.temp = t;
546 }
547
548 constexpr Temp getTemp() const noexcept
549 {
550 return data_.temp;
551 }
552
553 constexpr uint32_t tempId() const noexcept
554 {
555 return data_.temp.id();
556 }
557
558 constexpr bool hasRegClass() const noexcept
559 {
560 return isTemp() || isUndefined();
561 }
562
563 constexpr RegClass regClass() const noexcept
564 {
565 return data_.temp.regClass();
566 }
567
568 constexpr unsigned bytes() const noexcept
569 {
570 if (isConstant())
571 return 1 << constSize;
572 else
573 return data_.temp.bytes();
574 }
575
576 constexpr unsigned size() const noexcept
577 {
578 if (isConstant())
579 return constSize > 2 ? 2 : 1;
580 else
581 return data_.temp.size();
582 }
583
584 constexpr bool isFixed() const noexcept
585 {
586 return isFixed_;
587 }
588
589 constexpr PhysReg physReg() const noexcept
590 {
591 return reg_;
592 }
593
594 constexpr void setFixed(PhysReg reg) noexcept
595 {
596 isFixed_ = reg != unsigned(-1);
597 reg_ = reg;
598 }
599
600 constexpr bool isConstant() const noexcept
601 {
602 return isConstant_;
603 }
604
605 constexpr bool isLiteral() const noexcept
606 {
607 return isConstant() && reg_ == 255;
608 }
609
610 constexpr bool isUndefined() const noexcept
611 {
612 return isUndef_;
613 }
614
615 constexpr uint32_t constantValue() const noexcept
616 {
617 return data_.i;
618 }
619
620 constexpr bool constantEquals(uint32_t cmp) const noexcept
621 {
622 return isConstant() && constantValue() == cmp;
623 }
624
625 constexpr uint64_t constantValue64(bool signext=false) const noexcept
626 {
627 if (constSize == 3) {
628 if (reg_ <= 192)
629 return reg_ - 128;
630 else if (reg_ <= 208)
631 return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);
632
633 switch (reg_) {
634 case 240:
635 return 0x3FE0000000000000;
636 case 241:
637 return 0xBFE0000000000000;
638 case 242:
639 return 0x3FF0000000000000;
640 case 243:
641 return 0xBFF0000000000000;
642 case 244:
643 return 0x4000000000000000;
644 case 245:
645 return 0xC000000000000000;
646 case 246:
647 return 0x4010000000000000;
648 case 247:
649 return 0xC010000000000000;
650 }
651 } else if (constSize == 1) {
652 return (signext && (data_.i & 0x8000u) ? 0xffffffffffff0000ull : 0ull) | data_.i;
653 } else if (constSize == 0) {
654 return (signext && (data_.i & 0x80u) ? 0xffffffffffffff00ull : 0ull) | data_.i;
655 }
656 return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
657 }
658
659 constexpr bool isOfType(RegType type) const noexcept
660 {
661 return hasRegClass() && regClass().type() == type;
662 }
663
664 /* Indicates that the killed operand's live range intersects with the
665 * instruction's definitions. Unlike isKill() and isFirstKill(), this is
666 * not set by liveness analysis. */
667 constexpr void setLateKill(bool flag) noexcept
668 {
669 isLateKill_ = flag;
670 }
671
672 constexpr bool isLateKill() const noexcept
673 {
674 return isLateKill_;
675 }
676
677 constexpr void setKill(bool flag) noexcept
678 {
679 isKill_ = flag;
680 if (!flag)
681 setFirstKill(false);
682 }
683
684 constexpr bool isKill() const noexcept
685 {
686 return isKill_ || isFirstKill();
687 }
688
689 constexpr void setFirstKill(bool flag) noexcept
690 {
691 isFirstKill_ = flag;
692 if (flag)
693 setKill(flag);
694 }
695
696 /* When there are multiple operands killing the same temporary,
697 * isFirstKill() is only returns true for the first one. */
698 constexpr bool isFirstKill() const noexcept
699 {
700 return isFirstKill_;
701 }
702
703 constexpr bool isKillBeforeDef() const noexcept
704 {
705 return isKill() && !isLateKill();
706 }
707
708 constexpr bool isFirstKillBeforeDef() const noexcept
709 {
710 return isFirstKill() && !isLateKill();
711 }
712
713 constexpr bool operator == (Operand other) const noexcept
714 {
715 if (other.size() != size())
716 return false;
717 if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef())
718 return false;
719 if (isFixed() && other.isFixed() && physReg() != other.physReg())
720 return false;
721 if (isLiteral())
722 return other.isLiteral() && other.constantValue() == constantValue();
723 else if (isConstant())
724 return other.isConstant() && other.physReg() == physReg();
725 else if (isUndefined())
726 return other.isUndefined() && other.regClass() == regClass();
727 else
728 return other.isTemp() && other.getTemp() == getTemp();
729 }
730 private:
731 union {
732 uint32_t i;
733 float f;
734 Temp temp = Temp(0, s1);
735 } data_;
736 PhysReg reg_;
737 union {
738 struct {
739 uint8_t isTemp_:1;
740 uint8_t isFixed_:1;
741 uint8_t isConstant_:1;
742 uint8_t isKill_:1;
743 uint8_t isUndef_:1;
744 uint8_t isFirstKill_:1;
745 uint8_t constSize:2;
746 uint8_t isLateKill_:1;
747 };
748 /* can't initialize bit-fields in c++11, so work around using a union */
749 uint16_t control_ = 0;
750 };
751 };
752
753 /**
754 * Definition Class
755 * Definitions are the results of Instructions
756 * and refer to temporary virtual registers
757 * which are later mapped to physical registers
758 */
759 class Definition final
760 {
761 public:
762 constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0),
763 isKill_(0), isPrecise_(0), isNUW_(0) {}
764 Definition(uint32_t index, RegClass type) noexcept
765 : temp(index, type) {}
766 explicit Definition(Temp tmp) noexcept
767 : temp(tmp) {}
768 Definition(PhysReg reg, RegClass type) noexcept
769 : temp(Temp(0, type))
770 {
771 setFixed(reg);
772 }
773 Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
774 : temp(Temp(tmpId, type))
775 {
776 setFixed(reg);
777 }
778
779 constexpr bool isTemp() const noexcept
780 {
781 return tempId() > 0;
782 }
783
784 constexpr Temp getTemp() const noexcept
785 {
786 return temp;
787 }
788
789 constexpr uint32_t tempId() const noexcept
790 {
791 return temp.id();
792 }
793
794 constexpr void setTemp(Temp t) noexcept {
795 temp = t;
796 }
797
798 constexpr RegClass regClass() const noexcept
799 {
800 return temp.regClass();
801 }
802
803 constexpr unsigned bytes() const noexcept
804 {
805 return temp.bytes();
806 }
807
808 constexpr unsigned size() const noexcept
809 {
810 return temp.size();
811 }
812
813 constexpr bool isFixed() const noexcept
814 {
815 return isFixed_;
816 }
817
818 constexpr PhysReg physReg() const noexcept
819 {
820 return reg_;
821 }
822
823 constexpr void setFixed(PhysReg reg) noexcept
824 {
825 isFixed_ = 1;
826 reg_ = reg;
827 }
828
829 constexpr void setHint(PhysReg reg) noexcept
830 {
831 hasHint_ = 1;
832 reg_ = reg;
833 }
834
835 constexpr bool hasHint() const noexcept
836 {
837 return hasHint_;
838 }
839
840 constexpr void setKill(bool flag) noexcept
841 {
842 isKill_ = flag;
843 }
844
845 constexpr bool isKill() const noexcept
846 {
847 return isKill_;
848 }
849
850 constexpr void setPrecise(bool precise) noexcept
851 {
852 isPrecise_ = precise;
853 }
854
855 constexpr bool isPrecise() const noexcept
856 {
857 return isPrecise_;
858 }
859
860 /* No Unsigned Wrap */
861 constexpr void setNUW(bool nuw) noexcept
862 {
863 isNUW_ = nuw;
864 }
865
866 constexpr bool isNUW() const noexcept
867 {
868 return isNUW_;
869 }
870
871 private:
872 Temp temp = Temp(0, s1);
873 PhysReg reg_;
874 union {
875 struct {
876 uint8_t isFixed_:1;
877 uint8_t hasHint_:1;
878 uint8_t isKill_:1;
879 uint8_t isPrecise_:1;
880 uint8_t isNUW_:1;
881 };
882 /* can't initialize bit-fields in c++11, so work around using a union */
883 uint8_t control_ = 0;
884 };
885 };
886
887 struct Block;
888
889 struct Instruction {
890 aco_opcode opcode;
891 Format format;
892 uint32_t pass_flags;
893
894 aco::span<Operand> operands;
895 aco::span<Definition> definitions;
896
897 constexpr bool isVALU() const noexcept
898 {
899 return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
900 || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
901 || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
902 || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
903 || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
904 || format == Format::VOP3P;
905 }
906
907 constexpr bool isSALU() const noexcept
908 {
909 return format == Format::SOP1 ||
910 format == Format::SOP2 ||
911 format == Format::SOPC ||
912 format == Format::SOPK ||
913 format == Format::SOPP;
914 }
915
916 constexpr bool isVMEM() const noexcept
917 {
918 return format == Format::MTBUF ||
919 format == Format::MUBUF ||
920 format == Format::MIMG;
921 }
922
923 constexpr bool isDPP() const noexcept
924 {
925 return (uint16_t) format & (uint16_t) Format::DPP;
926 }
927
928 constexpr bool isVOP3() const noexcept
929 {
930 return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
931 ((uint16_t) format & (uint16_t) Format::VOP3B);
932 }
933
934 constexpr bool isSDWA() const noexcept
935 {
936 return (uint16_t) format & (uint16_t) Format::SDWA;
937 }
938
939 constexpr bool isFlatOrGlobal() const noexcept
940 {
941 return format == Format::FLAT || format == Format::GLOBAL;
942 }
943
944 constexpr bool usesModifiers() const noexcept;
945
946 constexpr bool reads_exec() const noexcept
947 {
948 for (const Operand& op : operands) {
949 if (op.isFixed() && op.physReg() == exec)
950 return true;
951 }
952 return false;
953 }
954 };
955 static_assert(sizeof(Instruction) == 16, "Unexpected padding");
956
957 struct SOPK_instruction : public Instruction {
958 uint16_t imm;
959 uint16_t padding;
960 };
961 static_assert(sizeof(SOPK_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
962
963 struct SOPP_instruction : public Instruction {
964 uint32_t imm;
965 int block;
966 };
967 static_assert(sizeof(SOPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
968
969 struct SOPC_instruction : public Instruction {
970 };
971 static_assert(sizeof(SOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
972
973 struct SOP1_instruction : public Instruction {
974 };
975 static_assert(sizeof(SOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
976
977 struct SOP2_instruction : public Instruction {
978 };
979 static_assert(sizeof(SOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
980
981 /**
982 * Scalar Memory Format:
983 * For s_(buffer_)load_dword*:
984 * Operand(0): SBASE - SGPR-pair which provides base address
985 * Operand(1): Offset - immediate (un)signed offset or SGPR
986 * Operand(2) / Definition(0): SDATA - SGPR for read / write result
987 * Operand(n-1): SOffset - SGPR offset (Vega only)
988 *
989 * Having no operands is also valid for instructions such as s_dcache_inv.
990 *
991 */
992 struct SMEM_instruction : public Instruction {
993 memory_sync_info sync;
994 bool glc : 1; /* VI+: globally coherent */
995 bool dlc : 1; /* NAVI: device level coherent */
996 bool nv : 1; /* VEGA only: Non-volatile */
997 bool disable_wqm : 1;
998 bool prevent_overflow : 1; /* avoid overflow when combining additions */
999 uint32_t padding: 3;
1000 };
1001 static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1002
1003 struct VOP1_instruction : public Instruction {
1004 };
1005 static_assert(sizeof(VOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1006
1007 struct VOP2_instruction : public Instruction {
1008 };
1009 static_assert(sizeof(VOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1010
1011 struct VOPC_instruction : public Instruction {
1012 };
1013 static_assert(sizeof(VOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1014
1015 struct VOP3A_instruction : public Instruction {
1016 bool abs[3];
1017 bool neg[3];
1018 uint8_t opsel : 4;
1019 uint8_t omod : 2;
1020 bool clamp : 1;
1021 uint32_t padding : 9;
1022 };
1023 static_assert(sizeof(VOP3A_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1024
1025 struct VOP3P_instruction : public Instruction {
1026 bool neg_lo[3];
1027 bool neg_hi[3];
1028 uint8_t opsel_lo : 3;
1029 uint8_t opsel_hi : 3;
1030 bool clamp : 1;
1031 uint32_t padding : 9;
1032 };
1033 static_assert(sizeof(VOP3P_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1034
1035 /**
1036 * Data Parallel Primitives Format:
1037 * This format can be used for VOP1, VOP2 or VOPC instructions.
1038 * The swizzle applies to the src0 operand.
1039 *
1040 */
1041 struct DPP_instruction : public Instruction {
1042 bool abs[2];
1043 bool neg[2];
1044 uint16_t dpp_ctrl;
1045 uint8_t row_mask : 4;
1046 uint8_t bank_mask : 4;
1047 bool bound_ctrl : 1;
1048 uint32_t padding : 7;
1049 };
1050 static_assert(sizeof(DPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1051
1052 enum sdwa_sel : uint8_t {
1053 /* masks */
1054 sdwa_wordnum = 0x1,
1055 sdwa_bytenum = 0x3,
1056 sdwa_asuint = 0x7 | 0x10,
1057 sdwa_rasize = 0x3,
1058
1059 /* flags */
1060 sdwa_isword = 0x4,
1061 sdwa_sext = 0x8,
1062 sdwa_isra = 0x10,
1063
1064 /* specific values */
1065 sdwa_ubyte0 = 0,
1066 sdwa_ubyte1 = 1,
1067 sdwa_ubyte2 = 2,
1068 sdwa_ubyte3 = 3,
1069 sdwa_uword0 = sdwa_isword | 0,
1070 sdwa_uword1 = sdwa_isword | 1,
1071 sdwa_udword = 6,
1072
1073 sdwa_sbyte0 = sdwa_ubyte0 | sdwa_sext,
1074 sdwa_sbyte1 = sdwa_ubyte1 | sdwa_sext,
1075 sdwa_sbyte2 = sdwa_ubyte2 | sdwa_sext,
1076 sdwa_sbyte3 = sdwa_ubyte3 | sdwa_sext,
1077 sdwa_sword0 = sdwa_uword0 | sdwa_sext,
1078 sdwa_sword1 = sdwa_uword1 | sdwa_sext,
1079 sdwa_sdword = sdwa_udword | sdwa_sext,
1080
1081 /* register-allocated */
1082 sdwa_ubyte = 1 | sdwa_isra,
1083 sdwa_uword = 2 | sdwa_isra,
1084 sdwa_sbyte = sdwa_ubyte | sdwa_sext,
1085 sdwa_sword = sdwa_uword | sdwa_sext,
1086 };
1087
1088 /**
1089 * Sub-Dword Addressing Format:
1090 * This format can be used for VOP1, VOP2 or VOPC instructions.
1091 *
1092 * omod and SGPR/constant operands are only available on GFX9+. For VOPC,
1093 * the definition doesn't have to be VCC on GFX9+.
1094 *
1095 */
1096 struct SDWA_instruction : public Instruction {
1097 /* these destination modifiers aren't available with VOPC except for
1098 * clamp on GFX8 */
1099 uint8_t sel[2];
1100 uint8_t dst_sel;
1101 bool neg[2];
1102 bool abs[2];
1103 bool dst_preserve : 1;
1104 bool clamp : 1;
1105 uint8_t omod : 2; /* GFX9+ */
1106 uint32_t padding : 4;
1107 };
1108 static_assert(sizeof(SDWA_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1109
1110 struct Interp_instruction : public Instruction {
1111 uint8_t attribute;
1112 uint8_t component;
1113 uint16_t padding;
1114 };
1115 static_assert(sizeof(Interp_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1116
1117 /**
1118 * Local and Global Data Sharing instructions
1119 * Operand(0): ADDR - VGPR which supplies the address.
1120 * Operand(1): DATA0 - First data VGPR.
1121 * Operand(2): DATA1 - Second data VGPR.
1122 * Operand(n-1): M0 - LDS size.
1123 * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
1124 *
1125 */
1126 struct DS_instruction : public Instruction {
1127 memory_sync_info sync;
1128 bool gds;
1129 int16_t offset0;
1130 int8_t offset1;
1131 uint8_t padding;
1132 };
1133 static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1134
1135 /**
1136 * Vector Memory Untyped-buffer Instructions
1137 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1138 * Operand(1): VADDR - Address source. Can carry an index and/or offset
1139 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1140 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1141 *
1142 */
1143 struct MUBUF_instruction : public Instruction {
1144 memory_sync_info sync;
1145 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1146 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1147 bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
1148 bool glc : 1; /* globally coherent */
1149 bool dlc : 1; /* NAVI: device level coherent */
1150 bool slc : 1; /* system level coherent */
1151 bool tfe : 1; /* texture fail enable */
1152 bool lds : 1; /* Return read-data to LDS instead of VGPRs */
1153 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1154 uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
1155 bool swizzled : 1;
1156 uint32_t padding1 : 18;
1157 };
1158 static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1159
1160 /**
1161 * Vector Memory Typed-buffer Instructions
1162 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1163 * Operand(1): VADDR - Address source. Can carry an index and/or offset
1164 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1165 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1166 *
1167 */
1168 struct MTBUF_instruction : public Instruction {
1169 memory_sync_info sync;
1170 uint8_t dfmt : 4; /* Data Format of data in memory buffer */
1171 uint8_t nfmt : 3; /* Numeric format of data in memory */
1172 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1173 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1174 bool glc : 1; /* globally coherent */
1175 bool dlc : 1; /* NAVI: device level coherent */
1176 bool slc : 1; /* system level coherent */
1177 bool tfe : 1; /* texture fail enable */
1178 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1179 uint32_t padding : 10;
1180 uint16_t offset; /* Unsigned byte offset - 12 bit */
1181 };
1182 static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1183
1184 /**
1185 * Vector Memory Image Instructions
1186 * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
1187 * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
1188 * or VDATA - Vector GPR for write data.
1189 * Operand(2): VADDR - Address source. Can carry an offset or an index.
1190 * Definition(0): VDATA - Vector GPR for read result.
1191 *
1192 */
1193 struct MIMG_instruction : public Instruction {
1194 memory_sync_info sync;
1195 uint8_t dmask; /* Data VGPR enable mask */
1196 uint8_t dim : 3; /* NAVI: dimensionality */
1197 bool unrm : 1; /* Force address to be un-normalized */
1198 bool dlc : 1; /* NAVI: device level coherent */
1199 bool glc : 1; /* globally coherent */
1200 bool slc : 1; /* system level coherent */
1201 bool tfe : 1; /* texture fail enable */
1202 bool da : 1; /* declare an array */
1203 bool lwe : 1; /* Force data to be un-normalized */
1204 bool r128 : 1; /* NAVI: Texture resource size */
1205 bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
1206 bool d16 : 1; /* Convert 32-bit data to 16-bit data */
1207 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1208 uint32_t padding : 18;
1209 };
1210 static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1211
1212 /**
1213 * Flat/Scratch/Global Instructions
1214 * Operand(0): ADDR
1215 * Operand(1): SADDR
1216 * Operand(2) / Definition(0): DATA/VDST
1217 *
1218 */
1219 struct FLAT_instruction : public Instruction {
1220 memory_sync_info sync;
1221 bool slc : 1; /* system level coherent */
1222 bool glc : 1; /* globally coherent */
1223 bool dlc : 1; /* NAVI: device level coherent */
1224 bool lds : 1;
1225 bool nv : 1;
1226 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1227 uint32_t padding0 : 2;
1228 uint16_t offset; /* Vega/Navi only */
1229 uint16_t padding1;
1230 };
1231 static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1232
1233 struct Export_instruction : public Instruction {
1234 uint8_t enabled_mask;
1235 uint8_t dest;
1236 bool compressed : 1;
1237 bool done : 1;
1238 bool valid_mask : 1;
1239 uint32_t padding : 13;
1240 };
1241 static_assert(sizeof(Export_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1242
1243 struct Pseudo_instruction : public Instruction {
1244 PhysReg scratch_sgpr; /* might not be valid if it's not needed */
1245 bool tmp_in_scc;
1246 uint8_t padding;
1247 };
1248 static_assert(sizeof(Pseudo_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1249
1250 struct Pseudo_branch_instruction : public Instruction {
1251 /* target[0] is the block index of the branch target.
1252 * For conditional branches, target[1] contains the fall-through alternative.
1253 * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
1254 */
1255 uint32_t target[2];
1256 };
1257 static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1258
1259 struct Pseudo_barrier_instruction : public Instruction {
1260 memory_sync_info sync;
1261 sync_scope exec_scope;
1262 };
1263 static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1264
1265 enum ReduceOp : uint16_t {
1266 iadd8, iadd16, iadd32, iadd64,
1267 imul8, imul16, imul32, imul64,
1268 fadd16, fadd32, fadd64,
1269 fmul16, fmul32, fmul64,
1270 imin8, imin16, imin32, imin64,
1271 imax8, imax16, imax32, imax64,
1272 umin8, umin16, umin32, umin64,
1273 umax8, umax16, umax32, umax64,
1274 fmin16, fmin32, fmin64,
1275 fmax16, fmax32, fmax64,
1276 iand8, iand16, iand32, iand64,
1277 ior8, ior16, ior32, ior64,
1278 ixor8, ixor16, ixor32, ixor64,
1279 };
1280
1281 /**
1282 * Subgroup Reduction Instructions, everything except for the data to be
1283 * reduced and the result as inserted by setup_reduce_temp().
1284 * Operand(0): data to be reduced
1285 * Operand(1): reduce temporary
1286 * Operand(2): vector temporary
1287 * Definition(0): result
1288 * Definition(1): scalar temporary
1289 * Definition(2): scalar identity temporary (not used to store identity on GFX10)
1290 * Definition(3): scc clobber
1291 * Definition(4): vcc clobber
1292 *
1293 */
1294 struct Pseudo_reduction_instruction : public Instruction {
1295 ReduceOp reduce_op;
1296 uint16_t cluster_size; // must be 0 for scans
1297 };
1298 static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1299
1300 struct instr_deleter_functor {
1301 void operator()(void* p) {
1302 free(p);
1303 }
1304 };
1305
1306 template<typename T>
1307 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
1308
1309 template<typename T>
1310 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
1311 {
1312 std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1313 char *data = (char*) calloc(1, size);
1314 T* inst = (T*) data;
1315
1316 inst->opcode = opcode;
1317 inst->format = format;
1318
1319 uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
1320 inst->operands = aco::span<Operand>(operands_offset, num_operands);
1321 uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1322 inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1323
1324 return inst;
1325 }
1326
1327 constexpr bool Instruction::usesModifiers() const noexcept
1328 {
1329 if (isDPP() || isSDWA())
1330 return true;
1331
1332 if (format == Format::VOP3P) {
1333 const VOP3P_instruction *vop3p = static_cast<const VOP3P_instruction*>(this);
1334 for (unsigned i = 0; i < operands.size(); i++) {
1335 if (vop3p->neg_lo[i] || vop3p->neg_hi[i])
1336 return true;
1337 }
1338 return vop3p->opsel_lo || vop3p->opsel_hi || vop3p->clamp;
1339 } else if (isVOP3()) {
1340 const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
1341 for (unsigned i = 0; i < operands.size(); i++) {
1342 if (vop3->abs[i] || vop3->neg[i])
1343 return true;
1344 }
1345 return vop3->opsel || vop3->clamp || vop3->omod;
1346 }
1347 return false;
1348 }
1349
1350 constexpr bool is_phi(Instruction* instr)
1351 {
1352 return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
1353 }
1354
1355 static inline bool is_phi(aco_ptr<Instruction>& instr)
1356 {
1357 return is_phi(instr.get());
1358 }
1359
1360 memory_sync_info get_sync_info(const Instruction* instr);
1361
1362 bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr);
1363
1364 bool can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high);
1365 bool can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr);
1366 /* updates "instr" and returns the old instruction (or NULL if no update was needed) */
1367 aco_ptr<Instruction> convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& instr);
1368
1369 enum block_kind {
1370 /* uniform indicates that leaving this block,
1371 * all actives lanes stay active */
1372 block_kind_uniform = 1 << 0,
1373 block_kind_top_level = 1 << 1,
1374 block_kind_loop_preheader = 1 << 2,
1375 block_kind_loop_header = 1 << 3,
1376 block_kind_loop_exit = 1 << 4,
1377 block_kind_continue = 1 << 5,
1378 block_kind_break = 1 << 6,
1379 block_kind_continue_or_break = 1 << 7,
1380 block_kind_discard = 1 << 8,
1381 block_kind_branch = 1 << 9,
1382 block_kind_merge = 1 << 10,
1383 block_kind_invert = 1 << 11,
1384 block_kind_uses_discard_if = 1 << 12,
1385 block_kind_needs_lowering = 1 << 13,
1386 block_kind_uses_demote = 1 << 14,
1387 block_kind_export_end = 1 << 15,
1388 };
1389
1390
1391 struct RegisterDemand {
1392 constexpr RegisterDemand() = default;
1393 constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1394 : vgpr{v}, sgpr{s} {}
1395 int16_t vgpr = 0;
1396 int16_t sgpr = 0;
1397
1398 constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1399 return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1400 }
1401
1402 constexpr bool exceeds(const RegisterDemand other) const noexcept {
1403 return vgpr > other.vgpr || sgpr > other.sgpr;
1404 }
1405
1406 constexpr RegisterDemand operator+(const Temp t) const noexcept {
1407 if (t.type() == RegType::sgpr)
1408 return RegisterDemand( vgpr, sgpr + t.size() );
1409 else
1410 return RegisterDemand( vgpr + t.size(), sgpr );
1411 }
1412
1413 constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1414 return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1415 }
1416
1417 constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1418 return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1419 }
1420
1421 constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1422 vgpr += other.vgpr;
1423 sgpr += other.sgpr;
1424 return *this;
1425 }
1426
1427 constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1428 vgpr -= other.vgpr;
1429 sgpr -= other.sgpr;
1430 return *this;
1431 }
1432
1433 constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1434 if (t.type() == RegType::sgpr)
1435 sgpr += t.size();
1436 else
1437 vgpr += t.size();
1438 return *this;
1439 }
1440
1441 constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1442 if (t.type() == RegType::sgpr)
1443 sgpr -= t.size();
1444 else
1445 vgpr -= t.size();
1446 return *this;
1447 }
1448
1449 constexpr void update(const RegisterDemand other) noexcept {
1450 vgpr = std::max(vgpr, other.vgpr);
1451 sgpr = std::max(sgpr, other.sgpr);
1452 }
1453
1454 };
1455
1456 /* CFG */
1457 struct Block {
1458 float_mode fp_mode;
1459 unsigned index;
1460 unsigned offset = 0;
1461 std::vector<aco_ptr<Instruction>> instructions;
1462 std::vector<unsigned> logical_preds;
1463 std::vector<unsigned> linear_preds;
1464 std::vector<unsigned> logical_succs;
1465 std::vector<unsigned> linear_succs;
1466 RegisterDemand register_demand = RegisterDemand();
1467 uint16_t loop_nest_depth = 0;
1468 uint16_t kind = 0;
1469 int logical_idom = -1;
1470 int linear_idom = -1;
1471 Temp live_out_exec = Temp();
1472
1473 /* this information is needed for predecessors to blocks with phis when
1474 * moving out of ssa */
1475 bool scc_live_out = false;
1476 PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1477
1478 Block(unsigned idx) : index(idx) {}
1479 Block() : index(0) {}
1480 };
1481
1482 using Stage = uint16_t;
1483
1484 /* software stages */
1485 static constexpr Stage sw_vs = 1 << 0;
1486 static constexpr Stage sw_gs = 1 << 1;
1487 static constexpr Stage sw_tcs = 1 << 2;
1488 static constexpr Stage sw_tes = 1 << 3;
1489 static constexpr Stage sw_fs = 1 << 4;
1490 static constexpr Stage sw_cs = 1 << 5;
1491 static constexpr Stage sw_gs_copy = 1 << 6;
1492 static constexpr Stage sw_mask = 0x7f;
1493
1494 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1495 static constexpr Stage hw_vs = 1 << 7;
1496 static constexpr Stage hw_es = 1 << 8; /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */
1497 static constexpr Stage hw_gs = 1 << 9; /* Geometry shader on GFX10/legacy and GFX6-9. */
1498 static constexpr Stage hw_ngg_gs = 1 << 10; /* Geometry shader on GFX10/NGG. */
1499 static constexpr Stage hw_ls = 1 << 11; /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */
1500 static constexpr Stage hw_hs = 1 << 12; /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */
1501 static constexpr Stage hw_fs = 1 << 13;
1502 static constexpr Stage hw_cs = 1 << 14;
1503 static constexpr Stage hw_mask = 0xff << 7;
1504
1505 /* possible settings of Program::stage */
1506 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1507 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1508 static constexpr Stage compute_cs = sw_cs | hw_cs;
1509 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1510 static constexpr Stage gs_copy_vs = sw_gs_copy | hw_vs;
1511 /* GFX10/NGG */
1512 static constexpr Stage ngg_vertex_gs = sw_vs | hw_ngg_gs;
1513 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_ngg_gs;
1514 static constexpr Stage ngg_tess_eval_gs = sw_tes | hw_ngg_gs;
1515 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_ngg_gs;
1516 /* GFX9 (and GFX10 if NGG isn't used) */
1517 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1518 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1519 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1520 /* pre-GFX9 */
1521 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1522 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1523 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1524 static constexpr Stage tess_eval_es = sw_tes | hw_es; /* tesselation evaluation before geometry */
1525 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1526
1527 enum statistic {
1528 statistic_hash,
1529 statistic_instructions,
1530 statistic_copies,
1531 statistic_branches,
1532 statistic_cycles,
1533 statistic_vmem_clauses,
1534 statistic_smem_clauses,
1535 statistic_vmem_score,
1536 statistic_smem_score,
1537 statistic_sgpr_presched,
1538 statistic_vgpr_presched,
1539 num_statistics
1540 };
1541
1542 class Program final {
1543 public:
1544 float_mode next_fp_mode;
1545 std::vector<Block> blocks;
1546 RegisterDemand max_reg_demand = RegisterDemand();
1547 uint16_t num_waves = 0;
1548 uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1549 ac_shader_config* config;
1550 struct radv_shader_info *info;
1551 enum chip_class chip_class;
1552 enum radeon_family family;
1553 unsigned wave_size;
1554 RegClass lane_mask;
1555 Stage stage; /* Stage */
1556 bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1557 bool needs_wqm = false; /* there exists a p_wqm instruction */
1558 bool wb_smem_l1_on_end = false;
1559
1560 std::vector<uint8_t> constant_data;
1561 Temp private_segment_buffer;
1562 Temp scratch_offset;
1563
1564 uint16_t min_waves = 0;
1565 uint16_t lds_alloc_granule;
1566 uint32_t lds_limit; /* in bytes */
1567 bool has_16bank_lds;
1568 uint16_t vgpr_limit;
1569 uint16_t sgpr_limit;
1570 uint16_t physical_sgprs;
1571 uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1572 uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
1573 unsigned workgroup_size; /* if known; otherwise UINT_MAX */
1574
1575 bool xnack_enabled = false;
1576 bool sram_ecc_enabled = false;
1577 bool has_fast_fma32 = false;
1578
1579 bool needs_vcc = false;
1580 bool needs_flat_scr = false;
1581
1582 bool collect_statistics = false;
1583 uint32_t statistics[num_statistics];
1584
1585 struct {
1586 void (*func)(void *private_data,
1587 enum radv_compiler_debug_level level,
1588 const char *message);
1589 void *private_data;
1590 } debug;
1591
1592 uint32_t allocateId()
1593 {
1594 assert(allocationID <= 16777215);
1595 return allocationID++;
1596 }
1597
1598 uint32_t peekAllocationId()
1599 {
1600 return allocationID;
1601 }
1602
1603 void setAllocationId(uint32_t id)
1604 {
1605 allocationID = id;
1606 }
1607
1608 Block* create_and_insert_block() {
1609 blocks.emplace_back(blocks.size());
1610 blocks.back().fp_mode = next_fp_mode;
1611 return &blocks.back();
1612 }
1613
1614 Block* insert_block(Block&& block) {
1615 block.index = blocks.size();
1616 block.fp_mode = next_fp_mode;
1617 blocks.emplace_back(std::move(block));
1618 return &blocks.back();
1619 }
1620
1621 private:
1622 uint32_t allocationID = 1;
1623 };
1624
1625 struct TempHash {
1626 std::size_t operator()(Temp t) const {
1627 return t.id();
1628 }
1629 };
1630 using TempSet = std::unordered_set<Temp, TempHash>;
1631
1632 struct live {
1633 /* live temps out per block */
1634 std::vector<TempSet> live_out;
1635 /* register demand (sgpr/vgpr) per instruction per block */
1636 std::vector<std::vector<RegisterDemand>> register_demand;
1637 };
1638
1639 void init();
1640
1641 void init_program(Program *program, Stage stage, struct radv_shader_info *info,
1642 enum chip_class chip_class, enum radeon_family family,
1643 ac_shader_config *config);
1644
1645 void select_program(Program *program,
1646 unsigned shader_count,
1647 struct nir_shader *const *shaders,
1648 ac_shader_config* config,
1649 struct radv_shader_args *args);
1650 void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
1651 ac_shader_config* config,
1652 struct radv_shader_args *args);
1653
1654 void lower_wqm(Program* program, live& live_vars,
1655 const struct radv_nir_compiler_options *options);
1656 void lower_phis(Program* program);
1657 void calc_min_waves(Program* program);
1658 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1659 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1660 std::vector<uint16_t> dead_code_analysis(Program *program);
1661 void dominator_tree(Program* program);
1662 void insert_exec_mask(Program *program);
1663 void value_numbering(Program* program);
1664 void optimize(Program* program);
1665 void setup_reduce_temp(Program* program);
1666 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1667 void register_allocation(Program *program, std::vector<TempSet>& live_out_per_block);
1668 void ssa_elimination(Program* program);
1669 void lower_to_hw_instr(Program* program);
1670 void schedule_program(Program* program, live& live_vars);
1671 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1672 void insert_wait_states(Program* program);
1673 void insert_NOPs(Program* program);
1674 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1675 void print_asm(Program *program, std::vector<uint32_t>& binary,
1676 unsigned exec_size, std::ostream& out);
1677 bool validate_ir(Program* program);
1678 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options);
1679 #ifndef NDEBUG
1680 void perfwarn(Program *program, bool cond, const char *msg, Instruction *instr=NULL);
1681 #else
1682 #define perfwarn(program, cond, msg, ...) do {} while(0)
1683 #endif
1684
1685 void collect_presched_stats(Program *program);
1686 void collect_preasm_stats(Program *program);
1687 void collect_postasm_stats(Program *program, const std::vector<uint32_t>& code);
1688
1689 void aco_print_instr(const Instruction *instr, FILE *output);
1690 void aco_print_program(const Program *program, FILE *output);
1691
1692 void _aco_perfwarn(Program *program, const char *file, unsigned line,
1693 const char *fmt, ...);
1694 void _aco_err(Program *program, const char *file, unsigned line,
1695 const char *fmt, ...);
1696
1697 #define aco_perfwarn(program, ...) _aco_perfwarn(program, __FILE__, __LINE__, __VA_ARGS__)
1698 #define aco_err(program, ...) _aco_err(program, __FILE__, __LINE__, __VA_ARGS__)
1699
1700 /* utilities for dealing with register demand */
1701 RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
1702 RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
1703 RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, aco_ptr<Instruction>& instr_before);
1704
1705 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1706 uint16_t get_extra_sgprs(Program *program);
1707
1708 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
1709 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1710 uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs);
1711
1712 /* return number of addressable sgprs/vgprs for max_waves */
1713 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1714 uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves);
1715
1716 typedef struct {
1717 const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
1718 const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1719 const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1720 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1721 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1722 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
1723 const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1724 const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1725 /* sizes used for input/output modifiers and constants */
1726 const unsigned operand_size[static_cast<int>(aco_opcode::num_opcodes)];
1727 const unsigned definition_size[static_cast<int>(aco_opcode::num_opcodes)];
1728 } Info;
1729
1730 extern const Info instr_info;
1731
1732 }
1733
1734 #endif /* ACO_IR_H */
1735