aco: make validate() usable in tests
[mesa.git] / src / amd / compiler / aco_ir.h
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #ifndef ACO_IR_H
26 #define ACO_IR_H
27
28 #include <vector>
29 #include <set>
30 #include <unordered_set>
31 #include <bitset>
32 #include <memory>
33
34 #include "nir.h"
35 #include "ac_binary.h"
36 #include "amd_family.h"
37 #include "aco_opcodes.h"
38 #include "aco_util.h"
39
40 struct radv_nir_compiler_options;
41 struct radv_shader_args;
42 struct radv_shader_info;
43
44 namespace aco {
45
46 extern uint64_t debug_flags;
47
48 enum {
49 DEBUG_VALIDATE = 0x1,
50 DEBUG_VALIDATE_RA = 0x2,
51 DEBUG_PERFWARN = 0x4,
52 };
53
54 /**
55 * Representation of the instruction's microcode encoding format
56 * Note: Some Vector ALU Formats can be combined, such that:
57 * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
58 * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
59 * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
60 *
61 * (*) The same is applicable for VOP1 and VOPC instructions.
62 */
63 enum class Format : std::uint16_t {
64 /* Pseudo Instruction Format */
65 PSEUDO = 0,
66 /* Scalar ALU & Control Formats */
67 SOP1 = 1,
68 SOP2 = 2,
69 SOPK = 3,
70 SOPP = 4,
71 SOPC = 5,
72 /* Scalar Memory Format */
73 SMEM = 6,
74 /* LDS/GDS Format */
75 DS = 8,
76 /* Vector Memory Buffer Formats */
77 MTBUF = 9,
78 MUBUF = 10,
79 /* Vector Memory Image Format */
80 MIMG = 11,
81 /* Export Format */
82 EXP = 12,
83 /* Flat Formats */
84 FLAT = 13,
85 GLOBAL = 14,
86 SCRATCH = 15,
87
88 PSEUDO_BRANCH = 16,
89 PSEUDO_BARRIER = 17,
90 PSEUDO_REDUCTION = 18,
91
92 /* Vector ALU Formats */
93 VOP3P = 19,
94 VOP1 = 1 << 8,
95 VOP2 = 1 << 9,
96 VOPC = 1 << 10,
97 VOP3 = 1 << 11,
98 VOP3A = 1 << 11,
99 VOP3B = 1 << 11,
100 /* Vector Parameter Interpolation Format */
101 VINTRP = 1 << 12,
102 DPP = 1 << 13,
103 SDWA = 1 << 14,
104 };
105
106 enum barrier_interaction : uint8_t {
107 barrier_none = 0,
108 barrier_buffer = 0x1,
109 barrier_image = 0x2,
110 barrier_atomic = 0x4,
111 barrier_shared = 0x8,
112 /* used for geometry shaders to ensure vertex data writes are before the
113 * GS_DONE s_sendmsg. */
114 barrier_gs_data = 0x10,
115 /* used for geometry shaders to ensure s_sendmsg instructions are in-order. */
116 barrier_gs_sendmsg = 0x20,
117 /* used by barriers. created by s_barrier */
118 barrier_barrier = 0x40,
119 barrier_count = 7,
120 };
121
122 enum fp_round {
123 fp_round_ne = 0,
124 fp_round_pi = 1,
125 fp_round_ni = 2,
126 fp_round_tz = 3,
127 };
128
129 enum fp_denorm {
130 /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
131 * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
132 fp_denorm_flush = 0x0,
133 fp_denorm_keep = 0x3,
134 };
135
136 struct float_mode {
137 /* matches encoding of the MODE register */
138 union {
139 struct {
140 fp_round round32:2;
141 fp_round round16_64:2;
142 unsigned denorm32:2;
143 unsigned denorm16_64:2;
144 };
145 struct {
146 uint8_t round:4;
147 uint8_t denorm:4;
148 };
149 uint8_t val = 0;
150 };
151 /* if false, optimizations which may remove infs/nan/-0.0 can be done */
152 bool preserve_signed_zero_inf_nan32:1;
153 bool preserve_signed_zero_inf_nan16_64:1;
154 /* if false, optimizations which may remove denormal flushing can be done */
155 bool must_flush_denorms32:1;
156 bool must_flush_denorms16_64:1;
157 bool care_about_round32:1;
158 bool care_about_round16_64:1;
159
160 /* Returns true if instructions using the mode "other" can safely use the
161 * current one instead. */
162 bool canReplace(float_mode other) const noexcept {
163 return val == other.val &&
164 (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
165 (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
166 (must_flush_denorms32 || !other.must_flush_denorms32) &&
167 (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
168 (care_about_round32 || !other.care_about_round32) &&
169 (care_about_round16_64 || !other.care_about_round16_64);
170 }
171 };
172
173 constexpr Format asVOP3(Format format) {
174 return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
175 };
176
177 constexpr Format asSDWA(Format format) {
178 assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC);
179 return (Format) ((uint32_t) Format::SDWA | (uint32_t) format);
180 }
181
182 enum class RegType {
183 none = 0,
184 sgpr,
185 vgpr,
186 linear_vgpr,
187 };
188
189 struct RegClass {
190
191 enum RC : uint8_t {
192 s1 = 1,
193 s2 = 2,
194 s3 = 3,
195 s4 = 4,
196 s6 = 6,
197 s8 = 8,
198 s16 = 16,
199 v1 = s1 | (1 << 5),
200 v2 = s2 | (1 << 5),
201 v3 = s3 | (1 << 5),
202 v4 = s4 | (1 << 5),
203 v5 = 5 | (1 << 5),
204 v6 = 6 | (1 << 5),
205 v7 = 7 | (1 << 5),
206 v8 = 8 | (1 << 5),
207 /* byte-sized register class */
208 v1b = v1 | (1 << 7),
209 v2b = v2 | (1 << 7),
210 v3b = v3 | (1 << 7),
211 v4b = v4 | (1 << 7),
212 v6b = v6 | (1 << 7),
213 v8b = v8 | (1 << 7),
214 /* these are used for WWM and spills to vgpr */
215 v1_linear = v1 | (1 << 6),
216 v2_linear = v2 | (1 << 6),
217 };
218
219 RegClass() = default;
220 constexpr RegClass(RC rc)
221 : rc(rc) {}
222 constexpr RegClass(RegType type, unsigned size)
223 : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
224
225 constexpr operator RC() const { return rc; }
226 explicit operator bool() = delete;
227
228 constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
229 constexpr bool is_subdword() const { return rc & (1 << 7); }
230 constexpr unsigned bytes() const { return ((unsigned) rc & 0x1F) * (is_subdword() ? 1 : 4); }
231 //TODO: use size() less in favor of bytes()
232 constexpr unsigned size() const { return (bytes() + 3) >> 2; }
233 constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
234 constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
235 constexpr RegClass as_subdword() const { return RegClass((RC) (rc | 1 << 7)); }
236
237 static constexpr RegClass get(RegType type, unsigned bytes) {
238 if (type == RegType::sgpr) {
239 return RegClass(type, DIV_ROUND_UP(bytes, 4u));
240 } else {
241 return bytes % 4u ? RegClass(type, bytes).as_subdword() :
242 RegClass(type, bytes / 4u);
243 }
244 }
245
246 private:
247 RC rc;
248 };
249
250 /* transitional helper expressions */
251 static constexpr RegClass s1{RegClass::s1};
252 static constexpr RegClass s2{RegClass::s2};
253 static constexpr RegClass s3{RegClass::s3};
254 static constexpr RegClass s4{RegClass::s4};
255 static constexpr RegClass s8{RegClass::s8};
256 static constexpr RegClass s16{RegClass::s16};
257 static constexpr RegClass v1{RegClass::v1};
258 static constexpr RegClass v2{RegClass::v2};
259 static constexpr RegClass v3{RegClass::v3};
260 static constexpr RegClass v4{RegClass::v4};
261 static constexpr RegClass v5{RegClass::v5};
262 static constexpr RegClass v6{RegClass::v6};
263 static constexpr RegClass v7{RegClass::v7};
264 static constexpr RegClass v8{RegClass::v8};
265 static constexpr RegClass v1b{RegClass::v1b};
266 static constexpr RegClass v2b{RegClass::v2b};
267 static constexpr RegClass v3b{RegClass::v3b};
268 static constexpr RegClass v4b{RegClass::v4b};
269 static constexpr RegClass v6b{RegClass::v6b};
270 static constexpr RegClass v8b{RegClass::v8b};
271
272 /**
273 * Temp Class
274 * Each temporary virtual register has a
275 * register class (i.e. size and type)
276 * and SSA id.
277 */
278 struct Temp {
279 Temp() noexcept : id_(0), reg_class(0) {}
280 constexpr Temp(uint32_t id, RegClass cls) noexcept
281 : id_(id), reg_class(uint8_t(cls)) {}
282
283 constexpr uint32_t id() const noexcept { return id_; }
284 constexpr RegClass regClass() const noexcept { return (RegClass::RC)reg_class; }
285
286 constexpr unsigned bytes() const noexcept { return regClass().bytes(); }
287 constexpr unsigned size() const noexcept { return regClass().size(); }
288 constexpr RegType type() const noexcept { return regClass().type(); }
289 constexpr bool is_linear() const noexcept { return regClass().is_linear(); }
290
291 constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
292 constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
293 constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
294
295 private:
296 uint32_t id_: 24;
297 uint32_t reg_class : 8;
298 };
299
300 /**
301 * PhysReg
302 * Represents the physical register for each
303 * Operand and Definition.
304 */
305 struct PhysReg {
306 constexpr PhysReg() = default;
307 explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {}
308 constexpr unsigned reg() const { return reg_b >> 2; }
309 constexpr unsigned byte() const { return reg_b & 0x3; }
310 constexpr operator unsigned() const { return reg(); }
311 constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; }
312 constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; }
313 constexpr bool operator <(PhysReg other) const { return reg_b < other.reg_b; }
314 constexpr PhysReg advance(int bytes) const { PhysReg res = *this; res.reg_b += bytes; return res; }
315
316 uint16_t reg_b = 0;
317 };
318
319 /* helper expressions for special registers */
320 static constexpr PhysReg m0{124};
321 static constexpr PhysReg vcc{106};
322 static constexpr PhysReg vcc_hi{107};
323 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
324 static constexpr PhysReg exec{126};
325 static constexpr PhysReg exec_lo{126};
326 static constexpr PhysReg exec_hi{127};
327 static constexpr PhysReg vccz{251};
328 static constexpr PhysReg execz{252};
329 static constexpr PhysReg scc{253};
330
331 /**
332 * Operand Class
333 * Initially, each Operand refers to either
334 * a temporary virtual register
335 * or to a constant value
336 * Temporary registers get mapped to physical register during RA
337 * Constant values are inlined into the instruction sequence.
338 */
339 class Operand final
340 {
341 public:
342 constexpr Operand()
343 : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
344 isKill_(false), isUndef_(true), isFirstKill_(false), constSize(0),
345 isLateKill_(false) {}
346
347 explicit Operand(Temp r) noexcept
348 {
349 data_.temp = r;
350 if (r.id()) {
351 isTemp_ = true;
352 } else {
353 isUndef_ = true;
354 setFixed(PhysReg{128});
355 }
356 };
357 explicit Operand(uint8_t v) noexcept
358 {
359 /* 8-bit constants are only used for copies and copies from any 8-bit
360 * constant can be implemented with a SDWA v_mul_u32_u24. So consider all
361 * to be inline constants. */
362 data_.i = v;
363 isConstant_ = true;
364 constSize = 0;
365 setFixed(PhysReg{0u});
366 };
367 explicit Operand(uint16_t v) noexcept
368 {
369 data_.i = v;
370 isConstant_ = true;
371 constSize = 1;
372 if (v <= 64)
373 setFixed(PhysReg{128u + v});
374 else if (v >= 0xFFF0) /* [-16 .. -1] */
375 setFixed(PhysReg{192u + (0xFFFF - v)});
376 else if (v == 0x3800) /* 0.5 */
377 setFixed(PhysReg{240});
378 else if (v == 0xB800) /* -0.5 */
379 setFixed(PhysReg{241});
380 else if (v == 0x3C00) /* 1.0 */
381 setFixed(PhysReg{242});
382 else if (v == 0xBC00) /* -1.0 */
383 setFixed(PhysReg{243});
384 else if (v == 0x4000) /* 2.0 */
385 setFixed(PhysReg{244});
386 else if (v == 0xC000) /* -2.0 */
387 setFixed(PhysReg{245});
388 else if (v == 0x4400) /* 4.0 */
389 setFixed(PhysReg{246});
390 else if (v == 0xC400) /* -4.0 */
391 setFixed(PhysReg{247});
392 else if (v == 0x3118) /* 1/2 PI */
393 setFixed(PhysReg{248});
394 else /* Literal Constant */
395 setFixed(PhysReg{255});
396 };
397 explicit Operand(uint32_t v, bool is64bit = false) noexcept
398 {
399 data_.i = v;
400 isConstant_ = true;
401 constSize = is64bit ? 3 : 2;
402 if (v <= 64)
403 setFixed(PhysReg{128 + v});
404 else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
405 setFixed(PhysReg{192 - v});
406 else if (v == 0x3f000000) /* 0.5 */
407 setFixed(PhysReg{240});
408 else if (v == 0xbf000000) /* -0.5 */
409 setFixed(PhysReg{241});
410 else if (v == 0x3f800000) /* 1.0 */
411 setFixed(PhysReg{242});
412 else if (v == 0xbf800000) /* -1.0 */
413 setFixed(PhysReg{243});
414 else if (v == 0x40000000) /* 2.0 */
415 setFixed(PhysReg{244});
416 else if (v == 0xc0000000) /* -2.0 */
417 setFixed(PhysReg{245});
418 else if (v == 0x40800000) /* 4.0 */
419 setFixed(PhysReg{246});
420 else if (v == 0xc0800000) /* -4.0 */
421 setFixed(PhysReg{247});
422 else { /* Literal Constant */
423 assert(!is64bit && "attempt to create a 64-bit literal constant");
424 setFixed(PhysReg{255});
425 }
426 };
427 explicit Operand(uint64_t v) noexcept
428 {
429 isConstant_ = true;
430 constSize = 3;
431 if (v <= 64) {
432 data_.i = (uint32_t) v;
433 setFixed(PhysReg{128 + (uint32_t) v});
434 } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
435 data_.i = (uint32_t) v;
436 setFixed(PhysReg{192 - (uint32_t) v});
437 } else if (v == 0x3FE0000000000000) { /* 0.5 */
438 data_.i = 0x3f000000;
439 setFixed(PhysReg{240});
440 } else if (v == 0xBFE0000000000000) { /* -0.5 */
441 data_.i = 0xbf000000;
442 setFixed(PhysReg{241});
443 } else if (v == 0x3FF0000000000000) { /* 1.0 */
444 data_.i = 0x3f800000;
445 setFixed(PhysReg{242});
446 } else if (v == 0xBFF0000000000000) { /* -1.0 */
447 data_.i = 0xbf800000;
448 setFixed(PhysReg{243});
449 } else if (v == 0x4000000000000000) { /* 2.0 */
450 data_.i = 0x40000000;
451 setFixed(PhysReg{244});
452 } else if (v == 0xC000000000000000) { /* -2.0 */
453 data_.i = 0xc0000000;
454 setFixed(PhysReg{245});
455 } else if (v == 0x4010000000000000) { /* 4.0 */
456 data_.i = 0x40800000;
457 setFixed(PhysReg{246});
458 } else if (v == 0xC010000000000000) { /* -4.0 */
459 data_.i = 0xc0800000;
460 setFixed(PhysReg{247});
461 } else { /* Literal Constant: we don't know if it is a long or double.*/
462 isConstant_ = 0;
463 assert(false && "attempt to create a 64-bit literal constant");
464 }
465 };
466 explicit Operand(RegClass type) noexcept
467 {
468 isUndef_ = true;
469 data_.temp = Temp(0, type);
470 setFixed(PhysReg{128});
471 };
472 explicit Operand(PhysReg reg, RegClass type) noexcept
473 {
474 data_.temp = Temp(0, type);
475 setFixed(reg);
476 }
477
478 constexpr bool isTemp() const noexcept
479 {
480 return isTemp_;
481 }
482
483 constexpr void setTemp(Temp t) noexcept {
484 assert(!isConstant_);
485 isTemp_ = true;
486 data_.temp = t;
487 }
488
489 constexpr Temp getTemp() const noexcept
490 {
491 return data_.temp;
492 }
493
494 constexpr uint32_t tempId() const noexcept
495 {
496 return data_.temp.id();
497 }
498
499 constexpr bool hasRegClass() const noexcept
500 {
501 return isTemp() || isUndefined();
502 }
503
504 constexpr RegClass regClass() const noexcept
505 {
506 return data_.temp.regClass();
507 }
508
509 constexpr unsigned bytes() const noexcept
510 {
511 if (isConstant())
512 return 1 << constSize;
513 else
514 return data_.temp.bytes();
515 }
516
517 constexpr unsigned size() const noexcept
518 {
519 if (isConstant())
520 return constSize > 2 ? 2 : 1;
521 else
522 return data_.temp.size();
523 }
524
525 constexpr bool isFixed() const noexcept
526 {
527 return isFixed_;
528 }
529
530 constexpr PhysReg physReg() const noexcept
531 {
532 return reg_;
533 }
534
535 constexpr void setFixed(PhysReg reg) noexcept
536 {
537 isFixed_ = reg != unsigned(-1);
538 reg_ = reg;
539 }
540
541 constexpr bool isConstant() const noexcept
542 {
543 return isConstant_;
544 }
545
546 constexpr bool isLiteral() const noexcept
547 {
548 return isConstant() && reg_ == 255;
549 }
550
551 constexpr bool isUndefined() const noexcept
552 {
553 return isUndef_;
554 }
555
556 constexpr uint32_t constantValue() const noexcept
557 {
558 return data_.i;
559 }
560
561 constexpr bool constantEquals(uint32_t cmp) const noexcept
562 {
563 return isConstant() && constantValue() == cmp;
564 }
565
566 constexpr uint64_t constantValue64(bool signext=false) const noexcept
567 {
568 if (constSize == 3) {
569 if (reg_ <= 192)
570 return reg_ - 128;
571 else if (reg_ <= 208)
572 return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);
573
574 switch (reg_) {
575 case 240:
576 return 0x3FE0000000000000;
577 case 241:
578 return 0xBFE0000000000000;
579 case 242:
580 return 0x3FF0000000000000;
581 case 243:
582 return 0xBFF0000000000000;
583 case 244:
584 return 0x4000000000000000;
585 case 245:
586 return 0xC000000000000000;
587 case 246:
588 return 0x4010000000000000;
589 case 247:
590 return 0xC010000000000000;
591 }
592 } else if (constSize == 1) {
593 return (signext && (data_.i & 0x8000u) ? 0xffffffffffff0000ull : 0ull) | data_.i;
594 } else if (constSize == 0) {
595 return (signext && (data_.i & 0x80u) ? 0xffffffffffffff00ull : 0ull) | data_.i;
596 }
597 return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
598 }
599
600 constexpr bool isOfType(RegType type) const noexcept
601 {
602 return hasRegClass() && regClass().type() == type;
603 }
604
605 /* Indicates that the killed operand's live range intersects with the
606 * instruction's definitions. Unlike isKill() and isFirstKill(), this is
607 * not set by liveness analysis. */
608 constexpr void setLateKill(bool flag) noexcept
609 {
610 isLateKill_ = flag;
611 }
612
613 constexpr bool isLateKill() const noexcept
614 {
615 return isLateKill_;
616 }
617
618 constexpr void setKill(bool flag) noexcept
619 {
620 isKill_ = flag;
621 if (!flag)
622 setFirstKill(false);
623 }
624
625 constexpr bool isKill() const noexcept
626 {
627 return isKill_ || isFirstKill();
628 }
629
630 constexpr void setFirstKill(bool flag) noexcept
631 {
632 isFirstKill_ = flag;
633 if (flag)
634 setKill(flag);
635 }
636
637 /* When there are multiple operands killing the same temporary,
638 * isFirstKill() is only returns true for the first one. */
639 constexpr bool isFirstKill() const noexcept
640 {
641 return isFirstKill_;
642 }
643
644 constexpr bool isKillBeforeDef() const noexcept
645 {
646 return isKill() && !isLateKill();
647 }
648
649 constexpr bool isFirstKillBeforeDef() const noexcept
650 {
651 return isFirstKill() && !isLateKill();
652 }
653
654 constexpr bool operator == (Operand other) const noexcept
655 {
656 if (other.size() != size())
657 return false;
658 if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef())
659 return false;
660 if (isFixed() && other.isFixed() && physReg() != other.physReg())
661 return false;
662 if (isLiteral())
663 return other.isLiteral() && other.constantValue() == constantValue();
664 else if (isConstant())
665 return other.isConstant() && other.physReg() == physReg();
666 else if (isUndefined())
667 return other.isUndefined() && other.regClass() == regClass();
668 else
669 return other.isTemp() && other.getTemp() == getTemp();
670 }
671 private:
672 union {
673 uint32_t i;
674 float f;
675 Temp temp = Temp(0, s1);
676 } data_;
677 PhysReg reg_;
678 union {
679 struct {
680 uint8_t isTemp_:1;
681 uint8_t isFixed_:1;
682 uint8_t isConstant_:1;
683 uint8_t isKill_:1;
684 uint8_t isUndef_:1;
685 uint8_t isFirstKill_:1;
686 uint8_t constSize:2;
687 uint8_t isLateKill_:1;
688 };
689 /* can't initialize bit-fields in c++11, so work around using a union */
690 uint16_t control_ = 0;
691 };
692 };
693
694 /**
695 * Definition Class
696 * Definitions are the results of Instructions
697 * and refer to temporary virtual registers
698 * which are later mapped to physical registers
699 */
700 class Definition final
701 {
702 public:
703 constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0),
704 isKill_(0), isPrecise_(0), isNUW_(0) {}
705 Definition(uint32_t index, RegClass type) noexcept
706 : temp(index, type) {}
707 explicit Definition(Temp tmp) noexcept
708 : temp(tmp) {}
709 Definition(PhysReg reg, RegClass type) noexcept
710 : temp(Temp(0, type))
711 {
712 setFixed(reg);
713 }
714 Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
715 : temp(Temp(tmpId, type))
716 {
717 setFixed(reg);
718 }
719
720 constexpr bool isTemp() const noexcept
721 {
722 return tempId() > 0;
723 }
724
725 constexpr Temp getTemp() const noexcept
726 {
727 return temp;
728 }
729
730 constexpr uint32_t tempId() const noexcept
731 {
732 return temp.id();
733 }
734
735 constexpr void setTemp(Temp t) noexcept {
736 temp = t;
737 }
738
739 constexpr RegClass regClass() const noexcept
740 {
741 return temp.regClass();
742 }
743
744 constexpr unsigned bytes() const noexcept
745 {
746 return temp.bytes();
747 }
748
749 constexpr unsigned size() const noexcept
750 {
751 return temp.size();
752 }
753
754 constexpr bool isFixed() const noexcept
755 {
756 return isFixed_;
757 }
758
759 constexpr PhysReg physReg() const noexcept
760 {
761 return reg_;
762 }
763
764 constexpr void setFixed(PhysReg reg) noexcept
765 {
766 isFixed_ = 1;
767 reg_ = reg;
768 }
769
770 constexpr void setHint(PhysReg reg) noexcept
771 {
772 hasHint_ = 1;
773 reg_ = reg;
774 }
775
776 constexpr bool hasHint() const noexcept
777 {
778 return hasHint_;
779 }
780
781 constexpr void setKill(bool flag) noexcept
782 {
783 isKill_ = flag;
784 }
785
786 constexpr bool isKill() const noexcept
787 {
788 return isKill_;
789 }
790
791 constexpr void setPrecise(bool precise) noexcept
792 {
793 isPrecise_ = precise;
794 }
795
796 constexpr bool isPrecise() const noexcept
797 {
798 return isPrecise_;
799 }
800
801 /* No Unsigned Wrap */
802 constexpr void setNUW(bool nuw) noexcept
803 {
804 isNUW_ = nuw;
805 }
806
807 constexpr bool isNUW() const noexcept
808 {
809 return isNUW_;
810 }
811
812 private:
813 Temp temp = Temp(0, s1);
814 PhysReg reg_;
815 union {
816 struct {
817 uint8_t isFixed_:1;
818 uint8_t hasHint_:1;
819 uint8_t isKill_:1;
820 uint8_t isPrecise_:1;
821 uint8_t isNUW_:1;
822 };
823 /* can't initialize bit-fields in c++11, so work around using a union */
824 uint8_t control_ = 0;
825 };
826 };
827
828 struct Block;
829
830 struct Instruction {
831 aco_opcode opcode;
832 Format format;
833 uint32_t pass_flags;
834
835 aco::span<Operand> operands;
836 aco::span<Definition> definitions;
837
838 constexpr bool isVALU() const noexcept
839 {
840 return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
841 || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
842 || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
843 || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
844 || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
845 || format == Format::VOP3P;
846 }
847
848 constexpr bool isSALU() const noexcept
849 {
850 return format == Format::SOP1 ||
851 format == Format::SOP2 ||
852 format == Format::SOPC ||
853 format == Format::SOPK ||
854 format == Format::SOPP;
855 }
856
857 constexpr bool isVMEM() const noexcept
858 {
859 return format == Format::MTBUF ||
860 format == Format::MUBUF ||
861 format == Format::MIMG;
862 }
863
864 constexpr bool isDPP() const noexcept
865 {
866 return (uint16_t) format & (uint16_t) Format::DPP;
867 }
868
869 constexpr bool isVOP3() const noexcept
870 {
871 return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
872 ((uint16_t) format & (uint16_t) Format::VOP3B);
873 }
874
875 constexpr bool isSDWA() const noexcept
876 {
877 return (uint16_t) format & (uint16_t) Format::SDWA;
878 }
879
880 constexpr bool isFlatOrGlobal() const noexcept
881 {
882 return format == Format::FLAT || format == Format::GLOBAL;
883 }
884
885 constexpr bool usesModifiers() const noexcept;
886
887 constexpr bool reads_exec() const noexcept
888 {
889 for (const Operand& op : operands) {
890 if (op.isFixed() && op.physReg() == exec)
891 return true;
892 }
893 return false;
894 }
895 };
896 static_assert(sizeof(Instruction) == 16, "Unexpected padding");
897
898 struct SOPK_instruction : public Instruction {
899 uint16_t imm;
900 uint16_t padding;
901 };
902 static_assert(sizeof(SOPK_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
903
904 struct SOPP_instruction : public Instruction {
905 uint32_t imm;
906 int block;
907 };
908 static_assert(sizeof(SOPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
909
910 struct SOPC_instruction : public Instruction {
911 };
912 static_assert(sizeof(SOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
913
914 struct SOP1_instruction : public Instruction {
915 };
916 static_assert(sizeof(SOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
917
918 struct SOP2_instruction : public Instruction {
919 };
920 static_assert(sizeof(SOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
921
922 /**
923 * Scalar Memory Format:
924 * For s_(buffer_)load_dword*:
925 * Operand(0): SBASE - SGPR-pair which provides base address
926 * Operand(1): Offset - immediate (un)signed offset or SGPR
927 * Operand(2) / Definition(0): SDATA - SGPR for read / write result
928 * Operand(n-1): SOffset - SGPR offset (Vega only)
929 *
930 * Having no operands is also valid for instructions such as s_dcache_inv.
931 *
932 */
933 struct SMEM_instruction : public Instruction {
934 barrier_interaction barrier;
935 bool glc : 1; /* VI+: globally coherent */
936 bool dlc : 1; /* NAVI: device level coherent */
937 bool nv : 1; /* VEGA only: Non-volatile */
938 bool can_reorder : 1;
939 bool disable_wqm : 1;
940 bool prevent_overflow : 1; /* avoid overflow when combining additions */
941 uint32_t padding: 18;
942 };
943 static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
944
945 struct VOP1_instruction : public Instruction {
946 };
947 static_assert(sizeof(VOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
948
949 struct VOP2_instruction : public Instruction {
950 };
951 static_assert(sizeof(VOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
952
953 struct VOPC_instruction : public Instruction {
954 };
955 static_assert(sizeof(VOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
956
957 struct VOP3A_instruction : public Instruction {
958 bool abs[3];
959 bool neg[3];
960 uint8_t opsel : 4;
961 uint8_t omod : 2;
962 bool clamp : 1;
963 uint32_t padding : 9;
964 };
965 static_assert(sizeof(VOP3A_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
966
967 struct VOP3P_instruction : public Instruction {
968 bool neg_lo[3];
969 bool neg_hi[3];
970 uint8_t opsel_lo : 3;
971 uint8_t opsel_hi : 3;
972 bool clamp : 1;
973 uint32_t padding : 9;
974 };
975 static_assert(sizeof(VOP3P_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
976
977 /**
978 * Data Parallel Primitives Format:
979 * This format can be used for VOP1, VOP2 or VOPC instructions.
980 * The swizzle applies to the src0 operand.
981 *
982 */
983 struct DPP_instruction : public Instruction {
984 bool abs[2];
985 bool neg[2];
986 uint16_t dpp_ctrl;
987 uint8_t row_mask : 4;
988 uint8_t bank_mask : 4;
989 bool bound_ctrl : 1;
990 uint32_t padding : 7;
991 };
992 static_assert(sizeof(DPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
993
994 enum sdwa_sel : uint8_t {
995 /* masks */
996 sdwa_wordnum = 0x1,
997 sdwa_bytenum = 0x3,
998 sdwa_asuint = 0x7 | 0x10,
999 sdwa_rasize = 0x3,
1000
1001 /* flags */
1002 sdwa_isword = 0x4,
1003 sdwa_sext = 0x8,
1004 sdwa_isra = 0x10,
1005
1006 /* specific values */
1007 sdwa_ubyte0 = 0,
1008 sdwa_ubyte1 = 1,
1009 sdwa_ubyte2 = 2,
1010 sdwa_ubyte3 = 3,
1011 sdwa_uword0 = sdwa_isword | 0,
1012 sdwa_uword1 = sdwa_isword | 1,
1013 sdwa_udword = 6,
1014
1015 sdwa_sbyte0 = sdwa_ubyte0 | sdwa_sext,
1016 sdwa_sbyte1 = sdwa_ubyte1 | sdwa_sext,
1017 sdwa_sbyte2 = sdwa_ubyte2 | sdwa_sext,
1018 sdwa_sbyte3 = sdwa_ubyte3 | sdwa_sext,
1019 sdwa_sword0 = sdwa_uword0 | sdwa_sext,
1020 sdwa_sword1 = sdwa_uword1 | sdwa_sext,
1021 sdwa_sdword = sdwa_udword | sdwa_sext,
1022
1023 /* register-allocated */
1024 sdwa_ubyte = 1 | sdwa_isra,
1025 sdwa_uword = 2 | sdwa_isra,
1026 sdwa_sbyte = sdwa_ubyte | sdwa_sext,
1027 sdwa_sword = sdwa_uword | sdwa_sext,
1028 };
1029
1030 /**
1031 * Sub-Dword Addressing Format:
1032 * This format can be used for VOP1, VOP2 or VOPC instructions.
1033 *
1034 * omod and SGPR/constant operands are only available on GFX9+. For VOPC,
1035 * the definition doesn't have to be VCC on GFX9+.
1036 *
1037 */
1038 struct SDWA_instruction : public Instruction {
1039 /* these destination modifiers aren't available with VOPC except for
1040 * clamp on GFX8 */
1041 uint8_t sel[2];
1042 uint8_t dst_sel;
1043 bool neg[2];
1044 bool abs[2];
1045 bool dst_preserve : 1;
1046 bool clamp : 1;
1047 uint8_t omod : 2; /* GFX9+ */
1048 uint32_t padding : 4;
1049 };
1050 static_assert(sizeof(SDWA_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1051
1052 struct Interp_instruction : public Instruction {
1053 uint8_t attribute;
1054 uint8_t component;
1055 uint16_t padding;
1056 };
1057 static_assert(sizeof(Interp_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1058
1059 /**
1060 * Local and Global Data Sharing instructions
1061 * Operand(0): ADDR - VGPR which supplies the address.
1062 * Operand(1): DATA0 - First data VGPR.
1063 * Operand(2): DATA1 - Second data VGPR.
1064 * Operand(n-1): M0 - LDS size.
1065 * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
1066 *
1067 */
1068 struct DS_instruction : public Instruction {
1069 int16_t offset0;
1070 int8_t offset1;
1071 bool gds;
1072 };
1073 static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1074
1075 /**
1076 * Vector Memory Untyped-buffer Instructions
1077 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1078 * Operand(1): VADDR - Address source. Can carry an index and/or offset
1079 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1080 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1081 *
1082 */
1083 struct MUBUF_instruction : public Instruction {
1084 uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
1085 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1086 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1087 bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
1088 bool glc : 1; /* globally coherent */
1089 bool dlc : 1; /* NAVI: device level coherent */
1090 bool slc : 1; /* system level coherent */
1091 bool tfe : 1; /* texture fail enable */
1092 bool lds : 1; /* Return read-data to LDS instead of VGPRs */
1093 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1094 bool can_reorder : 1;
1095 bool swizzled:1;
1096 uint8_t padding : 1;
1097 barrier_interaction barrier;
1098 };
1099 static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1100
1101 /**
1102 * Vector Memory Typed-buffer Instructions
1103 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1104 * Operand(1): VADDR - Address source. Can carry an index and/or offset
1105 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1106 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1107 *
1108 */
1109 struct MTBUF_instruction : public Instruction {
1110 uint16_t offset; /* Unsigned byte offset - 12 bit */
1111 barrier_interaction barrier;
1112 uint8_t dfmt : 4; /* Data Format of data in memory buffer */
1113 uint8_t nfmt : 3; /* Numeric format of data in memory */
1114 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1115 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1116 bool glc : 1; /* globally coherent */
1117 bool dlc : 1; /* NAVI: device level coherent */
1118 bool slc : 1; /* system level coherent */
1119 bool tfe : 1; /* texture fail enable */
1120 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1121 bool can_reorder : 1;
1122 uint32_t padding : 25;
1123 };
1124 static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1125
1126 /**
1127 * Vector Memory Image Instructions
1128 * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
1129 * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
1130 * or VDATA - Vector GPR for write data.
1131 * Operand(2): VADDR - Address source. Can carry an offset or an index.
1132 * Definition(0): VDATA - Vector GPR for read result.
1133 *
1134 */
1135 struct MIMG_instruction : public Instruction {
1136 uint8_t dmask; /* Data VGPR enable mask */
1137 uint8_t dim : 3; /* NAVI: dimensionality */
1138 bool unrm : 1; /* Force address to be un-normalized */
1139 bool dlc : 1; /* NAVI: device level coherent */
1140 bool glc : 1; /* globally coherent */
1141 bool slc : 1; /* system level coherent */
1142 bool tfe : 1; /* texture fail enable */
1143 bool da : 1; /* declare an array */
1144 bool lwe : 1; /* Force data to be un-normalized */
1145 bool r128 : 1; /* NAVI: Texture resource size */
1146 bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
1147 bool d16 : 1; /* Convert 32-bit data to 16-bit data */
1148 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1149 bool can_reorder : 1;
1150 uint8_t padding : 1;
1151 barrier_interaction barrier;
1152 };
1153 static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1154
1155 /**
1156 * Flat/Scratch/Global Instructions
1157 * Operand(0): ADDR
1158 * Operand(1): SADDR
1159 * Operand(2) / Definition(0): DATA/VDST
1160 *
1161 */
1162 struct FLAT_instruction : public Instruction {
1163 uint16_t offset; /* Vega/Navi only */
1164 bool slc : 1; /* system level coherent */
1165 bool glc : 1; /* globally coherent */
1166 bool dlc : 1; /* NAVI: device level coherent */
1167 bool lds : 1;
1168 bool nv : 1;
1169 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1170 bool can_reorder : 1;
1171 uint8_t padding : 1;
1172 barrier_interaction barrier;
1173 };
1174 static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1175
1176 struct Export_instruction : public Instruction {
1177 uint8_t enabled_mask;
1178 uint8_t dest;
1179 bool compressed : 1;
1180 bool done : 1;
1181 bool valid_mask : 1;
1182 uint32_t padding : 13;
1183 };
1184 static_assert(sizeof(Export_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1185
1186 struct Pseudo_instruction : public Instruction {
1187 PhysReg scratch_sgpr; /* might not be valid if it's not needed */
1188 bool tmp_in_scc;
1189 uint8_t padding;
1190 };
1191 static_assert(sizeof(Pseudo_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1192
1193 struct Pseudo_branch_instruction : public Instruction {
1194 /* target[0] is the block index of the branch target.
1195 * For conditional branches, target[1] contains the fall-through alternative.
1196 * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
1197 */
1198 uint32_t target[2];
1199 };
1200 static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1201
1202 struct Pseudo_barrier_instruction : public Instruction {
1203 };
1204 static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1205
1206 enum ReduceOp : uint16_t {
1207 iadd8, iadd16, iadd32, iadd64,
1208 imul8, imul16, imul32, imul64,
1209 fadd16, fadd32, fadd64,
1210 fmul16, fmul32, fmul64,
1211 imin8, imin16, imin32, imin64,
1212 imax8, imax16, imax32, imax64,
1213 umin8, umin16, umin32, umin64,
1214 umax8, umax16, umax32, umax64,
1215 fmin16, fmin32, fmin64,
1216 fmax16, fmax32, fmax64,
1217 iand8, iand16, iand32, iand64,
1218 ior8, ior16, ior32, ior64,
1219 ixor8, ixor16, ixor32, ixor64,
1220 };
1221
1222 /**
1223 * Subgroup Reduction Instructions, everything except for the data to be
1224 * reduced and the result as inserted by setup_reduce_temp().
1225 * Operand(0): data to be reduced
1226 * Operand(1): reduce temporary
1227 * Operand(2): vector temporary
1228 * Definition(0): result
1229 * Definition(1): scalar temporary
1230 * Definition(2): scalar identity temporary (not used to store identity on GFX10)
1231 * Definition(3): scc clobber
1232 * Definition(4): vcc clobber
1233 *
1234 */
1235 struct Pseudo_reduction_instruction : public Instruction {
1236 ReduceOp reduce_op;
1237 uint16_t cluster_size; // must be 0 for scans
1238 };
1239 static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1240
1241 struct instr_deleter_functor {
1242 void operator()(void* p) {
1243 free(p);
1244 }
1245 };
1246
1247 template<typename T>
1248 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
1249
1250 template<typename T>
1251 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
1252 {
1253 std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1254 char *data = (char*) calloc(1, size);
1255 T* inst = (T*) data;
1256
1257 inst->opcode = opcode;
1258 inst->format = format;
1259
1260 uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
1261 inst->operands = aco::span<Operand>(operands_offset, num_operands);
1262 uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1263 inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1264
1265 return inst;
1266 }
1267
1268 constexpr bool Instruction::usesModifiers() const noexcept
1269 {
1270 if (isDPP() || isSDWA())
1271 return true;
1272
1273 if (format == Format::VOP3P) {
1274 const VOP3P_instruction *vop3p = static_cast<const VOP3P_instruction*>(this);
1275 for (unsigned i = 0; i < operands.size(); i++) {
1276 if (vop3p->neg_lo[i] || vop3p->neg_hi[i])
1277 return true;
1278 }
1279 return vop3p->opsel_lo || vop3p->opsel_hi || vop3p->clamp;
1280 } else if (isVOP3()) {
1281 const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
1282 for (unsigned i = 0; i < operands.size(); i++) {
1283 if (vop3->abs[i] || vop3->neg[i])
1284 return true;
1285 }
1286 return vop3->opsel || vop3->clamp || vop3->omod;
1287 }
1288 return false;
1289 }
1290
1291 constexpr bool is_phi(Instruction* instr)
1292 {
1293 return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
1294 }
1295
1296 static inline bool is_phi(aco_ptr<Instruction>& instr)
1297 {
1298 return is_phi(instr.get());
1299 }
1300
1301 barrier_interaction get_barrier_interaction(const Instruction* instr);
1302 bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr);
1303
1304 bool can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high);
1305 bool can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr);
1306 /* updates "instr" and returns the old instruction (or NULL if no update was needed) */
1307 aco_ptr<Instruction> convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& instr);
1308
1309 enum block_kind {
1310 /* uniform indicates that leaving this block,
1311 * all actives lanes stay active */
1312 block_kind_uniform = 1 << 0,
1313 block_kind_top_level = 1 << 1,
1314 block_kind_loop_preheader = 1 << 2,
1315 block_kind_loop_header = 1 << 3,
1316 block_kind_loop_exit = 1 << 4,
1317 block_kind_continue = 1 << 5,
1318 block_kind_break = 1 << 6,
1319 block_kind_continue_or_break = 1 << 7,
1320 block_kind_discard = 1 << 8,
1321 block_kind_branch = 1 << 9,
1322 block_kind_merge = 1 << 10,
1323 block_kind_invert = 1 << 11,
1324 block_kind_uses_discard_if = 1 << 12,
1325 block_kind_needs_lowering = 1 << 13,
1326 block_kind_uses_demote = 1 << 14,
1327 block_kind_export_end = 1 << 15,
1328 };
1329
1330
1331 struct RegisterDemand {
1332 constexpr RegisterDemand() = default;
1333 constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1334 : vgpr{v}, sgpr{s} {}
1335 int16_t vgpr = 0;
1336 int16_t sgpr = 0;
1337
1338 constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1339 return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1340 }
1341
1342 constexpr bool exceeds(const RegisterDemand other) const noexcept {
1343 return vgpr > other.vgpr || sgpr > other.sgpr;
1344 }
1345
1346 constexpr RegisterDemand operator+(const Temp t) const noexcept {
1347 if (t.type() == RegType::sgpr)
1348 return RegisterDemand( vgpr, sgpr + t.size() );
1349 else
1350 return RegisterDemand( vgpr + t.size(), sgpr );
1351 }
1352
1353 constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1354 return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1355 }
1356
1357 constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1358 return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1359 }
1360
1361 constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1362 vgpr += other.vgpr;
1363 sgpr += other.sgpr;
1364 return *this;
1365 }
1366
1367 constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1368 vgpr -= other.vgpr;
1369 sgpr -= other.sgpr;
1370 return *this;
1371 }
1372
1373 constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1374 if (t.type() == RegType::sgpr)
1375 sgpr += t.size();
1376 else
1377 vgpr += t.size();
1378 return *this;
1379 }
1380
1381 constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1382 if (t.type() == RegType::sgpr)
1383 sgpr -= t.size();
1384 else
1385 vgpr -= t.size();
1386 return *this;
1387 }
1388
1389 constexpr void update(const RegisterDemand other) noexcept {
1390 vgpr = std::max(vgpr, other.vgpr);
1391 sgpr = std::max(sgpr, other.sgpr);
1392 }
1393
1394 };
1395
1396 /* CFG */
1397 struct Block {
1398 float_mode fp_mode;
1399 unsigned index;
1400 unsigned offset = 0;
1401 std::vector<aco_ptr<Instruction>> instructions;
1402 std::vector<unsigned> logical_preds;
1403 std::vector<unsigned> linear_preds;
1404 std::vector<unsigned> logical_succs;
1405 std::vector<unsigned> linear_succs;
1406 RegisterDemand register_demand = RegisterDemand();
1407 uint16_t loop_nest_depth = 0;
1408 uint16_t kind = 0;
1409 int logical_idom = -1;
1410 int linear_idom = -1;
1411 Temp live_out_exec = Temp();
1412
1413 /* this information is needed for predecessors to blocks with phis when
1414 * moving out of ssa */
1415 bool scc_live_out = false;
1416 PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1417
1418 Block(unsigned idx) : index(idx) {}
1419 Block() : index(0) {}
1420 };
1421
1422 using Stage = uint16_t;
1423
1424 /* software stages */
1425 static constexpr Stage sw_vs = 1 << 0;
1426 static constexpr Stage sw_gs = 1 << 1;
1427 static constexpr Stage sw_tcs = 1 << 2;
1428 static constexpr Stage sw_tes = 1 << 3;
1429 static constexpr Stage sw_fs = 1 << 4;
1430 static constexpr Stage sw_cs = 1 << 5;
1431 static constexpr Stage sw_gs_copy = 1 << 6;
1432 static constexpr Stage sw_mask = 0x7f;
1433
1434 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1435 static constexpr Stage hw_vs = 1 << 7;
1436 static constexpr Stage hw_es = 1 << 8; /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */
1437 static constexpr Stage hw_gs = 1 << 9; /* Geometry shader on GFX10/legacy and GFX6-9. */
1438 static constexpr Stage hw_ngg_gs = 1 << 10; /* Geometry shader on GFX10/NGG. */
1439 static constexpr Stage hw_ls = 1 << 11; /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */
1440 static constexpr Stage hw_hs = 1 << 12; /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */
1441 static constexpr Stage hw_fs = 1 << 13;
1442 static constexpr Stage hw_cs = 1 << 14;
1443 static constexpr Stage hw_mask = 0xff << 7;
1444
1445 /* possible settings of Program::stage */
1446 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1447 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1448 static constexpr Stage compute_cs = sw_cs | hw_cs;
1449 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1450 static constexpr Stage gs_copy_vs = sw_gs_copy | hw_vs;
1451 /* GFX10/NGG */
1452 static constexpr Stage ngg_vertex_gs = sw_vs | hw_ngg_gs;
1453 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_ngg_gs;
1454 static constexpr Stage ngg_tess_eval_gs = sw_tes | hw_ngg_gs;
1455 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_ngg_gs;
1456 /* GFX9 (and GFX10 if NGG isn't used) */
1457 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1458 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1459 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1460 /* pre-GFX9 */
1461 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1462 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1463 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1464 static constexpr Stage tess_eval_es = sw_tes | hw_es; /* tesselation evaluation before geometry */
1465 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1466
1467 enum statistic {
1468 statistic_hash,
1469 statistic_instructions,
1470 statistic_copies,
1471 statistic_branches,
1472 statistic_cycles,
1473 statistic_vmem_clauses,
1474 statistic_smem_clauses,
1475 statistic_vmem_score,
1476 statistic_smem_score,
1477 statistic_sgpr_presched,
1478 statistic_vgpr_presched,
1479 num_statistics
1480 };
1481
1482 class Program final {
1483 public:
1484 float_mode next_fp_mode;
1485 std::vector<Block> blocks;
1486 RegisterDemand max_reg_demand = RegisterDemand();
1487 uint16_t num_waves = 0;
1488 uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1489 ac_shader_config* config;
1490 struct radv_shader_info *info;
1491 enum chip_class chip_class;
1492 enum radeon_family family;
1493 unsigned wave_size;
1494 RegClass lane_mask;
1495 Stage stage; /* Stage */
1496 bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1497 bool needs_wqm = false; /* there exists a p_wqm instruction */
1498 bool wb_smem_l1_on_end = false;
1499
1500 std::vector<uint8_t> constant_data;
1501 Temp private_segment_buffer;
1502 Temp scratch_offset;
1503
1504 uint16_t min_waves = 0;
1505 uint16_t lds_alloc_granule;
1506 uint32_t lds_limit; /* in bytes */
1507 bool has_16bank_lds;
1508 uint16_t vgpr_limit;
1509 uint16_t sgpr_limit;
1510 uint16_t physical_sgprs;
1511 uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1512 uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
1513 unsigned workgroup_size; /* if known; otherwise UINT_MAX */
1514
1515 bool xnack_enabled = false;
1516 bool sram_ecc_enabled = false;
1517 bool has_fast_fma32 = false;
1518
1519 bool needs_vcc = false;
1520 bool needs_flat_scr = false;
1521
1522 bool collect_statistics = false;
1523 uint32_t statistics[num_statistics];
1524
1525 uint32_t allocateId()
1526 {
1527 assert(allocationID <= 16777215);
1528 return allocationID++;
1529 }
1530
1531 uint32_t peekAllocationId()
1532 {
1533 return allocationID;
1534 }
1535
1536 void setAllocationId(uint32_t id)
1537 {
1538 allocationID = id;
1539 }
1540
1541 Block* create_and_insert_block() {
1542 blocks.emplace_back(blocks.size());
1543 blocks.back().fp_mode = next_fp_mode;
1544 return &blocks.back();
1545 }
1546
1547 Block* insert_block(Block&& block) {
1548 block.index = blocks.size();
1549 block.fp_mode = next_fp_mode;
1550 blocks.emplace_back(std::move(block));
1551 return &blocks.back();
1552 }
1553
1554 private:
1555 uint32_t allocationID = 1;
1556 };
1557
1558 struct TempHash {
1559 std::size_t operator()(Temp t) const {
1560 return t.id();
1561 }
1562 };
1563 using TempSet = std::unordered_set<Temp, TempHash>;
1564
1565 struct live {
1566 /* live temps out per block */
1567 std::vector<TempSet> live_out;
1568 /* register demand (sgpr/vgpr) per instruction per block */
1569 std::vector<std::vector<RegisterDemand>> register_demand;
1570 };
1571
1572 void init();
1573
1574 void init_program(Program *program, Stage stage, struct radv_shader_info *info,
1575 enum chip_class chip_class, enum radeon_family family,
1576 ac_shader_config *config);
1577
1578 void select_program(Program *program,
1579 unsigned shader_count,
1580 struct nir_shader *const *shaders,
1581 ac_shader_config* config,
1582 struct radv_shader_args *args);
1583 void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
1584 ac_shader_config* config,
1585 struct radv_shader_args *args);
1586
1587 void lower_wqm(Program* program, live& live_vars,
1588 const struct radv_nir_compiler_options *options);
1589 void lower_phis(Program* program);
1590 void calc_min_waves(Program* program);
1591 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1592 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1593 std::vector<uint16_t> dead_code_analysis(Program *program);
1594 void dominator_tree(Program* program);
1595 void insert_exec_mask(Program *program);
1596 void value_numbering(Program* program);
1597 void optimize(Program* program);
1598 void setup_reduce_temp(Program* program);
1599 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1600 void register_allocation(Program *program, std::vector<TempSet>& live_out_per_block);
1601 void ssa_elimination(Program* program);
1602 void lower_to_hw_instr(Program* program);
1603 void schedule_program(Program* program, live& live_vars);
1604 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1605 void insert_wait_states(Program* program);
1606 void insert_NOPs(Program* program);
1607 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1608 void print_asm(Program *program, std::vector<uint32_t>& binary,
1609 unsigned exec_size, std::ostream& out);
1610 bool validate(Program* program, FILE *output);
1611 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output);
1612 #ifndef NDEBUG
1613 void perfwarn(bool cond, const char *msg, Instruction *instr=NULL);
1614 #else
1615 #define perfwarn(program, cond, msg, ...) do {} while(0)
1616 #endif
1617
1618 void collect_presched_stats(Program *program);
1619 void collect_preasm_stats(Program *program);
1620 void collect_postasm_stats(Program *program, const std::vector<uint32_t>& code);
1621
1622 void aco_print_instr(const Instruction *instr, FILE *output);
1623 void aco_print_program(const Program *program, FILE *output);
1624
1625 /* utilities for dealing with register demand */
1626 RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
1627 RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
1628 RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, aco_ptr<Instruction>& instr_before);
1629
1630 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1631 uint16_t get_extra_sgprs(Program *program);
1632
1633 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
1634 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1635 uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs);
1636
1637 /* return number of addressable sgprs/vgprs for max_waves */
1638 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1639 uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves);
1640
1641 typedef struct {
1642 const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
1643 const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1644 const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1645 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1646 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1647 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
1648 const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1649 const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1650 /* sizes used for input/output modifiers and constants */
1651 const unsigned operand_size[static_cast<int>(aco_opcode::num_opcodes)];
1652 const unsigned definition_size[static_cast<int>(aco_opcode::num_opcodes)];
1653 } Info;
1654
1655 extern const Info instr_info;
1656
1657 }
1658
1659 #endif /* ACO_IR_H */
1660