amd: fix empty-body issues
[mesa.git] / src / amd / compiler / aco_ir.h
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #ifndef ACO_IR_H
26 #define ACO_IR_H
27
28 #include <vector>
29 #include <set>
30 #include <bitset>
31 #include <memory>
32
33 #include "nir.h"
34 #include "ac_binary.h"
35 #include "amd_family.h"
36 #include "aco_opcodes.h"
37 #include "aco_util.h"
38
39 struct radv_nir_compiler_options;
40 struct radv_shader_args;
41 struct radv_shader_info;
42
43 namespace aco {
44
45 extern uint64_t debug_flags;
46
47 enum {
48 DEBUG_VALIDATE = 0x1,
49 DEBUG_VALIDATE_RA = 0x2,
50 DEBUG_PERFWARN = 0x4,
51 };
52
53 /**
54 * Representation of the instruction's microcode encoding format
55 * Note: Some Vector ALU Formats can be combined, such that:
56 * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
57 * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
58 * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
59 *
60 * (*) The same is applicable for VOP1 and VOPC instructions.
61 */
62 enum class Format : std::uint16_t {
63 /* Pseudo Instruction Format */
64 PSEUDO = 0,
65 /* Scalar ALU & Control Formats */
66 SOP1 = 1,
67 SOP2 = 2,
68 SOPK = 3,
69 SOPP = 4,
70 SOPC = 5,
71 /* Scalar Memory Format */
72 SMEM = 6,
73 /* LDS/GDS Format */
74 DS = 8,
75 /* Vector Memory Buffer Formats */
76 MTBUF = 9,
77 MUBUF = 10,
78 /* Vector Memory Image Format */
79 MIMG = 11,
80 /* Export Format */
81 EXP = 12,
82 /* Flat Formats */
83 FLAT = 13,
84 GLOBAL = 14,
85 SCRATCH = 15,
86
87 PSEUDO_BRANCH = 16,
88 PSEUDO_BARRIER = 17,
89 PSEUDO_REDUCTION = 18,
90
91 /* Vector ALU Formats */
92 VOP1 = 1 << 8,
93 VOP2 = 1 << 9,
94 VOPC = 1 << 10,
95 VOP3 = 1 << 11,
96 VOP3A = 1 << 11,
97 VOP3B = 1 << 11,
98 VOP3P = 1 << 12,
99 /* Vector Parameter Interpolation Format */
100 VINTRP = 1 << 13,
101 DPP = 1 << 14,
102 SDWA = 1 << 15,
103 };
104
105 enum barrier_interaction {
106 barrier_none = 0,
107 barrier_buffer = 0x1,
108 barrier_image = 0x2,
109 barrier_atomic = 0x4,
110 barrier_shared = 0x8,
111 barrier_count = 4,
112 };
113
114 enum fp_round {
115 fp_round_ne = 0,
116 fp_round_pi = 1,
117 fp_round_ni = 2,
118 fp_round_tz = 3,
119 };
120
121 enum fp_denorm {
122 /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
123 * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
124 fp_denorm_flush = 0x0,
125 fp_denorm_keep = 0x3,
126 };
127
128 struct float_mode {
129 /* matches encoding of the MODE register */
130 union {
131 struct {
132 fp_round round32:2;
133 fp_round round16_64:2;
134 unsigned denorm32:2;
135 unsigned denorm16_64:2;
136 };
137 uint8_t val = 0;
138 };
139 /* if false, optimizations which may remove infs/nan/-0.0 can be done */
140 bool preserve_signed_zero_inf_nan32:1;
141 bool preserve_signed_zero_inf_nan16_64:1;
142 /* if false, optimizations which may remove denormal flushing can be done */
143 bool must_flush_denorms32:1;
144 bool must_flush_denorms16_64:1;
145 bool care_about_round32:1;
146 bool care_about_round16_64:1;
147
148 /* Returns true if instructions using the mode "other" can safely use the
149 * current one instead. */
150 bool canReplace(float_mode other) const noexcept {
151 return val == other.val &&
152 (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
153 (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
154 (must_flush_denorms32 || !other.must_flush_denorms32) &&
155 (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
156 (care_about_round32 || !other.care_about_round32) &&
157 (care_about_round16_64 || !other.care_about_round16_64);
158 }
159 };
160
161 constexpr Format asVOP3(Format format) {
162 return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
163 };
164
165 enum class RegType {
166 none = 0,
167 sgpr,
168 vgpr,
169 linear_vgpr,
170 };
171
172 struct RegClass {
173
174 enum RC : uint8_t {
175 s1 = 1,
176 s2 = 2,
177 s3 = 3,
178 s4 = 4,
179 s6 = 6,
180 s8 = 8,
181 s16 = 16,
182 v1 = s1 | (1 << 5),
183 v2 = s2 | (1 << 5),
184 v3 = s3 | (1 << 5),
185 v4 = s4 | (1 << 5),
186 v5 = 5 | (1 << 5),
187 v6 = 6 | (1 << 5),
188 v7 = 7 | (1 << 5),
189 v8 = 8 | (1 << 5),
190 /* these are used for WWM and spills to vgpr */
191 v1_linear = v1 | (1 << 6),
192 v2_linear = v2 | (1 << 6),
193 };
194
195 RegClass() = default;
196 constexpr RegClass(RC rc)
197 : rc(rc) {}
198 constexpr RegClass(RegType type, unsigned size)
199 : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
200
201 constexpr operator RC() const { return rc; }
202 explicit operator bool() = delete;
203
204 constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
205 constexpr unsigned size() const { return (unsigned) rc & 0x1F; }
206 constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
207 constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
208
209 private:
210 RC rc;
211 };
212
213 /* transitional helper expressions */
214 static constexpr RegClass s1{RegClass::s1};
215 static constexpr RegClass s2{RegClass::s2};
216 static constexpr RegClass s3{RegClass::s3};
217 static constexpr RegClass s4{RegClass::s4};
218 static constexpr RegClass s8{RegClass::s8};
219 static constexpr RegClass s16{RegClass::s16};
220 static constexpr RegClass v1{RegClass::v1};
221 static constexpr RegClass v2{RegClass::v2};
222 static constexpr RegClass v3{RegClass::v3};
223 static constexpr RegClass v4{RegClass::v4};
224 static constexpr RegClass v5{RegClass::v5};
225 static constexpr RegClass v6{RegClass::v6};
226 static constexpr RegClass v7{RegClass::v7};
227 static constexpr RegClass v8{RegClass::v8};
228
229 /**
230 * Temp Class
231 * Each temporary virtual register has a
232 * register class (i.e. size and type)
233 * and SSA id.
234 */
235 struct Temp {
236 Temp() = default;
237 constexpr Temp(uint32_t id, RegClass cls) noexcept
238 : id_(id), reg_class(cls) {}
239
240 constexpr uint32_t id() const noexcept { return id_; }
241 constexpr RegClass regClass() const noexcept { return reg_class; }
242
243 constexpr unsigned size() const noexcept { return reg_class.size(); }
244 constexpr RegType type() const noexcept { return reg_class.type(); }
245 constexpr bool is_linear() const noexcept { return reg_class.is_linear(); }
246
247 constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
248 constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
249 constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
250
251 private:
252 uint32_t id_:24;
253 RegClass reg_class;
254 };
255
256 /**
257 * PhysReg
258 * Represents the physical register for each
259 * Operand and Definition.
260 */
261 struct PhysReg {
262 constexpr PhysReg() = default;
263 explicit constexpr PhysReg(unsigned r) : reg(r) {}
264 constexpr operator unsigned() const { return reg; }
265
266 uint16_t reg = 0;
267 };
268
269 /* helper expressions for special registers */
270 static constexpr PhysReg m0{124};
271 static constexpr PhysReg vcc{106};
272 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
273 static constexpr PhysReg exec{126};
274 static constexpr PhysReg exec_lo{126};
275 static constexpr PhysReg exec_hi{127};
276 static constexpr PhysReg scc{253};
277
278 /**
279 * Operand Class
280 * Initially, each Operand refers to either
281 * a temporary virtual register
282 * or to a constant value
283 * Temporary registers get mapped to physical register during RA
284 * Constant values are inlined into the instruction sequence.
285 */
286 class Operand final
287 {
288 public:
289 constexpr Operand()
290 : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
291 isKill_(false), isUndef_(true), isFirstKill_(false), is64BitConst_(false) {}
292
293 explicit Operand(Temp r) noexcept
294 {
295 data_.temp = r;
296 if (r.id()) {
297 isTemp_ = true;
298 } else {
299 isUndef_ = true;
300 setFixed(PhysReg{128});
301 }
302 };
303 explicit Operand(uint32_t v) noexcept
304 {
305 data_.i = v;
306 isConstant_ = true;
307 if (v <= 64)
308 setFixed(PhysReg{128 + v});
309 else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
310 setFixed(PhysReg{192 - v});
311 else if (v == 0x3f000000) /* 0.5 */
312 setFixed(PhysReg{240});
313 else if (v == 0xbf000000) /* -0.5 */
314 setFixed(PhysReg{241});
315 else if (v == 0x3f800000) /* 1.0 */
316 setFixed(PhysReg{242});
317 else if (v == 0xbf800000) /* -1.0 */
318 setFixed(PhysReg{243});
319 else if (v == 0x40000000) /* 2.0 */
320 setFixed(PhysReg{244});
321 else if (v == 0xc0000000) /* -2.0 */
322 setFixed(PhysReg{245});
323 else if (v == 0x40800000) /* 4.0 */
324 setFixed(PhysReg{246});
325 else if (v == 0xc0800000) /* -4.0 */
326 setFixed(PhysReg{247});
327 else /* Literal Constant */
328 setFixed(PhysReg{255});
329 };
330 explicit Operand(uint64_t v) noexcept
331 {
332 isConstant_ = true;
333 is64BitConst_ = true;
334 if (v <= 64)
335 setFixed(PhysReg{128 + (uint32_t) v});
336 else if (v >= 0xFFFFFFFFFFFFFFF0) /* [-16 .. -1] */
337 setFixed(PhysReg{192 - (uint32_t) v});
338 else if (v == 0x3FE0000000000000) /* 0.5 */
339 setFixed(PhysReg{240});
340 else if (v == 0xBFE0000000000000) /* -0.5 */
341 setFixed(PhysReg{241});
342 else if (v == 0x3FF0000000000000) /* 1.0 */
343 setFixed(PhysReg{242});
344 else if (v == 0xBFF0000000000000) /* -1.0 */
345 setFixed(PhysReg{243});
346 else if (v == 0x4000000000000000) /* 2.0 */
347 setFixed(PhysReg{244});
348 else if (v == 0xC000000000000000) /* -2.0 */
349 setFixed(PhysReg{245});
350 else if (v == 0x4010000000000000) /* 4.0 */
351 setFixed(PhysReg{246});
352 else if (v == 0xC010000000000000) /* -4.0 */
353 setFixed(PhysReg{247});
354 else { /* Literal Constant: we don't know if it is a long or double.*/
355 isConstant_ = 0;
356 assert(false && "attempt to create a 64-bit literal constant");
357 }
358 };
359 explicit Operand(RegClass type) noexcept
360 {
361 isUndef_ = true;
362 data_.temp = Temp(0, type);
363 setFixed(PhysReg{128});
364 };
365 explicit Operand(PhysReg reg, RegClass type) noexcept
366 {
367 data_.temp = Temp(0, type);
368 setFixed(reg);
369 }
370
371 constexpr bool isTemp() const noexcept
372 {
373 return isTemp_;
374 }
375
376 constexpr void setTemp(Temp t) noexcept {
377 assert(!isConstant_);
378 isTemp_ = true;
379 data_.temp = t;
380 }
381
382 constexpr Temp getTemp() const noexcept
383 {
384 return data_.temp;
385 }
386
387 constexpr uint32_t tempId() const noexcept
388 {
389 return data_.temp.id();
390 }
391
392 constexpr bool hasRegClass() const noexcept
393 {
394 return isTemp() || isUndefined();
395 }
396
397 constexpr RegClass regClass() const noexcept
398 {
399 return data_.temp.regClass();
400 }
401
402 constexpr unsigned size() const noexcept
403 {
404 if (isConstant())
405 return is64BitConst_ ? 2 : 1;
406 else
407 return data_.temp.size();
408 }
409
410 constexpr bool isFixed() const noexcept
411 {
412 return isFixed_;
413 }
414
415 constexpr PhysReg physReg() const noexcept
416 {
417 return reg_;
418 }
419
420 constexpr void setFixed(PhysReg reg) noexcept
421 {
422 isFixed_ = reg != unsigned(-1);
423 reg_ = reg;
424 }
425
426 constexpr bool isConstant() const noexcept
427 {
428 return isConstant_;
429 }
430
431 constexpr bool isLiteral() const noexcept
432 {
433 return isConstant() && reg_ == 255;
434 }
435
436 constexpr bool isUndefined() const noexcept
437 {
438 return isUndef_;
439 }
440
441 constexpr uint32_t constantValue() const noexcept
442 {
443 return data_.i;
444 }
445
446 constexpr bool constantEquals(uint32_t cmp) const noexcept
447 {
448 return isConstant() && constantValue() == cmp;
449 }
450
451 constexpr void setKill(bool flag) noexcept
452 {
453 isKill_ = flag;
454 if (!flag)
455 setFirstKill(false);
456 }
457
458 constexpr bool isKill() const noexcept
459 {
460 return isKill_ || isFirstKill();
461 }
462
463 constexpr void setFirstKill(bool flag) noexcept
464 {
465 isFirstKill_ = flag;
466 if (flag)
467 setKill(flag);
468 }
469
470 /* When there are multiple operands killing the same temporary,
471 * isFirstKill() is only returns true for the first one. */
472 constexpr bool isFirstKill() const noexcept
473 {
474 return isFirstKill_;
475 }
476
477 private:
478 union {
479 uint32_t i;
480 float f;
481 Temp temp = Temp(0, s1);
482 } data_;
483 PhysReg reg_;
484 union {
485 struct {
486 uint8_t isTemp_:1;
487 uint8_t isFixed_:1;
488 uint8_t isConstant_:1;
489 uint8_t isKill_:1;
490 uint8_t isUndef_:1;
491 uint8_t isFirstKill_:1;
492 uint8_t is64BitConst_:1;
493 };
494 /* can't initialize bit-fields in c++11, so work around using a union */
495 uint8_t control_ = 0;
496 };
497 };
498
499 /**
500 * Definition Class
501 * Definitions are the results of Instructions
502 * and refer to temporary virtual registers
503 * which are later mapped to physical registers
504 */
505 class Definition final
506 {
507 public:
508 constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0) {}
509 Definition(uint32_t index, RegClass type) noexcept
510 : temp(index, type) {}
511 explicit Definition(Temp tmp) noexcept
512 : temp(tmp) {}
513 Definition(PhysReg reg, RegClass type) noexcept
514 : temp(Temp(0, type))
515 {
516 setFixed(reg);
517 }
518 Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
519 : temp(Temp(tmpId, type))
520 {
521 setFixed(reg);
522 }
523
524 constexpr bool isTemp() const noexcept
525 {
526 return tempId() > 0;
527 }
528
529 constexpr Temp getTemp() const noexcept
530 {
531 return temp;
532 }
533
534 constexpr uint32_t tempId() const noexcept
535 {
536 return temp.id();
537 }
538
539 constexpr void setTemp(Temp t) noexcept {
540 temp = t;
541 }
542
543 constexpr RegClass regClass() const noexcept
544 {
545 return temp.regClass();
546 }
547
548 constexpr unsigned size() const noexcept
549 {
550 return temp.size();
551 }
552
553 constexpr bool isFixed() const noexcept
554 {
555 return isFixed_;
556 }
557
558 constexpr PhysReg physReg() const noexcept
559 {
560 return reg_;
561 }
562
563 constexpr void setFixed(PhysReg reg) noexcept
564 {
565 isFixed_ = 1;
566 reg_ = reg;
567 }
568
569 constexpr void setHint(PhysReg reg) noexcept
570 {
571 hasHint_ = 1;
572 reg_ = reg;
573 }
574
575 constexpr bool hasHint() const noexcept
576 {
577 return hasHint_;
578 }
579
580 constexpr void setKill(bool flag) noexcept
581 {
582 isKill_ = flag;
583 }
584
585 constexpr bool isKill() const noexcept
586 {
587 return isKill_;
588 }
589
590 private:
591 Temp temp = Temp(0, s1);
592 PhysReg reg_;
593 union {
594 struct {
595 uint8_t isFixed_:1;
596 uint8_t hasHint_:1;
597 uint8_t isKill_:1;
598 };
599 /* can't initialize bit-fields in c++11, so work around using a union */
600 uint8_t control_ = 0;
601 };
602 };
603
604 class Block;
605
606 struct Instruction {
607 aco_opcode opcode;
608 Format format;
609 uint32_t pass_flags;
610
611 aco::span<Operand> operands;
612 aco::span<Definition> definitions;
613
614 constexpr bool isVALU() const noexcept
615 {
616 return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
617 || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
618 || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
619 || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
620 || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
621 || ((uint16_t) format & (uint16_t) Format::VOP3P) == (uint16_t) Format::VOP3P;
622 }
623
624 constexpr bool isSALU() const noexcept
625 {
626 return format == Format::SOP1 ||
627 format == Format::SOP2 ||
628 format == Format::SOPC ||
629 format == Format::SOPK ||
630 format == Format::SOPP;
631 }
632
633 constexpr bool isVMEM() const noexcept
634 {
635 return format == Format::MTBUF ||
636 format == Format::MUBUF ||
637 format == Format::MIMG;
638 }
639
640 constexpr bool isDPP() const noexcept
641 {
642 return (uint16_t) format & (uint16_t) Format::DPP;
643 }
644
645 constexpr bool isVOP3() const noexcept
646 {
647 return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
648 ((uint16_t) format & (uint16_t) Format::VOP3B) ||
649 format == Format::VOP3P;
650 }
651
652 constexpr bool isSDWA() const noexcept
653 {
654 return (uint16_t) format & (uint16_t) Format::SDWA;
655 }
656
657 constexpr bool isFlatOrGlobal() const noexcept
658 {
659 return format == Format::FLAT || format == Format::GLOBAL;
660 }
661
662 constexpr bool usesModifiers() const noexcept;
663
664 constexpr bool reads_exec() const noexcept
665 {
666 for (const Operand& op : operands) {
667 if (op.isFixed() && op.physReg() == exec)
668 return true;
669 }
670 return false;
671 }
672 };
673
674 struct SOPK_instruction : public Instruction {
675 uint16_t imm;
676 };
677
678 struct SOPP_instruction : public Instruction {
679 uint32_t imm;
680 int block;
681 };
682
683 struct SOPC_instruction : public Instruction {
684 };
685
686 struct SOP1_instruction : public Instruction {
687 };
688
689 struct SOP2_instruction : public Instruction {
690 };
691
692 /**
693 * Scalar Memory Format:
694 * For s_(buffer_)load_dword*:
695 * Operand(0): SBASE - SGPR-pair which provides base address
696 * Operand(1): Offset - immediate (un)signed offset or SGPR
697 * Operand(2) / Definition(0): SDATA - SGPR for read / write result
698 * Operand(n-1): SOffset - SGPR offset (Vega only)
699 *
700 * Having no operands is also valid for instructions such as s_dcache_inv.
701 *
702 */
703 struct SMEM_instruction : public Instruction {
704 bool glc; /* VI+: globally coherent */
705 bool dlc; /* NAVI: device level coherent */
706 bool nv; /* VEGA only: Non-volatile */
707 bool can_reorder;
708 bool disable_wqm;
709 barrier_interaction barrier;
710 };
711
712 struct VOP1_instruction : public Instruction {
713 };
714
715 struct VOP2_instruction : public Instruction {
716 };
717
718 struct VOPC_instruction : public Instruction {
719 };
720
721 struct VOP3A_instruction : public Instruction {
722 bool abs[3];
723 bool opsel[4];
724 bool clamp;
725 unsigned omod;
726 bool neg[3];
727 };
728
729 /**
730 * Data Parallel Primitives Format:
731 * This format can be used for VOP1, VOP2 or VOPC instructions.
732 * The swizzle applies to the src0 operand.
733 *
734 */
735 struct DPP_instruction : public Instruction {
736 uint16_t dpp_ctrl;
737 uint8_t row_mask;
738 uint8_t bank_mask;
739 bool abs[2];
740 bool neg[2];
741 bool bound_ctrl;
742 };
743
744 struct Interp_instruction : public Instruction {
745 unsigned attribute;
746 unsigned component;
747 };
748
749 /**
750 * Local and Global Data Sharing instructions
751 * Operand(0): ADDR - VGPR which supplies the address.
752 * Operand(1): DATA0 - First data VGPR.
753 * Operand(2): DATA1 - Second data VGPR.
754 * Operand(n-1): M0 - LDS size.
755 * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
756 *
757 */
758 struct DS_instruction : public Instruction {
759 int16_t offset0;
760 int8_t offset1;
761 bool gds;
762 };
763
764 /**
765 * Vector Memory Untyped-buffer Instructions
766 * Operand(0): VADDR - Address source. Can carry an index and/or offset
767 * Operand(1): SRSRC - Specifies which SGPR supplies T# (resource constant)
768 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
769 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
770 *
771 */
772 struct MUBUF_instruction : public Instruction {
773 unsigned offset; /* Unsigned byte offset - 12 bit */
774 bool offen; /* Supply an offset from VGPR (VADDR) */
775 bool idxen; /* Supply an index from VGPR (VADDR) */
776 bool glc; /* globally coherent */
777 bool dlc; /* NAVI: device level coherent */
778 bool slc; /* system level coherent */
779 bool tfe; /* texture fail enable */
780 bool lds; /* Return read-data to LDS instead of VGPRs */
781 bool disable_wqm; /* Require an exec mask without helper invocations */
782 bool can_reorder;
783 barrier_interaction barrier;
784 };
785
786 /**
787 * Vector Memory Typed-buffer Instructions
788 * Operand(0): VADDR - Address source. Can carry an index and/or offset
789 * Operand(1): SRSRC - Specifies which SGPR supplies T# (resource constant)
790 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
791 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
792 *
793 */
794 struct MTBUF_instruction : public Instruction {
795 uint8_t dfmt : 4; /* Data Format of data in memory buffer */
796 uint8_t nfmt : 3; /* Numeric format of data in memory */
797 unsigned offset; /* Unsigned byte offset - 12 bit */
798 bool offen; /* Supply an offset from VGPR (VADDR) */
799 bool idxen; /* Supply an index from VGPR (VADDR) */
800 bool glc; /* globally coherent */
801 bool dlc; /* NAVI: device level coherent */
802 bool slc; /* system level coherent */
803 bool tfe; /* texture fail enable */
804 bool disable_wqm; /* Require an exec mask without helper invocations */
805 bool can_reorder;
806 barrier_interaction barrier;
807 };
808
809 /**
810 * Vector Memory Image Instructions
811 * Operand(0): VADDR - Address source. Can carry an offset or an index.
812 * Operand(1): SRSRC - Scalar GPR that specifies the resource constant.
813 * Operand(2): SSAMP - Scalar GPR that specifies sampler constant.
814 * Operand(3) / Definition(0): VDATA - Vector GPR for read / write result.
815 *
816 */
817 struct MIMG_instruction : public Instruction {
818 unsigned dmask; /* Data VGPR enable mask */
819 unsigned dim; /* NAVI: dimensionality */
820 bool unrm; /* Force address to be un-normalized */
821 bool dlc; /* NAVI: device level coherent */
822 bool glc; /* globally coherent */
823 bool slc; /* system level coherent */
824 bool tfe; /* texture fail enable */
825 bool da; /* declare an array */
826 bool lwe; /* Force data to be un-normalized */
827 bool r128; /* NAVI: Texture resource size */
828 bool a16; /* VEGA, NAVI: Address components are 16-bits */
829 bool d16; /* Convert 32-bit data to 16-bit data */
830 bool disable_wqm; /* Require an exec mask without helper invocations */
831 bool can_reorder;
832 barrier_interaction barrier;
833 };
834
835 /**
836 * Flat/Scratch/Global Instructions
837 * Operand(0): ADDR
838 * Operand(1): SADDR
839 * Operand(2) / Definition(0): DATA/VDST
840 *
841 */
842 struct FLAT_instruction : public Instruction {
843 uint16_t offset; /* Vega/Navi only */
844 bool slc; /* system level coherent */
845 bool glc; /* globally coherent */
846 bool dlc; /* NAVI: device level coherent */
847 bool lds;
848 bool nv;
849 bool disable_wqm; /* Require an exec mask without helper invocations */
850 bool can_reorder;
851 barrier_interaction barrier;
852 };
853
854 struct Export_instruction : public Instruction {
855 unsigned enabled_mask;
856 unsigned dest;
857 bool compressed;
858 bool done;
859 bool valid_mask;
860 };
861
862 struct Pseudo_instruction : public Instruction {
863 bool tmp_in_scc;
864 PhysReg scratch_sgpr; /* might not be valid if it's not needed */
865 };
866
867 struct Pseudo_branch_instruction : public Instruction {
868 /* target[0] is the block index of the branch target.
869 * For conditional branches, target[1] contains the fall-through alternative.
870 * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
871 */
872 uint32_t target[2];
873 };
874
875 struct Pseudo_barrier_instruction : public Instruction {
876 };
877
878 enum ReduceOp {
879 iadd32, iadd64,
880 imul32, imul64,
881 fadd32, fadd64,
882 fmul32, fmul64,
883 imin32, imin64,
884 imax32, imax64,
885 umin32, umin64,
886 umax32, umax64,
887 fmin32, fmin64,
888 fmax32, fmax64,
889 iand32, iand64,
890 ior32, ior64,
891 ixor32, ixor64,
892 gfx10_wave64_bpermute
893 };
894
895 /**
896 * Subgroup Reduction Instructions, everything except for the data to be
897 * reduced and the result as inserted by setup_reduce_temp().
898 * Operand(0): data to be reduced
899 * Operand(1): reduce temporary
900 * Operand(2): vector temporary
901 * Definition(0): result
902 * Definition(1): scalar temporary
903 * Definition(2): scalar identity temporary (not used to store identity on GFX10)
904 * Definition(3): scc clobber
905 * Definition(4): vcc clobber
906 *
907 */
908 struct Pseudo_reduction_instruction : public Instruction {
909 ReduceOp reduce_op;
910 unsigned cluster_size; // must be 0 for scans
911 };
912
913 struct instr_deleter_functor {
914 void operator()(void* p) {
915 free(p);
916 }
917 };
918
919 template<typename T>
920 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
921
922 template<typename T>
923 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
924 {
925 std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
926 char *data = (char*) calloc(1, size);
927 T* inst = (T*) data;
928
929 inst->opcode = opcode;
930 inst->format = format;
931
932 inst->operands = aco::span<Operand>((Operand*)(data + sizeof(T)), num_operands);
933 inst->definitions = aco::span<Definition>((Definition*)inst->operands.end(), num_definitions);
934
935 return inst;
936 }
937
938 constexpr bool Instruction::usesModifiers() const noexcept
939 {
940 if (isDPP() || isSDWA())
941 return true;
942 if (!isVOP3())
943 return false;
944 const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
945 for (unsigned i = 0; i < operands.size(); i++) {
946 if (vop3->abs[i] || vop3->opsel[i] || vop3->neg[i])
947 return true;
948 }
949 return vop3->opsel[3] || vop3->clamp || vop3->omod;
950 }
951
952 constexpr bool is_phi(Instruction* instr)
953 {
954 return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
955 }
956
957 static inline bool is_phi(aco_ptr<Instruction>& instr)
958 {
959 return is_phi(instr.get());
960 }
961
962 constexpr barrier_interaction get_barrier_interaction(Instruction* instr)
963 {
964 switch (instr->format) {
965 case Format::SMEM:
966 return static_cast<SMEM_instruction*>(instr)->barrier;
967 case Format::MUBUF:
968 return static_cast<MUBUF_instruction*>(instr)->barrier;
969 case Format::MIMG:
970 return static_cast<MIMG_instruction*>(instr)->barrier;
971 case Format::FLAT:
972 case Format::GLOBAL:
973 case Format::SCRATCH:
974 return static_cast<FLAT_instruction*>(instr)->barrier;
975 case Format::DS:
976 return barrier_shared;
977 default:
978 return barrier_none;
979 }
980 }
981
982 enum block_kind {
983 /* uniform indicates that leaving this block,
984 * all actives lanes stay active */
985 block_kind_uniform = 1 << 0,
986 block_kind_top_level = 1 << 1,
987 block_kind_loop_preheader = 1 << 2,
988 block_kind_loop_header = 1 << 3,
989 block_kind_loop_exit = 1 << 4,
990 block_kind_continue = 1 << 5,
991 block_kind_break = 1 << 6,
992 block_kind_continue_or_break = 1 << 7,
993 block_kind_discard = 1 << 8,
994 block_kind_branch = 1 << 9,
995 block_kind_merge = 1 << 10,
996 block_kind_invert = 1 << 11,
997 block_kind_uses_discard_if = 1 << 12,
998 block_kind_needs_lowering = 1 << 13,
999 block_kind_uses_demote = 1 << 14,
1000 };
1001
1002
1003 struct RegisterDemand {
1004 constexpr RegisterDemand() = default;
1005 constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1006 : vgpr{v}, sgpr{s} {}
1007 int16_t vgpr = 0;
1008 int16_t sgpr = 0;
1009
1010 constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1011 return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1012 }
1013
1014 constexpr bool exceeds(const RegisterDemand other) const noexcept {
1015 return vgpr > other.vgpr || sgpr > other.sgpr;
1016 }
1017
1018 constexpr RegisterDemand operator+(const Temp t) const noexcept {
1019 if (t.type() == RegType::sgpr)
1020 return RegisterDemand( vgpr, sgpr + t.size() );
1021 else
1022 return RegisterDemand( vgpr + t.size(), sgpr );
1023 }
1024
1025 constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1026 return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1027 }
1028
1029 constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1030 return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1031 }
1032
1033 constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1034 vgpr += other.vgpr;
1035 sgpr += other.sgpr;
1036 return *this;
1037 }
1038
1039 constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1040 vgpr -= other.vgpr;
1041 sgpr -= other.sgpr;
1042 return *this;
1043 }
1044
1045 constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1046 if (t.type() == RegType::sgpr)
1047 sgpr += t.size();
1048 else
1049 vgpr += t.size();
1050 return *this;
1051 }
1052
1053 constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1054 if (t.type() == RegType::sgpr)
1055 sgpr -= t.size();
1056 else
1057 vgpr -= t.size();
1058 return *this;
1059 }
1060
1061 constexpr void update(const RegisterDemand other) noexcept {
1062 vgpr = std::max(vgpr, other.vgpr);
1063 sgpr = std::max(sgpr, other.sgpr);
1064 }
1065
1066 };
1067
1068 /* CFG */
1069 struct Block {
1070 float_mode fp_mode;
1071 unsigned index;
1072 unsigned offset = 0;
1073 std::vector<aco_ptr<Instruction>> instructions;
1074 std::vector<unsigned> logical_preds;
1075 std::vector<unsigned> linear_preds;
1076 std::vector<unsigned> logical_succs;
1077 std::vector<unsigned> linear_succs;
1078 RegisterDemand register_demand = RegisterDemand();
1079 uint16_t loop_nest_depth = 0;
1080 uint16_t kind = 0;
1081 int logical_idom = -1;
1082 int linear_idom = -1;
1083 Temp live_out_exec = Temp();
1084
1085 /* this information is needed for predecessors to blocks with phis when
1086 * moving out of ssa */
1087 bool scc_live_out = false;
1088 PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1089
1090 Block(unsigned idx) : index(idx) {}
1091 Block() : index(0) {}
1092 };
1093
1094 using Stage = uint16_t;
1095
1096 /* software stages */
1097 static constexpr Stage sw_vs = 1 << 0;
1098 static constexpr Stage sw_gs = 1 << 1;
1099 static constexpr Stage sw_tcs = 1 << 2;
1100 static constexpr Stage sw_tes = 1 << 3;
1101 static constexpr Stage sw_fs = 1 << 4;
1102 static constexpr Stage sw_cs = 1 << 5;
1103 static constexpr Stage sw_mask = 0x3f;
1104
1105 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1106 static constexpr Stage hw_vs = 1 << 6;
1107 static constexpr Stage hw_es = 1 << 7; /* not on GFX9. combined into GS on GFX9 (and GFX10/legacy). */
1108 static constexpr Stage hw_gs = 1 << 8;
1109 static constexpr Stage hw_ls = 1 << 9; /* not on GFX9. combined into HS on GFX9 (and GFX10/legacy). */
1110 static constexpr Stage hw_hs = 1 << 10;
1111 static constexpr Stage hw_fs = 1 << 11;
1112 static constexpr Stage hw_cs = 1 << 12;
1113 static constexpr Stage hw_mask = 0x7f << 6;
1114
1115 /* possible settings of Program::stage */
1116 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1117 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1118 static constexpr Stage compute_cs = sw_cs | hw_cs;
1119 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1120 /* GFX10/NGG */
1121 static constexpr Stage ngg_vertex_gs = sw_vs | hw_gs;
1122 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1123 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1124 static constexpr Stage ngg_vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1125 /* GFX9 (and GFX10 if NGG isn't used) */
1126 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1127 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1128 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1129 /* pre-GFX9 */
1130 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1131 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1132 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1133 static constexpr Stage tess_eval_es = sw_tes | hw_gs; /* tesselation evaluation before geometry */
1134 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1135
1136 class Program final {
1137 public:
1138 float_mode next_fp_mode;
1139 std::vector<Block> blocks;
1140 RegisterDemand max_reg_demand = RegisterDemand();
1141 uint16_t num_waves = 0;
1142 uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1143 ac_shader_config* config;
1144 struct radv_shader_info *info;
1145 enum chip_class chip_class;
1146 enum radeon_family family;
1147 unsigned wave_size;
1148 RegClass lane_mask;
1149 Stage stage; /* Stage */
1150 bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1151 bool needs_wqm = false; /* there exists a p_wqm instruction */
1152 bool wb_smem_l1_on_end = false;
1153
1154 std::vector<uint8_t> constant_data;
1155 Temp private_segment_buffer;
1156 Temp scratch_offset;
1157
1158 uint16_t lds_alloc_granule;
1159 uint32_t lds_limit; /* in bytes */
1160 uint16_t vgpr_limit;
1161 uint16_t sgpr_limit;
1162 uint16_t physical_sgprs;
1163 uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1164 uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
1165
1166 bool needs_vcc = false;
1167 bool needs_xnack_mask = false;
1168 bool needs_flat_scr = false;
1169
1170 uint32_t allocateId()
1171 {
1172 assert(allocationID <= 16777215);
1173 return allocationID++;
1174 }
1175
1176 uint32_t peekAllocationId()
1177 {
1178 return allocationID;
1179 }
1180
1181 void setAllocationId(uint32_t id)
1182 {
1183 allocationID = id;
1184 }
1185
1186 Block* create_and_insert_block() {
1187 blocks.emplace_back(blocks.size());
1188 blocks.back().fp_mode = next_fp_mode;
1189 return &blocks.back();
1190 }
1191
1192 Block* insert_block(Block&& block) {
1193 block.index = blocks.size();
1194 block.fp_mode = next_fp_mode;
1195 blocks.emplace_back(std::move(block));
1196 return &blocks.back();
1197 }
1198
1199 private:
1200 uint32_t allocationID = 1;
1201 };
1202
1203 struct live {
1204 /* live temps out per block */
1205 std::vector<std::set<Temp>> live_out;
1206 /* register demand (sgpr/vgpr) per instruction per block */
1207 std::vector<std::vector<RegisterDemand>> register_demand;
1208 };
1209
1210 void select_program(Program *program,
1211 unsigned shader_count,
1212 struct nir_shader *const *shaders,
1213 ac_shader_config* config,
1214 struct radv_shader_args *args);
1215
1216 void lower_wqm(Program* program, live& live_vars,
1217 const struct radv_nir_compiler_options *options);
1218 void lower_bool_phis(Program* program);
1219 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1220 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1221 std::vector<uint16_t> dead_code_analysis(Program *program);
1222 void dominator_tree(Program* program);
1223 void insert_exec_mask(Program *program);
1224 void value_numbering(Program* program);
1225 void optimize(Program* program);
1226 void setup_reduce_temp(Program* program);
1227 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1228 void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_per_block);
1229 void ssa_elimination(Program* program);
1230 void lower_to_hw_instr(Program* program);
1231 void schedule_program(Program* program, live& live_vars);
1232 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1233 void insert_wait_states(Program* program);
1234 void insert_NOPs(Program* program);
1235 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1236 void print_asm(Program *program, std::vector<uint32_t>& binary,
1237 unsigned exec_size, std::ostream& out);
1238 void validate(Program* program, FILE *output);
1239 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output);
1240 #ifndef NDEBUG
1241 void perfwarn(bool cond, const char *msg, Instruction *instr=NULL);
1242 #else
1243 #define perfwarn(program, cond, msg, ...) do {} while(0)
1244 #endif
1245
1246 void aco_print_instr(Instruction *instr, FILE *output);
1247 void aco_print_program(Program *program, FILE *output);
1248
1249 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1250 uint16_t get_extra_sgprs(Program *program);
1251
1252 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
1253 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1254 uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs);
1255
1256 /* return number of addressable sgprs/vgprs for max_waves */
1257 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1258 uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves);
1259
1260 typedef struct {
1261 const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
1262 const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1263 const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1264 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1265 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1266 const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1267 const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1268 } Info;
1269
1270 extern const Info instr_info;
1271
1272 }
1273
1274 #endif /* ACO_IR_H */
1275