aco: add new addr64 bit to MUBUF instructions on GFX6-GFX7
[mesa.git] / src / amd / compiler / aco_ir.h
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #ifndef ACO_IR_H
26 #define ACO_IR_H
27
28 #include <vector>
29 #include <set>
30 #include <bitset>
31 #include <memory>
32
33 #include "nir.h"
34 #include "ac_binary.h"
35 #include "amd_family.h"
36 #include "aco_opcodes.h"
37 #include "aco_util.h"
38
39 struct radv_nir_compiler_options;
40 struct radv_shader_args;
41 struct radv_shader_info;
42
43 namespace aco {
44
45 extern uint64_t debug_flags;
46
47 enum {
48 DEBUG_VALIDATE = 0x1,
49 DEBUG_VALIDATE_RA = 0x2,
50 DEBUG_PERFWARN = 0x4,
51 };
52
53 /**
54 * Representation of the instruction's microcode encoding format
55 * Note: Some Vector ALU Formats can be combined, such that:
56 * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
57 * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
58 * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
59 *
60 * (*) The same is applicable for VOP1 and VOPC instructions.
61 */
62 enum class Format : std::uint16_t {
63 /* Pseudo Instruction Format */
64 PSEUDO = 0,
65 /* Scalar ALU & Control Formats */
66 SOP1 = 1,
67 SOP2 = 2,
68 SOPK = 3,
69 SOPP = 4,
70 SOPC = 5,
71 /* Scalar Memory Format */
72 SMEM = 6,
73 /* LDS/GDS Format */
74 DS = 8,
75 /* Vector Memory Buffer Formats */
76 MTBUF = 9,
77 MUBUF = 10,
78 /* Vector Memory Image Format */
79 MIMG = 11,
80 /* Export Format */
81 EXP = 12,
82 /* Flat Formats */
83 FLAT = 13,
84 GLOBAL = 14,
85 SCRATCH = 15,
86
87 PSEUDO_BRANCH = 16,
88 PSEUDO_BARRIER = 17,
89 PSEUDO_REDUCTION = 18,
90
91 /* Vector ALU Formats */
92 VOP1 = 1 << 8,
93 VOP2 = 1 << 9,
94 VOPC = 1 << 10,
95 VOP3 = 1 << 11,
96 VOP3A = 1 << 11,
97 VOP3B = 1 << 11,
98 VOP3P = 1 << 12,
99 /* Vector Parameter Interpolation Format */
100 VINTRP = 1 << 13,
101 DPP = 1 << 14,
102 SDWA = 1 << 15,
103 };
104
105 enum barrier_interaction : uint8_t {
106 barrier_none = 0,
107 barrier_buffer = 0x1,
108 barrier_image = 0x2,
109 barrier_atomic = 0x4,
110 barrier_shared = 0x8,
111 barrier_count = 4,
112 };
113
114 enum fp_round {
115 fp_round_ne = 0,
116 fp_round_pi = 1,
117 fp_round_ni = 2,
118 fp_round_tz = 3,
119 };
120
121 enum fp_denorm {
122 /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
123 * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
124 fp_denorm_flush = 0x0,
125 fp_denorm_keep = 0x3,
126 };
127
128 struct float_mode {
129 /* matches encoding of the MODE register */
130 union {
131 struct {
132 fp_round round32:2;
133 fp_round round16_64:2;
134 unsigned denorm32:2;
135 unsigned denorm16_64:2;
136 };
137 uint8_t val = 0;
138 };
139 /* if false, optimizations which may remove infs/nan/-0.0 can be done */
140 bool preserve_signed_zero_inf_nan32:1;
141 bool preserve_signed_zero_inf_nan16_64:1;
142 /* if false, optimizations which may remove denormal flushing can be done */
143 bool must_flush_denorms32:1;
144 bool must_flush_denorms16_64:1;
145 bool care_about_round32:1;
146 bool care_about_round16_64:1;
147
148 /* Returns true if instructions using the mode "other" can safely use the
149 * current one instead. */
150 bool canReplace(float_mode other) const noexcept {
151 return val == other.val &&
152 (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
153 (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
154 (must_flush_denorms32 || !other.must_flush_denorms32) &&
155 (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
156 (care_about_round32 || !other.care_about_round32) &&
157 (care_about_round16_64 || !other.care_about_round16_64);
158 }
159 };
160
161 constexpr Format asVOP3(Format format) {
162 return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
163 };
164
165 enum class RegType {
166 none = 0,
167 sgpr,
168 vgpr,
169 linear_vgpr,
170 };
171
172 struct RegClass {
173
174 enum RC : uint8_t {
175 s1 = 1,
176 s2 = 2,
177 s3 = 3,
178 s4 = 4,
179 s6 = 6,
180 s8 = 8,
181 s16 = 16,
182 v1 = s1 | (1 << 5),
183 v2 = s2 | (1 << 5),
184 v3 = s3 | (1 << 5),
185 v4 = s4 | (1 << 5),
186 v5 = 5 | (1 << 5),
187 v6 = 6 | (1 << 5),
188 v7 = 7 | (1 << 5),
189 v8 = 8 | (1 << 5),
190 /* these are used for WWM and spills to vgpr */
191 v1_linear = v1 | (1 << 6),
192 v2_linear = v2 | (1 << 6),
193 };
194
195 RegClass() = default;
196 constexpr RegClass(RC rc)
197 : rc(rc) {}
198 constexpr RegClass(RegType type, unsigned size)
199 : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
200
201 constexpr operator RC() const { return rc; }
202 explicit operator bool() = delete;
203
204 constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
205 constexpr unsigned size() const { return (unsigned) rc & 0x1F; }
206 constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
207 constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
208
209 private:
210 RC rc;
211 };
212
213 /* transitional helper expressions */
214 static constexpr RegClass s1{RegClass::s1};
215 static constexpr RegClass s2{RegClass::s2};
216 static constexpr RegClass s3{RegClass::s3};
217 static constexpr RegClass s4{RegClass::s4};
218 static constexpr RegClass s8{RegClass::s8};
219 static constexpr RegClass s16{RegClass::s16};
220 static constexpr RegClass v1{RegClass::v1};
221 static constexpr RegClass v2{RegClass::v2};
222 static constexpr RegClass v3{RegClass::v3};
223 static constexpr RegClass v4{RegClass::v4};
224 static constexpr RegClass v5{RegClass::v5};
225 static constexpr RegClass v6{RegClass::v6};
226 static constexpr RegClass v7{RegClass::v7};
227 static constexpr RegClass v8{RegClass::v8};
228
229 /**
230 * Temp Class
231 * Each temporary virtual register has a
232 * register class (i.e. size and type)
233 * and SSA id.
234 */
235 struct Temp {
236 Temp() = default;
237 constexpr Temp(uint32_t id, RegClass cls) noexcept
238 : id_(id), reg_class(cls) {}
239
240 constexpr uint32_t id() const noexcept { return id_; }
241 constexpr RegClass regClass() const noexcept { return reg_class; }
242
243 constexpr unsigned size() const noexcept { return reg_class.size(); }
244 constexpr RegType type() const noexcept { return reg_class.type(); }
245 constexpr bool is_linear() const noexcept { return reg_class.is_linear(); }
246
247 constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
248 constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
249 constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
250
251 private:
252 uint32_t id_:24;
253 RegClass reg_class;
254 };
255
256 /**
257 * PhysReg
258 * Represents the physical register for each
259 * Operand and Definition.
260 */
261 struct PhysReg {
262 constexpr PhysReg() = default;
263 explicit constexpr PhysReg(unsigned r) : reg(r) {}
264 constexpr operator unsigned() const { return reg; }
265
266 uint16_t reg = 0;
267 };
268
269 /* helper expressions for special registers */
270 static constexpr PhysReg m0{124};
271 static constexpr PhysReg vcc{106};
272 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
273 static constexpr PhysReg exec{126};
274 static constexpr PhysReg exec_lo{126};
275 static constexpr PhysReg exec_hi{127};
276 static constexpr PhysReg scc{253};
277
278 /**
279 * Operand Class
280 * Initially, each Operand refers to either
281 * a temporary virtual register
282 * or to a constant value
283 * Temporary registers get mapped to physical register during RA
284 * Constant values are inlined into the instruction sequence.
285 */
286 class Operand final
287 {
288 public:
289 constexpr Operand()
290 : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
291 isKill_(false), isUndef_(true), isFirstKill_(false), is64BitConst_(false) {}
292
293 explicit Operand(Temp r) noexcept
294 {
295 data_.temp = r;
296 if (r.id()) {
297 isTemp_ = true;
298 } else {
299 isUndef_ = true;
300 setFixed(PhysReg{128});
301 }
302 };
303 explicit Operand(uint32_t v, bool is64bit = false) noexcept
304 {
305 data_.i = v;
306 isConstant_ = true;
307 is64BitConst_ = is64bit;
308 if (v <= 64)
309 setFixed(PhysReg{128 + v});
310 else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
311 setFixed(PhysReg{192 - v});
312 else if (v == 0x3f000000) /* 0.5 */
313 setFixed(PhysReg{240});
314 else if (v == 0xbf000000) /* -0.5 */
315 setFixed(PhysReg{241});
316 else if (v == 0x3f800000) /* 1.0 */
317 setFixed(PhysReg{242});
318 else if (v == 0xbf800000) /* -1.0 */
319 setFixed(PhysReg{243});
320 else if (v == 0x40000000) /* 2.0 */
321 setFixed(PhysReg{244});
322 else if (v == 0xc0000000) /* -2.0 */
323 setFixed(PhysReg{245});
324 else if (v == 0x40800000) /* 4.0 */
325 setFixed(PhysReg{246});
326 else if (v == 0xc0800000) /* -4.0 */
327 setFixed(PhysReg{247});
328 else { /* Literal Constant */
329 assert(!is64bit && "attempt to create a 64-bit literal constant");
330 setFixed(PhysReg{255});
331 }
332 };
333 explicit Operand(uint64_t v) noexcept
334 {
335 isConstant_ = true;
336 is64BitConst_ = true;
337 if (v <= 64) {
338 data_.i = (uint32_t) v;
339 setFixed(PhysReg{128 + (uint32_t) v});
340 } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
341 data_.i = (uint32_t) v;
342 setFixed(PhysReg{192 - (uint32_t) v});
343 } else if (v == 0x3FE0000000000000) { /* 0.5 */
344 data_.i = 0x3f000000;
345 setFixed(PhysReg{240});
346 } else if (v == 0xBFE0000000000000) { /* -0.5 */
347 data_.i = 0xbf000000;
348 setFixed(PhysReg{241});
349 } else if (v == 0x3FF0000000000000) { /* 1.0 */
350 data_.i = 0x3f800000;
351 setFixed(PhysReg{242});
352 } else if (v == 0xBFF0000000000000) { /* -1.0 */
353 data_.i = 0xbf800000;
354 setFixed(PhysReg{243});
355 } else if (v == 0x4000000000000000) { /* 2.0 */
356 data_.i = 0x40000000;
357 setFixed(PhysReg{244});
358 } else if (v == 0xC000000000000000) { /* -2.0 */
359 data_.i = 0xc0000000;
360 setFixed(PhysReg{245});
361 } else if (v == 0x4010000000000000) { /* 4.0 */
362 data_.i = 0x40800000;
363 setFixed(PhysReg{246});
364 } else if (v == 0xC010000000000000) { /* -4.0 */
365 data_.i = 0xc0800000;
366 setFixed(PhysReg{247});
367 } else { /* Literal Constant: we don't know if it is a long or double.*/
368 isConstant_ = 0;
369 assert(false && "attempt to create a 64-bit literal constant");
370 }
371 };
372 explicit Operand(RegClass type) noexcept
373 {
374 isUndef_ = true;
375 data_.temp = Temp(0, type);
376 setFixed(PhysReg{128});
377 };
378 explicit Operand(PhysReg reg, RegClass type) noexcept
379 {
380 data_.temp = Temp(0, type);
381 setFixed(reg);
382 }
383
384 constexpr bool isTemp() const noexcept
385 {
386 return isTemp_;
387 }
388
389 constexpr void setTemp(Temp t) noexcept {
390 assert(!isConstant_);
391 isTemp_ = true;
392 data_.temp = t;
393 }
394
395 constexpr Temp getTemp() const noexcept
396 {
397 return data_.temp;
398 }
399
400 constexpr uint32_t tempId() const noexcept
401 {
402 return data_.temp.id();
403 }
404
405 constexpr bool hasRegClass() const noexcept
406 {
407 return isTemp() || isUndefined();
408 }
409
410 constexpr RegClass regClass() const noexcept
411 {
412 return data_.temp.regClass();
413 }
414
415 constexpr unsigned size() const noexcept
416 {
417 if (isConstant())
418 return is64BitConst_ ? 2 : 1;
419 else
420 return data_.temp.size();
421 }
422
423 constexpr bool isFixed() const noexcept
424 {
425 return isFixed_;
426 }
427
428 constexpr PhysReg physReg() const noexcept
429 {
430 return reg_;
431 }
432
433 constexpr void setFixed(PhysReg reg) noexcept
434 {
435 isFixed_ = reg != unsigned(-1);
436 reg_ = reg;
437 }
438
439 constexpr bool isConstant() const noexcept
440 {
441 return isConstant_;
442 }
443
444 constexpr bool isLiteral() const noexcept
445 {
446 return isConstant() && reg_ == 255;
447 }
448
449 constexpr bool isUndefined() const noexcept
450 {
451 return isUndef_;
452 }
453
454 constexpr uint32_t constantValue() const noexcept
455 {
456 return data_.i;
457 }
458
459 constexpr bool constantEquals(uint32_t cmp) const noexcept
460 {
461 return isConstant() && constantValue() == cmp;
462 }
463
464 constexpr void setKill(bool flag) noexcept
465 {
466 isKill_ = flag;
467 if (!flag)
468 setFirstKill(false);
469 }
470
471 constexpr bool isKill() const noexcept
472 {
473 return isKill_ || isFirstKill();
474 }
475
476 constexpr void setFirstKill(bool flag) noexcept
477 {
478 isFirstKill_ = flag;
479 if (flag)
480 setKill(flag);
481 }
482
483 /* When there are multiple operands killing the same temporary,
484 * isFirstKill() is only returns true for the first one. */
485 constexpr bool isFirstKill() const noexcept
486 {
487 return isFirstKill_;
488 }
489
490 private:
491 union {
492 uint32_t i;
493 float f;
494 Temp temp = Temp(0, s1);
495 } data_;
496 PhysReg reg_;
497 union {
498 struct {
499 uint8_t isTemp_:1;
500 uint8_t isFixed_:1;
501 uint8_t isConstant_:1;
502 uint8_t isKill_:1;
503 uint8_t isUndef_:1;
504 uint8_t isFirstKill_:1;
505 uint8_t is64BitConst_:1;
506 };
507 /* can't initialize bit-fields in c++11, so work around using a union */
508 uint8_t control_ = 0;
509 };
510 };
511
512 /**
513 * Definition Class
514 * Definitions are the results of Instructions
515 * and refer to temporary virtual registers
516 * which are later mapped to physical registers
517 */
518 class Definition final
519 {
520 public:
521 constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0) {}
522 Definition(uint32_t index, RegClass type) noexcept
523 : temp(index, type) {}
524 explicit Definition(Temp tmp) noexcept
525 : temp(tmp) {}
526 Definition(PhysReg reg, RegClass type) noexcept
527 : temp(Temp(0, type))
528 {
529 setFixed(reg);
530 }
531 Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
532 : temp(Temp(tmpId, type))
533 {
534 setFixed(reg);
535 }
536
537 constexpr bool isTemp() const noexcept
538 {
539 return tempId() > 0;
540 }
541
542 constexpr Temp getTemp() const noexcept
543 {
544 return temp;
545 }
546
547 constexpr uint32_t tempId() const noexcept
548 {
549 return temp.id();
550 }
551
552 constexpr void setTemp(Temp t) noexcept {
553 temp = t;
554 }
555
556 constexpr RegClass regClass() const noexcept
557 {
558 return temp.regClass();
559 }
560
561 constexpr unsigned size() const noexcept
562 {
563 return temp.size();
564 }
565
566 constexpr bool isFixed() const noexcept
567 {
568 return isFixed_;
569 }
570
571 constexpr PhysReg physReg() const noexcept
572 {
573 return reg_;
574 }
575
576 constexpr void setFixed(PhysReg reg) noexcept
577 {
578 isFixed_ = 1;
579 reg_ = reg;
580 }
581
582 constexpr void setHint(PhysReg reg) noexcept
583 {
584 hasHint_ = 1;
585 reg_ = reg;
586 }
587
588 constexpr bool hasHint() const noexcept
589 {
590 return hasHint_;
591 }
592
593 constexpr void setKill(bool flag) noexcept
594 {
595 isKill_ = flag;
596 }
597
598 constexpr bool isKill() const noexcept
599 {
600 return isKill_;
601 }
602
603 private:
604 Temp temp = Temp(0, s1);
605 PhysReg reg_;
606 union {
607 struct {
608 uint8_t isFixed_:1;
609 uint8_t hasHint_:1;
610 uint8_t isKill_:1;
611 };
612 /* can't initialize bit-fields in c++11, so work around using a union */
613 uint8_t control_ = 0;
614 };
615 };
616
617 class Block;
618
619 struct Instruction {
620 aco_opcode opcode;
621 Format format;
622 uint32_t pass_flags;
623
624 aco::span<Operand> operands;
625 aco::span<Definition> definitions;
626
627 constexpr bool isVALU() const noexcept
628 {
629 return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
630 || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
631 || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
632 || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
633 || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
634 || ((uint16_t) format & (uint16_t) Format::VOP3P) == (uint16_t) Format::VOP3P;
635 }
636
637 constexpr bool isSALU() const noexcept
638 {
639 return format == Format::SOP1 ||
640 format == Format::SOP2 ||
641 format == Format::SOPC ||
642 format == Format::SOPK ||
643 format == Format::SOPP;
644 }
645
646 constexpr bool isVMEM() const noexcept
647 {
648 return format == Format::MTBUF ||
649 format == Format::MUBUF ||
650 format == Format::MIMG;
651 }
652
653 constexpr bool isDPP() const noexcept
654 {
655 return (uint16_t) format & (uint16_t) Format::DPP;
656 }
657
658 constexpr bool isVOP3() const noexcept
659 {
660 return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
661 ((uint16_t) format & (uint16_t) Format::VOP3B) ||
662 format == Format::VOP3P;
663 }
664
665 constexpr bool isSDWA() const noexcept
666 {
667 return (uint16_t) format & (uint16_t) Format::SDWA;
668 }
669
670 constexpr bool isFlatOrGlobal() const noexcept
671 {
672 return format == Format::FLAT || format == Format::GLOBAL;
673 }
674
675 constexpr bool usesModifiers() const noexcept;
676
677 constexpr bool reads_exec() const noexcept
678 {
679 for (const Operand& op : operands) {
680 if (op.isFixed() && op.physReg() == exec)
681 return true;
682 }
683 return false;
684 }
685 };
686
687 struct SOPK_instruction : public Instruction {
688 uint16_t imm;
689 };
690
691 struct SOPP_instruction : public Instruction {
692 uint32_t imm;
693 int block;
694 };
695
696 struct SOPC_instruction : public Instruction {
697 };
698
699 struct SOP1_instruction : public Instruction {
700 };
701
702 struct SOP2_instruction : public Instruction {
703 };
704
705 /**
706 * Scalar Memory Format:
707 * For s_(buffer_)load_dword*:
708 * Operand(0): SBASE - SGPR-pair which provides base address
709 * Operand(1): Offset - immediate (un)signed offset or SGPR
710 * Operand(2) / Definition(0): SDATA - SGPR for read / write result
711 * Operand(n-1): SOffset - SGPR offset (Vega only)
712 *
713 * Having no operands is also valid for instructions such as s_dcache_inv.
714 *
715 */
716 struct SMEM_instruction : public Instruction {
717 bool glc : 1; /* VI+: globally coherent */
718 bool dlc : 1; /* NAVI: device level coherent */
719 bool nv : 1; /* VEGA only: Non-volatile */
720 bool can_reorder : 1;
721 bool disable_wqm : 1;
722 barrier_interaction barrier;
723 };
724
725 struct VOP1_instruction : public Instruction {
726 };
727
728 struct VOP2_instruction : public Instruction {
729 };
730
731 struct VOPC_instruction : public Instruction {
732 };
733
734 struct VOP3A_instruction : public Instruction {
735 bool abs[3];
736 bool neg[3];
737 uint8_t opsel : 4;
738 uint8_t omod : 2;
739 bool clamp : 1;
740 };
741
742 /**
743 * Data Parallel Primitives Format:
744 * This format can be used for VOP1, VOP2 or VOPC instructions.
745 * The swizzle applies to the src0 operand.
746 *
747 */
748 struct DPP_instruction : public Instruction {
749 bool abs[2];
750 bool neg[2];
751 uint16_t dpp_ctrl;
752 uint8_t row_mask : 4;
753 uint8_t bank_mask : 4;
754 bool bound_ctrl : 1;
755 };
756
757 struct Interp_instruction : public Instruction {
758 uint8_t attribute;
759 uint8_t component;
760 };
761
762 /**
763 * Local and Global Data Sharing instructions
764 * Operand(0): ADDR - VGPR which supplies the address.
765 * Operand(1): DATA0 - First data VGPR.
766 * Operand(2): DATA1 - Second data VGPR.
767 * Operand(n-1): M0 - LDS size.
768 * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
769 *
770 */
771 struct DS_instruction : public Instruction {
772 int16_t offset0;
773 int8_t offset1;
774 bool gds;
775 };
776
777 /**
778 * Vector Memory Untyped-buffer Instructions
779 * Operand(0): VADDR - Address source. Can carry an index and/or offset
780 * Operand(1): SRSRC - Specifies which SGPR supplies T# (resource constant)
781 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
782 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
783 *
784 */
785 struct MUBUF_instruction : public Instruction {
786 uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
787 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
788 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
789 bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
790 bool glc : 1; /* globally coherent */
791 bool dlc : 1; /* NAVI: device level coherent */
792 bool slc : 1; /* system level coherent */
793 bool tfe : 1; /* texture fail enable */
794 bool lds : 1; /* Return read-data to LDS instead of VGPRs */
795 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
796 bool can_reorder : 1;
797 barrier_interaction barrier;
798 };
799
800 /**
801 * Vector Memory Typed-buffer Instructions
802 * Operand(0): VADDR - Address source. Can carry an index and/or offset
803 * Operand(1): SRSRC - Specifies which SGPR supplies T# (resource constant)
804 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
805 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
806 *
807 */
808 struct MTBUF_instruction : public Instruction {
809 uint16_t offset; /* Unsigned byte offset - 12 bit */
810 uint8_t dfmt : 4; /* Data Format of data in memory buffer */
811 uint8_t nfmt : 3; /* Numeric format of data in memory */
812 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
813 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
814 bool glc : 1; /* globally coherent */
815 bool dlc : 1; /* NAVI: device level coherent */
816 bool slc : 1; /* system level coherent */
817 bool tfe : 1; /* texture fail enable */
818 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
819 bool can_reorder : 1;
820 barrier_interaction barrier;
821 };
822
823 /**
824 * Vector Memory Image Instructions
825 * Operand(0): VADDR - Address source. Can carry an offset or an index.
826 * Operand(1): SRSRC - Scalar GPR that specifies the resource constant.
827 * Operand(2): SSAMP - Scalar GPR that specifies sampler constant.
828 * Operand(3) / Definition(0): VDATA - Vector GPR for read / write result.
829 *
830 */
831 struct MIMG_instruction : public Instruction {
832 uint8_t dmask; /* Data VGPR enable mask */
833 uint8_t dim : 3; /* NAVI: dimensionality */
834 bool unrm : 1; /* Force address to be un-normalized */
835 bool dlc : 1; /* NAVI: device level coherent */
836 bool glc : 1; /* globally coherent */
837 bool slc : 1; /* system level coherent */
838 bool tfe : 1; /* texture fail enable */
839 bool da : 1; /* declare an array */
840 bool lwe : 1; /* Force data to be un-normalized */
841 bool r128 : 1; /* NAVI: Texture resource size */
842 bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
843 bool d16 : 1; /* Convert 32-bit data to 16-bit data */
844 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
845 bool can_reorder : 1;
846 barrier_interaction barrier;
847 };
848
849 /**
850 * Flat/Scratch/Global Instructions
851 * Operand(0): ADDR
852 * Operand(1): SADDR
853 * Operand(2) / Definition(0): DATA/VDST
854 *
855 */
856 struct FLAT_instruction : public Instruction {
857 uint16_t offset; /* Vega/Navi only */
858 bool slc : 1; /* system level coherent */
859 bool glc : 1; /* globally coherent */
860 bool dlc : 1; /* NAVI: device level coherent */
861 bool lds : 1;
862 bool nv : 1;
863 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
864 bool can_reorder : 1;
865 barrier_interaction barrier;
866 };
867
868 struct Export_instruction : public Instruction {
869 uint8_t enabled_mask;
870 uint8_t dest;
871 bool compressed : 1;
872 bool done : 1;
873 bool valid_mask : 1;
874 };
875
876 struct Pseudo_instruction : public Instruction {
877 bool tmp_in_scc;
878 PhysReg scratch_sgpr; /* might not be valid if it's not needed */
879 };
880
881 struct Pseudo_branch_instruction : public Instruction {
882 /* target[0] is the block index of the branch target.
883 * For conditional branches, target[1] contains the fall-through alternative.
884 * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
885 */
886 uint32_t target[2];
887 };
888
889 struct Pseudo_barrier_instruction : public Instruction {
890 };
891
892 enum ReduceOp {
893 iadd32, iadd64,
894 imul32, imul64,
895 fadd32, fadd64,
896 fmul32, fmul64,
897 imin32, imin64,
898 imax32, imax64,
899 umin32, umin64,
900 umax32, umax64,
901 fmin32, fmin64,
902 fmax32, fmax64,
903 iand32, iand64,
904 ior32, ior64,
905 ixor32, ixor64,
906 gfx10_wave64_bpermute
907 };
908
909 /**
910 * Subgroup Reduction Instructions, everything except for the data to be
911 * reduced and the result as inserted by setup_reduce_temp().
912 * Operand(0): data to be reduced
913 * Operand(1): reduce temporary
914 * Operand(2): vector temporary
915 * Definition(0): result
916 * Definition(1): scalar temporary
917 * Definition(2): scalar identity temporary (not used to store identity on GFX10)
918 * Definition(3): scc clobber
919 * Definition(4): vcc clobber
920 *
921 */
922 struct Pseudo_reduction_instruction : public Instruction {
923 ReduceOp reduce_op;
924 unsigned cluster_size; // must be 0 for scans
925 };
926
927 struct instr_deleter_functor {
928 void operator()(void* p) {
929 free(p);
930 }
931 };
932
933 template<typename T>
934 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
935
936 template<typename T>
937 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
938 {
939 std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
940 char *data = (char*) calloc(1, size);
941 T* inst = (T*) data;
942
943 inst->opcode = opcode;
944 inst->format = format;
945
946 uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
947 inst->operands = aco::span<Operand>(operands_offset, num_operands);
948 uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
949 inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
950
951 return inst;
952 }
953
954 constexpr bool Instruction::usesModifiers() const noexcept
955 {
956 if (isDPP() || isSDWA())
957 return true;
958 if (!isVOP3())
959 return false;
960 const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
961 for (unsigned i = 0; i < operands.size(); i++) {
962 if (vop3->abs[i] || vop3->neg[i])
963 return true;
964 }
965 return vop3->opsel || vop3->clamp || vop3->omod;
966 }
967
968 constexpr bool is_phi(Instruction* instr)
969 {
970 return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
971 }
972
973 static inline bool is_phi(aco_ptr<Instruction>& instr)
974 {
975 return is_phi(instr.get());
976 }
977
978 constexpr barrier_interaction get_barrier_interaction(Instruction* instr)
979 {
980 switch (instr->format) {
981 case Format::SMEM:
982 return static_cast<SMEM_instruction*>(instr)->barrier;
983 case Format::MUBUF:
984 return static_cast<MUBUF_instruction*>(instr)->barrier;
985 case Format::MIMG:
986 return static_cast<MIMG_instruction*>(instr)->barrier;
987 case Format::FLAT:
988 case Format::GLOBAL:
989 case Format::SCRATCH:
990 return static_cast<FLAT_instruction*>(instr)->barrier;
991 case Format::DS:
992 return barrier_shared;
993 default:
994 return barrier_none;
995 }
996 }
997
998 bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr);
999
1000 enum block_kind {
1001 /* uniform indicates that leaving this block,
1002 * all actives lanes stay active */
1003 block_kind_uniform = 1 << 0,
1004 block_kind_top_level = 1 << 1,
1005 block_kind_loop_preheader = 1 << 2,
1006 block_kind_loop_header = 1 << 3,
1007 block_kind_loop_exit = 1 << 4,
1008 block_kind_continue = 1 << 5,
1009 block_kind_break = 1 << 6,
1010 block_kind_continue_or_break = 1 << 7,
1011 block_kind_discard = 1 << 8,
1012 block_kind_branch = 1 << 9,
1013 block_kind_merge = 1 << 10,
1014 block_kind_invert = 1 << 11,
1015 block_kind_uses_discard_if = 1 << 12,
1016 block_kind_needs_lowering = 1 << 13,
1017 block_kind_uses_demote = 1 << 14,
1018 };
1019
1020
1021 struct RegisterDemand {
1022 constexpr RegisterDemand() = default;
1023 constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1024 : vgpr{v}, sgpr{s} {}
1025 int16_t vgpr = 0;
1026 int16_t sgpr = 0;
1027
1028 constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1029 return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1030 }
1031
1032 constexpr bool exceeds(const RegisterDemand other) const noexcept {
1033 return vgpr > other.vgpr || sgpr > other.sgpr;
1034 }
1035
1036 constexpr RegisterDemand operator+(const Temp t) const noexcept {
1037 if (t.type() == RegType::sgpr)
1038 return RegisterDemand( vgpr, sgpr + t.size() );
1039 else
1040 return RegisterDemand( vgpr + t.size(), sgpr );
1041 }
1042
1043 constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1044 return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1045 }
1046
1047 constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1048 return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1049 }
1050
1051 constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1052 vgpr += other.vgpr;
1053 sgpr += other.sgpr;
1054 return *this;
1055 }
1056
1057 constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1058 vgpr -= other.vgpr;
1059 sgpr -= other.sgpr;
1060 return *this;
1061 }
1062
1063 constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1064 if (t.type() == RegType::sgpr)
1065 sgpr += t.size();
1066 else
1067 vgpr += t.size();
1068 return *this;
1069 }
1070
1071 constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1072 if (t.type() == RegType::sgpr)
1073 sgpr -= t.size();
1074 else
1075 vgpr -= t.size();
1076 return *this;
1077 }
1078
1079 constexpr void update(const RegisterDemand other) noexcept {
1080 vgpr = std::max(vgpr, other.vgpr);
1081 sgpr = std::max(sgpr, other.sgpr);
1082 }
1083
1084 };
1085
1086 /* CFG */
1087 struct Block {
1088 float_mode fp_mode;
1089 unsigned index;
1090 unsigned offset = 0;
1091 std::vector<aco_ptr<Instruction>> instructions;
1092 std::vector<unsigned> logical_preds;
1093 std::vector<unsigned> linear_preds;
1094 std::vector<unsigned> logical_succs;
1095 std::vector<unsigned> linear_succs;
1096 RegisterDemand register_demand = RegisterDemand();
1097 uint16_t loop_nest_depth = 0;
1098 uint16_t kind = 0;
1099 int logical_idom = -1;
1100 int linear_idom = -1;
1101 Temp live_out_exec = Temp();
1102
1103 /* this information is needed for predecessors to blocks with phis when
1104 * moving out of ssa */
1105 bool scc_live_out = false;
1106 PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1107
1108 Block(unsigned idx) : index(idx) {}
1109 Block() : index(0) {}
1110 };
1111
1112 using Stage = uint16_t;
1113
1114 /* software stages */
1115 static constexpr Stage sw_vs = 1 << 0;
1116 static constexpr Stage sw_gs = 1 << 1;
1117 static constexpr Stage sw_tcs = 1 << 2;
1118 static constexpr Stage sw_tes = 1 << 3;
1119 static constexpr Stage sw_fs = 1 << 4;
1120 static constexpr Stage sw_cs = 1 << 5;
1121 static constexpr Stage sw_mask = 0x3f;
1122
1123 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1124 static constexpr Stage hw_vs = 1 << 6;
1125 static constexpr Stage hw_es = 1 << 7; /* not on GFX9. combined into GS on GFX9 (and GFX10/legacy). */
1126 static constexpr Stage hw_gs = 1 << 8;
1127 static constexpr Stage hw_ls = 1 << 9; /* not on GFX9. combined into HS on GFX9 (and GFX10/legacy). */
1128 static constexpr Stage hw_hs = 1 << 10;
1129 static constexpr Stage hw_fs = 1 << 11;
1130 static constexpr Stage hw_cs = 1 << 12;
1131 static constexpr Stage hw_mask = 0x7f << 6;
1132
1133 /* possible settings of Program::stage */
1134 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1135 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1136 static constexpr Stage compute_cs = sw_cs | hw_cs;
1137 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1138 /* GFX10/NGG */
1139 static constexpr Stage ngg_vertex_gs = sw_vs | hw_gs;
1140 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1141 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1142 static constexpr Stage ngg_vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1143 /* GFX9 (and GFX10 if NGG isn't used) */
1144 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1145 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1146 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1147 /* pre-GFX9 */
1148 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1149 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1150 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1151 static constexpr Stage tess_eval_es = sw_tes | hw_gs; /* tesselation evaluation before geometry */
1152 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1153
1154 class Program final {
1155 public:
1156 float_mode next_fp_mode;
1157 std::vector<Block> blocks;
1158 RegisterDemand max_reg_demand = RegisterDemand();
1159 uint16_t num_waves = 0;
1160 uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1161 ac_shader_config* config;
1162 struct radv_shader_info *info;
1163 enum chip_class chip_class;
1164 enum radeon_family family;
1165 unsigned wave_size;
1166 RegClass lane_mask;
1167 Stage stage; /* Stage */
1168 bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1169 bool needs_wqm = false; /* there exists a p_wqm instruction */
1170 bool wb_smem_l1_on_end = false;
1171
1172 std::vector<uint8_t> constant_data;
1173 Temp private_segment_buffer;
1174 Temp scratch_offset;
1175
1176 uint16_t min_waves = 0;
1177 uint16_t lds_alloc_granule;
1178 uint32_t lds_limit; /* in bytes */
1179 uint16_t vgpr_limit;
1180 uint16_t sgpr_limit;
1181 uint16_t physical_sgprs;
1182 uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1183 uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
1184
1185 bool needs_vcc = false;
1186 bool needs_xnack_mask = false;
1187 bool needs_flat_scr = false;
1188
1189 uint32_t allocateId()
1190 {
1191 assert(allocationID <= 16777215);
1192 return allocationID++;
1193 }
1194
1195 uint32_t peekAllocationId()
1196 {
1197 return allocationID;
1198 }
1199
1200 void setAllocationId(uint32_t id)
1201 {
1202 allocationID = id;
1203 }
1204
1205 Block* create_and_insert_block() {
1206 blocks.emplace_back(blocks.size());
1207 blocks.back().fp_mode = next_fp_mode;
1208 return &blocks.back();
1209 }
1210
1211 Block* insert_block(Block&& block) {
1212 block.index = blocks.size();
1213 block.fp_mode = next_fp_mode;
1214 blocks.emplace_back(std::move(block));
1215 return &blocks.back();
1216 }
1217
1218 private:
1219 uint32_t allocationID = 1;
1220 };
1221
1222 struct live {
1223 /* live temps out per block */
1224 std::vector<std::set<Temp>> live_out;
1225 /* register demand (sgpr/vgpr) per instruction per block */
1226 std::vector<std::vector<RegisterDemand>> register_demand;
1227 };
1228
1229 void select_program(Program *program,
1230 unsigned shader_count,
1231 struct nir_shader *const *shaders,
1232 ac_shader_config* config,
1233 struct radv_shader_args *args);
1234
1235 void lower_wqm(Program* program, live& live_vars,
1236 const struct radv_nir_compiler_options *options);
1237 void lower_bool_phis(Program* program);
1238 void calc_min_waves(Program* program);
1239 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1240 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1241 std::vector<uint16_t> dead_code_analysis(Program *program);
1242 void dominator_tree(Program* program);
1243 void insert_exec_mask(Program *program);
1244 void value_numbering(Program* program);
1245 void optimize(Program* program);
1246 void setup_reduce_temp(Program* program);
1247 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1248 void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_per_block);
1249 void ssa_elimination(Program* program);
1250 void lower_to_hw_instr(Program* program);
1251 void schedule_program(Program* program, live& live_vars);
1252 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1253 void insert_wait_states(Program* program);
1254 void insert_NOPs(Program* program);
1255 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1256 void print_asm(Program *program, std::vector<uint32_t>& binary,
1257 unsigned exec_size, std::ostream& out);
1258 void validate(Program* program, FILE *output);
1259 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output);
1260 #ifndef NDEBUG
1261 void perfwarn(bool cond, const char *msg, Instruction *instr=NULL);
1262 #else
1263 #define perfwarn(program, cond, msg, ...) do {} while(0)
1264 #endif
1265
1266 void aco_print_instr(Instruction *instr, FILE *output);
1267 void aco_print_program(Program *program, FILE *output);
1268
1269 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1270 uint16_t get_extra_sgprs(Program *program);
1271
1272 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
1273 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1274 uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs);
1275
1276 /* return number of addressable sgprs/vgprs for max_waves */
1277 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1278 uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves);
1279
1280 typedef struct {
1281 const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
1282 const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1283 const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1284 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1285 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1286 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
1287 const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1288 const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1289 } Info;
1290
1291 extern const Info instr_info;
1292
1293 }
1294
1295 #endif /* ACO_IR_H */
1296