aco: Initial GFX7 Support
[mesa.git] / src / amd / compiler / aco_ir.h
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #ifndef ACO_IR_H
26 #define ACO_IR_H
27
28 #include <vector>
29 #include <set>
30 #include <bitset>
31 #include <memory>
32
33 #include "nir.h"
34 #include "ac_binary.h"
35 #include "amd_family.h"
36 #include "aco_opcodes.h"
37 #include "aco_util.h"
38
39 struct radv_nir_compiler_options;
40 struct radv_shader_args;
41 struct radv_shader_info;
42
43 namespace aco {
44
45 extern uint64_t debug_flags;
46
47 enum {
48 DEBUG_VALIDATE = 0x1,
49 DEBUG_VALIDATE_RA = 0x2,
50 DEBUG_PERFWARN = 0x4,
51 };
52
53 /**
54 * Representation of the instruction's microcode encoding format
55 * Note: Some Vector ALU Formats can be combined, such that:
56 * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
57 * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
58 * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
59 *
60 * (*) The same is applicable for VOP1 and VOPC instructions.
61 */
62 enum class Format : std::uint16_t {
63 /* Pseudo Instruction Format */
64 PSEUDO = 0,
65 /* Scalar ALU & Control Formats */
66 SOP1 = 1,
67 SOP2 = 2,
68 SOPK = 3,
69 SOPP = 4,
70 SOPC = 5,
71 /* Scalar Memory Format */
72 SMEM = 6,
73 /* LDS/GDS Format */
74 DS = 8,
75 /* Vector Memory Buffer Formats */
76 MTBUF = 9,
77 MUBUF = 10,
78 /* Vector Memory Image Format */
79 MIMG = 11,
80 /* Export Format */
81 EXP = 12,
82 /* Flat Formats */
83 FLAT = 13,
84 GLOBAL = 14,
85 SCRATCH = 15,
86
87 PSEUDO_BRANCH = 16,
88 PSEUDO_BARRIER = 17,
89 PSEUDO_REDUCTION = 18,
90
91 /* Vector ALU Formats */
92 VOP1 = 1 << 8,
93 VOP2 = 1 << 9,
94 VOPC = 1 << 10,
95 VOP3 = 1 << 11,
96 VOP3A = 1 << 11,
97 VOP3B = 1 << 11,
98 VOP3P = 1 << 12,
99 /* Vector Parameter Interpolation Format */
100 VINTRP = 1 << 13,
101 DPP = 1 << 14,
102 SDWA = 1 << 15,
103 };
104
105 enum barrier_interaction {
106 barrier_none = 0,
107 barrier_buffer = 0x1,
108 barrier_image = 0x2,
109 barrier_atomic = 0x4,
110 barrier_shared = 0x8,
111 barrier_count = 4,
112 };
113
114 enum fp_round {
115 fp_round_ne = 0,
116 fp_round_pi = 1,
117 fp_round_ni = 2,
118 fp_round_tz = 3,
119 };
120
121 enum fp_denorm {
122 /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
123 * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
124 fp_denorm_flush = 0x0,
125 fp_denorm_keep = 0x3,
126 };
127
128 struct float_mode {
129 /* matches encoding of the MODE register */
130 union {
131 struct {
132 fp_round round32:2;
133 fp_round round16_64:2;
134 unsigned denorm32:2;
135 unsigned denorm16_64:2;
136 };
137 uint8_t val = 0;
138 };
139 /* if false, optimizations which may remove infs/nan/-0.0 can be done */
140 bool preserve_signed_zero_inf_nan32:1;
141 bool preserve_signed_zero_inf_nan16_64:1;
142 /* if false, optimizations which may remove denormal flushing can be done */
143 bool must_flush_denorms32:1;
144 bool must_flush_denorms16_64:1;
145 bool care_about_round32:1;
146 bool care_about_round16_64:1;
147
148 /* Returns true if instructions using the mode "other" can safely use the
149 * current one instead. */
150 bool canReplace(float_mode other) const noexcept {
151 return val == other.val &&
152 (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
153 (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
154 (must_flush_denorms32 || !other.must_flush_denorms32) &&
155 (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
156 (care_about_round32 || !other.care_about_round32) &&
157 (care_about_round16_64 || !other.care_about_round16_64);
158 }
159 };
160
161 constexpr Format asVOP3(Format format) {
162 return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
163 };
164
165 enum class RegType {
166 none = 0,
167 sgpr,
168 vgpr,
169 linear_vgpr,
170 };
171
172 struct RegClass {
173
174 enum RC : uint8_t {
175 s1 = 1,
176 s2 = 2,
177 s3 = 3,
178 s4 = 4,
179 s6 = 6,
180 s8 = 8,
181 s16 = 16,
182 v1 = s1 | (1 << 5),
183 v2 = s2 | (1 << 5),
184 v3 = s3 | (1 << 5),
185 v4 = s4 | (1 << 5),
186 v5 = 5 | (1 << 5),
187 v6 = 6 | (1 << 5),
188 v7 = 7 | (1 << 5),
189 v8 = 8 | (1 << 5),
190 /* these are used for WWM and spills to vgpr */
191 v1_linear = v1 | (1 << 6),
192 v2_linear = v2 | (1 << 6),
193 };
194
195 RegClass() = default;
196 constexpr RegClass(RC rc)
197 : rc(rc) {}
198 constexpr RegClass(RegType type, unsigned size)
199 : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
200
201 constexpr operator RC() const { return rc; }
202 explicit operator bool() = delete;
203
204 constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
205 constexpr unsigned size() const { return (unsigned) rc & 0x1F; }
206 constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
207 constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
208
209 private:
210 RC rc;
211 };
212
213 /* transitional helper expressions */
214 static constexpr RegClass s1{RegClass::s1};
215 static constexpr RegClass s2{RegClass::s2};
216 static constexpr RegClass s3{RegClass::s3};
217 static constexpr RegClass s4{RegClass::s4};
218 static constexpr RegClass s8{RegClass::s8};
219 static constexpr RegClass s16{RegClass::s16};
220 static constexpr RegClass v1{RegClass::v1};
221 static constexpr RegClass v2{RegClass::v2};
222 static constexpr RegClass v3{RegClass::v3};
223 static constexpr RegClass v4{RegClass::v4};
224 static constexpr RegClass v5{RegClass::v5};
225 static constexpr RegClass v6{RegClass::v6};
226 static constexpr RegClass v7{RegClass::v7};
227 static constexpr RegClass v8{RegClass::v8};
228
229 /**
230 * Temp Class
231 * Each temporary virtual register has a
232 * register class (i.e. size and type)
233 * and SSA id.
234 */
235 struct Temp {
236 Temp() = default;
237 constexpr Temp(uint32_t id, RegClass cls) noexcept
238 : id_(id), reg_class(cls) {}
239
240 constexpr uint32_t id() const noexcept { return id_; }
241 constexpr RegClass regClass() const noexcept { return reg_class; }
242
243 constexpr unsigned size() const noexcept { return reg_class.size(); }
244 constexpr RegType type() const noexcept { return reg_class.type(); }
245 constexpr bool is_linear() const noexcept { return reg_class.is_linear(); }
246
247 constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
248 constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
249 constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
250
251 private:
252 uint32_t id_:24;
253 RegClass reg_class;
254 };
255
256 /**
257 * PhysReg
258 * Represents the physical register for each
259 * Operand and Definition.
260 */
261 struct PhysReg {
262 constexpr PhysReg() = default;
263 explicit constexpr PhysReg(unsigned r) : reg(r) {}
264 constexpr operator unsigned() const { return reg; }
265
266 uint16_t reg = 0;
267 };
268
269 /* helper expressions for special registers */
270 static constexpr PhysReg m0{124};
271 static constexpr PhysReg vcc{106};
272 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
273 static constexpr PhysReg exec{126};
274 static constexpr PhysReg exec_lo{126};
275 static constexpr PhysReg exec_hi{127};
276 static constexpr PhysReg scc{253};
277
278 /**
279 * Operand Class
280 * Initially, each Operand refers to either
281 * a temporary virtual register
282 * or to a constant value
283 * Temporary registers get mapped to physical register during RA
284 * Constant values are inlined into the instruction sequence.
285 */
286 class Operand final
287 {
288 public:
289 constexpr Operand()
290 : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
291 isKill_(false), isUndef_(true), isFirstKill_(false), is64BitConst_(false) {}
292
293 explicit Operand(Temp r) noexcept
294 {
295 data_.temp = r;
296 if (r.id()) {
297 isTemp_ = true;
298 } else {
299 isUndef_ = true;
300 setFixed(PhysReg{128});
301 }
302 };
303 explicit Operand(uint32_t v) noexcept
304 {
305 data_.i = v;
306 isConstant_ = true;
307 if (v <= 64)
308 setFixed(PhysReg{128 + v});
309 else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
310 setFixed(PhysReg{192 - v});
311 else if (v == 0x3f000000) /* 0.5 */
312 setFixed(PhysReg{240});
313 else if (v == 0xbf000000) /* -0.5 */
314 setFixed(PhysReg{241});
315 else if (v == 0x3f800000) /* 1.0 */
316 setFixed(PhysReg{242});
317 else if (v == 0xbf800000) /* -1.0 */
318 setFixed(PhysReg{243});
319 else if (v == 0x40000000) /* 2.0 */
320 setFixed(PhysReg{244});
321 else if (v == 0xc0000000) /* -2.0 */
322 setFixed(PhysReg{245});
323 else if (v == 0x40800000) /* 4.0 */
324 setFixed(PhysReg{246});
325 else if (v == 0xc0800000) /* -4.0 */
326 setFixed(PhysReg{247});
327 else if (v == 0x3e22f983) /* 1/(2*PI) */
328 setFixed(PhysReg{248});
329 else /* Literal Constant */
330 setFixed(PhysReg{255});
331 };
332 explicit Operand(uint64_t v) noexcept
333 {
334 isConstant_ = true;
335 is64BitConst_ = true;
336 if (v <= 64)
337 setFixed(PhysReg{128 + (uint32_t) v});
338 else if (v >= 0xFFFFFFFFFFFFFFF0) /* [-16 .. -1] */
339 setFixed(PhysReg{192 - (uint32_t) v});
340 else if (v == 0x3FE0000000000000) /* 0.5 */
341 setFixed(PhysReg{240});
342 else if (v == 0xBFE0000000000000) /* -0.5 */
343 setFixed(PhysReg{241});
344 else if (v == 0x3FF0000000000000) /* 1.0 */
345 setFixed(PhysReg{242});
346 else if (v == 0xBFF0000000000000) /* -1.0 */
347 setFixed(PhysReg{243});
348 else if (v == 0x4000000000000000) /* 2.0 */
349 setFixed(PhysReg{244});
350 else if (v == 0xC000000000000000) /* -2.0 */
351 setFixed(PhysReg{245});
352 else if (v == 0x4010000000000000) /* 4.0 */
353 setFixed(PhysReg{246});
354 else if (v == 0xC010000000000000) /* -4.0 */
355 setFixed(PhysReg{247});
356 else if (v == 0x3fc45f306dc9c882) /* 1/(2*PI) */
357 setFixed(PhysReg{248});
358 else { /* Literal Constant: we don't know if it is a long or double.*/
359 isConstant_ = 0;
360 assert(false && "attempt to create a 64-bit literal constant");
361 }
362 };
363 explicit Operand(RegClass type) noexcept
364 {
365 isUndef_ = true;
366 data_.temp = Temp(0, type);
367 setFixed(PhysReg{128});
368 };
369 explicit Operand(PhysReg reg, RegClass type) noexcept
370 {
371 data_.temp = Temp(0, type);
372 setFixed(reg);
373 }
374
375 constexpr bool isTemp() const noexcept
376 {
377 return isTemp_;
378 }
379
380 constexpr void setTemp(Temp t) noexcept {
381 assert(!isConstant_);
382 isTemp_ = true;
383 data_.temp = t;
384 }
385
386 constexpr Temp getTemp() const noexcept
387 {
388 return data_.temp;
389 }
390
391 constexpr uint32_t tempId() const noexcept
392 {
393 return data_.temp.id();
394 }
395
396 constexpr bool hasRegClass() const noexcept
397 {
398 return isTemp() || isUndefined();
399 }
400
401 constexpr RegClass regClass() const noexcept
402 {
403 return data_.temp.regClass();
404 }
405
406 constexpr unsigned size() const noexcept
407 {
408 if (isConstant())
409 return is64BitConst_ ? 2 : 1;
410 else
411 return data_.temp.size();
412 }
413
414 constexpr bool isFixed() const noexcept
415 {
416 return isFixed_;
417 }
418
419 constexpr PhysReg physReg() const noexcept
420 {
421 return reg_;
422 }
423
424 constexpr void setFixed(PhysReg reg) noexcept
425 {
426 isFixed_ = reg != unsigned(-1);
427 reg_ = reg;
428 }
429
430 constexpr bool isConstant() const noexcept
431 {
432 return isConstant_;
433 }
434
435 constexpr bool isLiteral() const noexcept
436 {
437 return isConstant() && reg_ == 255;
438 }
439
440 constexpr bool isUndefined() const noexcept
441 {
442 return isUndef_;
443 }
444
445 constexpr uint32_t constantValue() const noexcept
446 {
447 return data_.i;
448 }
449
450 constexpr bool constantEquals(uint32_t cmp) const noexcept
451 {
452 return isConstant() && constantValue() == cmp;
453 }
454
455 constexpr void setKill(bool flag) noexcept
456 {
457 isKill_ = flag;
458 if (!flag)
459 setFirstKill(false);
460 }
461
462 constexpr bool isKill() const noexcept
463 {
464 return isKill_ || isFirstKill();
465 }
466
467 constexpr void setFirstKill(bool flag) noexcept
468 {
469 isFirstKill_ = flag;
470 if (flag)
471 setKill(flag);
472 }
473
474 /* When there are multiple operands killing the same temporary,
475 * isFirstKill() is only returns true for the first one. */
476 constexpr bool isFirstKill() const noexcept
477 {
478 return isFirstKill_;
479 }
480
481 private:
482 union {
483 uint32_t i;
484 float f;
485 Temp temp = Temp(0, s1);
486 } data_;
487 PhysReg reg_;
488 union {
489 struct {
490 uint8_t isTemp_:1;
491 uint8_t isFixed_:1;
492 uint8_t isConstant_:1;
493 uint8_t isKill_:1;
494 uint8_t isUndef_:1;
495 uint8_t isFirstKill_:1;
496 uint8_t is64BitConst_:1;
497 };
498 /* can't initialize bit-fields in c++11, so work around using a union */
499 uint8_t control_ = 0;
500 };
501 };
502
503 /**
504 * Definition Class
505 * Definitions are the results of Instructions
506 * and refer to temporary virtual registers
507 * which are later mapped to physical registers
508 */
509 class Definition final
510 {
511 public:
512 constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0) {}
513 Definition(uint32_t index, RegClass type) noexcept
514 : temp(index, type) {}
515 explicit Definition(Temp tmp) noexcept
516 : temp(tmp) {}
517 Definition(PhysReg reg, RegClass type) noexcept
518 : temp(Temp(0, type))
519 {
520 setFixed(reg);
521 }
522 Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
523 : temp(Temp(tmpId, type))
524 {
525 setFixed(reg);
526 }
527
528 constexpr bool isTemp() const noexcept
529 {
530 return tempId() > 0;
531 }
532
533 constexpr Temp getTemp() const noexcept
534 {
535 return temp;
536 }
537
538 constexpr uint32_t tempId() const noexcept
539 {
540 return temp.id();
541 }
542
543 constexpr void setTemp(Temp t) noexcept {
544 temp = t;
545 }
546
547 constexpr RegClass regClass() const noexcept
548 {
549 return temp.regClass();
550 }
551
552 constexpr unsigned size() const noexcept
553 {
554 return temp.size();
555 }
556
557 constexpr bool isFixed() const noexcept
558 {
559 return isFixed_;
560 }
561
562 constexpr PhysReg physReg() const noexcept
563 {
564 return reg_;
565 }
566
567 constexpr void setFixed(PhysReg reg) noexcept
568 {
569 isFixed_ = 1;
570 reg_ = reg;
571 }
572
573 constexpr void setHint(PhysReg reg) noexcept
574 {
575 hasHint_ = 1;
576 reg_ = reg;
577 }
578
579 constexpr bool hasHint() const noexcept
580 {
581 return hasHint_;
582 }
583
584 constexpr void setKill(bool flag) noexcept
585 {
586 isKill_ = flag;
587 }
588
589 constexpr bool isKill() const noexcept
590 {
591 return isKill_;
592 }
593
594 private:
595 Temp temp = Temp(0, s1);
596 PhysReg reg_;
597 union {
598 struct {
599 uint8_t isFixed_:1;
600 uint8_t hasHint_:1;
601 uint8_t isKill_:1;
602 };
603 /* can't initialize bit-fields in c++11, so work around using a union */
604 uint8_t control_ = 0;
605 };
606 };
607
608 class Block;
609
610 struct Instruction {
611 aco_opcode opcode;
612 Format format;
613 uint32_t pass_flags;
614
615 aco::span<Operand> operands;
616 aco::span<Definition> definitions;
617
618 constexpr bool isVALU() const noexcept
619 {
620 return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
621 || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
622 || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
623 || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
624 || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
625 || ((uint16_t) format & (uint16_t) Format::VOP3P) == (uint16_t) Format::VOP3P;
626 }
627
628 constexpr bool isSALU() const noexcept
629 {
630 return format == Format::SOP1 ||
631 format == Format::SOP2 ||
632 format == Format::SOPC ||
633 format == Format::SOPK ||
634 format == Format::SOPP;
635 }
636
637 constexpr bool isVMEM() const noexcept
638 {
639 return format == Format::MTBUF ||
640 format == Format::MUBUF ||
641 format == Format::MIMG;
642 }
643
644 constexpr bool isDPP() const noexcept
645 {
646 return (uint16_t) format & (uint16_t) Format::DPP;
647 }
648
649 constexpr bool isVOP3() const noexcept
650 {
651 return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
652 ((uint16_t) format & (uint16_t) Format::VOP3B) ||
653 format == Format::VOP3P;
654 }
655
656 constexpr bool isSDWA() const noexcept
657 {
658 return (uint16_t) format & (uint16_t) Format::SDWA;
659 }
660
661 constexpr bool isFlatOrGlobal() const noexcept
662 {
663 return format == Format::FLAT || format == Format::GLOBAL;
664 }
665
666 constexpr bool usesModifiers() const noexcept;
667
668 constexpr bool reads_exec() const noexcept
669 {
670 for (const Operand& op : operands) {
671 if (op.isFixed() && op.physReg() == exec)
672 return true;
673 }
674 return false;
675 }
676 };
677
678 struct SOPK_instruction : public Instruction {
679 uint16_t imm;
680 };
681
682 struct SOPP_instruction : public Instruction {
683 uint32_t imm;
684 int block;
685 };
686
687 struct SOPC_instruction : public Instruction {
688 };
689
690 struct SOP1_instruction : public Instruction {
691 };
692
693 struct SOP2_instruction : public Instruction {
694 };
695
696 /**
697 * Scalar Memory Format:
698 * For s_(buffer_)load_dword*:
699 * Operand(0): SBASE - SGPR-pair which provides base address
700 * Operand(1): Offset - immediate (un)signed offset or SGPR
701 * Operand(2) / Definition(0): SDATA - SGPR for read / write result
702 * Operand(n-1): SOffset - SGPR offset (Vega only)
703 *
704 * Having no operands is also valid for instructions such as s_dcache_inv.
705 *
706 */
707 struct SMEM_instruction : public Instruction {
708 bool glc; /* VI+: globally coherent */
709 bool dlc; /* NAVI: device level coherent */
710 bool nv; /* VEGA only: Non-volatile */
711 bool can_reorder;
712 bool disable_wqm;
713 barrier_interaction barrier;
714 };
715
716 struct VOP1_instruction : public Instruction {
717 };
718
719 struct VOP2_instruction : public Instruction {
720 };
721
722 struct VOPC_instruction : public Instruction {
723 };
724
725 struct VOP3A_instruction : public Instruction {
726 bool abs[3];
727 bool opsel[4];
728 bool clamp;
729 unsigned omod;
730 bool neg[3];
731 };
732
733 /**
734 * Data Parallel Primitives Format:
735 * This format can be used for VOP1, VOP2 or VOPC instructions.
736 * The swizzle applies to the src0 operand.
737 *
738 */
739 struct DPP_instruction : public Instruction {
740 uint16_t dpp_ctrl;
741 uint8_t row_mask;
742 uint8_t bank_mask;
743 bool abs[2];
744 bool neg[2];
745 bool bound_ctrl;
746 };
747
748 struct Interp_instruction : public Instruction {
749 unsigned attribute;
750 unsigned component;
751 };
752
753 /**
754 * Local and Global Data Sharing instructions
755 * Operand(0): ADDR - VGPR which supplies the address.
756 * Operand(1): DATA0 - First data VGPR.
757 * Operand(2): DATA1 - Second data VGPR.
758 * Operand(n-1): M0 - LDS size.
759 * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
760 *
761 */
762 struct DS_instruction : public Instruction {
763 int16_t offset0;
764 int8_t offset1;
765 bool gds;
766 };
767
768 /**
769 * Vector Memory Untyped-buffer Instructions
770 * Operand(0): VADDR - Address source. Can carry an index and/or offset
771 * Operand(1): SRSRC - Specifies which SGPR supplies T# (resource constant)
772 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
773 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
774 *
775 */
776 struct MUBUF_instruction : public Instruction {
777 unsigned offset; /* Unsigned byte offset - 12 bit */
778 bool offen; /* Supply an offset from VGPR (VADDR) */
779 bool idxen; /* Supply an index from VGPR (VADDR) */
780 bool glc; /* globally coherent */
781 bool dlc; /* NAVI: device level coherent */
782 bool slc; /* system level coherent */
783 bool tfe; /* texture fail enable */
784 bool lds; /* Return read-data to LDS instead of VGPRs */
785 bool disable_wqm; /* Require an exec mask without helper invocations */
786 bool can_reorder;
787 barrier_interaction barrier;
788 };
789
790 /**
791 * Vector Memory Typed-buffer Instructions
792 * Operand(0): VADDR - Address source. Can carry an index and/or offset
793 * Operand(1): SRSRC - Specifies which SGPR supplies T# (resource constant)
794 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
795 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
796 *
797 */
798 struct MTBUF_instruction : public Instruction {
799 uint8_t dfmt : 4; /* Data Format of data in memory buffer */
800 uint8_t nfmt : 3; /* Numeric format of data in memory */
801 unsigned offset; /* Unsigned byte offset - 12 bit */
802 bool offen; /* Supply an offset from VGPR (VADDR) */
803 bool idxen; /* Supply an index from VGPR (VADDR) */
804 bool glc; /* globally coherent */
805 bool dlc; /* NAVI: device level coherent */
806 bool slc; /* system level coherent */
807 bool tfe; /* texture fail enable */
808 bool disable_wqm; /* Require an exec mask without helper invocations */
809 bool can_reorder;
810 barrier_interaction barrier;
811 };
812
813 /**
814 * Vector Memory Image Instructions
815 * Operand(0): VADDR - Address source. Can carry an offset or an index.
816 * Operand(1): SRSRC - Scalar GPR that specifies the resource constant.
817 * Operand(2): SSAMP - Scalar GPR that specifies sampler constant.
818 * Operand(3) / Definition(0): VDATA - Vector GPR for read / write result.
819 *
820 */
821 struct MIMG_instruction : public Instruction {
822 unsigned dmask; /* Data VGPR enable mask */
823 unsigned dim; /* NAVI: dimensionality */
824 bool unrm; /* Force address to be un-normalized */
825 bool dlc; /* NAVI: device level coherent */
826 bool glc; /* globally coherent */
827 bool slc; /* system level coherent */
828 bool tfe; /* texture fail enable */
829 bool da; /* declare an array */
830 bool lwe; /* Force data to be un-normalized */
831 bool r128; /* NAVI: Texture resource size */
832 bool a16; /* VEGA, NAVI: Address components are 16-bits */
833 bool d16; /* Convert 32-bit data to 16-bit data */
834 bool disable_wqm; /* Require an exec mask without helper invocations */
835 bool can_reorder;
836 barrier_interaction barrier;
837 };
838
839 /**
840 * Flat/Scratch/Global Instructions
841 * Operand(0): ADDR
842 * Operand(1): SADDR
843 * Operand(2) / Definition(0): DATA/VDST
844 *
845 */
846 struct FLAT_instruction : public Instruction {
847 uint16_t offset; /* Vega/Navi only */
848 bool slc; /* system level coherent */
849 bool glc; /* globally coherent */
850 bool dlc; /* NAVI: device level coherent */
851 bool lds;
852 bool nv;
853 bool disable_wqm; /* Require an exec mask without helper invocations */
854 bool can_reorder;
855 barrier_interaction barrier;
856 };
857
858 struct Export_instruction : public Instruction {
859 unsigned enabled_mask;
860 unsigned dest;
861 bool compressed;
862 bool done;
863 bool valid_mask;
864 };
865
866 struct Pseudo_instruction : public Instruction {
867 bool tmp_in_scc;
868 PhysReg scratch_sgpr; /* might not be valid if it's not needed */
869 };
870
871 struct Pseudo_branch_instruction : public Instruction {
872 /* target[0] is the block index of the branch target.
873 * For conditional branches, target[1] contains the fall-through alternative.
874 * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
875 */
876 uint32_t target[2];
877 };
878
879 struct Pseudo_barrier_instruction : public Instruction {
880 };
881
882 enum ReduceOp {
883 iadd32, iadd64,
884 imul32, imul64,
885 fadd32, fadd64,
886 fmul32, fmul64,
887 imin32, imin64,
888 imax32, imax64,
889 umin32, umin64,
890 umax32, umax64,
891 fmin32, fmin64,
892 fmax32, fmax64,
893 iand32, iand64,
894 ior32, ior64,
895 ixor32, ixor64,
896 gfx10_wave64_bpermute
897 };
898
899 /**
900 * Subgroup Reduction Instructions, everything except for the data to be
901 * reduced and the result as inserted by setup_reduce_temp().
902 * Operand(0): data to be reduced
903 * Operand(1): reduce temporary
904 * Operand(2): vector temporary
905 * Definition(0): result
906 * Definition(1): scalar temporary
907 * Definition(2): scalar identity temporary (not used to store identity on GFX10)
908 * Definition(3): scc clobber
909 * Definition(4): vcc clobber
910 *
911 */
912 struct Pseudo_reduction_instruction : public Instruction {
913 ReduceOp reduce_op;
914 unsigned cluster_size; // must be 0 for scans
915 };
916
917 struct instr_deleter_functor {
918 void operator()(void* p) {
919 free(p);
920 }
921 };
922
923 template<typename T>
924 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
925
926 template<typename T>
927 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
928 {
929 std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
930 char *data = (char*) calloc(1, size);
931 T* inst = (T*) data;
932
933 inst->opcode = opcode;
934 inst->format = format;
935
936 inst->operands = aco::span<Operand>((Operand*)(data + sizeof(T)), num_operands);
937 inst->definitions = aco::span<Definition>((Definition*)inst->operands.end(), num_definitions);
938
939 return inst;
940 }
941
942 constexpr bool Instruction::usesModifiers() const noexcept
943 {
944 if (isDPP() || isSDWA())
945 return true;
946 if (!isVOP3())
947 return false;
948 const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
949 for (unsigned i = 0; i < operands.size(); i++) {
950 if (vop3->abs[i] || vop3->opsel[i] || vop3->neg[i])
951 return true;
952 }
953 return vop3->opsel[3] || vop3->clamp || vop3->omod;
954 }
955
956 constexpr bool is_phi(Instruction* instr)
957 {
958 return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
959 }
960
961 static inline bool is_phi(aco_ptr<Instruction>& instr)
962 {
963 return is_phi(instr.get());
964 }
965
966 constexpr barrier_interaction get_barrier_interaction(Instruction* instr)
967 {
968 switch (instr->format) {
969 case Format::SMEM:
970 return static_cast<SMEM_instruction*>(instr)->barrier;
971 case Format::MUBUF:
972 return static_cast<MUBUF_instruction*>(instr)->barrier;
973 case Format::MIMG:
974 return static_cast<MIMG_instruction*>(instr)->barrier;
975 case Format::FLAT:
976 case Format::GLOBAL:
977 case Format::SCRATCH:
978 return static_cast<FLAT_instruction*>(instr)->barrier;
979 case Format::DS:
980 return barrier_shared;
981 default:
982 return barrier_none;
983 }
984 }
985
986 enum block_kind {
987 /* uniform indicates that leaving this block,
988 * all actives lanes stay active */
989 block_kind_uniform = 1 << 0,
990 block_kind_top_level = 1 << 1,
991 block_kind_loop_preheader = 1 << 2,
992 block_kind_loop_header = 1 << 3,
993 block_kind_loop_exit = 1 << 4,
994 block_kind_continue = 1 << 5,
995 block_kind_break = 1 << 6,
996 block_kind_continue_or_break = 1 << 7,
997 block_kind_discard = 1 << 8,
998 block_kind_branch = 1 << 9,
999 block_kind_merge = 1 << 10,
1000 block_kind_invert = 1 << 11,
1001 block_kind_uses_discard_if = 1 << 12,
1002 block_kind_needs_lowering = 1 << 13,
1003 block_kind_uses_demote = 1 << 14,
1004 };
1005
1006
1007 struct RegisterDemand {
1008 constexpr RegisterDemand() = default;
1009 constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1010 : vgpr{v}, sgpr{s} {}
1011 int16_t vgpr = 0;
1012 int16_t sgpr = 0;
1013
1014 constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1015 return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1016 }
1017
1018 constexpr bool exceeds(const RegisterDemand other) const noexcept {
1019 return vgpr > other.vgpr || sgpr > other.sgpr;
1020 }
1021
1022 constexpr RegisterDemand operator+(const Temp t) const noexcept {
1023 if (t.type() == RegType::sgpr)
1024 return RegisterDemand( vgpr, sgpr + t.size() );
1025 else
1026 return RegisterDemand( vgpr + t.size(), sgpr );
1027 }
1028
1029 constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1030 return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1031 }
1032
1033 constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1034 return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1035 }
1036
1037 constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1038 vgpr += other.vgpr;
1039 sgpr += other.sgpr;
1040 return *this;
1041 }
1042
1043 constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1044 vgpr -= other.vgpr;
1045 sgpr -= other.sgpr;
1046 return *this;
1047 }
1048
1049 constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1050 if (t.type() == RegType::sgpr)
1051 sgpr += t.size();
1052 else
1053 vgpr += t.size();
1054 return *this;
1055 }
1056
1057 constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1058 if (t.type() == RegType::sgpr)
1059 sgpr -= t.size();
1060 else
1061 vgpr -= t.size();
1062 return *this;
1063 }
1064
1065 constexpr void update(const RegisterDemand other) noexcept {
1066 vgpr = std::max(vgpr, other.vgpr);
1067 sgpr = std::max(sgpr, other.sgpr);
1068 }
1069
1070 };
1071
1072 /* CFG */
1073 struct Block {
1074 float_mode fp_mode;
1075 unsigned index;
1076 unsigned offset = 0;
1077 std::vector<aco_ptr<Instruction>> instructions;
1078 std::vector<unsigned> logical_preds;
1079 std::vector<unsigned> linear_preds;
1080 std::vector<unsigned> logical_succs;
1081 std::vector<unsigned> linear_succs;
1082 RegisterDemand register_demand = RegisterDemand();
1083 uint16_t loop_nest_depth = 0;
1084 uint16_t kind = 0;
1085 int logical_idom = -1;
1086 int linear_idom = -1;
1087 Temp live_out_exec = Temp();
1088
1089 /* this information is needed for predecessors to blocks with phis when
1090 * moving out of ssa */
1091 bool scc_live_out = false;
1092 PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1093
1094 Block(unsigned idx) : index(idx) {}
1095 Block() : index(0) {}
1096 };
1097
1098 using Stage = uint16_t;
1099
1100 /* software stages */
1101 static constexpr Stage sw_vs = 1 << 0;
1102 static constexpr Stage sw_gs = 1 << 1;
1103 static constexpr Stage sw_tcs = 1 << 2;
1104 static constexpr Stage sw_tes = 1 << 3;
1105 static constexpr Stage sw_fs = 1 << 4;
1106 static constexpr Stage sw_cs = 1 << 5;
1107 static constexpr Stage sw_mask = 0x3f;
1108
1109 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1110 static constexpr Stage hw_vs = 1 << 6;
1111 static constexpr Stage hw_es = 1 << 7; /* not on GFX9. combined into GS on GFX9 (and GFX10/legacy). */
1112 static constexpr Stage hw_gs = 1 << 8;
1113 static constexpr Stage hw_ls = 1 << 9; /* not on GFX9. combined into HS on GFX9 (and GFX10/legacy). */
1114 static constexpr Stage hw_hs = 1 << 10;
1115 static constexpr Stage hw_fs = 1 << 11;
1116 static constexpr Stage hw_cs = 1 << 12;
1117 static constexpr Stage hw_mask = 0x7f << 6;
1118
1119 /* possible settings of Program::stage */
1120 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1121 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1122 static constexpr Stage compute_cs = sw_cs | hw_cs;
1123 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1124 /* GFX10/NGG */
1125 static constexpr Stage ngg_vertex_gs = sw_vs | hw_gs;
1126 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1127 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1128 static constexpr Stage ngg_vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1129 /* GFX9 (and GFX10 if NGG isn't used) */
1130 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1131 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1132 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1133 /* pre-GFX9 */
1134 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1135 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1136 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1137 static constexpr Stage tess_eval_es = sw_tes | hw_gs; /* tesselation evaluation before geometry */
1138 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1139
1140 class Program final {
1141 public:
1142 float_mode next_fp_mode;
1143 std::vector<Block> blocks;
1144 RegisterDemand max_reg_demand = RegisterDemand();
1145 uint16_t num_waves = 0;
1146 uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1147 ac_shader_config* config;
1148 struct radv_shader_info *info;
1149 enum chip_class chip_class;
1150 enum radeon_family family;
1151 unsigned wave_size;
1152 RegClass lane_mask;
1153 Stage stage; /* Stage */
1154 bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1155 bool needs_wqm = false; /* there exists a p_wqm instruction */
1156 bool wb_smem_l1_on_end = false;
1157
1158 std::vector<uint8_t> constant_data;
1159 Temp private_segment_buffer;
1160 Temp scratch_offset;
1161
1162 uint16_t lds_alloc_granule;
1163 uint32_t lds_limit; /* in bytes */
1164 uint16_t vgpr_limit;
1165 uint16_t sgpr_limit;
1166 uint16_t physical_sgprs;
1167 uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1168
1169 bool needs_vcc = false;
1170 bool needs_xnack_mask = false;
1171 bool needs_flat_scr = false;
1172
1173 uint32_t allocateId()
1174 {
1175 assert(allocationID <= 16777215);
1176 return allocationID++;
1177 }
1178
1179 uint32_t peekAllocationId()
1180 {
1181 return allocationID;
1182 }
1183
1184 void setAllocationId(uint32_t id)
1185 {
1186 allocationID = id;
1187 }
1188
1189 Block* create_and_insert_block() {
1190 blocks.emplace_back(blocks.size());
1191 blocks.back().fp_mode = next_fp_mode;
1192 return &blocks.back();
1193 }
1194
1195 Block* insert_block(Block&& block) {
1196 block.index = blocks.size();
1197 block.fp_mode = next_fp_mode;
1198 blocks.emplace_back(std::move(block));
1199 return &blocks.back();
1200 }
1201
1202 private:
1203 uint32_t allocationID = 1;
1204 };
1205
1206 struct live {
1207 /* live temps out per block */
1208 std::vector<std::set<Temp>> live_out;
1209 /* register demand (sgpr/vgpr) per instruction per block */
1210 std::vector<std::vector<RegisterDemand>> register_demand;
1211 };
1212
1213 void select_program(Program *program,
1214 unsigned shader_count,
1215 struct nir_shader *const *shaders,
1216 ac_shader_config* config,
1217 struct radv_shader_args *args);
1218
1219 void lower_wqm(Program* program, live& live_vars,
1220 const struct radv_nir_compiler_options *options);
1221 void lower_bool_phis(Program* program);
1222 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1223 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1224 std::vector<uint16_t> dead_code_analysis(Program *program);
1225 void dominator_tree(Program* program);
1226 void insert_exec_mask(Program *program);
1227 void value_numbering(Program* program);
1228 void optimize(Program* program);
1229 void setup_reduce_temp(Program* program);
1230 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1231 void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_per_block);
1232 void ssa_elimination(Program* program);
1233 void lower_to_hw_instr(Program* program);
1234 void schedule_program(Program* program, live& live_vars);
1235 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1236 void insert_wait_states(Program* program);
1237 void insert_NOPs(Program* program);
1238 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1239 void print_asm(Program *program, std::vector<uint32_t>& binary,
1240 unsigned exec_size, std::ostream& out);
1241 void validate(Program* program, FILE *output);
1242 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output);
1243 #ifndef NDEBUG
1244 void perfwarn(bool cond, const char *msg, Instruction *instr=NULL);
1245 #else
1246 #define perfwarn(program, cond, msg, ...)
1247 #endif
1248
1249 void aco_print_instr(Instruction *instr, FILE *output);
1250 void aco_print_program(Program *program, FILE *output);
1251
1252 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1253 uint16_t get_extra_sgprs(Program *program);
1254
1255 /* get number of sgprs allocated required to address a number of sgprs */
1256 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1257
1258 /* return number of addressable SGPRs for max_waves */
1259 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1260
1261 typedef struct {
1262 const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
1263 const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1264 const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1265 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1266 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1267 const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1268 const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1269 } Info;
1270
1271 extern const Info instr_info;
1272
1273 }
1274
1275 #endif /* ACO_IR_H */
1276