aco: implement VK_KHR_shader_float_controls
[mesa.git] / src / amd / compiler / aco_ir.h
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #ifndef ACO_IR_H
26 #define ACO_IR_H
27
28 #include <vector>
29 #include <set>
30 #include <bitset>
31 #include <memory>
32
33 #include "nir.h"
34 #include "ac_binary.h"
35 #include "amd_family.h"
36 #include "aco_opcodes.h"
37 #include "aco_util.h"
38
39 struct radv_nir_compiler_options;
40 struct radv_shader_info;
41
42 namespace aco {
43
44 extern uint64_t debug_flags;
45
46 enum {
47 DEBUG_VALIDATE = 0x1,
48 DEBUG_VALIDATE_RA = 0x2,
49 DEBUG_PERFWARN = 0x4,
50 };
51
52 /**
53 * Representation of the instruction's microcode encoding format
54 * Note: Some Vector ALU Formats can be combined, such that:
55 * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
56 * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
57 * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
58 *
59 * (*) The same is applicable for VOP1 and VOPC instructions.
60 */
61 enum class Format : std::uint16_t {
62 /* Pseudo Instruction Format */
63 PSEUDO = 0,
64 /* Scalar ALU & Control Formats */
65 SOP1 = 1,
66 SOP2 = 2,
67 SOPK = 3,
68 SOPP = 4,
69 SOPC = 5,
70 /* Scalar Memory Format */
71 SMEM = 6,
72 /* LDS/GDS Format */
73 DS = 8,
74 /* Vector Memory Buffer Formats */
75 MTBUF = 9,
76 MUBUF = 10,
77 /* Vector Memory Image Format */
78 MIMG = 11,
79 /* Export Format */
80 EXP = 12,
81 /* Flat Formats */
82 FLAT = 13,
83 GLOBAL = 14,
84 SCRATCH = 15,
85
86 PSEUDO_BRANCH = 16,
87 PSEUDO_BARRIER = 17,
88 PSEUDO_REDUCTION = 18,
89
90 /* Vector ALU Formats */
91 VOP1 = 1 << 8,
92 VOP2 = 1 << 9,
93 VOPC = 1 << 10,
94 VOP3 = 1 << 11,
95 VOP3A = 1 << 11,
96 VOP3B = 1 << 11,
97 VOP3P = 1 << 12,
98 /* Vector Parameter Interpolation Format */
99 VINTRP = 1 << 13,
100 DPP = 1 << 14,
101 SDWA = 1 << 15,
102 };
103
104 enum barrier_interaction {
105 barrier_none = 0,
106 barrier_buffer = 0x1,
107 barrier_image = 0x2,
108 barrier_atomic = 0x4,
109 barrier_shared = 0x8,
110 barrier_count = 4,
111 };
112
113 enum fp_round {
114 fp_round_ne = 0,
115 fp_round_pi = 1,
116 fp_round_ni = 2,
117 fp_round_tz = 3,
118 };
119
120 enum fp_denorm {
121 /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
122 * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
123 fp_denorm_flush = 0x0,
124 fp_denorm_keep = 0x3,
125 };
126
127 struct float_mode {
128 /* matches encoding of the MODE register */
129 union {
130 struct {
131 fp_round round32:2;
132 fp_round round16_64:2;
133 unsigned denorm32:2;
134 unsigned denorm16_64:2;
135 };
136 uint8_t val = 0;
137 };
138 /* if false, optimizations which may remove infs/nan/-0.0 can be done */
139 bool preserve_signed_zero_inf_nan32:1;
140 bool preserve_signed_zero_inf_nan16_64:1;
141 /* if false, optimizations which may remove denormal flushing can be done */
142 bool must_flush_denorms32:1;
143 bool must_flush_denorms16_64:1;
144 bool care_about_round32:1;
145 bool care_about_round16_64:1;
146
147 /* Returns true if instructions using the mode "other" can safely use the
148 * current one instead. */
149 bool canReplace(float_mode other) const noexcept {
150 return val == other.val &&
151 (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
152 (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
153 (must_flush_denorms32 || !other.must_flush_denorms32) &&
154 (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
155 (care_about_round32 || !other.care_about_round32) &&
156 (care_about_round16_64 || !other.care_about_round16_64);
157 }
158 };
159
160 constexpr Format asVOP3(Format format) {
161 return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
162 };
163
164 enum class RegType {
165 none = 0,
166 sgpr,
167 vgpr,
168 linear_vgpr,
169 };
170
171 struct RegClass {
172
173 enum RC : uint8_t {
174 s1 = 1,
175 s2 = 2,
176 s3 = 3,
177 s4 = 4,
178 s6 = 6,
179 s8 = 8,
180 s16 = 16,
181 v1 = s1 | (1 << 5),
182 v2 = s2 | (1 << 5),
183 v3 = s3 | (1 << 5),
184 v4 = s4 | (1 << 5),
185 v5 = 5 | (1 << 5),
186 v6 = 6 | (1 << 5),
187 v7 = 7 | (1 << 5),
188 v8 = 8 | (1 << 5),
189 /* these are used for WWM and spills to vgpr */
190 v1_linear = v1 | (1 << 6),
191 v2_linear = v2 | (1 << 6),
192 };
193
194 RegClass() = default;
195 constexpr RegClass(RC rc)
196 : rc(rc) {}
197 constexpr RegClass(RegType type, unsigned size)
198 : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
199
200 constexpr operator RC() const { return rc; }
201 explicit operator bool() = delete;
202
203 constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
204 constexpr unsigned size() const { return (unsigned) rc & 0x1F; }
205 constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
206 constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
207
208 private:
209 RC rc;
210 };
211
212 /* transitional helper expressions */
213 static constexpr RegClass s1{RegClass::s1};
214 static constexpr RegClass s2{RegClass::s2};
215 static constexpr RegClass s3{RegClass::s3};
216 static constexpr RegClass s4{RegClass::s4};
217 static constexpr RegClass s8{RegClass::s8};
218 static constexpr RegClass s16{RegClass::s16};
219 static constexpr RegClass v1{RegClass::v1};
220 static constexpr RegClass v2{RegClass::v2};
221 static constexpr RegClass v3{RegClass::v3};
222 static constexpr RegClass v4{RegClass::v4};
223 static constexpr RegClass v5{RegClass::v5};
224 static constexpr RegClass v6{RegClass::v6};
225 static constexpr RegClass v7{RegClass::v7};
226 static constexpr RegClass v8{RegClass::v8};
227
228 /**
229 * Temp Class
230 * Each temporary virtual register has a
231 * register class (i.e. size and type)
232 * and SSA id.
233 */
234 struct Temp {
235 Temp() = default;
236 constexpr Temp(uint32_t id, RegClass cls) noexcept
237 : id_(id), reg_class(cls) {}
238
239 constexpr uint32_t id() const noexcept { return id_; }
240 constexpr RegClass regClass() const noexcept { return reg_class; }
241
242 constexpr unsigned size() const noexcept { return reg_class.size(); }
243 constexpr RegType type() const noexcept { return reg_class.type(); }
244 constexpr bool is_linear() const noexcept { return reg_class.is_linear(); }
245
246 constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
247 constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
248 constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
249
250 private:
251 uint32_t id_:24;
252 RegClass reg_class;
253 };
254
255 /**
256 * PhysReg
257 * Represents the physical register for each
258 * Operand and Definition.
259 */
260 struct PhysReg {
261 constexpr PhysReg() = default;
262 explicit constexpr PhysReg(unsigned r) : reg(r) {}
263 constexpr operator unsigned() const { return reg; }
264
265 uint16_t reg = 0;
266 };
267
268 /* helper expressions for special registers */
269 static constexpr PhysReg m0{124};
270 static constexpr PhysReg vcc{106};
271 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
272 static constexpr PhysReg exec{126};
273 static constexpr PhysReg exec_lo{126};
274 static constexpr PhysReg exec_hi{127};
275 static constexpr PhysReg scc{253};
276
277 /**
278 * Operand Class
279 * Initially, each Operand refers to either
280 * a temporary virtual register
281 * or to a constant value
282 * Temporary registers get mapped to physical register during RA
283 * Constant values are inlined into the instruction sequence.
284 */
285 class Operand final
286 {
287 public:
288 constexpr Operand()
289 : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
290 isKill_(false), isUndef_(true), isFirstKill_(false), is64BitConst_(false) {}
291
292 explicit Operand(Temp r) noexcept
293 {
294 data_.temp = r;
295 if (r.id()) {
296 isTemp_ = true;
297 } else {
298 isUndef_ = true;
299 setFixed(PhysReg{128});
300 }
301 };
302 explicit Operand(uint32_t v) noexcept
303 {
304 data_.i = v;
305 isConstant_ = true;
306 if (v <= 64)
307 setFixed(PhysReg{128 + v});
308 else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
309 setFixed(PhysReg{192 - v});
310 else if (v == 0x3f000000) /* 0.5 */
311 setFixed(PhysReg{240});
312 else if (v == 0xbf000000) /* -0.5 */
313 setFixed(PhysReg{241});
314 else if (v == 0x3f800000) /* 1.0 */
315 setFixed(PhysReg{242});
316 else if (v == 0xbf800000) /* -1.0 */
317 setFixed(PhysReg{243});
318 else if (v == 0x40000000) /* 2.0 */
319 setFixed(PhysReg{244});
320 else if (v == 0xc0000000) /* -2.0 */
321 setFixed(PhysReg{245});
322 else if (v == 0x40800000) /* 4.0 */
323 setFixed(PhysReg{246});
324 else if (v == 0xc0800000) /* -4.0 */
325 setFixed(PhysReg{247});
326 else if (v == 0x3e22f983) /* 1/(2*PI) */
327 setFixed(PhysReg{248});
328 else /* Literal Constant */
329 setFixed(PhysReg{255});
330 };
331 explicit Operand(uint64_t v) noexcept
332 {
333 isConstant_ = true;
334 is64BitConst_ = true;
335 if (v <= 64)
336 setFixed(PhysReg{128 + (uint32_t) v});
337 else if (v >= 0xFFFFFFFFFFFFFFF0) /* [-16 .. -1] */
338 setFixed(PhysReg{192 - (uint32_t) v});
339 else if (v == 0x3FE0000000000000) /* 0.5 */
340 setFixed(PhysReg{240});
341 else if (v == 0xBFE0000000000000) /* -0.5 */
342 setFixed(PhysReg{241});
343 else if (v == 0x3FF0000000000000) /* 1.0 */
344 setFixed(PhysReg{242});
345 else if (v == 0xBFF0000000000000) /* -1.0 */
346 setFixed(PhysReg{243});
347 else if (v == 0x4000000000000000) /* 2.0 */
348 setFixed(PhysReg{244});
349 else if (v == 0xC000000000000000) /* -2.0 */
350 setFixed(PhysReg{245});
351 else if (v == 0x4010000000000000) /* 4.0 */
352 setFixed(PhysReg{246});
353 else if (v == 0xC010000000000000) /* -4.0 */
354 setFixed(PhysReg{247});
355 else if (v == 0x3fc45f306dc9c882) /* 1/(2*PI) */
356 setFixed(PhysReg{248});
357 else { /* Literal Constant: we don't know if it is a long or double.*/
358 isConstant_ = 0;
359 assert(false && "attempt to create a 64-bit literal constant");
360 }
361 };
362 explicit Operand(RegClass type) noexcept
363 {
364 isUndef_ = true;
365 data_.temp = Temp(0, type);
366 setFixed(PhysReg{128});
367 };
368 explicit Operand(PhysReg reg, RegClass type) noexcept
369 {
370 data_.temp = Temp(0, type);
371 setFixed(reg);
372 }
373
374 constexpr bool isTemp() const noexcept
375 {
376 return isTemp_;
377 }
378
379 constexpr void setTemp(Temp t) noexcept {
380 assert(!isConstant_);
381 isTemp_ = true;
382 data_.temp = t;
383 }
384
385 constexpr Temp getTemp() const noexcept
386 {
387 return data_.temp;
388 }
389
390 constexpr uint32_t tempId() const noexcept
391 {
392 return data_.temp.id();
393 }
394
395 constexpr bool hasRegClass() const noexcept
396 {
397 return isTemp() || isUndefined();
398 }
399
400 constexpr RegClass regClass() const noexcept
401 {
402 return data_.temp.regClass();
403 }
404
405 constexpr unsigned size() const noexcept
406 {
407 if (isConstant())
408 return is64BitConst_ ? 2 : 1;
409 else
410 return data_.temp.size();
411 }
412
413 constexpr bool isFixed() const noexcept
414 {
415 return isFixed_;
416 }
417
418 constexpr PhysReg physReg() const noexcept
419 {
420 return reg_;
421 }
422
423 constexpr void setFixed(PhysReg reg) noexcept
424 {
425 isFixed_ = reg != unsigned(-1);
426 reg_ = reg;
427 }
428
429 constexpr bool isConstant() const noexcept
430 {
431 return isConstant_;
432 }
433
434 constexpr bool isLiteral() const noexcept
435 {
436 return isConstant() && reg_ == 255;
437 }
438
439 constexpr bool isUndefined() const noexcept
440 {
441 return isUndef_;
442 }
443
444 constexpr uint32_t constantValue() const noexcept
445 {
446 return data_.i;
447 }
448
449 constexpr bool constantEquals(uint32_t cmp) const noexcept
450 {
451 return isConstant() && constantValue() == cmp;
452 }
453
454 constexpr void setKill(bool flag) noexcept
455 {
456 isKill_ = flag;
457 if (!flag)
458 setFirstKill(false);
459 }
460
461 constexpr bool isKill() const noexcept
462 {
463 return isKill_ || isFirstKill();
464 }
465
466 constexpr void setFirstKill(bool flag) noexcept
467 {
468 isFirstKill_ = flag;
469 if (flag)
470 setKill(flag);
471 }
472
473 /* When there are multiple operands killing the same temporary,
474 * isFirstKill() is only returns true for the first one. */
475 constexpr bool isFirstKill() const noexcept
476 {
477 return isFirstKill_;
478 }
479
480 private:
481 union {
482 uint32_t i;
483 float f;
484 Temp temp = Temp(0, s1);
485 } data_;
486 PhysReg reg_;
487 union {
488 struct {
489 uint8_t isTemp_:1;
490 uint8_t isFixed_:1;
491 uint8_t isConstant_:1;
492 uint8_t isKill_:1;
493 uint8_t isUndef_:1;
494 uint8_t isFirstKill_:1;
495 uint8_t is64BitConst_:1;
496 };
497 /* can't initialize bit-fields in c++11, so work around using a union */
498 uint8_t control_ = 0;
499 };
500 };
501
502 /**
503 * Definition Class
504 * Definitions are the results of Instructions
505 * and refer to temporary virtual registers
506 * which are later mapped to physical registers
507 */
508 class Definition final
509 {
510 public:
511 constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0) {}
512 Definition(uint32_t index, RegClass type) noexcept
513 : temp(index, type) {}
514 explicit Definition(Temp tmp) noexcept
515 : temp(tmp) {}
516 Definition(PhysReg reg, RegClass type) noexcept
517 : temp(Temp(0, type))
518 {
519 setFixed(reg);
520 }
521 Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
522 : temp(Temp(tmpId, type))
523 {
524 setFixed(reg);
525 }
526
527 constexpr bool isTemp() const noexcept
528 {
529 return tempId() > 0;
530 }
531
532 constexpr Temp getTemp() const noexcept
533 {
534 return temp;
535 }
536
537 constexpr uint32_t tempId() const noexcept
538 {
539 return temp.id();
540 }
541
542 constexpr void setTemp(Temp t) noexcept {
543 temp = t;
544 }
545
546 constexpr RegClass regClass() const noexcept
547 {
548 return temp.regClass();
549 }
550
551 constexpr unsigned size() const noexcept
552 {
553 return temp.size();
554 }
555
556 constexpr bool isFixed() const noexcept
557 {
558 return isFixed_;
559 }
560
561 constexpr PhysReg physReg() const noexcept
562 {
563 return reg_;
564 }
565
566 constexpr void setFixed(PhysReg reg) noexcept
567 {
568 isFixed_ = 1;
569 reg_ = reg;
570 }
571
572 constexpr void setHint(PhysReg reg) noexcept
573 {
574 hasHint_ = 1;
575 reg_ = reg;
576 }
577
578 constexpr bool hasHint() const noexcept
579 {
580 return hasHint_;
581 }
582
583 constexpr void setKill(bool flag) noexcept
584 {
585 isKill_ = flag;
586 }
587
588 constexpr bool isKill() const noexcept
589 {
590 return isKill_;
591 }
592
593 private:
594 Temp temp = Temp(0, s1);
595 PhysReg reg_;
596 union {
597 struct {
598 uint8_t isFixed_:1;
599 uint8_t hasHint_:1;
600 uint8_t isKill_:1;
601 };
602 /* can't initialize bit-fields in c++11, so work around using a union */
603 uint8_t control_ = 0;
604 };
605 };
606
607 class Block;
608
609 struct Instruction {
610 aco_opcode opcode;
611 Format format;
612 uint32_t pass_flags;
613
614 aco::span<Operand> operands;
615 aco::span<Definition> definitions;
616
617 constexpr bool isVALU() const noexcept
618 {
619 return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
620 || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
621 || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
622 || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
623 || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
624 || ((uint16_t) format & (uint16_t) Format::VOP3P) == (uint16_t) Format::VOP3P;
625 }
626
627 constexpr bool isSALU() const noexcept
628 {
629 return format == Format::SOP1 ||
630 format == Format::SOP2 ||
631 format == Format::SOPC ||
632 format == Format::SOPK ||
633 format == Format::SOPP;
634 }
635
636 constexpr bool isVMEM() const noexcept
637 {
638 return format == Format::MTBUF ||
639 format == Format::MUBUF ||
640 format == Format::MIMG;
641 }
642
643 constexpr bool isDPP() const noexcept
644 {
645 return (uint16_t) format & (uint16_t) Format::DPP;
646 }
647
648 constexpr bool isVOP3() const noexcept
649 {
650 return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
651 ((uint16_t) format & (uint16_t) Format::VOP3B) ||
652 format == Format::VOP3P;
653 }
654
655 constexpr bool isSDWA() const noexcept
656 {
657 return (uint16_t) format & (uint16_t) Format::SDWA;
658 }
659
660 constexpr bool isFlatOrGlobal() const noexcept
661 {
662 return format == Format::FLAT || format == Format::GLOBAL;
663 }
664
665 constexpr bool usesModifiers() const noexcept;
666
667 constexpr bool reads_exec() const noexcept
668 {
669 for (const Operand& op : operands) {
670 if (op.isFixed() && op.physReg() == exec)
671 return true;
672 }
673 return false;
674 }
675 };
676
677 struct SOPK_instruction : public Instruction {
678 uint16_t imm;
679 };
680
681 struct SOPP_instruction : public Instruction {
682 uint32_t imm;
683 int block;
684 };
685
686 struct SOPC_instruction : public Instruction {
687 };
688
689 struct SOP1_instruction : public Instruction {
690 };
691
692 struct SOP2_instruction : public Instruction {
693 };
694
695 /**
696 * Scalar Memory Format:
697 * For s_(buffer_)load_dword*:
698 * Operand(0): SBASE - SGPR-pair which provides base address
699 * Operand(1): Offset - immediate (un)signed offset or SGPR
700 * Operand(2) / Definition(0): SDATA - SGPR for read / write result
701 * Operand(n-1): SOffset - SGPR offset (Vega only)
702 *
703 * Having no operands is also valid for instructions such as s_dcache_inv.
704 *
705 */
706 struct SMEM_instruction : public Instruction {
707 bool glc; /* VI+: globally coherent */
708 bool dlc; /* NAVI: device level coherent */
709 bool nv; /* VEGA only: Non-volatile */
710 bool can_reorder;
711 bool disable_wqm;
712 barrier_interaction barrier;
713 };
714
715 struct VOP1_instruction : public Instruction {
716 };
717
718 struct VOP2_instruction : public Instruction {
719 };
720
721 struct VOPC_instruction : public Instruction {
722 };
723
724 struct VOP3A_instruction : public Instruction {
725 bool abs[3];
726 bool opsel[4];
727 bool clamp;
728 unsigned omod;
729 bool neg[3];
730 };
731
732 /**
733 * Data Parallel Primitives Format:
734 * This format can be used for VOP1, VOP2 or VOPC instructions.
735 * The swizzle applies to the src0 operand.
736 *
737 */
738 struct DPP_instruction : public Instruction {
739 uint16_t dpp_ctrl;
740 uint8_t row_mask;
741 uint8_t bank_mask;
742 bool abs[2];
743 bool neg[2];
744 bool bound_ctrl;
745 };
746
747 struct Interp_instruction : public Instruction {
748 unsigned attribute;
749 unsigned component;
750 };
751
752 /**
753 * Local and Global Data Sharing instructions
754 * Operand(0): ADDR - VGPR which supplies the address.
755 * Operand(1): DATA0 - First data VGPR.
756 * Operand(2): DATA1 - Second data VGPR.
757 * Operand(n-1): M0 - LDS size.
758 * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
759 *
760 */
761 struct DS_instruction : public Instruction {
762 int16_t offset0;
763 int8_t offset1;
764 bool gds;
765 };
766
767 /**
768 * Vector Memory Untyped-buffer Instructions
769 * Operand(0): VADDR - Address source. Can carry an index and/or offset
770 * Operand(1): SRSRC - Specifies which SGPR supplies T# (resource constant)
771 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
772 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
773 *
774 */
775 struct MUBUF_instruction : public Instruction {
776 unsigned offset; /* Unsigned byte offset - 12 bit */
777 bool offen; /* Supply an offset from VGPR (VADDR) */
778 bool idxen; /* Supply an index from VGPR (VADDR) */
779 bool glc; /* globally coherent */
780 bool dlc; /* NAVI: device level coherent */
781 bool slc; /* system level coherent */
782 bool tfe; /* texture fail enable */
783 bool lds; /* Return read-data to LDS instead of VGPRs */
784 bool disable_wqm; /* Require an exec mask without helper invocations */
785 bool can_reorder;
786 barrier_interaction barrier;
787 };
788
789 /**
790 * Vector Memory Typed-buffer Instructions
791 * Operand(0): VADDR - Address source. Can carry an index and/or offset
792 * Operand(1): SRSRC - Specifies which SGPR supplies T# (resource constant)
793 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
794 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
795 *
796 */
797 struct MTBUF_instruction : public Instruction {
798 uint8_t dfmt : 4; /* Data Format of data in memory buffer */
799 uint8_t nfmt : 3; /* Numeric format of data in memory */
800 unsigned offset; /* Unsigned byte offset - 12 bit */
801 bool offen; /* Supply an offset from VGPR (VADDR) */
802 bool idxen; /* Supply an index from VGPR (VADDR) */
803 bool glc; /* globally coherent */
804 bool dlc; /* NAVI: device level coherent */
805 bool slc; /* system level coherent */
806 bool tfe; /* texture fail enable */
807 bool disable_wqm; /* Require an exec mask without helper invocations */
808 bool can_reorder;
809 barrier_interaction barrier;
810 };
811
812 /**
813 * Vector Memory Image Instructions
814 * Operand(0): VADDR - Address source. Can carry an offset or an index.
815 * Operand(1): SRSRC - Scalar GPR that specifies the resource constant.
816 * Operand(2): SSAMP - Scalar GPR that specifies sampler constant.
817 * Operand(3) / Definition(0): VDATA - Vector GPR for read / write result.
818 *
819 */
820 struct MIMG_instruction : public Instruction {
821 unsigned dmask; /* Data VGPR enable mask */
822 unsigned dim; /* NAVI: dimensionality */
823 bool unrm; /* Force address to be un-normalized */
824 bool dlc; /* NAVI: device level coherent */
825 bool glc; /* globally coherent */
826 bool slc; /* system level coherent */
827 bool tfe; /* texture fail enable */
828 bool da; /* declare an array */
829 bool lwe; /* Force data to be un-normalized */
830 bool r128; /* NAVI: Texture resource size */
831 bool a16; /* VEGA, NAVI: Address components are 16-bits */
832 bool d16; /* Convert 32-bit data to 16-bit data */
833 bool disable_wqm; /* Require an exec mask without helper invocations */
834 bool can_reorder;
835 barrier_interaction barrier;
836 };
837
838 /**
839 * Flat/Scratch/Global Instructions
840 * Operand(0): ADDR
841 * Operand(1): SADDR
842 * Operand(2) / Definition(0): DATA/VDST
843 *
844 */
845 struct FLAT_instruction : public Instruction {
846 uint16_t offset; /* Vega only */
847 bool slc; /* system level coherent */
848 bool glc; /* globally coherent */
849 bool dlc; /* NAVI: device level coherent */
850 bool lds;
851 bool nv;
852 };
853
854 struct Export_instruction : public Instruction {
855 unsigned enabled_mask;
856 unsigned dest;
857 bool compressed;
858 bool done;
859 bool valid_mask;
860 };
861
862 struct Pseudo_instruction : public Instruction {
863 bool tmp_in_scc;
864 PhysReg scratch_sgpr; /* might not be valid if it's not needed */
865 };
866
867 struct Pseudo_branch_instruction : public Instruction {
868 /* target[0] is the block index of the branch target.
869 * For conditional branches, target[1] contains the fall-through alternative.
870 * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
871 */
872 uint32_t target[2];
873 };
874
875 struct Pseudo_barrier_instruction : public Instruction {
876 };
877
878 enum ReduceOp {
879 iadd32, iadd64,
880 imul32, imul64,
881 fadd32, fadd64,
882 fmul32, fmul64,
883 imin32, imin64,
884 imax32, imax64,
885 umin32, umin64,
886 umax32, umax64,
887 fmin32, fmin64,
888 fmax32, fmax64,
889 iand32, iand64,
890 ior32, ior64,
891 ixor32, ixor64,
892 gfx10_wave64_bpermute
893 };
894
895 /**
896 * Subgroup Reduction Instructions, everything except for the data to be
897 * reduced and the result as inserted by setup_reduce_temp().
898 * Operand(0): data to be reduced
899 * Operand(1): reduce temporary
900 * Operand(2): vector temporary
901 * Definition(0): result
902 * Definition(1): scalar temporary
903 * Definition(2): scalar identity temporary (not used to store identity on GFX10)
904 * Definition(3): scc clobber
905 * Definition(4): vcc clobber
906 *
907 */
908 struct Pseudo_reduction_instruction : public Instruction {
909 ReduceOp reduce_op;
910 unsigned cluster_size; // must be 0 for scans
911 };
912
913 struct instr_deleter_functor {
914 void operator()(void* p) {
915 free(p);
916 }
917 };
918
919 template<typename T>
920 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
921
922 template<typename T>
923 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
924 {
925 std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
926 char *data = (char*) calloc(1, size);
927 T* inst = (T*) data;
928
929 inst->opcode = opcode;
930 inst->format = format;
931
932 inst->operands = aco::span<Operand>((Operand*)(data + sizeof(T)), num_operands);
933 inst->definitions = aco::span<Definition>((Definition*)inst->operands.end(), num_definitions);
934
935 return inst;
936 }
937
938 constexpr bool Instruction::usesModifiers() const noexcept
939 {
940 if (isDPP() || isSDWA())
941 return true;
942 if (!isVOP3())
943 return false;
944 const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
945 for (unsigned i = 0; i < operands.size(); i++) {
946 if (vop3->abs[i] || vop3->opsel[i] || vop3->neg[i])
947 return true;
948 }
949 return vop3->opsel[3] || vop3->clamp || vop3->omod;
950 }
951
952 constexpr bool is_phi(Instruction* instr)
953 {
954 return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
955 }
956
957 static inline bool is_phi(aco_ptr<Instruction>& instr)
958 {
959 return is_phi(instr.get());
960 }
961
962 constexpr barrier_interaction get_barrier_interaction(Instruction* instr)
963 {
964 switch (instr->format) {
965 case Format::SMEM:
966 return static_cast<SMEM_instruction*>(instr)->barrier;
967 case Format::MUBUF:
968 return static_cast<MUBUF_instruction*>(instr)->barrier;
969 case Format::MIMG:
970 return static_cast<MIMG_instruction*>(instr)->barrier;
971 case Format::FLAT:
972 case Format::GLOBAL:
973 return barrier_buffer;
974 case Format::DS:
975 return barrier_shared;
976 default:
977 return barrier_none;
978 }
979 }
980
981 enum block_kind {
982 /* uniform indicates that leaving this block,
983 * all actives lanes stay active */
984 block_kind_uniform = 1 << 0,
985 block_kind_top_level = 1 << 1,
986 block_kind_loop_preheader = 1 << 2,
987 block_kind_loop_header = 1 << 3,
988 block_kind_loop_exit = 1 << 4,
989 block_kind_continue = 1 << 5,
990 block_kind_break = 1 << 6,
991 block_kind_continue_or_break = 1 << 7,
992 block_kind_discard = 1 << 8,
993 block_kind_branch = 1 << 9,
994 block_kind_merge = 1 << 10,
995 block_kind_invert = 1 << 11,
996 block_kind_uses_discard_if = 1 << 12,
997 block_kind_needs_lowering = 1 << 13,
998 block_kind_uses_demote = 1 << 14,
999 };
1000
1001
1002 struct RegisterDemand {
1003 constexpr RegisterDemand() = default;
1004 constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1005 : vgpr{v}, sgpr{s} {}
1006 int16_t vgpr = 0;
1007 int16_t sgpr = 0;
1008
1009 constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1010 return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1011 }
1012
1013 constexpr bool exceeds(const RegisterDemand other) const noexcept {
1014 return vgpr > other.vgpr || sgpr > other.sgpr;
1015 }
1016
1017 constexpr RegisterDemand operator+(const Temp t) const noexcept {
1018 if (t.type() == RegType::sgpr)
1019 return RegisterDemand( vgpr, sgpr + t.size() );
1020 else
1021 return RegisterDemand( vgpr + t.size(), sgpr );
1022 }
1023
1024 constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1025 return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1026 }
1027
1028 constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1029 return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1030 }
1031
1032 constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1033 vgpr += other.vgpr;
1034 sgpr += other.sgpr;
1035 return *this;
1036 }
1037
1038 constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1039 vgpr -= other.vgpr;
1040 sgpr -= other.sgpr;
1041 return *this;
1042 }
1043
1044 constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1045 if (t.type() == RegType::sgpr)
1046 sgpr += t.size();
1047 else
1048 vgpr += t.size();
1049 return *this;
1050 }
1051
1052 constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1053 if (t.type() == RegType::sgpr)
1054 sgpr -= t.size();
1055 else
1056 vgpr -= t.size();
1057 return *this;
1058 }
1059
1060 constexpr void update(const RegisterDemand other) noexcept {
1061 vgpr = std::max(vgpr, other.vgpr);
1062 sgpr = std::max(sgpr, other.sgpr);
1063 }
1064
1065 };
1066
1067 /* CFG */
1068 struct Block {
1069 float_mode fp_mode;
1070 unsigned index;
1071 unsigned offset = 0;
1072 std::vector<aco_ptr<Instruction>> instructions;
1073 std::vector<unsigned> logical_preds;
1074 std::vector<unsigned> linear_preds;
1075 std::vector<unsigned> logical_succs;
1076 std::vector<unsigned> linear_succs;
1077 RegisterDemand register_demand = RegisterDemand();
1078 uint16_t loop_nest_depth = 0;
1079 uint16_t kind = 0;
1080 int logical_idom = -1;
1081 int linear_idom = -1;
1082 Temp live_out_exec = Temp();
1083
1084 /* this information is needed for predecessors to blocks with phis when
1085 * moving out of ssa */
1086 bool scc_live_out = false;
1087 PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1088
1089 Block(unsigned idx) : index(idx) {}
1090 Block() : index(0) {}
1091 };
1092
1093 using Stage = uint16_t;
1094
1095 /* software stages */
1096 static constexpr Stage sw_vs = 1 << 0;
1097 static constexpr Stage sw_gs = 1 << 1;
1098 static constexpr Stage sw_tcs = 1 << 2;
1099 static constexpr Stage sw_tes = 1 << 3;
1100 static constexpr Stage sw_fs = 1 << 4;
1101 static constexpr Stage sw_cs = 1 << 5;
1102 static constexpr Stage sw_mask = 0x3f;
1103
1104 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1105 static constexpr Stage hw_vs = 1 << 6;
1106 static constexpr Stage hw_es = 1 << 7; /* not on GFX9. combined into GS on GFX9 (and GFX10/legacy). */
1107 static constexpr Stage hw_gs = 1 << 8;
1108 static constexpr Stage hw_ls = 1 << 9; /* not on GFX9. combined into HS on GFX9 (and GFX10/legacy). */
1109 static constexpr Stage hw_hs = 1 << 10;
1110 static constexpr Stage hw_fs = 1 << 11;
1111 static constexpr Stage hw_cs = 1 << 12;
1112 static constexpr Stage hw_mask = 0x7f << 6;
1113
1114 /* possible settings of Program::stage */
1115 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1116 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1117 static constexpr Stage compute_cs = sw_cs | hw_cs;
1118 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1119 /* GFX10/NGG */
1120 static constexpr Stage ngg_vertex_gs = sw_vs | hw_gs;
1121 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1122 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1123 static constexpr Stage ngg_vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1124 /* GFX9 (and GFX10 if NGG isn't used) */
1125 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1126 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1127 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1128 /* pre-GFX9 */
1129 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1130 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1131 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1132 static constexpr Stage tess_eval_es = sw_tes | hw_gs; /* tesselation evaluation before geometry */
1133 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1134
1135 class Program final {
1136 public:
1137 float_mode next_fp_mode;
1138 std::vector<Block> blocks;
1139 RegisterDemand max_reg_demand = RegisterDemand();
1140 uint16_t num_waves = 0;
1141 uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1142 ac_shader_config* config;
1143 struct radv_shader_info *info;
1144 enum chip_class chip_class;
1145 enum radeon_family family;
1146 unsigned wave_size;
1147 Stage stage; /* Stage */
1148 bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1149 bool needs_wqm = false; /* there exists a p_wqm instruction */
1150 bool wb_smem_l1_on_end = false;
1151
1152 std::vector<uint8_t> constant_data;
1153 Temp private_segment_buffer;
1154 Temp scratch_offset;
1155
1156 uint16_t lds_alloc_granule;
1157 uint32_t lds_limit; /* in bytes */
1158 uint16_t vgpr_limit;
1159 uint16_t sgpr_limit;
1160 uint16_t physical_sgprs;
1161 uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1162
1163 bool needs_vcc = false;
1164 bool needs_xnack_mask = false;
1165 bool needs_flat_scr = false;
1166
1167 uint32_t allocateId()
1168 {
1169 assert(allocationID <= 16777215);
1170 return allocationID++;
1171 }
1172
1173 uint32_t peekAllocationId()
1174 {
1175 return allocationID;
1176 }
1177
1178 void setAllocationId(uint32_t id)
1179 {
1180 allocationID = id;
1181 }
1182
1183 Block* create_and_insert_block() {
1184 blocks.emplace_back(blocks.size());
1185 blocks.back().fp_mode = next_fp_mode;
1186 return &blocks.back();
1187 }
1188
1189 Block* insert_block(Block&& block) {
1190 block.index = blocks.size();
1191 block.fp_mode = next_fp_mode;
1192 blocks.emplace_back(std::move(block));
1193 return &blocks.back();
1194 }
1195
1196 private:
1197 uint32_t allocationID = 1;
1198 };
1199
1200 struct live {
1201 /* live temps out per block */
1202 std::vector<std::set<Temp>> live_out;
1203 /* register demand (sgpr/vgpr) per instruction per block */
1204 std::vector<std::vector<RegisterDemand>> register_demand;
1205 };
1206
1207 void select_program(Program *program,
1208 unsigned shader_count,
1209 struct nir_shader *const *shaders,
1210 ac_shader_config* config,
1211 struct radv_shader_info *info,
1212 struct radv_nir_compiler_options *options);
1213
1214 void lower_wqm(Program* program, live& live_vars,
1215 const struct radv_nir_compiler_options *options);
1216 void lower_bool_phis(Program* program);
1217 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1218 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1219 std::vector<uint16_t> dead_code_analysis(Program *program);
1220 void dominator_tree(Program* program);
1221 void insert_exec_mask(Program *program);
1222 void value_numbering(Program* program);
1223 void optimize(Program* program);
1224 void setup_reduce_temp(Program* program);
1225 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1226 void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_per_block);
1227 void ssa_elimination(Program* program);
1228 void lower_to_hw_instr(Program* program);
1229 void schedule_program(Program* program, live& live_vars);
1230 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1231 void insert_wait_states(Program* program);
1232 void insert_NOPs(Program* program);
1233 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1234 void print_asm(Program *program, std::vector<uint32_t>& binary,
1235 unsigned exec_size, std::ostream& out);
1236 void validate(Program* program, FILE *output);
1237 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output);
1238 #ifndef NDEBUG
1239 void perfwarn(bool cond, const char *msg, Instruction *instr=NULL);
1240 #else
1241 #define perfwarn(program, cond, msg, ...)
1242 #endif
1243
1244 void aco_print_instr(Instruction *instr, FILE *output);
1245 void aco_print_program(Program *program, FILE *output);
1246
1247 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1248 uint16_t get_extra_sgprs(Program *program);
1249
1250 /* get number of sgprs allocated required to address a number of sgprs */
1251 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1252
1253 /* return number of addressable SGPRs for max_waves */
1254 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1255
1256 typedef struct {
1257 const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1258 const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1259 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1260 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1261 const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1262 const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1263 } Info;
1264
1265 extern const Info instr_info;
1266
1267 }
1268
1269 #endif /* ACO_IR_H */
1270