aco: Implement 64-bit constant propagation.
[mesa.git] / src / amd / compiler / aco_ir.h
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #ifndef ACO_IR_H
26 #define ACO_IR_H
27
28 #include <vector>
29 #include <set>
30 #include <bitset>
31 #include <memory>
32
33 #include "nir.h"
34 #include "ac_binary.h"
35 #include "amd_family.h"
36 #include "aco_opcodes.h"
37 #include "aco_util.h"
38
39 struct radv_nir_compiler_options;
40 struct radv_shader_args;
41 struct radv_shader_info;
42
43 namespace aco {
44
45 extern uint64_t debug_flags;
46
47 enum {
48 DEBUG_VALIDATE = 0x1,
49 DEBUG_VALIDATE_RA = 0x2,
50 DEBUG_PERFWARN = 0x4,
51 };
52
53 /**
54 * Representation of the instruction's microcode encoding format
55 * Note: Some Vector ALU Formats can be combined, such that:
56 * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
57 * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
58 * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
59 *
60 * (*) The same is applicable for VOP1 and VOPC instructions.
61 */
62 enum class Format : std::uint16_t {
63 /* Pseudo Instruction Format */
64 PSEUDO = 0,
65 /* Scalar ALU & Control Formats */
66 SOP1 = 1,
67 SOP2 = 2,
68 SOPK = 3,
69 SOPP = 4,
70 SOPC = 5,
71 /* Scalar Memory Format */
72 SMEM = 6,
73 /* LDS/GDS Format */
74 DS = 8,
75 /* Vector Memory Buffer Formats */
76 MTBUF = 9,
77 MUBUF = 10,
78 /* Vector Memory Image Format */
79 MIMG = 11,
80 /* Export Format */
81 EXP = 12,
82 /* Flat Formats */
83 FLAT = 13,
84 GLOBAL = 14,
85 SCRATCH = 15,
86
87 PSEUDO_BRANCH = 16,
88 PSEUDO_BARRIER = 17,
89 PSEUDO_REDUCTION = 18,
90
91 /* Vector ALU Formats */
92 VOP1 = 1 << 8,
93 VOP2 = 1 << 9,
94 VOPC = 1 << 10,
95 VOP3 = 1 << 11,
96 VOP3A = 1 << 11,
97 VOP3B = 1 << 11,
98 VOP3P = 1 << 12,
99 /* Vector Parameter Interpolation Format */
100 VINTRP = 1 << 13,
101 DPP = 1 << 14,
102 SDWA = 1 << 15,
103 };
104
105 enum barrier_interaction : uint8_t {
106 barrier_none = 0,
107 barrier_buffer = 0x1,
108 barrier_image = 0x2,
109 barrier_atomic = 0x4,
110 barrier_shared = 0x8,
111 barrier_count = 4,
112 };
113
114 enum fp_round {
115 fp_round_ne = 0,
116 fp_round_pi = 1,
117 fp_round_ni = 2,
118 fp_round_tz = 3,
119 };
120
121 enum fp_denorm {
122 /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
123 * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
124 fp_denorm_flush = 0x0,
125 fp_denorm_keep = 0x3,
126 };
127
128 struct float_mode {
129 /* matches encoding of the MODE register */
130 union {
131 struct {
132 fp_round round32:2;
133 fp_round round16_64:2;
134 unsigned denorm32:2;
135 unsigned denorm16_64:2;
136 };
137 uint8_t val = 0;
138 };
139 /* if false, optimizations which may remove infs/nan/-0.0 can be done */
140 bool preserve_signed_zero_inf_nan32:1;
141 bool preserve_signed_zero_inf_nan16_64:1;
142 /* if false, optimizations which may remove denormal flushing can be done */
143 bool must_flush_denorms32:1;
144 bool must_flush_denorms16_64:1;
145 bool care_about_round32:1;
146 bool care_about_round16_64:1;
147
148 /* Returns true if instructions using the mode "other" can safely use the
149 * current one instead. */
150 bool canReplace(float_mode other) const noexcept {
151 return val == other.val &&
152 (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
153 (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
154 (must_flush_denorms32 || !other.must_flush_denorms32) &&
155 (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
156 (care_about_round32 || !other.care_about_round32) &&
157 (care_about_round16_64 || !other.care_about_round16_64);
158 }
159 };
160
161 constexpr Format asVOP3(Format format) {
162 return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
163 };
164
165 enum class RegType {
166 none = 0,
167 sgpr,
168 vgpr,
169 linear_vgpr,
170 };
171
172 struct RegClass {
173
174 enum RC : uint8_t {
175 s1 = 1,
176 s2 = 2,
177 s3 = 3,
178 s4 = 4,
179 s6 = 6,
180 s8 = 8,
181 s16 = 16,
182 v1 = s1 | (1 << 5),
183 v2 = s2 | (1 << 5),
184 v3 = s3 | (1 << 5),
185 v4 = s4 | (1 << 5),
186 v5 = 5 | (1 << 5),
187 v6 = 6 | (1 << 5),
188 v7 = 7 | (1 << 5),
189 v8 = 8 | (1 << 5),
190 /* these are used for WWM and spills to vgpr */
191 v1_linear = v1 | (1 << 6),
192 v2_linear = v2 | (1 << 6),
193 };
194
195 RegClass() = default;
196 constexpr RegClass(RC rc)
197 : rc(rc) {}
198 constexpr RegClass(RegType type, unsigned size)
199 : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
200
201 constexpr operator RC() const { return rc; }
202 explicit operator bool() = delete;
203
204 constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
205 constexpr unsigned size() const { return (unsigned) rc & 0x1F; }
206 constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
207 constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
208
209 private:
210 RC rc;
211 };
212
213 /* transitional helper expressions */
214 static constexpr RegClass s1{RegClass::s1};
215 static constexpr RegClass s2{RegClass::s2};
216 static constexpr RegClass s3{RegClass::s3};
217 static constexpr RegClass s4{RegClass::s4};
218 static constexpr RegClass s8{RegClass::s8};
219 static constexpr RegClass s16{RegClass::s16};
220 static constexpr RegClass v1{RegClass::v1};
221 static constexpr RegClass v2{RegClass::v2};
222 static constexpr RegClass v3{RegClass::v3};
223 static constexpr RegClass v4{RegClass::v4};
224 static constexpr RegClass v5{RegClass::v5};
225 static constexpr RegClass v6{RegClass::v6};
226 static constexpr RegClass v7{RegClass::v7};
227 static constexpr RegClass v8{RegClass::v8};
228
229 /**
230 * Temp Class
231 * Each temporary virtual register has a
232 * register class (i.e. size and type)
233 * and SSA id.
234 */
235 struct Temp {
236 Temp() = default;
237 constexpr Temp(uint32_t id, RegClass cls) noexcept
238 : id_(id), reg_class(cls) {}
239
240 constexpr uint32_t id() const noexcept { return id_; }
241 constexpr RegClass regClass() const noexcept { return reg_class; }
242
243 constexpr unsigned size() const noexcept { return reg_class.size(); }
244 constexpr RegType type() const noexcept { return reg_class.type(); }
245 constexpr bool is_linear() const noexcept { return reg_class.is_linear(); }
246
247 constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
248 constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
249 constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
250
251 private:
252 uint32_t id_:24;
253 RegClass reg_class;
254 };
255
256 /**
257 * PhysReg
258 * Represents the physical register for each
259 * Operand and Definition.
260 */
261 struct PhysReg {
262 constexpr PhysReg() = default;
263 explicit constexpr PhysReg(unsigned r) : reg(r) {}
264 constexpr operator unsigned() const { return reg; }
265
266 uint16_t reg = 0;
267 };
268
269 /* helper expressions for special registers */
270 static constexpr PhysReg m0{124};
271 static constexpr PhysReg vcc{106};
272 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
273 static constexpr PhysReg exec{126};
274 static constexpr PhysReg exec_lo{126};
275 static constexpr PhysReg exec_hi{127};
276 static constexpr PhysReg scc{253};
277
278 /**
279 * Operand Class
280 * Initially, each Operand refers to either
281 * a temporary virtual register
282 * or to a constant value
283 * Temporary registers get mapped to physical register during RA
284 * Constant values are inlined into the instruction sequence.
285 */
286 class Operand final
287 {
288 public:
289 constexpr Operand()
290 : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
291 isKill_(false), isUndef_(true), isFirstKill_(false), is64BitConst_(false) {}
292
293 explicit Operand(Temp r) noexcept
294 {
295 data_.temp = r;
296 if (r.id()) {
297 isTemp_ = true;
298 } else {
299 isUndef_ = true;
300 setFixed(PhysReg{128});
301 }
302 };
303 explicit Operand(uint32_t v, bool is64bit = false) noexcept
304 {
305 data_.i = v;
306 isConstant_ = true;
307 is64BitConst_ = is64bit;
308 if (v <= 64)
309 setFixed(PhysReg{128 + v});
310 else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
311 setFixed(PhysReg{192 - v});
312 else if (v == 0x3f000000) /* 0.5 */
313 setFixed(PhysReg{240});
314 else if (v == 0xbf000000) /* -0.5 */
315 setFixed(PhysReg{241});
316 else if (v == 0x3f800000) /* 1.0 */
317 setFixed(PhysReg{242});
318 else if (v == 0xbf800000) /* -1.0 */
319 setFixed(PhysReg{243});
320 else if (v == 0x40000000) /* 2.0 */
321 setFixed(PhysReg{244});
322 else if (v == 0xc0000000) /* -2.0 */
323 setFixed(PhysReg{245});
324 else if (v == 0x40800000) /* 4.0 */
325 setFixed(PhysReg{246});
326 else if (v == 0xc0800000) /* -4.0 */
327 setFixed(PhysReg{247});
328 else { /* Literal Constant */
329 assert(!is64bit && "attempt to create a 64-bit literal constant");
330 setFixed(PhysReg{255});
331 }
332 };
333 explicit Operand(uint64_t v) noexcept
334 {
335 isConstant_ = true;
336 is64BitConst_ = true;
337 if (v <= 64) {
338 data_.i = (uint32_t) v;
339 setFixed(PhysReg{128 + (uint32_t) v});
340 } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
341 data_.i = (uint32_t) v;
342 setFixed(PhysReg{192 - (uint32_t) v});
343 } else if (v == 0x3FE0000000000000) { /* 0.5 */
344 data_.i = 0x3f000000;
345 setFixed(PhysReg{240});
346 } else if (v == 0xBFE0000000000000) { /* -0.5 */
347 data_.i = 0xbf000000;
348 setFixed(PhysReg{241});
349 } else if (v == 0x3FF0000000000000) { /* 1.0 */
350 data_.i = 0x3f800000;
351 setFixed(PhysReg{242});
352 } else if (v == 0xBFF0000000000000) { /* -1.0 */
353 data_.i = 0xbf800000;
354 setFixed(PhysReg{243});
355 } else if (v == 0x4000000000000000) { /* 2.0 */
356 data_.i = 0x40000000;
357 setFixed(PhysReg{244});
358 } else if (v == 0xC000000000000000) { /* -2.0 */
359 data_.i = 0xc0000000;
360 setFixed(PhysReg{245});
361 } else if (v == 0x4010000000000000) { /* 4.0 */
362 data_.i = 0x40800000;
363 setFixed(PhysReg{246});
364 } else if (v == 0xC010000000000000) { /* -4.0 */
365 data_.i = 0xc0800000;
366 setFixed(PhysReg{247});
367 } else { /* Literal Constant: we don't know if it is a long or double.*/
368 isConstant_ = 0;
369 assert(false && "attempt to create a 64-bit literal constant");
370 }
371 };
372 explicit Operand(RegClass type) noexcept
373 {
374 isUndef_ = true;
375 data_.temp = Temp(0, type);
376 setFixed(PhysReg{128});
377 };
378 explicit Operand(PhysReg reg, RegClass type) noexcept
379 {
380 data_.temp = Temp(0, type);
381 setFixed(reg);
382 }
383
384 constexpr bool isTemp() const noexcept
385 {
386 return isTemp_;
387 }
388
389 constexpr void setTemp(Temp t) noexcept {
390 assert(!isConstant_);
391 isTemp_ = true;
392 data_.temp = t;
393 }
394
395 constexpr Temp getTemp() const noexcept
396 {
397 return data_.temp;
398 }
399
400 constexpr uint32_t tempId() const noexcept
401 {
402 return data_.temp.id();
403 }
404
405 constexpr bool hasRegClass() const noexcept
406 {
407 return isTemp() || isUndefined();
408 }
409
410 constexpr RegClass regClass() const noexcept
411 {
412 return data_.temp.regClass();
413 }
414
415 constexpr unsigned size() const noexcept
416 {
417 if (isConstant())
418 return is64BitConst_ ? 2 : 1;
419 else
420 return data_.temp.size();
421 }
422
423 constexpr bool isFixed() const noexcept
424 {
425 return isFixed_;
426 }
427
428 constexpr PhysReg physReg() const noexcept
429 {
430 return reg_;
431 }
432
433 constexpr void setFixed(PhysReg reg) noexcept
434 {
435 isFixed_ = reg != unsigned(-1);
436 reg_ = reg;
437 }
438
439 constexpr bool isConstant() const noexcept
440 {
441 return isConstant_;
442 }
443
444 constexpr bool isLiteral() const noexcept
445 {
446 return isConstant() && reg_ == 255;
447 }
448
449 constexpr bool isUndefined() const noexcept
450 {
451 return isUndef_;
452 }
453
454 constexpr uint32_t constantValue() const noexcept
455 {
456 return data_.i;
457 }
458
459 constexpr bool constantEquals(uint32_t cmp) const noexcept
460 {
461 return isConstant() && constantValue() == cmp;
462 }
463
464 constexpr void setKill(bool flag) noexcept
465 {
466 isKill_ = flag;
467 if (!flag)
468 setFirstKill(false);
469 }
470
471 constexpr bool isKill() const noexcept
472 {
473 return isKill_ || isFirstKill();
474 }
475
476 constexpr void setFirstKill(bool flag) noexcept
477 {
478 isFirstKill_ = flag;
479 if (flag)
480 setKill(flag);
481 }
482
483 /* When there are multiple operands killing the same temporary,
484 * isFirstKill() is only returns true for the first one. */
485 constexpr bool isFirstKill() const noexcept
486 {
487 return isFirstKill_;
488 }
489
490 private:
491 union {
492 uint32_t i;
493 float f;
494 Temp temp = Temp(0, s1);
495 } data_;
496 PhysReg reg_;
497 union {
498 struct {
499 uint8_t isTemp_:1;
500 uint8_t isFixed_:1;
501 uint8_t isConstant_:1;
502 uint8_t isKill_:1;
503 uint8_t isUndef_:1;
504 uint8_t isFirstKill_:1;
505 uint8_t is64BitConst_:1;
506 };
507 /* can't initialize bit-fields in c++11, so work around using a union */
508 uint8_t control_ = 0;
509 };
510 };
511
512 /**
513 * Definition Class
514 * Definitions are the results of Instructions
515 * and refer to temporary virtual registers
516 * which are later mapped to physical registers
517 */
518 class Definition final
519 {
520 public:
521 constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0) {}
522 Definition(uint32_t index, RegClass type) noexcept
523 : temp(index, type) {}
524 explicit Definition(Temp tmp) noexcept
525 : temp(tmp) {}
526 Definition(PhysReg reg, RegClass type) noexcept
527 : temp(Temp(0, type))
528 {
529 setFixed(reg);
530 }
531 Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
532 : temp(Temp(tmpId, type))
533 {
534 setFixed(reg);
535 }
536
537 constexpr bool isTemp() const noexcept
538 {
539 return tempId() > 0;
540 }
541
542 constexpr Temp getTemp() const noexcept
543 {
544 return temp;
545 }
546
547 constexpr uint32_t tempId() const noexcept
548 {
549 return temp.id();
550 }
551
552 constexpr void setTemp(Temp t) noexcept {
553 temp = t;
554 }
555
556 constexpr RegClass regClass() const noexcept
557 {
558 return temp.regClass();
559 }
560
561 constexpr unsigned size() const noexcept
562 {
563 return temp.size();
564 }
565
566 constexpr bool isFixed() const noexcept
567 {
568 return isFixed_;
569 }
570
571 constexpr PhysReg physReg() const noexcept
572 {
573 return reg_;
574 }
575
576 constexpr void setFixed(PhysReg reg) noexcept
577 {
578 isFixed_ = 1;
579 reg_ = reg;
580 }
581
582 constexpr void setHint(PhysReg reg) noexcept
583 {
584 hasHint_ = 1;
585 reg_ = reg;
586 }
587
588 constexpr bool hasHint() const noexcept
589 {
590 return hasHint_;
591 }
592
593 constexpr void setKill(bool flag) noexcept
594 {
595 isKill_ = flag;
596 }
597
598 constexpr bool isKill() const noexcept
599 {
600 return isKill_;
601 }
602
603 private:
604 Temp temp = Temp(0, s1);
605 PhysReg reg_;
606 union {
607 struct {
608 uint8_t isFixed_:1;
609 uint8_t hasHint_:1;
610 uint8_t isKill_:1;
611 };
612 /* can't initialize bit-fields in c++11, so work around using a union */
613 uint8_t control_ = 0;
614 };
615 };
616
617 class Block;
618
619 struct Instruction {
620 aco_opcode opcode;
621 Format format;
622 uint32_t pass_flags;
623
624 aco::span<Operand> operands;
625 aco::span<Definition> definitions;
626
627 constexpr bool isVALU() const noexcept
628 {
629 return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
630 || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
631 || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
632 || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
633 || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
634 || ((uint16_t) format & (uint16_t) Format::VOP3P) == (uint16_t) Format::VOP3P;
635 }
636
637 constexpr bool isSALU() const noexcept
638 {
639 return format == Format::SOP1 ||
640 format == Format::SOP2 ||
641 format == Format::SOPC ||
642 format == Format::SOPK ||
643 format == Format::SOPP;
644 }
645
646 constexpr bool isVMEM() const noexcept
647 {
648 return format == Format::MTBUF ||
649 format == Format::MUBUF ||
650 format == Format::MIMG;
651 }
652
653 constexpr bool isDPP() const noexcept
654 {
655 return (uint16_t) format & (uint16_t) Format::DPP;
656 }
657
658 constexpr bool isVOP3() const noexcept
659 {
660 return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
661 ((uint16_t) format & (uint16_t) Format::VOP3B) ||
662 format == Format::VOP3P;
663 }
664
665 constexpr bool isSDWA() const noexcept
666 {
667 return (uint16_t) format & (uint16_t) Format::SDWA;
668 }
669
670 constexpr bool isFlatOrGlobal() const noexcept
671 {
672 return format == Format::FLAT || format == Format::GLOBAL;
673 }
674
675 constexpr bool usesModifiers() const noexcept;
676
677 constexpr bool reads_exec() const noexcept
678 {
679 for (const Operand& op : operands) {
680 if (op.isFixed() && op.physReg() == exec)
681 return true;
682 }
683 return false;
684 }
685 };
686
687 struct SOPK_instruction : public Instruction {
688 uint16_t imm;
689 };
690
691 struct SOPP_instruction : public Instruction {
692 uint32_t imm;
693 int block;
694 };
695
696 struct SOPC_instruction : public Instruction {
697 };
698
699 struct SOP1_instruction : public Instruction {
700 };
701
702 struct SOP2_instruction : public Instruction {
703 };
704
705 /**
706 * Scalar Memory Format:
707 * For s_(buffer_)load_dword*:
708 * Operand(0): SBASE - SGPR-pair which provides base address
709 * Operand(1): Offset - immediate (un)signed offset or SGPR
710 * Operand(2) / Definition(0): SDATA - SGPR for read / write result
711 * Operand(n-1): SOffset - SGPR offset (Vega only)
712 *
713 * Having no operands is also valid for instructions such as s_dcache_inv.
714 *
715 */
716 struct SMEM_instruction : public Instruction {
717 bool glc : 1; /* VI+: globally coherent */
718 bool dlc : 1; /* NAVI: device level coherent */
719 bool nv : 1; /* VEGA only: Non-volatile */
720 bool can_reorder : 1;
721 bool disable_wqm : 1;
722 barrier_interaction barrier;
723 };
724
725 struct VOP1_instruction : public Instruction {
726 };
727
728 struct VOP2_instruction : public Instruction {
729 };
730
731 struct VOPC_instruction : public Instruction {
732 };
733
734 struct VOP3A_instruction : public Instruction {
735 bool abs[3];
736 bool neg[3];
737 uint8_t opsel : 4;
738 uint8_t omod : 2;
739 bool clamp : 1;
740 };
741
742 /**
743 * Data Parallel Primitives Format:
744 * This format can be used for VOP1, VOP2 or VOPC instructions.
745 * The swizzle applies to the src0 operand.
746 *
747 */
748 struct DPP_instruction : public Instruction {
749 bool abs[2];
750 bool neg[2];
751 uint16_t dpp_ctrl;
752 uint8_t row_mask : 4;
753 uint8_t bank_mask : 4;
754 bool bound_ctrl : 1;
755 };
756
757 struct Interp_instruction : public Instruction {
758 uint8_t attribute;
759 uint8_t component;
760 };
761
762 /**
763 * Local and Global Data Sharing instructions
764 * Operand(0): ADDR - VGPR which supplies the address.
765 * Operand(1): DATA0 - First data VGPR.
766 * Operand(2): DATA1 - Second data VGPR.
767 * Operand(n-1): M0 - LDS size.
768 * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
769 *
770 */
771 struct DS_instruction : public Instruction {
772 int16_t offset0;
773 int8_t offset1;
774 bool gds;
775 };
776
777 /**
778 * Vector Memory Untyped-buffer Instructions
779 * Operand(0): VADDR - Address source. Can carry an index and/or offset
780 * Operand(1): SRSRC - Specifies which SGPR supplies T# (resource constant)
781 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
782 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
783 *
784 */
785 struct MUBUF_instruction : public Instruction {
786 uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
787 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
788 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
789 bool glc : 1; /* globally coherent */
790 bool dlc : 1; /* NAVI: device level coherent */
791 bool slc : 1; /* system level coherent */
792 bool tfe : 1; /* texture fail enable */
793 bool lds : 1; /* Return read-data to LDS instead of VGPRs */
794 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
795 bool can_reorder : 1;
796 barrier_interaction barrier;
797 };
798
799 /**
800 * Vector Memory Typed-buffer Instructions
801 * Operand(0): VADDR - Address source. Can carry an index and/or offset
802 * Operand(1): SRSRC - Specifies which SGPR supplies T# (resource constant)
803 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
804 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
805 *
806 */
807 struct MTBUF_instruction : public Instruction {
808 uint16_t offset; /* Unsigned byte offset - 12 bit */
809 uint8_t dfmt : 4; /* Data Format of data in memory buffer */
810 uint8_t nfmt : 3; /* Numeric format of data in memory */
811 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
812 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
813 bool glc : 1; /* globally coherent */
814 bool dlc : 1; /* NAVI: device level coherent */
815 bool slc : 1; /* system level coherent */
816 bool tfe : 1; /* texture fail enable */
817 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
818 bool can_reorder : 1;
819 barrier_interaction barrier;
820 };
821
822 /**
823 * Vector Memory Image Instructions
824 * Operand(0): VADDR - Address source. Can carry an offset or an index.
825 * Operand(1): SRSRC - Scalar GPR that specifies the resource constant.
826 * Operand(2): SSAMP - Scalar GPR that specifies sampler constant.
827 * Operand(3) / Definition(0): VDATA - Vector GPR for read / write result.
828 *
829 */
830 struct MIMG_instruction : public Instruction {
831 uint8_t dmask; /* Data VGPR enable mask */
832 uint8_t dim : 3; /* NAVI: dimensionality */
833 bool unrm : 1; /* Force address to be un-normalized */
834 bool dlc : 1; /* NAVI: device level coherent */
835 bool glc : 1; /* globally coherent */
836 bool slc : 1; /* system level coherent */
837 bool tfe : 1; /* texture fail enable */
838 bool da : 1; /* declare an array */
839 bool lwe : 1; /* Force data to be un-normalized */
840 bool r128 : 1; /* NAVI: Texture resource size */
841 bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
842 bool d16 : 1; /* Convert 32-bit data to 16-bit data */
843 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
844 bool can_reorder : 1;
845 barrier_interaction barrier;
846 };
847
848 /**
849 * Flat/Scratch/Global Instructions
850 * Operand(0): ADDR
851 * Operand(1): SADDR
852 * Operand(2) / Definition(0): DATA/VDST
853 *
854 */
855 struct FLAT_instruction : public Instruction {
856 uint16_t offset; /* Vega/Navi only */
857 bool slc : 1; /* system level coherent */
858 bool glc : 1; /* globally coherent */
859 bool dlc : 1; /* NAVI: device level coherent */
860 bool lds : 1;
861 bool nv : 1;
862 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
863 bool can_reorder : 1;
864 barrier_interaction barrier;
865 };
866
867 struct Export_instruction : public Instruction {
868 uint8_t enabled_mask;
869 uint8_t dest;
870 bool compressed : 1;
871 bool done : 1;
872 bool valid_mask : 1;
873 };
874
875 struct Pseudo_instruction : public Instruction {
876 bool tmp_in_scc;
877 PhysReg scratch_sgpr; /* might not be valid if it's not needed */
878 };
879
880 struct Pseudo_branch_instruction : public Instruction {
881 /* target[0] is the block index of the branch target.
882 * For conditional branches, target[1] contains the fall-through alternative.
883 * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
884 */
885 uint32_t target[2];
886 };
887
888 struct Pseudo_barrier_instruction : public Instruction {
889 };
890
891 enum ReduceOp {
892 iadd32, iadd64,
893 imul32, imul64,
894 fadd32, fadd64,
895 fmul32, fmul64,
896 imin32, imin64,
897 imax32, imax64,
898 umin32, umin64,
899 umax32, umax64,
900 fmin32, fmin64,
901 fmax32, fmax64,
902 iand32, iand64,
903 ior32, ior64,
904 ixor32, ixor64,
905 gfx10_wave64_bpermute
906 };
907
908 /**
909 * Subgroup Reduction Instructions, everything except for the data to be
910 * reduced and the result as inserted by setup_reduce_temp().
911 * Operand(0): data to be reduced
912 * Operand(1): reduce temporary
913 * Operand(2): vector temporary
914 * Definition(0): result
915 * Definition(1): scalar temporary
916 * Definition(2): scalar identity temporary (not used to store identity on GFX10)
917 * Definition(3): scc clobber
918 * Definition(4): vcc clobber
919 *
920 */
921 struct Pseudo_reduction_instruction : public Instruction {
922 ReduceOp reduce_op;
923 unsigned cluster_size; // must be 0 for scans
924 };
925
926 struct instr_deleter_functor {
927 void operator()(void* p) {
928 free(p);
929 }
930 };
931
932 template<typename T>
933 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
934
935 template<typename T>
936 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
937 {
938 std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
939 char *data = (char*) calloc(1, size);
940 T* inst = (T*) data;
941
942 inst->opcode = opcode;
943 inst->format = format;
944
945 uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
946 inst->operands = aco::span<Operand>(operands_offset, num_operands);
947 uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
948 inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
949
950 return inst;
951 }
952
953 constexpr bool Instruction::usesModifiers() const noexcept
954 {
955 if (isDPP() || isSDWA())
956 return true;
957 if (!isVOP3())
958 return false;
959 const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
960 for (unsigned i = 0; i < operands.size(); i++) {
961 if (vop3->abs[i] || vop3->neg[i])
962 return true;
963 }
964 return vop3->opsel || vop3->clamp || vop3->omod;
965 }
966
967 constexpr bool is_phi(Instruction* instr)
968 {
969 return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
970 }
971
972 static inline bool is_phi(aco_ptr<Instruction>& instr)
973 {
974 return is_phi(instr.get());
975 }
976
977 constexpr barrier_interaction get_barrier_interaction(Instruction* instr)
978 {
979 switch (instr->format) {
980 case Format::SMEM:
981 return static_cast<SMEM_instruction*>(instr)->barrier;
982 case Format::MUBUF:
983 return static_cast<MUBUF_instruction*>(instr)->barrier;
984 case Format::MIMG:
985 return static_cast<MIMG_instruction*>(instr)->barrier;
986 case Format::FLAT:
987 case Format::GLOBAL:
988 case Format::SCRATCH:
989 return static_cast<FLAT_instruction*>(instr)->barrier;
990 case Format::DS:
991 return barrier_shared;
992 default:
993 return barrier_none;
994 }
995 }
996
997 bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr);
998
999 enum block_kind {
1000 /* uniform indicates that leaving this block,
1001 * all actives lanes stay active */
1002 block_kind_uniform = 1 << 0,
1003 block_kind_top_level = 1 << 1,
1004 block_kind_loop_preheader = 1 << 2,
1005 block_kind_loop_header = 1 << 3,
1006 block_kind_loop_exit = 1 << 4,
1007 block_kind_continue = 1 << 5,
1008 block_kind_break = 1 << 6,
1009 block_kind_continue_or_break = 1 << 7,
1010 block_kind_discard = 1 << 8,
1011 block_kind_branch = 1 << 9,
1012 block_kind_merge = 1 << 10,
1013 block_kind_invert = 1 << 11,
1014 block_kind_uses_discard_if = 1 << 12,
1015 block_kind_needs_lowering = 1 << 13,
1016 block_kind_uses_demote = 1 << 14,
1017 };
1018
1019
1020 struct RegisterDemand {
1021 constexpr RegisterDemand() = default;
1022 constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1023 : vgpr{v}, sgpr{s} {}
1024 int16_t vgpr = 0;
1025 int16_t sgpr = 0;
1026
1027 constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1028 return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1029 }
1030
1031 constexpr bool exceeds(const RegisterDemand other) const noexcept {
1032 return vgpr > other.vgpr || sgpr > other.sgpr;
1033 }
1034
1035 constexpr RegisterDemand operator+(const Temp t) const noexcept {
1036 if (t.type() == RegType::sgpr)
1037 return RegisterDemand( vgpr, sgpr + t.size() );
1038 else
1039 return RegisterDemand( vgpr + t.size(), sgpr );
1040 }
1041
1042 constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1043 return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1044 }
1045
1046 constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1047 return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1048 }
1049
1050 constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1051 vgpr += other.vgpr;
1052 sgpr += other.sgpr;
1053 return *this;
1054 }
1055
1056 constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1057 vgpr -= other.vgpr;
1058 sgpr -= other.sgpr;
1059 return *this;
1060 }
1061
1062 constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1063 if (t.type() == RegType::sgpr)
1064 sgpr += t.size();
1065 else
1066 vgpr += t.size();
1067 return *this;
1068 }
1069
1070 constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1071 if (t.type() == RegType::sgpr)
1072 sgpr -= t.size();
1073 else
1074 vgpr -= t.size();
1075 return *this;
1076 }
1077
1078 constexpr void update(const RegisterDemand other) noexcept {
1079 vgpr = std::max(vgpr, other.vgpr);
1080 sgpr = std::max(sgpr, other.sgpr);
1081 }
1082
1083 };
1084
1085 /* CFG */
1086 struct Block {
1087 float_mode fp_mode;
1088 unsigned index;
1089 unsigned offset = 0;
1090 std::vector<aco_ptr<Instruction>> instructions;
1091 std::vector<unsigned> logical_preds;
1092 std::vector<unsigned> linear_preds;
1093 std::vector<unsigned> logical_succs;
1094 std::vector<unsigned> linear_succs;
1095 RegisterDemand register_demand = RegisterDemand();
1096 uint16_t loop_nest_depth = 0;
1097 uint16_t kind = 0;
1098 int logical_idom = -1;
1099 int linear_idom = -1;
1100 Temp live_out_exec = Temp();
1101
1102 /* this information is needed for predecessors to blocks with phis when
1103 * moving out of ssa */
1104 bool scc_live_out = false;
1105 PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1106
1107 Block(unsigned idx) : index(idx) {}
1108 Block() : index(0) {}
1109 };
1110
1111 using Stage = uint16_t;
1112
1113 /* software stages */
1114 static constexpr Stage sw_vs = 1 << 0;
1115 static constexpr Stage sw_gs = 1 << 1;
1116 static constexpr Stage sw_tcs = 1 << 2;
1117 static constexpr Stage sw_tes = 1 << 3;
1118 static constexpr Stage sw_fs = 1 << 4;
1119 static constexpr Stage sw_cs = 1 << 5;
1120 static constexpr Stage sw_mask = 0x3f;
1121
1122 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1123 static constexpr Stage hw_vs = 1 << 6;
1124 static constexpr Stage hw_es = 1 << 7; /* not on GFX9. combined into GS on GFX9 (and GFX10/legacy). */
1125 static constexpr Stage hw_gs = 1 << 8;
1126 static constexpr Stage hw_ls = 1 << 9; /* not on GFX9. combined into HS on GFX9 (and GFX10/legacy). */
1127 static constexpr Stage hw_hs = 1 << 10;
1128 static constexpr Stage hw_fs = 1 << 11;
1129 static constexpr Stage hw_cs = 1 << 12;
1130 static constexpr Stage hw_mask = 0x7f << 6;
1131
1132 /* possible settings of Program::stage */
1133 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1134 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1135 static constexpr Stage compute_cs = sw_cs | hw_cs;
1136 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1137 /* GFX10/NGG */
1138 static constexpr Stage ngg_vertex_gs = sw_vs | hw_gs;
1139 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1140 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1141 static constexpr Stage ngg_vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1142 /* GFX9 (and GFX10 if NGG isn't used) */
1143 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1144 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1145 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1146 /* pre-GFX9 */
1147 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1148 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1149 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1150 static constexpr Stage tess_eval_es = sw_tes | hw_gs; /* tesselation evaluation before geometry */
1151 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1152
1153 class Program final {
1154 public:
1155 float_mode next_fp_mode;
1156 std::vector<Block> blocks;
1157 RegisterDemand max_reg_demand = RegisterDemand();
1158 uint16_t num_waves = 0;
1159 uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1160 ac_shader_config* config;
1161 struct radv_shader_info *info;
1162 enum chip_class chip_class;
1163 enum radeon_family family;
1164 unsigned wave_size;
1165 RegClass lane_mask;
1166 Stage stage; /* Stage */
1167 bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1168 bool needs_wqm = false; /* there exists a p_wqm instruction */
1169 bool wb_smem_l1_on_end = false;
1170
1171 std::vector<uint8_t> constant_data;
1172 Temp private_segment_buffer;
1173 Temp scratch_offset;
1174
1175 uint16_t min_waves = 0;
1176 uint16_t lds_alloc_granule;
1177 uint32_t lds_limit; /* in bytes */
1178 uint16_t vgpr_limit;
1179 uint16_t sgpr_limit;
1180 uint16_t physical_sgprs;
1181 uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1182 uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
1183
1184 bool needs_vcc = false;
1185 bool needs_xnack_mask = false;
1186 bool needs_flat_scr = false;
1187
1188 uint32_t allocateId()
1189 {
1190 assert(allocationID <= 16777215);
1191 return allocationID++;
1192 }
1193
1194 uint32_t peekAllocationId()
1195 {
1196 return allocationID;
1197 }
1198
1199 void setAllocationId(uint32_t id)
1200 {
1201 allocationID = id;
1202 }
1203
1204 Block* create_and_insert_block() {
1205 blocks.emplace_back(blocks.size());
1206 blocks.back().fp_mode = next_fp_mode;
1207 return &blocks.back();
1208 }
1209
1210 Block* insert_block(Block&& block) {
1211 block.index = blocks.size();
1212 block.fp_mode = next_fp_mode;
1213 blocks.emplace_back(std::move(block));
1214 return &blocks.back();
1215 }
1216
1217 private:
1218 uint32_t allocationID = 1;
1219 };
1220
1221 struct live {
1222 /* live temps out per block */
1223 std::vector<std::set<Temp>> live_out;
1224 /* register demand (sgpr/vgpr) per instruction per block */
1225 std::vector<std::vector<RegisterDemand>> register_demand;
1226 };
1227
1228 void select_program(Program *program,
1229 unsigned shader_count,
1230 struct nir_shader *const *shaders,
1231 ac_shader_config* config,
1232 struct radv_shader_args *args);
1233
1234 void lower_wqm(Program* program, live& live_vars,
1235 const struct radv_nir_compiler_options *options);
1236 void lower_bool_phis(Program* program);
1237 void calc_min_waves(Program* program);
1238 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1239 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1240 std::vector<uint16_t> dead_code_analysis(Program *program);
1241 void dominator_tree(Program* program);
1242 void insert_exec_mask(Program *program);
1243 void value_numbering(Program* program);
1244 void optimize(Program* program);
1245 void setup_reduce_temp(Program* program);
1246 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1247 void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_per_block);
1248 void ssa_elimination(Program* program);
1249 void lower_to_hw_instr(Program* program);
1250 void schedule_program(Program* program, live& live_vars);
1251 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1252 void insert_wait_states(Program* program);
1253 void insert_NOPs(Program* program);
1254 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1255 void print_asm(Program *program, std::vector<uint32_t>& binary,
1256 unsigned exec_size, std::ostream& out);
1257 void validate(Program* program, FILE *output);
1258 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output);
1259 #ifndef NDEBUG
1260 void perfwarn(bool cond, const char *msg, Instruction *instr=NULL);
1261 #else
1262 #define perfwarn(program, cond, msg, ...) do {} while(0)
1263 #endif
1264
1265 void aco_print_instr(Instruction *instr, FILE *output);
1266 void aco_print_program(Program *program, FILE *output);
1267
1268 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1269 uint16_t get_extra_sgprs(Program *program);
1270
1271 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
1272 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1273 uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs);
1274
1275 /* return number of addressable sgprs/vgprs for max_waves */
1276 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1277 uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves);
1278
1279 typedef struct {
1280 const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
1281 const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1282 const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1283 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1284 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1285 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
1286 const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1287 const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1288 } Info;
1289
1290 extern const Info instr_info;
1291
1292 }
1293
1294 #endif /* ACO_IR_H */
1295