aco: don't enable store_global for helper invocations
[mesa.git] / src / amd / compiler / aco_ir.h
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #ifndef ACO_IR_H
26 #define ACO_IR_H
27
28 #include <vector>
29 #include <set>
30 #include <bitset>
31 #include <memory>
32
33 #include "nir.h"
34 #include "ac_binary.h"
35 #include "amd_family.h"
36 #include "aco_opcodes.h"
37 #include "aco_util.h"
38
39 struct radv_nir_compiler_options;
40 struct radv_shader_args;
41 struct radv_shader_info;
42
43 namespace aco {
44
45 extern uint64_t debug_flags;
46
47 enum {
48 DEBUG_VALIDATE = 0x1,
49 DEBUG_VALIDATE_RA = 0x2,
50 DEBUG_PERFWARN = 0x4,
51 };
52
53 /**
54 * Representation of the instruction's microcode encoding format
55 * Note: Some Vector ALU Formats can be combined, such that:
56 * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
57 * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
58 * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
59 *
60 * (*) The same is applicable for VOP1 and VOPC instructions.
61 */
62 enum class Format : std::uint16_t {
63 /* Pseudo Instruction Format */
64 PSEUDO = 0,
65 /* Scalar ALU & Control Formats */
66 SOP1 = 1,
67 SOP2 = 2,
68 SOPK = 3,
69 SOPP = 4,
70 SOPC = 5,
71 /* Scalar Memory Format */
72 SMEM = 6,
73 /* LDS/GDS Format */
74 DS = 8,
75 /* Vector Memory Buffer Formats */
76 MTBUF = 9,
77 MUBUF = 10,
78 /* Vector Memory Image Format */
79 MIMG = 11,
80 /* Export Format */
81 EXP = 12,
82 /* Flat Formats */
83 FLAT = 13,
84 GLOBAL = 14,
85 SCRATCH = 15,
86
87 PSEUDO_BRANCH = 16,
88 PSEUDO_BARRIER = 17,
89 PSEUDO_REDUCTION = 18,
90
91 /* Vector ALU Formats */
92 VOP1 = 1 << 8,
93 VOP2 = 1 << 9,
94 VOPC = 1 << 10,
95 VOP3 = 1 << 11,
96 VOP3A = 1 << 11,
97 VOP3B = 1 << 11,
98 VOP3P = 1 << 12,
99 /* Vector Parameter Interpolation Format */
100 VINTRP = 1 << 13,
101 DPP = 1 << 14,
102 SDWA = 1 << 15,
103 };
104
105 enum barrier_interaction {
106 barrier_none = 0,
107 barrier_buffer = 0x1,
108 barrier_image = 0x2,
109 barrier_atomic = 0x4,
110 barrier_shared = 0x8,
111 barrier_count = 4,
112 };
113
114 enum fp_round {
115 fp_round_ne = 0,
116 fp_round_pi = 1,
117 fp_round_ni = 2,
118 fp_round_tz = 3,
119 };
120
121 enum fp_denorm {
122 /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
123 * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
124 fp_denorm_flush = 0x0,
125 fp_denorm_keep = 0x3,
126 };
127
128 struct float_mode {
129 /* matches encoding of the MODE register */
130 union {
131 struct {
132 fp_round round32:2;
133 fp_round round16_64:2;
134 unsigned denorm32:2;
135 unsigned denorm16_64:2;
136 };
137 uint8_t val = 0;
138 };
139 /* if false, optimizations which may remove infs/nan/-0.0 can be done */
140 bool preserve_signed_zero_inf_nan32:1;
141 bool preserve_signed_zero_inf_nan16_64:1;
142 /* if false, optimizations which may remove denormal flushing can be done */
143 bool must_flush_denorms32:1;
144 bool must_flush_denorms16_64:1;
145 bool care_about_round32:1;
146 bool care_about_round16_64:1;
147
148 /* Returns true if instructions using the mode "other" can safely use the
149 * current one instead. */
150 bool canReplace(float_mode other) const noexcept {
151 return val == other.val &&
152 (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
153 (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
154 (must_flush_denorms32 || !other.must_flush_denorms32) &&
155 (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
156 (care_about_round32 || !other.care_about_round32) &&
157 (care_about_round16_64 || !other.care_about_round16_64);
158 }
159 };
160
161 constexpr Format asVOP3(Format format) {
162 return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
163 };
164
165 enum class RegType {
166 none = 0,
167 sgpr,
168 vgpr,
169 linear_vgpr,
170 };
171
172 struct RegClass {
173
174 enum RC : uint8_t {
175 s1 = 1,
176 s2 = 2,
177 s3 = 3,
178 s4 = 4,
179 s6 = 6,
180 s8 = 8,
181 s16 = 16,
182 v1 = s1 | (1 << 5),
183 v2 = s2 | (1 << 5),
184 v3 = s3 | (1 << 5),
185 v4 = s4 | (1 << 5),
186 v5 = 5 | (1 << 5),
187 v6 = 6 | (1 << 5),
188 v7 = 7 | (1 << 5),
189 v8 = 8 | (1 << 5),
190 /* these are used for WWM and spills to vgpr */
191 v1_linear = v1 | (1 << 6),
192 v2_linear = v2 | (1 << 6),
193 };
194
195 RegClass() = default;
196 constexpr RegClass(RC rc)
197 : rc(rc) {}
198 constexpr RegClass(RegType type, unsigned size)
199 : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
200
201 constexpr operator RC() const { return rc; }
202 explicit operator bool() = delete;
203
204 constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
205 constexpr unsigned size() const { return (unsigned) rc & 0x1F; }
206 constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
207 constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
208
209 private:
210 RC rc;
211 };
212
213 /* transitional helper expressions */
214 static constexpr RegClass s1{RegClass::s1};
215 static constexpr RegClass s2{RegClass::s2};
216 static constexpr RegClass s3{RegClass::s3};
217 static constexpr RegClass s4{RegClass::s4};
218 static constexpr RegClass s8{RegClass::s8};
219 static constexpr RegClass s16{RegClass::s16};
220 static constexpr RegClass v1{RegClass::v1};
221 static constexpr RegClass v2{RegClass::v2};
222 static constexpr RegClass v3{RegClass::v3};
223 static constexpr RegClass v4{RegClass::v4};
224 static constexpr RegClass v5{RegClass::v5};
225 static constexpr RegClass v6{RegClass::v6};
226 static constexpr RegClass v7{RegClass::v7};
227 static constexpr RegClass v8{RegClass::v8};
228
229 /**
230 * Temp Class
231 * Each temporary virtual register has a
232 * register class (i.e. size and type)
233 * and SSA id.
234 */
235 struct Temp {
236 Temp() = default;
237 constexpr Temp(uint32_t id, RegClass cls) noexcept
238 : id_(id), reg_class(cls) {}
239
240 constexpr uint32_t id() const noexcept { return id_; }
241 constexpr RegClass regClass() const noexcept { return reg_class; }
242
243 constexpr unsigned size() const noexcept { return reg_class.size(); }
244 constexpr RegType type() const noexcept { return reg_class.type(); }
245 constexpr bool is_linear() const noexcept { return reg_class.is_linear(); }
246
247 constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
248 constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
249 constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
250
251 private:
252 uint32_t id_:24;
253 RegClass reg_class;
254 };
255
256 /**
257 * PhysReg
258 * Represents the physical register for each
259 * Operand and Definition.
260 */
261 struct PhysReg {
262 constexpr PhysReg() = default;
263 explicit constexpr PhysReg(unsigned r) : reg(r) {}
264 constexpr operator unsigned() const { return reg; }
265
266 uint16_t reg = 0;
267 };
268
269 /* helper expressions for special registers */
270 static constexpr PhysReg m0{124};
271 static constexpr PhysReg vcc{106};
272 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
273 static constexpr PhysReg exec{126};
274 static constexpr PhysReg exec_lo{126};
275 static constexpr PhysReg exec_hi{127};
276 static constexpr PhysReg scc{253};
277
278 /**
279 * Operand Class
280 * Initially, each Operand refers to either
281 * a temporary virtual register
282 * or to a constant value
283 * Temporary registers get mapped to physical register during RA
284 * Constant values are inlined into the instruction sequence.
285 */
286 class Operand final
287 {
288 public:
289 constexpr Operand()
290 : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
291 isKill_(false), isUndef_(true), isFirstKill_(false), is64BitConst_(false) {}
292
293 explicit Operand(Temp r) noexcept
294 {
295 data_.temp = r;
296 if (r.id()) {
297 isTemp_ = true;
298 } else {
299 isUndef_ = true;
300 setFixed(PhysReg{128});
301 }
302 };
303 explicit Operand(uint32_t v) noexcept
304 {
305 data_.i = v;
306 isConstant_ = true;
307 if (v <= 64)
308 setFixed(PhysReg{128 + v});
309 else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
310 setFixed(PhysReg{192 - v});
311 else if (v == 0x3f000000) /* 0.5 */
312 setFixed(PhysReg{240});
313 else if (v == 0xbf000000) /* -0.5 */
314 setFixed(PhysReg{241});
315 else if (v == 0x3f800000) /* 1.0 */
316 setFixed(PhysReg{242});
317 else if (v == 0xbf800000) /* -1.0 */
318 setFixed(PhysReg{243});
319 else if (v == 0x40000000) /* 2.0 */
320 setFixed(PhysReg{244});
321 else if (v == 0xc0000000) /* -2.0 */
322 setFixed(PhysReg{245});
323 else if (v == 0x40800000) /* 4.0 */
324 setFixed(PhysReg{246});
325 else if (v == 0xc0800000) /* -4.0 */
326 setFixed(PhysReg{247});
327 else if (v == 0x3e22f983) /* 1/(2*PI) */
328 setFixed(PhysReg{248});
329 else /* Literal Constant */
330 setFixed(PhysReg{255});
331 };
332 explicit Operand(uint64_t v) noexcept
333 {
334 isConstant_ = true;
335 is64BitConst_ = true;
336 if (v <= 64)
337 setFixed(PhysReg{128 + (uint32_t) v});
338 else if (v >= 0xFFFFFFFFFFFFFFF0) /* [-16 .. -1] */
339 setFixed(PhysReg{192 - (uint32_t) v});
340 else if (v == 0x3FE0000000000000) /* 0.5 */
341 setFixed(PhysReg{240});
342 else if (v == 0xBFE0000000000000) /* -0.5 */
343 setFixed(PhysReg{241});
344 else if (v == 0x3FF0000000000000) /* 1.0 */
345 setFixed(PhysReg{242});
346 else if (v == 0xBFF0000000000000) /* -1.0 */
347 setFixed(PhysReg{243});
348 else if (v == 0x4000000000000000) /* 2.0 */
349 setFixed(PhysReg{244});
350 else if (v == 0xC000000000000000) /* -2.0 */
351 setFixed(PhysReg{245});
352 else if (v == 0x4010000000000000) /* 4.0 */
353 setFixed(PhysReg{246});
354 else if (v == 0xC010000000000000) /* -4.0 */
355 setFixed(PhysReg{247});
356 else if (v == 0x3fc45f306dc9c882) /* 1/(2*PI) */
357 setFixed(PhysReg{248});
358 else { /* Literal Constant: we don't know if it is a long or double.*/
359 isConstant_ = 0;
360 assert(false && "attempt to create a 64-bit literal constant");
361 }
362 };
363 explicit Operand(RegClass type) noexcept
364 {
365 isUndef_ = true;
366 data_.temp = Temp(0, type);
367 setFixed(PhysReg{128});
368 };
369 explicit Operand(PhysReg reg, RegClass type) noexcept
370 {
371 data_.temp = Temp(0, type);
372 setFixed(reg);
373 }
374
375 constexpr bool isTemp() const noexcept
376 {
377 return isTemp_;
378 }
379
380 constexpr void setTemp(Temp t) noexcept {
381 assert(!isConstant_);
382 isTemp_ = true;
383 data_.temp = t;
384 }
385
386 constexpr Temp getTemp() const noexcept
387 {
388 return data_.temp;
389 }
390
391 constexpr uint32_t tempId() const noexcept
392 {
393 return data_.temp.id();
394 }
395
396 constexpr bool hasRegClass() const noexcept
397 {
398 return isTemp() || isUndefined();
399 }
400
401 constexpr RegClass regClass() const noexcept
402 {
403 return data_.temp.regClass();
404 }
405
406 constexpr unsigned size() const noexcept
407 {
408 if (isConstant())
409 return is64BitConst_ ? 2 : 1;
410 else
411 return data_.temp.size();
412 }
413
414 constexpr bool isFixed() const noexcept
415 {
416 return isFixed_;
417 }
418
419 constexpr PhysReg physReg() const noexcept
420 {
421 return reg_;
422 }
423
424 constexpr void setFixed(PhysReg reg) noexcept
425 {
426 isFixed_ = reg != unsigned(-1);
427 reg_ = reg;
428 }
429
430 constexpr bool isConstant() const noexcept
431 {
432 return isConstant_;
433 }
434
435 constexpr bool isLiteral() const noexcept
436 {
437 return isConstant() && reg_ == 255;
438 }
439
440 constexpr bool isUndefined() const noexcept
441 {
442 return isUndef_;
443 }
444
445 constexpr uint32_t constantValue() const noexcept
446 {
447 return data_.i;
448 }
449
450 constexpr bool constantEquals(uint32_t cmp) const noexcept
451 {
452 return isConstant() && constantValue() == cmp;
453 }
454
455 constexpr void setKill(bool flag) noexcept
456 {
457 isKill_ = flag;
458 if (!flag)
459 setFirstKill(false);
460 }
461
462 constexpr bool isKill() const noexcept
463 {
464 return isKill_ || isFirstKill();
465 }
466
467 constexpr void setFirstKill(bool flag) noexcept
468 {
469 isFirstKill_ = flag;
470 if (flag)
471 setKill(flag);
472 }
473
474 /* When there are multiple operands killing the same temporary,
475 * isFirstKill() is only returns true for the first one. */
476 constexpr bool isFirstKill() const noexcept
477 {
478 return isFirstKill_;
479 }
480
481 private:
482 union {
483 uint32_t i;
484 float f;
485 Temp temp = Temp(0, s1);
486 } data_;
487 PhysReg reg_;
488 union {
489 struct {
490 uint8_t isTemp_:1;
491 uint8_t isFixed_:1;
492 uint8_t isConstant_:1;
493 uint8_t isKill_:1;
494 uint8_t isUndef_:1;
495 uint8_t isFirstKill_:1;
496 uint8_t is64BitConst_:1;
497 };
498 /* can't initialize bit-fields in c++11, so work around using a union */
499 uint8_t control_ = 0;
500 };
501 };
502
503 /**
504 * Definition Class
505 * Definitions are the results of Instructions
506 * and refer to temporary virtual registers
507 * which are later mapped to physical registers
508 */
509 class Definition final
510 {
511 public:
512 constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0) {}
513 Definition(uint32_t index, RegClass type) noexcept
514 : temp(index, type) {}
515 explicit Definition(Temp tmp) noexcept
516 : temp(tmp) {}
517 Definition(PhysReg reg, RegClass type) noexcept
518 : temp(Temp(0, type))
519 {
520 setFixed(reg);
521 }
522 Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
523 : temp(Temp(tmpId, type))
524 {
525 setFixed(reg);
526 }
527
528 constexpr bool isTemp() const noexcept
529 {
530 return tempId() > 0;
531 }
532
533 constexpr Temp getTemp() const noexcept
534 {
535 return temp;
536 }
537
538 constexpr uint32_t tempId() const noexcept
539 {
540 return temp.id();
541 }
542
543 constexpr void setTemp(Temp t) noexcept {
544 temp = t;
545 }
546
547 constexpr RegClass regClass() const noexcept
548 {
549 return temp.regClass();
550 }
551
552 constexpr unsigned size() const noexcept
553 {
554 return temp.size();
555 }
556
557 constexpr bool isFixed() const noexcept
558 {
559 return isFixed_;
560 }
561
562 constexpr PhysReg physReg() const noexcept
563 {
564 return reg_;
565 }
566
567 constexpr void setFixed(PhysReg reg) noexcept
568 {
569 isFixed_ = 1;
570 reg_ = reg;
571 }
572
573 constexpr void setHint(PhysReg reg) noexcept
574 {
575 hasHint_ = 1;
576 reg_ = reg;
577 }
578
579 constexpr bool hasHint() const noexcept
580 {
581 return hasHint_;
582 }
583
584 constexpr void setKill(bool flag) noexcept
585 {
586 isKill_ = flag;
587 }
588
589 constexpr bool isKill() const noexcept
590 {
591 return isKill_;
592 }
593
594 private:
595 Temp temp = Temp(0, s1);
596 PhysReg reg_;
597 union {
598 struct {
599 uint8_t isFixed_:1;
600 uint8_t hasHint_:1;
601 uint8_t isKill_:1;
602 };
603 /* can't initialize bit-fields in c++11, so work around using a union */
604 uint8_t control_ = 0;
605 };
606 };
607
608 class Block;
609
610 struct Instruction {
611 aco_opcode opcode;
612 Format format;
613 uint32_t pass_flags;
614
615 aco::span<Operand> operands;
616 aco::span<Definition> definitions;
617
618 constexpr bool isVALU() const noexcept
619 {
620 return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
621 || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
622 || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
623 || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
624 || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
625 || ((uint16_t) format & (uint16_t) Format::VOP3P) == (uint16_t) Format::VOP3P;
626 }
627
628 constexpr bool isSALU() const noexcept
629 {
630 return format == Format::SOP1 ||
631 format == Format::SOP2 ||
632 format == Format::SOPC ||
633 format == Format::SOPK ||
634 format == Format::SOPP;
635 }
636
637 constexpr bool isVMEM() const noexcept
638 {
639 return format == Format::MTBUF ||
640 format == Format::MUBUF ||
641 format == Format::MIMG;
642 }
643
644 constexpr bool isDPP() const noexcept
645 {
646 return (uint16_t) format & (uint16_t) Format::DPP;
647 }
648
649 constexpr bool isVOP3() const noexcept
650 {
651 return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
652 ((uint16_t) format & (uint16_t) Format::VOP3B) ||
653 format == Format::VOP3P;
654 }
655
656 constexpr bool isSDWA() const noexcept
657 {
658 return (uint16_t) format & (uint16_t) Format::SDWA;
659 }
660
661 constexpr bool isFlatOrGlobal() const noexcept
662 {
663 return format == Format::FLAT || format == Format::GLOBAL;
664 }
665
666 constexpr bool usesModifiers() const noexcept;
667
668 constexpr bool reads_exec() const noexcept
669 {
670 for (const Operand& op : operands) {
671 if (op.isFixed() && op.physReg() == exec)
672 return true;
673 }
674 return false;
675 }
676 };
677
678 struct SOPK_instruction : public Instruction {
679 uint16_t imm;
680 };
681
682 struct SOPP_instruction : public Instruction {
683 uint32_t imm;
684 int block;
685 };
686
687 struct SOPC_instruction : public Instruction {
688 };
689
690 struct SOP1_instruction : public Instruction {
691 };
692
693 struct SOP2_instruction : public Instruction {
694 };
695
696 /**
697 * Scalar Memory Format:
698 * For s_(buffer_)load_dword*:
699 * Operand(0): SBASE - SGPR-pair which provides base address
700 * Operand(1): Offset - immediate (un)signed offset or SGPR
701 * Operand(2) / Definition(0): SDATA - SGPR for read / write result
702 * Operand(n-1): SOffset - SGPR offset (Vega only)
703 *
704 * Having no operands is also valid for instructions such as s_dcache_inv.
705 *
706 */
707 struct SMEM_instruction : public Instruction {
708 bool glc; /* VI+: globally coherent */
709 bool dlc; /* NAVI: device level coherent */
710 bool nv; /* VEGA only: Non-volatile */
711 bool can_reorder;
712 bool disable_wqm;
713 barrier_interaction barrier;
714 };
715
716 struct VOP1_instruction : public Instruction {
717 };
718
719 struct VOP2_instruction : public Instruction {
720 };
721
722 struct VOPC_instruction : public Instruction {
723 };
724
725 struct VOP3A_instruction : public Instruction {
726 bool abs[3];
727 bool opsel[4];
728 bool clamp;
729 unsigned omod;
730 bool neg[3];
731 };
732
733 /**
734 * Data Parallel Primitives Format:
735 * This format can be used for VOP1, VOP2 or VOPC instructions.
736 * The swizzle applies to the src0 operand.
737 *
738 */
739 struct DPP_instruction : public Instruction {
740 uint16_t dpp_ctrl;
741 uint8_t row_mask;
742 uint8_t bank_mask;
743 bool abs[2];
744 bool neg[2];
745 bool bound_ctrl;
746 };
747
748 struct Interp_instruction : public Instruction {
749 unsigned attribute;
750 unsigned component;
751 };
752
753 /**
754 * Local and Global Data Sharing instructions
755 * Operand(0): ADDR - VGPR which supplies the address.
756 * Operand(1): DATA0 - First data VGPR.
757 * Operand(2): DATA1 - Second data VGPR.
758 * Operand(n-1): M0 - LDS size.
759 * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
760 *
761 */
762 struct DS_instruction : public Instruction {
763 int16_t offset0;
764 int8_t offset1;
765 bool gds;
766 };
767
768 /**
769 * Vector Memory Untyped-buffer Instructions
770 * Operand(0): VADDR - Address source. Can carry an index and/or offset
771 * Operand(1): SRSRC - Specifies which SGPR supplies T# (resource constant)
772 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
773 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
774 *
775 */
776 struct MUBUF_instruction : public Instruction {
777 unsigned offset; /* Unsigned byte offset - 12 bit */
778 bool offen; /* Supply an offset from VGPR (VADDR) */
779 bool idxen; /* Supply an index from VGPR (VADDR) */
780 bool glc; /* globally coherent */
781 bool dlc; /* NAVI: device level coherent */
782 bool slc; /* system level coherent */
783 bool tfe; /* texture fail enable */
784 bool lds; /* Return read-data to LDS instead of VGPRs */
785 bool disable_wqm; /* Require an exec mask without helper invocations */
786 bool can_reorder;
787 barrier_interaction barrier;
788 };
789
790 /**
791 * Vector Memory Typed-buffer Instructions
792 * Operand(0): VADDR - Address source. Can carry an index and/or offset
793 * Operand(1): SRSRC - Specifies which SGPR supplies T# (resource constant)
794 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
795 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
796 *
797 */
798 struct MTBUF_instruction : public Instruction {
799 uint8_t dfmt : 4; /* Data Format of data in memory buffer */
800 uint8_t nfmt : 3; /* Numeric format of data in memory */
801 unsigned offset; /* Unsigned byte offset - 12 bit */
802 bool offen; /* Supply an offset from VGPR (VADDR) */
803 bool idxen; /* Supply an index from VGPR (VADDR) */
804 bool glc; /* globally coherent */
805 bool dlc; /* NAVI: device level coherent */
806 bool slc; /* system level coherent */
807 bool tfe; /* texture fail enable */
808 bool disable_wqm; /* Require an exec mask without helper invocations */
809 bool can_reorder;
810 barrier_interaction barrier;
811 };
812
813 /**
814 * Vector Memory Image Instructions
815 * Operand(0): VADDR - Address source. Can carry an offset or an index.
816 * Operand(1): SRSRC - Scalar GPR that specifies the resource constant.
817 * Operand(2): SSAMP - Scalar GPR that specifies sampler constant.
818 * Operand(3) / Definition(0): VDATA - Vector GPR for read / write result.
819 *
820 */
821 struct MIMG_instruction : public Instruction {
822 unsigned dmask; /* Data VGPR enable mask */
823 unsigned dim; /* NAVI: dimensionality */
824 bool unrm; /* Force address to be un-normalized */
825 bool dlc; /* NAVI: device level coherent */
826 bool glc; /* globally coherent */
827 bool slc; /* system level coherent */
828 bool tfe; /* texture fail enable */
829 bool da; /* declare an array */
830 bool lwe; /* Force data to be un-normalized */
831 bool r128; /* NAVI: Texture resource size */
832 bool a16; /* VEGA, NAVI: Address components are 16-bits */
833 bool d16; /* Convert 32-bit data to 16-bit data */
834 bool disable_wqm; /* Require an exec mask without helper invocations */
835 bool can_reorder;
836 barrier_interaction barrier;
837 };
838
839 /**
840 * Flat/Scratch/Global Instructions
841 * Operand(0): ADDR
842 * Operand(1): SADDR
843 * Operand(2) / Definition(0): DATA/VDST
844 *
845 */
846 struct FLAT_instruction : public Instruction {
847 uint16_t offset; /* Vega/Navi only */
848 bool slc; /* system level coherent */
849 bool glc; /* globally coherent */
850 bool dlc; /* NAVI: device level coherent */
851 bool lds;
852 bool nv;
853 bool disable_wqm;
854 };
855
856 struct Export_instruction : public Instruction {
857 unsigned enabled_mask;
858 unsigned dest;
859 bool compressed;
860 bool done;
861 bool valid_mask;
862 };
863
864 struct Pseudo_instruction : public Instruction {
865 bool tmp_in_scc;
866 PhysReg scratch_sgpr; /* might not be valid if it's not needed */
867 };
868
869 struct Pseudo_branch_instruction : public Instruction {
870 /* target[0] is the block index of the branch target.
871 * For conditional branches, target[1] contains the fall-through alternative.
872 * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
873 */
874 uint32_t target[2];
875 };
876
877 struct Pseudo_barrier_instruction : public Instruction {
878 };
879
880 enum ReduceOp {
881 iadd32, iadd64,
882 imul32, imul64,
883 fadd32, fadd64,
884 fmul32, fmul64,
885 imin32, imin64,
886 imax32, imax64,
887 umin32, umin64,
888 umax32, umax64,
889 fmin32, fmin64,
890 fmax32, fmax64,
891 iand32, iand64,
892 ior32, ior64,
893 ixor32, ixor64,
894 gfx10_wave64_bpermute
895 };
896
897 /**
898 * Subgroup Reduction Instructions, everything except for the data to be
899 * reduced and the result as inserted by setup_reduce_temp().
900 * Operand(0): data to be reduced
901 * Operand(1): reduce temporary
902 * Operand(2): vector temporary
903 * Definition(0): result
904 * Definition(1): scalar temporary
905 * Definition(2): scalar identity temporary (not used to store identity on GFX10)
906 * Definition(3): scc clobber
907 * Definition(4): vcc clobber
908 *
909 */
910 struct Pseudo_reduction_instruction : public Instruction {
911 ReduceOp reduce_op;
912 unsigned cluster_size; // must be 0 for scans
913 };
914
915 struct instr_deleter_functor {
916 void operator()(void* p) {
917 free(p);
918 }
919 };
920
921 template<typename T>
922 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
923
924 template<typename T>
925 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
926 {
927 std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
928 char *data = (char*) calloc(1, size);
929 T* inst = (T*) data;
930
931 inst->opcode = opcode;
932 inst->format = format;
933
934 inst->operands = aco::span<Operand>((Operand*)(data + sizeof(T)), num_operands);
935 inst->definitions = aco::span<Definition>((Definition*)inst->operands.end(), num_definitions);
936
937 return inst;
938 }
939
940 constexpr bool Instruction::usesModifiers() const noexcept
941 {
942 if (isDPP() || isSDWA())
943 return true;
944 if (!isVOP3())
945 return false;
946 const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
947 for (unsigned i = 0; i < operands.size(); i++) {
948 if (vop3->abs[i] || vop3->opsel[i] || vop3->neg[i])
949 return true;
950 }
951 return vop3->opsel[3] || vop3->clamp || vop3->omod;
952 }
953
954 constexpr bool is_phi(Instruction* instr)
955 {
956 return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
957 }
958
959 static inline bool is_phi(aco_ptr<Instruction>& instr)
960 {
961 return is_phi(instr.get());
962 }
963
964 constexpr barrier_interaction get_barrier_interaction(Instruction* instr)
965 {
966 switch (instr->format) {
967 case Format::SMEM:
968 return static_cast<SMEM_instruction*>(instr)->barrier;
969 case Format::MUBUF:
970 return static_cast<MUBUF_instruction*>(instr)->barrier;
971 case Format::MIMG:
972 return static_cast<MIMG_instruction*>(instr)->barrier;
973 case Format::FLAT:
974 case Format::GLOBAL:
975 return barrier_buffer;
976 case Format::DS:
977 return barrier_shared;
978 default:
979 return barrier_none;
980 }
981 }
982
983 enum block_kind {
984 /* uniform indicates that leaving this block,
985 * all actives lanes stay active */
986 block_kind_uniform = 1 << 0,
987 block_kind_top_level = 1 << 1,
988 block_kind_loop_preheader = 1 << 2,
989 block_kind_loop_header = 1 << 3,
990 block_kind_loop_exit = 1 << 4,
991 block_kind_continue = 1 << 5,
992 block_kind_break = 1 << 6,
993 block_kind_continue_or_break = 1 << 7,
994 block_kind_discard = 1 << 8,
995 block_kind_branch = 1 << 9,
996 block_kind_merge = 1 << 10,
997 block_kind_invert = 1 << 11,
998 block_kind_uses_discard_if = 1 << 12,
999 block_kind_needs_lowering = 1 << 13,
1000 block_kind_uses_demote = 1 << 14,
1001 };
1002
1003
1004 struct RegisterDemand {
1005 constexpr RegisterDemand() = default;
1006 constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1007 : vgpr{v}, sgpr{s} {}
1008 int16_t vgpr = 0;
1009 int16_t sgpr = 0;
1010
1011 constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1012 return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1013 }
1014
1015 constexpr bool exceeds(const RegisterDemand other) const noexcept {
1016 return vgpr > other.vgpr || sgpr > other.sgpr;
1017 }
1018
1019 constexpr RegisterDemand operator+(const Temp t) const noexcept {
1020 if (t.type() == RegType::sgpr)
1021 return RegisterDemand( vgpr, sgpr + t.size() );
1022 else
1023 return RegisterDemand( vgpr + t.size(), sgpr );
1024 }
1025
1026 constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1027 return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1028 }
1029
1030 constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1031 return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1032 }
1033
1034 constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1035 vgpr += other.vgpr;
1036 sgpr += other.sgpr;
1037 return *this;
1038 }
1039
1040 constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1041 vgpr -= other.vgpr;
1042 sgpr -= other.sgpr;
1043 return *this;
1044 }
1045
1046 constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1047 if (t.type() == RegType::sgpr)
1048 sgpr += t.size();
1049 else
1050 vgpr += t.size();
1051 return *this;
1052 }
1053
1054 constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1055 if (t.type() == RegType::sgpr)
1056 sgpr -= t.size();
1057 else
1058 vgpr -= t.size();
1059 return *this;
1060 }
1061
1062 constexpr void update(const RegisterDemand other) noexcept {
1063 vgpr = std::max(vgpr, other.vgpr);
1064 sgpr = std::max(sgpr, other.sgpr);
1065 }
1066
1067 };
1068
1069 /* CFG */
1070 struct Block {
1071 float_mode fp_mode;
1072 unsigned index;
1073 unsigned offset = 0;
1074 std::vector<aco_ptr<Instruction>> instructions;
1075 std::vector<unsigned> logical_preds;
1076 std::vector<unsigned> linear_preds;
1077 std::vector<unsigned> logical_succs;
1078 std::vector<unsigned> linear_succs;
1079 RegisterDemand register_demand = RegisterDemand();
1080 uint16_t loop_nest_depth = 0;
1081 uint16_t kind = 0;
1082 int logical_idom = -1;
1083 int linear_idom = -1;
1084 Temp live_out_exec = Temp();
1085
1086 /* this information is needed for predecessors to blocks with phis when
1087 * moving out of ssa */
1088 bool scc_live_out = false;
1089 PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1090
1091 Block(unsigned idx) : index(idx) {}
1092 Block() : index(0) {}
1093 };
1094
1095 using Stage = uint16_t;
1096
1097 /* software stages */
1098 static constexpr Stage sw_vs = 1 << 0;
1099 static constexpr Stage sw_gs = 1 << 1;
1100 static constexpr Stage sw_tcs = 1 << 2;
1101 static constexpr Stage sw_tes = 1 << 3;
1102 static constexpr Stage sw_fs = 1 << 4;
1103 static constexpr Stage sw_cs = 1 << 5;
1104 static constexpr Stage sw_mask = 0x3f;
1105
1106 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1107 static constexpr Stage hw_vs = 1 << 6;
1108 static constexpr Stage hw_es = 1 << 7; /* not on GFX9. combined into GS on GFX9 (and GFX10/legacy). */
1109 static constexpr Stage hw_gs = 1 << 8;
1110 static constexpr Stage hw_ls = 1 << 9; /* not on GFX9. combined into HS on GFX9 (and GFX10/legacy). */
1111 static constexpr Stage hw_hs = 1 << 10;
1112 static constexpr Stage hw_fs = 1 << 11;
1113 static constexpr Stage hw_cs = 1 << 12;
1114 static constexpr Stage hw_mask = 0x7f << 6;
1115
1116 /* possible settings of Program::stage */
1117 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1118 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1119 static constexpr Stage compute_cs = sw_cs | hw_cs;
1120 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1121 /* GFX10/NGG */
1122 static constexpr Stage ngg_vertex_gs = sw_vs | hw_gs;
1123 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1124 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1125 static constexpr Stage ngg_vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1126 /* GFX9 (and GFX10 if NGG isn't used) */
1127 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1128 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1129 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1130 /* pre-GFX9 */
1131 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1132 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1133 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1134 static constexpr Stage tess_eval_es = sw_tes | hw_gs; /* tesselation evaluation before geometry */
1135 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1136
1137 class Program final {
1138 public:
1139 float_mode next_fp_mode;
1140 std::vector<Block> blocks;
1141 RegisterDemand max_reg_demand = RegisterDemand();
1142 uint16_t num_waves = 0;
1143 uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1144 ac_shader_config* config;
1145 struct radv_shader_info *info;
1146 enum chip_class chip_class;
1147 enum radeon_family family;
1148 unsigned wave_size;
1149 Stage stage; /* Stage */
1150 bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1151 bool needs_wqm = false; /* there exists a p_wqm instruction */
1152 bool wb_smem_l1_on_end = false;
1153
1154 std::vector<uint8_t> constant_data;
1155 Temp private_segment_buffer;
1156 Temp scratch_offset;
1157
1158 uint16_t lds_alloc_granule;
1159 uint32_t lds_limit; /* in bytes */
1160 uint16_t vgpr_limit;
1161 uint16_t sgpr_limit;
1162 uint16_t physical_sgprs;
1163 uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1164
1165 bool needs_vcc = false;
1166 bool needs_xnack_mask = false;
1167 bool needs_flat_scr = false;
1168
1169 uint32_t allocateId()
1170 {
1171 assert(allocationID <= 16777215);
1172 return allocationID++;
1173 }
1174
1175 uint32_t peekAllocationId()
1176 {
1177 return allocationID;
1178 }
1179
1180 void setAllocationId(uint32_t id)
1181 {
1182 allocationID = id;
1183 }
1184
1185 Block* create_and_insert_block() {
1186 blocks.emplace_back(blocks.size());
1187 blocks.back().fp_mode = next_fp_mode;
1188 return &blocks.back();
1189 }
1190
1191 Block* insert_block(Block&& block) {
1192 block.index = blocks.size();
1193 block.fp_mode = next_fp_mode;
1194 blocks.emplace_back(std::move(block));
1195 return &blocks.back();
1196 }
1197
1198 private:
1199 uint32_t allocationID = 1;
1200 };
1201
1202 struct live {
1203 /* live temps out per block */
1204 std::vector<std::set<Temp>> live_out;
1205 /* register demand (sgpr/vgpr) per instruction per block */
1206 std::vector<std::vector<RegisterDemand>> register_demand;
1207 };
1208
1209 void select_program(Program *program,
1210 unsigned shader_count,
1211 struct nir_shader *const *shaders,
1212 ac_shader_config* config,
1213 struct radv_shader_args *args);
1214
1215 void lower_wqm(Program* program, live& live_vars,
1216 const struct radv_nir_compiler_options *options);
1217 void lower_bool_phis(Program* program);
1218 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1219 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1220 std::vector<uint16_t> dead_code_analysis(Program *program);
1221 void dominator_tree(Program* program);
1222 void insert_exec_mask(Program *program);
1223 void value_numbering(Program* program);
1224 void optimize(Program* program);
1225 void setup_reduce_temp(Program* program);
1226 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1227 void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_per_block);
1228 void ssa_elimination(Program* program);
1229 void lower_to_hw_instr(Program* program);
1230 void schedule_program(Program* program, live& live_vars);
1231 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1232 void insert_wait_states(Program* program);
1233 void insert_NOPs(Program* program);
1234 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1235 void print_asm(Program *program, std::vector<uint32_t>& binary,
1236 unsigned exec_size, std::ostream& out);
1237 void validate(Program* program, FILE *output);
1238 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output);
1239 #ifndef NDEBUG
1240 void perfwarn(bool cond, const char *msg, Instruction *instr=NULL);
1241 #else
1242 #define perfwarn(program, cond, msg, ...)
1243 #endif
1244
1245 void aco_print_instr(Instruction *instr, FILE *output);
1246 void aco_print_program(Program *program, FILE *output);
1247
1248 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1249 uint16_t get_extra_sgprs(Program *program);
1250
1251 /* get number of sgprs allocated required to address a number of sgprs */
1252 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1253
1254 /* return number of addressable SGPRs for max_waves */
1255 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1256
1257 typedef struct {
1258 const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1259 const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1260 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1261 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1262 const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1263 const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1264 } Info;
1265
1266 extern const Info instr_info;
1267
1268 }
1269
1270 #endif /* ACO_IR_H */
1271