aco: Use common argument handling
[mesa.git] / src / amd / compiler / aco_ir.h
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #ifndef ACO_IR_H
26 #define ACO_IR_H
27
28 #include <vector>
29 #include <set>
30 #include <bitset>
31 #include <memory>
32
33 #include "nir.h"
34 #include "ac_binary.h"
35 #include "amd_family.h"
36 #include "aco_opcodes.h"
37 #include "aco_util.h"
38
39 struct radv_nir_compiler_options;
40 struct radv_shader_args;
41 struct radv_shader_info;
42
43 namespace aco {
44
45 extern uint64_t debug_flags;
46
47 enum {
48 DEBUG_VALIDATE = 0x1,
49 DEBUG_VALIDATE_RA = 0x2,
50 DEBUG_PERFWARN = 0x4,
51 };
52
53 /**
54 * Representation of the instruction's microcode encoding format
55 * Note: Some Vector ALU Formats can be combined, such that:
56 * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
57 * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
58 * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
59 *
60 * (*) The same is applicable for VOP1 and VOPC instructions.
61 */
62 enum class Format : std::uint16_t {
63 /* Pseudo Instruction Format */
64 PSEUDO = 0,
65 /* Scalar ALU & Control Formats */
66 SOP1 = 1,
67 SOP2 = 2,
68 SOPK = 3,
69 SOPP = 4,
70 SOPC = 5,
71 /* Scalar Memory Format */
72 SMEM = 6,
73 /* LDS/GDS Format */
74 DS = 8,
75 /* Vector Memory Buffer Formats */
76 MTBUF = 9,
77 MUBUF = 10,
78 /* Vector Memory Image Format */
79 MIMG = 11,
80 /* Export Format */
81 EXP = 12,
82 /* Flat Formats */
83 FLAT = 13,
84 GLOBAL = 14,
85 SCRATCH = 15,
86
87 PSEUDO_BRANCH = 16,
88 PSEUDO_BARRIER = 17,
89 PSEUDO_REDUCTION = 18,
90
91 /* Vector ALU Formats */
92 VOP1 = 1 << 8,
93 VOP2 = 1 << 9,
94 VOPC = 1 << 10,
95 VOP3 = 1 << 11,
96 VOP3A = 1 << 11,
97 VOP3B = 1 << 11,
98 VOP3P = 1 << 12,
99 /* Vector Parameter Interpolation Format */
100 VINTRP = 1 << 13,
101 DPP = 1 << 14,
102 SDWA = 1 << 15,
103 };
104
105 enum barrier_interaction {
106 barrier_none = 0,
107 barrier_buffer = 0x1,
108 barrier_image = 0x2,
109 barrier_atomic = 0x4,
110 barrier_shared = 0x8,
111 barrier_count = 4,
112 };
113
114 enum fp_round {
115 fp_round_ne = 0,
116 fp_round_pi = 1,
117 fp_round_ni = 2,
118 fp_round_tz = 3,
119 };
120
121 enum fp_denorm {
122 /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
123 * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
124 fp_denorm_flush = 0x0,
125 fp_denorm_keep = 0x3,
126 };
127
128 struct float_mode {
129 /* matches encoding of the MODE register */
130 union {
131 struct {
132 fp_round round32:2;
133 fp_round round16_64:2;
134 unsigned denorm32:2;
135 unsigned denorm16_64:2;
136 };
137 uint8_t val = 0;
138 };
139 /* if false, optimizations which may remove infs/nan/-0.0 can be done */
140 bool preserve_signed_zero_inf_nan32:1;
141 bool preserve_signed_zero_inf_nan16_64:1;
142 /* if false, optimizations which may remove denormal flushing can be done */
143 bool must_flush_denorms32:1;
144 bool must_flush_denorms16_64:1;
145 bool care_about_round32:1;
146 bool care_about_round16_64:1;
147
148 /* Returns true if instructions using the mode "other" can safely use the
149 * current one instead. */
150 bool canReplace(float_mode other) const noexcept {
151 return val == other.val &&
152 (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
153 (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
154 (must_flush_denorms32 || !other.must_flush_denorms32) &&
155 (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
156 (care_about_round32 || !other.care_about_round32) &&
157 (care_about_round16_64 || !other.care_about_round16_64);
158 }
159 };
160
161 constexpr Format asVOP3(Format format) {
162 return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
163 };
164
165 enum class RegType {
166 none = 0,
167 sgpr,
168 vgpr,
169 linear_vgpr,
170 };
171
172 struct RegClass {
173
174 enum RC : uint8_t {
175 s1 = 1,
176 s2 = 2,
177 s3 = 3,
178 s4 = 4,
179 s6 = 6,
180 s8 = 8,
181 s16 = 16,
182 v1 = s1 | (1 << 5),
183 v2 = s2 | (1 << 5),
184 v3 = s3 | (1 << 5),
185 v4 = s4 | (1 << 5),
186 v5 = 5 | (1 << 5),
187 v6 = 6 | (1 << 5),
188 v7 = 7 | (1 << 5),
189 v8 = 8 | (1 << 5),
190 /* these are used for WWM and spills to vgpr */
191 v1_linear = v1 | (1 << 6),
192 v2_linear = v2 | (1 << 6),
193 };
194
195 RegClass() = default;
196 constexpr RegClass(RC rc)
197 : rc(rc) {}
198 constexpr RegClass(RegType type, unsigned size)
199 : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
200
201 constexpr operator RC() const { return rc; }
202 explicit operator bool() = delete;
203
204 constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
205 constexpr unsigned size() const { return (unsigned) rc & 0x1F; }
206 constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
207 constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
208
209 private:
210 RC rc;
211 };
212
213 /* transitional helper expressions */
214 static constexpr RegClass s1{RegClass::s1};
215 static constexpr RegClass s2{RegClass::s2};
216 static constexpr RegClass s3{RegClass::s3};
217 static constexpr RegClass s4{RegClass::s4};
218 static constexpr RegClass s8{RegClass::s8};
219 static constexpr RegClass s16{RegClass::s16};
220 static constexpr RegClass v1{RegClass::v1};
221 static constexpr RegClass v2{RegClass::v2};
222 static constexpr RegClass v3{RegClass::v3};
223 static constexpr RegClass v4{RegClass::v4};
224 static constexpr RegClass v5{RegClass::v5};
225 static constexpr RegClass v6{RegClass::v6};
226 static constexpr RegClass v7{RegClass::v7};
227 static constexpr RegClass v8{RegClass::v8};
228
229 /**
230 * Temp Class
231 * Each temporary virtual register has a
232 * register class (i.e. size and type)
233 * and SSA id.
234 */
235 struct Temp {
236 Temp() = default;
237 constexpr Temp(uint32_t id, RegClass cls) noexcept
238 : id_(id), reg_class(cls) {}
239
240 constexpr uint32_t id() const noexcept { return id_; }
241 constexpr RegClass regClass() const noexcept { return reg_class; }
242
243 constexpr unsigned size() const noexcept { return reg_class.size(); }
244 constexpr RegType type() const noexcept { return reg_class.type(); }
245 constexpr bool is_linear() const noexcept { return reg_class.is_linear(); }
246
247 constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
248 constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
249 constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
250
251 private:
252 uint32_t id_:24;
253 RegClass reg_class;
254 };
255
256 /**
257 * PhysReg
258 * Represents the physical register for each
259 * Operand and Definition.
260 */
261 struct PhysReg {
262 constexpr PhysReg() = default;
263 explicit constexpr PhysReg(unsigned r) : reg(r) {}
264 constexpr operator unsigned() const { return reg; }
265
266 uint16_t reg = 0;
267 };
268
269 /* helper expressions for special registers */
270 static constexpr PhysReg m0{124};
271 static constexpr PhysReg vcc{106};
272 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
273 static constexpr PhysReg exec{126};
274 static constexpr PhysReg exec_lo{126};
275 static constexpr PhysReg exec_hi{127};
276 static constexpr PhysReg scc{253};
277
278 /**
279 * Operand Class
280 * Initially, each Operand refers to either
281 * a temporary virtual register
282 * or to a constant value
283 * Temporary registers get mapped to physical register during RA
284 * Constant values are inlined into the instruction sequence.
285 */
286 class Operand final
287 {
288 public:
289 constexpr Operand()
290 : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
291 isKill_(false), isUndef_(true), isFirstKill_(false), is64BitConst_(false) {}
292
293 explicit Operand(Temp r) noexcept
294 {
295 data_.temp = r;
296 if (r.id()) {
297 isTemp_ = true;
298 } else {
299 isUndef_ = true;
300 setFixed(PhysReg{128});
301 }
302 };
303 explicit Operand(uint32_t v) noexcept
304 {
305 data_.i = v;
306 isConstant_ = true;
307 if (v <= 64)
308 setFixed(PhysReg{128 + v});
309 else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
310 setFixed(PhysReg{192 - v});
311 else if (v == 0x3f000000) /* 0.5 */
312 setFixed(PhysReg{240});
313 else if (v == 0xbf000000) /* -0.5 */
314 setFixed(PhysReg{241});
315 else if (v == 0x3f800000) /* 1.0 */
316 setFixed(PhysReg{242});
317 else if (v == 0xbf800000) /* -1.0 */
318 setFixed(PhysReg{243});
319 else if (v == 0x40000000) /* 2.0 */
320 setFixed(PhysReg{244});
321 else if (v == 0xc0000000) /* -2.0 */
322 setFixed(PhysReg{245});
323 else if (v == 0x40800000) /* 4.0 */
324 setFixed(PhysReg{246});
325 else if (v == 0xc0800000) /* -4.0 */
326 setFixed(PhysReg{247});
327 else if (v == 0x3e22f983) /* 1/(2*PI) */
328 setFixed(PhysReg{248});
329 else /* Literal Constant */
330 setFixed(PhysReg{255});
331 };
332 explicit Operand(uint64_t v) noexcept
333 {
334 isConstant_ = true;
335 is64BitConst_ = true;
336 if (v <= 64)
337 setFixed(PhysReg{128 + (uint32_t) v});
338 else if (v >= 0xFFFFFFFFFFFFFFF0) /* [-16 .. -1] */
339 setFixed(PhysReg{192 - (uint32_t) v});
340 else if (v == 0x3FE0000000000000) /* 0.5 */
341 setFixed(PhysReg{240});
342 else if (v == 0xBFE0000000000000) /* -0.5 */
343 setFixed(PhysReg{241});
344 else if (v == 0x3FF0000000000000) /* 1.0 */
345 setFixed(PhysReg{242});
346 else if (v == 0xBFF0000000000000) /* -1.0 */
347 setFixed(PhysReg{243});
348 else if (v == 0x4000000000000000) /* 2.0 */
349 setFixed(PhysReg{244});
350 else if (v == 0xC000000000000000) /* -2.0 */
351 setFixed(PhysReg{245});
352 else if (v == 0x4010000000000000) /* 4.0 */
353 setFixed(PhysReg{246});
354 else if (v == 0xC010000000000000) /* -4.0 */
355 setFixed(PhysReg{247});
356 else if (v == 0x3fc45f306dc9c882) /* 1/(2*PI) */
357 setFixed(PhysReg{248});
358 else { /* Literal Constant: we don't know if it is a long or double.*/
359 isConstant_ = 0;
360 assert(false && "attempt to create a 64-bit literal constant");
361 }
362 };
363 explicit Operand(RegClass type) noexcept
364 {
365 isUndef_ = true;
366 data_.temp = Temp(0, type);
367 setFixed(PhysReg{128});
368 };
369 explicit Operand(PhysReg reg, RegClass type) noexcept
370 {
371 data_.temp = Temp(0, type);
372 setFixed(reg);
373 }
374
375 constexpr bool isTemp() const noexcept
376 {
377 return isTemp_;
378 }
379
380 constexpr void setTemp(Temp t) noexcept {
381 assert(!isConstant_);
382 isTemp_ = true;
383 data_.temp = t;
384 }
385
386 constexpr Temp getTemp() const noexcept
387 {
388 return data_.temp;
389 }
390
391 constexpr uint32_t tempId() const noexcept
392 {
393 return data_.temp.id();
394 }
395
396 constexpr bool hasRegClass() const noexcept
397 {
398 return isTemp() || isUndefined();
399 }
400
401 constexpr RegClass regClass() const noexcept
402 {
403 return data_.temp.regClass();
404 }
405
406 constexpr unsigned size() const noexcept
407 {
408 if (isConstant())
409 return is64BitConst_ ? 2 : 1;
410 else
411 return data_.temp.size();
412 }
413
414 constexpr bool isFixed() const noexcept
415 {
416 return isFixed_;
417 }
418
419 constexpr PhysReg physReg() const noexcept
420 {
421 return reg_;
422 }
423
424 constexpr void setFixed(PhysReg reg) noexcept
425 {
426 isFixed_ = reg != unsigned(-1);
427 reg_ = reg;
428 }
429
430 constexpr bool isConstant() const noexcept
431 {
432 return isConstant_;
433 }
434
435 constexpr bool isLiteral() const noexcept
436 {
437 return isConstant() && reg_ == 255;
438 }
439
440 constexpr bool isUndefined() const noexcept
441 {
442 return isUndef_;
443 }
444
445 constexpr uint32_t constantValue() const noexcept
446 {
447 return data_.i;
448 }
449
450 constexpr bool constantEquals(uint32_t cmp) const noexcept
451 {
452 return isConstant() && constantValue() == cmp;
453 }
454
455 constexpr void setKill(bool flag) noexcept
456 {
457 isKill_ = flag;
458 if (!flag)
459 setFirstKill(false);
460 }
461
462 constexpr bool isKill() const noexcept
463 {
464 return isKill_ || isFirstKill();
465 }
466
467 constexpr void setFirstKill(bool flag) noexcept
468 {
469 isFirstKill_ = flag;
470 if (flag)
471 setKill(flag);
472 }
473
474 /* When there are multiple operands killing the same temporary,
475 * isFirstKill() is only returns true for the first one. */
476 constexpr bool isFirstKill() const noexcept
477 {
478 return isFirstKill_;
479 }
480
481 private:
482 union {
483 uint32_t i;
484 float f;
485 Temp temp = Temp(0, s1);
486 } data_;
487 PhysReg reg_;
488 union {
489 struct {
490 uint8_t isTemp_:1;
491 uint8_t isFixed_:1;
492 uint8_t isConstant_:1;
493 uint8_t isKill_:1;
494 uint8_t isUndef_:1;
495 uint8_t isFirstKill_:1;
496 uint8_t is64BitConst_:1;
497 };
498 /* can't initialize bit-fields in c++11, so work around using a union */
499 uint8_t control_ = 0;
500 };
501 };
502
503 /**
504 * Definition Class
505 * Definitions are the results of Instructions
506 * and refer to temporary virtual registers
507 * which are later mapped to physical registers
508 */
509 class Definition final
510 {
511 public:
512 constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0) {}
513 Definition(uint32_t index, RegClass type) noexcept
514 : temp(index, type) {}
515 explicit Definition(Temp tmp) noexcept
516 : temp(tmp) {}
517 Definition(PhysReg reg, RegClass type) noexcept
518 : temp(Temp(0, type))
519 {
520 setFixed(reg);
521 }
522 Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
523 : temp(Temp(tmpId, type))
524 {
525 setFixed(reg);
526 }
527
528 constexpr bool isTemp() const noexcept
529 {
530 return tempId() > 0;
531 }
532
533 constexpr Temp getTemp() const noexcept
534 {
535 return temp;
536 }
537
538 constexpr uint32_t tempId() const noexcept
539 {
540 return temp.id();
541 }
542
543 constexpr void setTemp(Temp t) noexcept {
544 temp = t;
545 }
546
547 constexpr RegClass regClass() const noexcept
548 {
549 return temp.regClass();
550 }
551
552 constexpr unsigned size() const noexcept
553 {
554 return temp.size();
555 }
556
557 constexpr bool isFixed() const noexcept
558 {
559 return isFixed_;
560 }
561
562 constexpr PhysReg physReg() const noexcept
563 {
564 return reg_;
565 }
566
567 constexpr void setFixed(PhysReg reg) noexcept
568 {
569 isFixed_ = 1;
570 reg_ = reg;
571 }
572
573 constexpr void setHint(PhysReg reg) noexcept
574 {
575 hasHint_ = 1;
576 reg_ = reg;
577 }
578
579 constexpr bool hasHint() const noexcept
580 {
581 return hasHint_;
582 }
583
584 constexpr void setKill(bool flag) noexcept
585 {
586 isKill_ = flag;
587 }
588
589 constexpr bool isKill() const noexcept
590 {
591 return isKill_;
592 }
593
594 private:
595 Temp temp = Temp(0, s1);
596 PhysReg reg_;
597 union {
598 struct {
599 uint8_t isFixed_:1;
600 uint8_t hasHint_:1;
601 uint8_t isKill_:1;
602 };
603 /* can't initialize bit-fields in c++11, so work around using a union */
604 uint8_t control_ = 0;
605 };
606 };
607
608 class Block;
609
610 struct Instruction {
611 aco_opcode opcode;
612 Format format;
613 uint32_t pass_flags;
614
615 aco::span<Operand> operands;
616 aco::span<Definition> definitions;
617
618 constexpr bool isVALU() const noexcept
619 {
620 return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
621 || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
622 || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
623 || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
624 || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
625 || ((uint16_t) format & (uint16_t) Format::VOP3P) == (uint16_t) Format::VOP3P;
626 }
627
628 constexpr bool isSALU() const noexcept
629 {
630 return format == Format::SOP1 ||
631 format == Format::SOP2 ||
632 format == Format::SOPC ||
633 format == Format::SOPK ||
634 format == Format::SOPP;
635 }
636
637 constexpr bool isVMEM() const noexcept
638 {
639 return format == Format::MTBUF ||
640 format == Format::MUBUF ||
641 format == Format::MIMG;
642 }
643
644 constexpr bool isDPP() const noexcept
645 {
646 return (uint16_t) format & (uint16_t) Format::DPP;
647 }
648
649 constexpr bool isVOP3() const noexcept
650 {
651 return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
652 ((uint16_t) format & (uint16_t) Format::VOP3B) ||
653 format == Format::VOP3P;
654 }
655
656 constexpr bool isSDWA() const noexcept
657 {
658 return (uint16_t) format & (uint16_t) Format::SDWA;
659 }
660
661 constexpr bool isFlatOrGlobal() const noexcept
662 {
663 return format == Format::FLAT || format == Format::GLOBAL;
664 }
665
666 constexpr bool usesModifiers() const noexcept;
667
668 constexpr bool reads_exec() const noexcept
669 {
670 for (const Operand& op : operands) {
671 if (op.isFixed() && op.physReg() == exec)
672 return true;
673 }
674 return false;
675 }
676 };
677
678 struct SOPK_instruction : public Instruction {
679 uint16_t imm;
680 };
681
682 struct SOPP_instruction : public Instruction {
683 uint32_t imm;
684 int block;
685 };
686
687 struct SOPC_instruction : public Instruction {
688 };
689
690 struct SOP1_instruction : public Instruction {
691 };
692
693 struct SOP2_instruction : public Instruction {
694 };
695
696 /**
697 * Scalar Memory Format:
698 * For s_(buffer_)load_dword*:
699 * Operand(0): SBASE - SGPR-pair which provides base address
700 * Operand(1): Offset - immediate (un)signed offset or SGPR
701 * Operand(2) / Definition(0): SDATA - SGPR for read / write result
702 * Operand(n-1): SOffset - SGPR offset (Vega only)
703 *
704 * Having no operands is also valid for instructions such as s_dcache_inv.
705 *
706 */
707 struct SMEM_instruction : public Instruction {
708 bool glc; /* VI+: globally coherent */
709 bool dlc; /* NAVI: device level coherent */
710 bool nv; /* VEGA only: Non-volatile */
711 bool can_reorder;
712 bool disable_wqm;
713 barrier_interaction barrier;
714 };
715
716 struct VOP1_instruction : public Instruction {
717 };
718
719 struct VOP2_instruction : public Instruction {
720 };
721
722 struct VOPC_instruction : public Instruction {
723 };
724
725 struct VOP3A_instruction : public Instruction {
726 bool abs[3];
727 bool opsel[4];
728 bool clamp;
729 unsigned omod;
730 bool neg[3];
731 };
732
733 /**
734 * Data Parallel Primitives Format:
735 * This format can be used for VOP1, VOP2 or VOPC instructions.
736 * The swizzle applies to the src0 operand.
737 *
738 */
739 struct DPP_instruction : public Instruction {
740 uint16_t dpp_ctrl;
741 uint8_t row_mask;
742 uint8_t bank_mask;
743 bool abs[2];
744 bool neg[2];
745 bool bound_ctrl;
746 };
747
748 struct Interp_instruction : public Instruction {
749 unsigned attribute;
750 unsigned component;
751 };
752
753 /**
754 * Local and Global Data Sharing instructions
755 * Operand(0): ADDR - VGPR which supplies the address.
756 * Operand(1): DATA0 - First data VGPR.
757 * Operand(2): DATA1 - Second data VGPR.
758 * Operand(n-1): M0 - LDS size.
759 * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
760 *
761 */
762 struct DS_instruction : public Instruction {
763 int16_t offset0;
764 int8_t offset1;
765 bool gds;
766 };
767
768 /**
769 * Vector Memory Untyped-buffer Instructions
770 * Operand(0): VADDR - Address source. Can carry an index and/or offset
771 * Operand(1): SRSRC - Specifies which SGPR supplies T# (resource constant)
772 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
773 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
774 *
775 */
776 struct MUBUF_instruction : public Instruction {
777 unsigned offset; /* Unsigned byte offset - 12 bit */
778 bool offen; /* Supply an offset from VGPR (VADDR) */
779 bool idxen; /* Supply an index from VGPR (VADDR) */
780 bool glc; /* globally coherent */
781 bool dlc; /* NAVI: device level coherent */
782 bool slc; /* system level coherent */
783 bool tfe; /* texture fail enable */
784 bool lds; /* Return read-data to LDS instead of VGPRs */
785 bool disable_wqm; /* Require an exec mask without helper invocations */
786 bool can_reorder;
787 barrier_interaction barrier;
788 };
789
790 /**
791 * Vector Memory Typed-buffer Instructions
792 * Operand(0): VADDR - Address source. Can carry an index and/or offset
793 * Operand(1): SRSRC - Specifies which SGPR supplies T# (resource constant)
794 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
795 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
796 *
797 */
798 struct MTBUF_instruction : public Instruction {
799 uint8_t dfmt : 4; /* Data Format of data in memory buffer */
800 uint8_t nfmt : 3; /* Numeric format of data in memory */
801 unsigned offset; /* Unsigned byte offset - 12 bit */
802 bool offen; /* Supply an offset from VGPR (VADDR) */
803 bool idxen; /* Supply an index from VGPR (VADDR) */
804 bool glc; /* globally coherent */
805 bool dlc; /* NAVI: device level coherent */
806 bool slc; /* system level coherent */
807 bool tfe; /* texture fail enable */
808 bool disable_wqm; /* Require an exec mask without helper invocations */
809 bool can_reorder;
810 barrier_interaction barrier;
811 };
812
813 /**
814 * Vector Memory Image Instructions
815 * Operand(0): VADDR - Address source. Can carry an offset or an index.
816 * Operand(1): SRSRC - Scalar GPR that specifies the resource constant.
817 * Operand(2): SSAMP - Scalar GPR that specifies sampler constant.
818 * Operand(3) / Definition(0): VDATA - Vector GPR for read / write result.
819 *
820 */
821 struct MIMG_instruction : public Instruction {
822 unsigned dmask; /* Data VGPR enable mask */
823 unsigned dim; /* NAVI: dimensionality */
824 bool unrm; /* Force address to be un-normalized */
825 bool dlc; /* NAVI: device level coherent */
826 bool glc; /* globally coherent */
827 bool slc; /* system level coherent */
828 bool tfe; /* texture fail enable */
829 bool da; /* declare an array */
830 bool lwe; /* Force data to be un-normalized */
831 bool r128; /* NAVI: Texture resource size */
832 bool a16; /* VEGA, NAVI: Address components are 16-bits */
833 bool d16; /* Convert 32-bit data to 16-bit data */
834 bool disable_wqm; /* Require an exec mask without helper invocations */
835 bool can_reorder;
836 barrier_interaction barrier;
837 };
838
839 /**
840 * Flat/Scratch/Global Instructions
841 * Operand(0): ADDR
842 * Operand(1): SADDR
843 * Operand(2) / Definition(0): DATA/VDST
844 *
845 */
846 struct FLAT_instruction : public Instruction {
847 uint16_t offset; /* Vega only */
848 bool slc; /* system level coherent */
849 bool glc; /* globally coherent */
850 bool dlc; /* NAVI: device level coherent */
851 bool lds;
852 bool nv;
853 };
854
855 struct Export_instruction : public Instruction {
856 unsigned enabled_mask;
857 unsigned dest;
858 bool compressed;
859 bool done;
860 bool valid_mask;
861 };
862
863 struct Pseudo_instruction : public Instruction {
864 bool tmp_in_scc;
865 PhysReg scratch_sgpr; /* might not be valid if it's not needed */
866 };
867
868 struct Pseudo_branch_instruction : public Instruction {
869 /* target[0] is the block index of the branch target.
870 * For conditional branches, target[1] contains the fall-through alternative.
871 * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
872 */
873 uint32_t target[2];
874 };
875
876 struct Pseudo_barrier_instruction : public Instruction {
877 };
878
879 enum ReduceOp {
880 iadd32, iadd64,
881 imul32, imul64,
882 fadd32, fadd64,
883 fmul32, fmul64,
884 imin32, imin64,
885 imax32, imax64,
886 umin32, umin64,
887 umax32, umax64,
888 fmin32, fmin64,
889 fmax32, fmax64,
890 iand32, iand64,
891 ior32, ior64,
892 ixor32, ixor64,
893 gfx10_wave64_bpermute
894 };
895
896 /**
897 * Subgroup Reduction Instructions, everything except for the data to be
898 * reduced and the result as inserted by setup_reduce_temp().
899 * Operand(0): data to be reduced
900 * Operand(1): reduce temporary
901 * Operand(2): vector temporary
902 * Definition(0): result
903 * Definition(1): scalar temporary
904 * Definition(2): scalar identity temporary (not used to store identity on GFX10)
905 * Definition(3): scc clobber
906 * Definition(4): vcc clobber
907 *
908 */
909 struct Pseudo_reduction_instruction : public Instruction {
910 ReduceOp reduce_op;
911 unsigned cluster_size; // must be 0 for scans
912 };
913
914 struct instr_deleter_functor {
915 void operator()(void* p) {
916 free(p);
917 }
918 };
919
920 template<typename T>
921 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
922
923 template<typename T>
924 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
925 {
926 std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
927 char *data = (char*) calloc(1, size);
928 T* inst = (T*) data;
929
930 inst->opcode = opcode;
931 inst->format = format;
932
933 inst->operands = aco::span<Operand>((Operand*)(data + sizeof(T)), num_operands);
934 inst->definitions = aco::span<Definition>((Definition*)inst->operands.end(), num_definitions);
935
936 return inst;
937 }
938
939 constexpr bool Instruction::usesModifiers() const noexcept
940 {
941 if (isDPP() || isSDWA())
942 return true;
943 if (!isVOP3())
944 return false;
945 const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
946 for (unsigned i = 0; i < operands.size(); i++) {
947 if (vop3->abs[i] || vop3->opsel[i] || vop3->neg[i])
948 return true;
949 }
950 return vop3->opsel[3] || vop3->clamp || vop3->omod;
951 }
952
953 constexpr bool is_phi(Instruction* instr)
954 {
955 return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
956 }
957
958 static inline bool is_phi(aco_ptr<Instruction>& instr)
959 {
960 return is_phi(instr.get());
961 }
962
963 constexpr barrier_interaction get_barrier_interaction(Instruction* instr)
964 {
965 switch (instr->format) {
966 case Format::SMEM:
967 return static_cast<SMEM_instruction*>(instr)->barrier;
968 case Format::MUBUF:
969 return static_cast<MUBUF_instruction*>(instr)->barrier;
970 case Format::MIMG:
971 return static_cast<MIMG_instruction*>(instr)->barrier;
972 case Format::FLAT:
973 case Format::GLOBAL:
974 return barrier_buffer;
975 case Format::DS:
976 return barrier_shared;
977 default:
978 return barrier_none;
979 }
980 }
981
982 enum block_kind {
983 /* uniform indicates that leaving this block,
984 * all actives lanes stay active */
985 block_kind_uniform = 1 << 0,
986 block_kind_top_level = 1 << 1,
987 block_kind_loop_preheader = 1 << 2,
988 block_kind_loop_header = 1 << 3,
989 block_kind_loop_exit = 1 << 4,
990 block_kind_continue = 1 << 5,
991 block_kind_break = 1 << 6,
992 block_kind_continue_or_break = 1 << 7,
993 block_kind_discard = 1 << 8,
994 block_kind_branch = 1 << 9,
995 block_kind_merge = 1 << 10,
996 block_kind_invert = 1 << 11,
997 block_kind_uses_discard_if = 1 << 12,
998 block_kind_needs_lowering = 1 << 13,
999 block_kind_uses_demote = 1 << 14,
1000 };
1001
1002
1003 struct RegisterDemand {
1004 constexpr RegisterDemand() = default;
1005 constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1006 : vgpr{v}, sgpr{s} {}
1007 int16_t vgpr = 0;
1008 int16_t sgpr = 0;
1009
1010 constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1011 return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1012 }
1013
1014 constexpr bool exceeds(const RegisterDemand other) const noexcept {
1015 return vgpr > other.vgpr || sgpr > other.sgpr;
1016 }
1017
1018 constexpr RegisterDemand operator+(const Temp t) const noexcept {
1019 if (t.type() == RegType::sgpr)
1020 return RegisterDemand( vgpr, sgpr + t.size() );
1021 else
1022 return RegisterDemand( vgpr + t.size(), sgpr );
1023 }
1024
1025 constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1026 return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1027 }
1028
1029 constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1030 return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1031 }
1032
1033 constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1034 vgpr += other.vgpr;
1035 sgpr += other.sgpr;
1036 return *this;
1037 }
1038
1039 constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1040 vgpr -= other.vgpr;
1041 sgpr -= other.sgpr;
1042 return *this;
1043 }
1044
1045 constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1046 if (t.type() == RegType::sgpr)
1047 sgpr += t.size();
1048 else
1049 vgpr += t.size();
1050 return *this;
1051 }
1052
1053 constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1054 if (t.type() == RegType::sgpr)
1055 sgpr -= t.size();
1056 else
1057 vgpr -= t.size();
1058 return *this;
1059 }
1060
1061 constexpr void update(const RegisterDemand other) noexcept {
1062 vgpr = std::max(vgpr, other.vgpr);
1063 sgpr = std::max(sgpr, other.sgpr);
1064 }
1065
1066 };
1067
1068 /* CFG */
1069 struct Block {
1070 float_mode fp_mode;
1071 unsigned index;
1072 unsigned offset = 0;
1073 std::vector<aco_ptr<Instruction>> instructions;
1074 std::vector<unsigned> logical_preds;
1075 std::vector<unsigned> linear_preds;
1076 std::vector<unsigned> logical_succs;
1077 std::vector<unsigned> linear_succs;
1078 RegisterDemand register_demand = RegisterDemand();
1079 uint16_t loop_nest_depth = 0;
1080 uint16_t kind = 0;
1081 int logical_idom = -1;
1082 int linear_idom = -1;
1083 Temp live_out_exec = Temp();
1084
1085 /* this information is needed for predecessors to blocks with phis when
1086 * moving out of ssa */
1087 bool scc_live_out = false;
1088 PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1089
1090 Block(unsigned idx) : index(idx) {}
1091 Block() : index(0) {}
1092 };
1093
1094 using Stage = uint16_t;
1095
1096 /* software stages */
1097 static constexpr Stage sw_vs = 1 << 0;
1098 static constexpr Stage sw_gs = 1 << 1;
1099 static constexpr Stage sw_tcs = 1 << 2;
1100 static constexpr Stage sw_tes = 1 << 3;
1101 static constexpr Stage sw_fs = 1 << 4;
1102 static constexpr Stage sw_cs = 1 << 5;
1103 static constexpr Stage sw_mask = 0x3f;
1104
1105 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1106 static constexpr Stage hw_vs = 1 << 6;
1107 static constexpr Stage hw_es = 1 << 7; /* not on GFX9. combined into GS on GFX9 (and GFX10/legacy). */
1108 static constexpr Stage hw_gs = 1 << 8;
1109 static constexpr Stage hw_ls = 1 << 9; /* not on GFX9. combined into HS on GFX9 (and GFX10/legacy). */
1110 static constexpr Stage hw_hs = 1 << 10;
1111 static constexpr Stage hw_fs = 1 << 11;
1112 static constexpr Stage hw_cs = 1 << 12;
1113 static constexpr Stage hw_mask = 0x7f << 6;
1114
1115 /* possible settings of Program::stage */
1116 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1117 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1118 static constexpr Stage compute_cs = sw_cs | hw_cs;
1119 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1120 /* GFX10/NGG */
1121 static constexpr Stage ngg_vertex_gs = sw_vs | hw_gs;
1122 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1123 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1124 static constexpr Stage ngg_vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1125 /* GFX9 (and GFX10 if NGG isn't used) */
1126 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1127 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1128 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1129 /* pre-GFX9 */
1130 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1131 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1132 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1133 static constexpr Stage tess_eval_es = sw_tes | hw_gs; /* tesselation evaluation before geometry */
1134 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1135
1136 class Program final {
1137 public:
1138 float_mode next_fp_mode;
1139 std::vector<Block> blocks;
1140 RegisterDemand max_reg_demand = RegisterDemand();
1141 uint16_t num_waves = 0;
1142 uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1143 ac_shader_config* config;
1144 struct radv_shader_info *info;
1145 enum chip_class chip_class;
1146 enum radeon_family family;
1147 unsigned wave_size;
1148 Stage stage; /* Stage */
1149 bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1150 bool needs_wqm = false; /* there exists a p_wqm instruction */
1151 bool wb_smem_l1_on_end = false;
1152
1153 std::vector<uint8_t> constant_data;
1154 Temp private_segment_buffer;
1155 Temp scratch_offset;
1156
1157 uint16_t lds_alloc_granule;
1158 uint32_t lds_limit; /* in bytes */
1159 uint16_t vgpr_limit;
1160 uint16_t sgpr_limit;
1161 uint16_t physical_sgprs;
1162 uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1163
1164 bool needs_vcc = false;
1165 bool needs_xnack_mask = false;
1166 bool needs_flat_scr = false;
1167
1168 uint32_t allocateId()
1169 {
1170 assert(allocationID <= 16777215);
1171 return allocationID++;
1172 }
1173
1174 uint32_t peekAllocationId()
1175 {
1176 return allocationID;
1177 }
1178
1179 void setAllocationId(uint32_t id)
1180 {
1181 allocationID = id;
1182 }
1183
1184 Block* create_and_insert_block() {
1185 blocks.emplace_back(blocks.size());
1186 blocks.back().fp_mode = next_fp_mode;
1187 return &blocks.back();
1188 }
1189
1190 Block* insert_block(Block&& block) {
1191 block.index = blocks.size();
1192 block.fp_mode = next_fp_mode;
1193 blocks.emplace_back(std::move(block));
1194 return &blocks.back();
1195 }
1196
1197 private:
1198 uint32_t allocationID = 1;
1199 };
1200
1201 struct live {
1202 /* live temps out per block */
1203 std::vector<std::set<Temp>> live_out;
1204 /* register demand (sgpr/vgpr) per instruction per block */
1205 std::vector<std::vector<RegisterDemand>> register_demand;
1206 };
1207
1208 void select_program(Program *program,
1209 unsigned shader_count,
1210 struct nir_shader *const *shaders,
1211 ac_shader_config* config,
1212 struct radv_shader_args *args);
1213
1214 void lower_wqm(Program* program, live& live_vars,
1215 const struct radv_nir_compiler_options *options);
1216 void lower_bool_phis(Program* program);
1217 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1218 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1219 std::vector<uint16_t> dead_code_analysis(Program *program);
1220 void dominator_tree(Program* program);
1221 void insert_exec_mask(Program *program);
1222 void value_numbering(Program* program);
1223 void optimize(Program* program);
1224 void setup_reduce_temp(Program* program);
1225 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1226 void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_per_block);
1227 void ssa_elimination(Program* program);
1228 void lower_to_hw_instr(Program* program);
1229 void schedule_program(Program* program, live& live_vars);
1230 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1231 void insert_wait_states(Program* program);
1232 void insert_NOPs(Program* program);
1233 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1234 void print_asm(Program *program, std::vector<uint32_t>& binary,
1235 unsigned exec_size, std::ostream& out);
1236 void validate(Program* program, FILE *output);
1237 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output);
1238 #ifndef NDEBUG
1239 void perfwarn(bool cond, const char *msg, Instruction *instr=NULL);
1240 #else
1241 #define perfwarn(program, cond, msg, ...)
1242 #endif
1243
1244 void aco_print_instr(Instruction *instr, FILE *output);
1245 void aco_print_program(Program *program, FILE *output);
1246
1247 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1248 uint16_t get_extra_sgprs(Program *program);
1249
1250 /* get number of sgprs allocated required to address a number of sgprs */
1251 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1252
1253 /* return number of addressable SGPRs for max_waves */
1254 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1255
1256 typedef struct {
1257 const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1258 const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1259 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1260 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1261 const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1262 const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1263 } Info;
1264
1265 extern const Info instr_info;
1266
1267 }
1268
1269 #endif /* ACO_IR_H */
1270