3f38e6aadaebf99b878b704df1ec10501bef3acf
[mesa.git] / src / amd / compiler / aco_ir.h
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #ifndef ACO_IR_H
26 #define ACO_IR_H
27
28 #include <vector>
29 #include <set>
30 #include <bitset>
31 #include <memory>
32
33 #include "nir.h"
34 #include "ac_binary.h"
35 #include "amd_family.h"
36 #include "aco_opcodes.h"
37 #include "aco_util.h"
38
39 struct radv_nir_compiler_options;
40 struct radv_shader_args;
41 struct radv_shader_info;
42
43 namespace aco {
44
45 extern uint64_t debug_flags;
46
47 enum {
48 DEBUG_VALIDATE = 0x1,
49 DEBUG_VALIDATE_RA = 0x2,
50 DEBUG_PERFWARN = 0x4,
51 };
52
53 /**
54 * Representation of the instruction's microcode encoding format
55 * Note: Some Vector ALU Formats can be combined, such that:
56 * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
57 * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
58 * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
59 *
60 * (*) The same is applicable for VOP1 and VOPC instructions.
61 */
62 enum class Format : std::uint16_t {
63 /* Pseudo Instruction Format */
64 PSEUDO = 0,
65 /* Scalar ALU & Control Formats */
66 SOP1 = 1,
67 SOP2 = 2,
68 SOPK = 3,
69 SOPP = 4,
70 SOPC = 5,
71 /* Scalar Memory Format */
72 SMEM = 6,
73 /* LDS/GDS Format */
74 DS = 8,
75 /* Vector Memory Buffer Formats */
76 MTBUF = 9,
77 MUBUF = 10,
78 /* Vector Memory Image Format */
79 MIMG = 11,
80 /* Export Format */
81 EXP = 12,
82 /* Flat Formats */
83 FLAT = 13,
84 GLOBAL = 14,
85 SCRATCH = 15,
86
87 PSEUDO_BRANCH = 16,
88 PSEUDO_BARRIER = 17,
89 PSEUDO_REDUCTION = 18,
90
91 /* Vector ALU Formats */
92 VOP1 = 1 << 8,
93 VOP2 = 1 << 9,
94 VOPC = 1 << 10,
95 VOP3 = 1 << 11,
96 VOP3A = 1 << 11,
97 VOP3B = 1 << 11,
98 VOP3P = 1 << 12,
99 /* Vector Parameter Interpolation Format */
100 VINTRP = 1 << 13,
101 DPP = 1 << 14,
102 SDWA = 1 << 15,
103 };
104
105 enum barrier_interaction : uint8_t {
106 barrier_none = 0,
107 barrier_buffer = 0x1,
108 barrier_image = 0x2,
109 barrier_atomic = 0x4,
110 barrier_shared = 0x8,
111 /* used for geometry shaders to ensure vertex data writes are before the
112 * GS_DONE s_sendmsg. */
113 barrier_gs_data = 0x10,
114 /* used for geometry shaders to ensure s_sendmsg instructions are in-order. */
115 barrier_gs_sendmsg = 0x20,
116 barrier_count = 6,
117 };
118
119 enum fp_round {
120 fp_round_ne = 0,
121 fp_round_pi = 1,
122 fp_round_ni = 2,
123 fp_round_tz = 3,
124 };
125
126 enum fp_denorm {
127 /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
128 * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
129 fp_denorm_flush = 0x0,
130 fp_denorm_keep = 0x3,
131 };
132
133 struct float_mode {
134 /* matches encoding of the MODE register */
135 union {
136 struct {
137 fp_round round32:2;
138 fp_round round16_64:2;
139 unsigned denorm32:2;
140 unsigned denorm16_64:2;
141 };
142 uint8_t val = 0;
143 };
144 /* if false, optimizations which may remove infs/nan/-0.0 can be done */
145 bool preserve_signed_zero_inf_nan32:1;
146 bool preserve_signed_zero_inf_nan16_64:1;
147 /* if false, optimizations which may remove denormal flushing can be done */
148 bool must_flush_denorms32:1;
149 bool must_flush_denorms16_64:1;
150 bool care_about_round32:1;
151 bool care_about_round16_64:1;
152
153 /* Returns true if instructions using the mode "other" can safely use the
154 * current one instead. */
155 bool canReplace(float_mode other) const noexcept {
156 return val == other.val &&
157 (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
158 (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
159 (must_flush_denorms32 || !other.must_flush_denorms32) &&
160 (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
161 (care_about_round32 || !other.care_about_round32) &&
162 (care_about_round16_64 || !other.care_about_round16_64);
163 }
164 };
165
166 constexpr Format asVOP3(Format format) {
167 return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
168 };
169
170 enum class RegType {
171 none = 0,
172 sgpr,
173 vgpr,
174 linear_vgpr,
175 };
176
177 struct RegClass {
178
179 enum RC : uint8_t {
180 s1 = 1,
181 s2 = 2,
182 s3 = 3,
183 s4 = 4,
184 s6 = 6,
185 s8 = 8,
186 s16 = 16,
187 v1 = s1 | (1 << 5),
188 v2 = s2 | (1 << 5),
189 v3 = s3 | (1 << 5),
190 v4 = s4 | (1 << 5),
191 v5 = 5 | (1 << 5),
192 v6 = 6 | (1 << 5),
193 v7 = 7 | (1 << 5),
194 v8 = 8 | (1 << 5),
195 /* these are used for WWM and spills to vgpr */
196 v1_linear = v1 | (1 << 6),
197 v2_linear = v2 | (1 << 6),
198 };
199
200 RegClass() = default;
201 constexpr RegClass(RC rc)
202 : rc(rc) {}
203 constexpr RegClass(RegType type, unsigned size)
204 : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
205
206 constexpr operator RC() const { return rc; }
207 explicit operator bool() = delete;
208
209 constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
210 constexpr unsigned size() const { return (unsigned) rc & 0x1F; }
211 constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
212 constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
213
214 private:
215 RC rc;
216 };
217
218 /* transitional helper expressions */
219 static constexpr RegClass s1{RegClass::s1};
220 static constexpr RegClass s2{RegClass::s2};
221 static constexpr RegClass s3{RegClass::s3};
222 static constexpr RegClass s4{RegClass::s4};
223 static constexpr RegClass s8{RegClass::s8};
224 static constexpr RegClass s16{RegClass::s16};
225 static constexpr RegClass v1{RegClass::v1};
226 static constexpr RegClass v2{RegClass::v2};
227 static constexpr RegClass v3{RegClass::v3};
228 static constexpr RegClass v4{RegClass::v4};
229 static constexpr RegClass v5{RegClass::v5};
230 static constexpr RegClass v6{RegClass::v6};
231 static constexpr RegClass v7{RegClass::v7};
232 static constexpr RegClass v8{RegClass::v8};
233
234 /**
235 * Temp Class
236 * Each temporary virtual register has a
237 * register class (i.e. size and type)
238 * and SSA id.
239 */
240 struct Temp {
241 Temp() = default;
242 constexpr Temp(uint32_t id, RegClass cls) noexcept
243 : id_(id), reg_class(cls) {}
244
245 constexpr uint32_t id() const noexcept { return id_; }
246 constexpr RegClass regClass() const noexcept { return reg_class; }
247
248 constexpr unsigned size() const noexcept { return reg_class.size(); }
249 constexpr RegType type() const noexcept { return reg_class.type(); }
250 constexpr bool is_linear() const noexcept { return reg_class.is_linear(); }
251
252 constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
253 constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
254 constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
255
256 private:
257 uint32_t id_:24;
258 RegClass reg_class;
259 };
260
261 /**
262 * PhysReg
263 * Represents the physical register for each
264 * Operand and Definition.
265 */
266 struct PhysReg {
267 constexpr PhysReg() = default;
268 explicit constexpr PhysReg(unsigned r) : reg(r) {}
269 constexpr operator unsigned() const { return reg; }
270
271 uint16_t reg = 0;
272 };
273
274 /* helper expressions for special registers */
275 static constexpr PhysReg m0{124};
276 static constexpr PhysReg vcc{106};
277 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
278 static constexpr PhysReg exec{126};
279 static constexpr PhysReg exec_lo{126};
280 static constexpr PhysReg exec_hi{127};
281 static constexpr PhysReg scc{253};
282
283 /**
284 * Operand Class
285 * Initially, each Operand refers to either
286 * a temporary virtual register
287 * or to a constant value
288 * Temporary registers get mapped to physical register during RA
289 * Constant values are inlined into the instruction sequence.
290 */
291 class Operand final
292 {
293 public:
294 constexpr Operand()
295 : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
296 isKill_(false), isUndef_(true), isFirstKill_(false), is64BitConst_(false) {}
297
298 explicit Operand(Temp r) noexcept
299 {
300 data_.temp = r;
301 if (r.id()) {
302 isTemp_ = true;
303 } else {
304 isUndef_ = true;
305 setFixed(PhysReg{128});
306 }
307 };
308 explicit Operand(uint32_t v, bool is64bit = false) noexcept
309 {
310 data_.i = v;
311 isConstant_ = true;
312 is64BitConst_ = is64bit;
313 if (v <= 64)
314 setFixed(PhysReg{128 + v});
315 else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
316 setFixed(PhysReg{192 - v});
317 else if (v == 0x3f000000) /* 0.5 */
318 setFixed(PhysReg{240});
319 else if (v == 0xbf000000) /* -0.5 */
320 setFixed(PhysReg{241});
321 else if (v == 0x3f800000) /* 1.0 */
322 setFixed(PhysReg{242});
323 else if (v == 0xbf800000) /* -1.0 */
324 setFixed(PhysReg{243});
325 else if (v == 0x40000000) /* 2.0 */
326 setFixed(PhysReg{244});
327 else if (v == 0xc0000000) /* -2.0 */
328 setFixed(PhysReg{245});
329 else if (v == 0x40800000) /* 4.0 */
330 setFixed(PhysReg{246});
331 else if (v == 0xc0800000) /* -4.0 */
332 setFixed(PhysReg{247});
333 else { /* Literal Constant */
334 assert(!is64bit && "attempt to create a 64-bit literal constant");
335 setFixed(PhysReg{255});
336 }
337 };
338 explicit Operand(uint64_t v) noexcept
339 {
340 isConstant_ = true;
341 is64BitConst_ = true;
342 if (v <= 64) {
343 data_.i = (uint32_t) v;
344 setFixed(PhysReg{128 + (uint32_t) v});
345 } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
346 data_.i = (uint32_t) v;
347 setFixed(PhysReg{192 - (uint32_t) v});
348 } else if (v == 0x3FE0000000000000) { /* 0.5 */
349 data_.i = 0x3f000000;
350 setFixed(PhysReg{240});
351 } else if (v == 0xBFE0000000000000) { /* -0.5 */
352 data_.i = 0xbf000000;
353 setFixed(PhysReg{241});
354 } else if (v == 0x3FF0000000000000) { /* 1.0 */
355 data_.i = 0x3f800000;
356 setFixed(PhysReg{242});
357 } else if (v == 0xBFF0000000000000) { /* -1.0 */
358 data_.i = 0xbf800000;
359 setFixed(PhysReg{243});
360 } else if (v == 0x4000000000000000) { /* 2.0 */
361 data_.i = 0x40000000;
362 setFixed(PhysReg{244});
363 } else if (v == 0xC000000000000000) { /* -2.0 */
364 data_.i = 0xc0000000;
365 setFixed(PhysReg{245});
366 } else if (v == 0x4010000000000000) { /* 4.0 */
367 data_.i = 0x40800000;
368 setFixed(PhysReg{246});
369 } else if (v == 0xC010000000000000) { /* -4.0 */
370 data_.i = 0xc0800000;
371 setFixed(PhysReg{247});
372 } else { /* Literal Constant: we don't know if it is a long or double.*/
373 isConstant_ = 0;
374 assert(false && "attempt to create a 64-bit literal constant");
375 }
376 };
377 explicit Operand(RegClass type) noexcept
378 {
379 isUndef_ = true;
380 data_.temp = Temp(0, type);
381 setFixed(PhysReg{128});
382 };
383 explicit Operand(PhysReg reg, RegClass type) noexcept
384 {
385 data_.temp = Temp(0, type);
386 setFixed(reg);
387 }
388
389 constexpr bool isTemp() const noexcept
390 {
391 return isTemp_;
392 }
393
394 constexpr void setTemp(Temp t) noexcept {
395 assert(!isConstant_);
396 isTemp_ = true;
397 data_.temp = t;
398 }
399
400 constexpr Temp getTemp() const noexcept
401 {
402 return data_.temp;
403 }
404
405 constexpr uint32_t tempId() const noexcept
406 {
407 return data_.temp.id();
408 }
409
410 constexpr bool hasRegClass() const noexcept
411 {
412 return isTemp() || isUndefined();
413 }
414
415 constexpr RegClass regClass() const noexcept
416 {
417 return data_.temp.regClass();
418 }
419
420 constexpr unsigned size() const noexcept
421 {
422 if (isConstant())
423 return is64BitConst_ ? 2 : 1;
424 else
425 return data_.temp.size();
426 }
427
428 constexpr bool isFixed() const noexcept
429 {
430 return isFixed_;
431 }
432
433 constexpr PhysReg physReg() const noexcept
434 {
435 return reg_;
436 }
437
438 constexpr void setFixed(PhysReg reg) noexcept
439 {
440 isFixed_ = reg != unsigned(-1);
441 reg_ = reg;
442 }
443
444 constexpr bool isConstant() const noexcept
445 {
446 return isConstant_;
447 }
448
449 constexpr bool isLiteral() const noexcept
450 {
451 return isConstant() && reg_ == 255;
452 }
453
454 constexpr bool isUndefined() const noexcept
455 {
456 return isUndef_;
457 }
458
459 constexpr uint32_t constantValue() const noexcept
460 {
461 return data_.i;
462 }
463
464 constexpr bool constantEquals(uint32_t cmp) const noexcept
465 {
466 return isConstant() && constantValue() == cmp;
467 }
468
469 constexpr void setKill(bool flag) noexcept
470 {
471 isKill_ = flag;
472 if (!flag)
473 setFirstKill(false);
474 }
475
476 constexpr bool isKill() const noexcept
477 {
478 return isKill_ || isFirstKill();
479 }
480
481 constexpr void setFirstKill(bool flag) noexcept
482 {
483 isFirstKill_ = flag;
484 if (flag)
485 setKill(flag);
486 }
487
488 /* When there are multiple operands killing the same temporary,
489 * isFirstKill() is only returns true for the first one. */
490 constexpr bool isFirstKill() const noexcept
491 {
492 return isFirstKill_;
493 }
494
495 private:
496 union {
497 uint32_t i;
498 float f;
499 Temp temp = Temp(0, s1);
500 } data_;
501 PhysReg reg_;
502 union {
503 struct {
504 uint8_t isTemp_:1;
505 uint8_t isFixed_:1;
506 uint8_t isConstant_:1;
507 uint8_t isKill_:1;
508 uint8_t isUndef_:1;
509 uint8_t isFirstKill_:1;
510 uint8_t is64BitConst_:1;
511 };
512 /* can't initialize bit-fields in c++11, so work around using a union */
513 uint8_t control_ = 0;
514 };
515 };
516
517 /**
518 * Definition Class
519 * Definitions are the results of Instructions
520 * and refer to temporary virtual registers
521 * which are later mapped to physical registers
522 */
523 class Definition final
524 {
525 public:
526 constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0) {}
527 Definition(uint32_t index, RegClass type) noexcept
528 : temp(index, type) {}
529 explicit Definition(Temp tmp) noexcept
530 : temp(tmp) {}
531 Definition(PhysReg reg, RegClass type) noexcept
532 : temp(Temp(0, type))
533 {
534 setFixed(reg);
535 }
536 Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
537 : temp(Temp(tmpId, type))
538 {
539 setFixed(reg);
540 }
541
542 constexpr bool isTemp() const noexcept
543 {
544 return tempId() > 0;
545 }
546
547 constexpr Temp getTemp() const noexcept
548 {
549 return temp;
550 }
551
552 constexpr uint32_t tempId() const noexcept
553 {
554 return temp.id();
555 }
556
557 constexpr void setTemp(Temp t) noexcept {
558 temp = t;
559 }
560
561 constexpr RegClass regClass() const noexcept
562 {
563 return temp.regClass();
564 }
565
566 constexpr unsigned size() const noexcept
567 {
568 return temp.size();
569 }
570
571 constexpr bool isFixed() const noexcept
572 {
573 return isFixed_;
574 }
575
576 constexpr PhysReg physReg() const noexcept
577 {
578 return reg_;
579 }
580
581 constexpr void setFixed(PhysReg reg) noexcept
582 {
583 isFixed_ = 1;
584 reg_ = reg;
585 }
586
587 constexpr void setHint(PhysReg reg) noexcept
588 {
589 hasHint_ = 1;
590 reg_ = reg;
591 }
592
593 constexpr bool hasHint() const noexcept
594 {
595 return hasHint_;
596 }
597
598 constexpr void setKill(bool flag) noexcept
599 {
600 isKill_ = flag;
601 }
602
603 constexpr bool isKill() const noexcept
604 {
605 return isKill_;
606 }
607
608 private:
609 Temp temp = Temp(0, s1);
610 PhysReg reg_;
611 union {
612 struct {
613 uint8_t isFixed_:1;
614 uint8_t hasHint_:1;
615 uint8_t isKill_:1;
616 };
617 /* can't initialize bit-fields in c++11, so work around using a union */
618 uint8_t control_ = 0;
619 };
620 };
621
622 class Block;
623
624 struct Instruction {
625 aco_opcode opcode;
626 Format format;
627 uint32_t pass_flags;
628
629 aco::span<Operand> operands;
630 aco::span<Definition> definitions;
631
632 constexpr bool isVALU() const noexcept
633 {
634 return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
635 || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
636 || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
637 || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
638 || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
639 || ((uint16_t) format & (uint16_t) Format::VOP3P) == (uint16_t) Format::VOP3P;
640 }
641
642 constexpr bool isSALU() const noexcept
643 {
644 return format == Format::SOP1 ||
645 format == Format::SOP2 ||
646 format == Format::SOPC ||
647 format == Format::SOPK ||
648 format == Format::SOPP;
649 }
650
651 constexpr bool isVMEM() const noexcept
652 {
653 return format == Format::MTBUF ||
654 format == Format::MUBUF ||
655 format == Format::MIMG;
656 }
657
658 constexpr bool isDPP() const noexcept
659 {
660 return (uint16_t) format & (uint16_t) Format::DPP;
661 }
662
663 constexpr bool isVOP3() const noexcept
664 {
665 return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
666 ((uint16_t) format & (uint16_t) Format::VOP3B) ||
667 format == Format::VOP3P;
668 }
669
670 constexpr bool isSDWA() const noexcept
671 {
672 return (uint16_t) format & (uint16_t) Format::SDWA;
673 }
674
675 constexpr bool isFlatOrGlobal() const noexcept
676 {
677 return format == Format::FLAT || format == Format::GLOBAL;
678 }
679
680 constexpr bool usesModifiers() const noexcept;
681
682 constexpr bool reads_exec() const noexcept
683 {
684 for (const Operand& op : operands) {
685 if (op.isFixed() && op.physReg() == exec)
686 return true;
687 }
688 return false;
689 }
690 };
691
692 struct SOPK_instruction : public Instruction {
693 uint16_t imm;
694 };
695
696 struct SOPP_instruction : public Instruction {
697 uint32_t imm;
698 int block;
699 };
700
701 struct SOPC_instruction : public Instruction {
702 };
703
704 struct SOP1_instruction : public Instruction {
705 };
706
707 struct SOP2_instruction : public Instruction {
708 };
709
710 /**
711 * Scalar Memory Format:
712 * For s_(buffer_)load_dword*:
713 * Operand(0): SBASE - SGPR-pair which provides base address
714 * Operand(1): Offset - immediate (un)signed offset or SGPR
715 * Operand(2) / Definition(0): SDATA - SGPR for read / write result
716 * Operand(n-1): SOffset - SGPR offset (Vega only)
717 *
718 * Having no operands is also valid for instructions such as s_dcache_inv.
719 *
720 */
721 struct SMEM_instruction : public Instruction {
722 bool glc : 1; /* VI+: globally coherent */
723 bool dlc : 1; /* NAVI: device level coherent */
724 bool nv : 1; /* VEGA only: Non-volatile */
725 bool can_reorder : 1;
726 bool disable_wqm : 1;
727 barrier_interaction barrier;
728 };
729
730 struct VOP1_instruction : public Instruction {
731 };
732
733 struct VOP2_instruction : public Instruction {
734 };
735
736 struct VOPC_instruction : public Instruction {
737 };
738
739 struct VOP3A_instruction : public Instruction {
740 bool abs[3];
741 bool neg[3];
742 uint8_t opsel : 4;
743 uint8_t omod : 2;
744 bool clamp : 1;
745 };
746
747 /**
748 * Data Parallel Primitives Format:
749 * This format can be used for VOP1, VOP2 or VOPC instructions.
750 * The swizzle applies to the src0 operand.
751 *
752 */
753 struct DPP_instruction : public Instruction {
754 bool abs[2];
755 bool neg[2];
756 uint16_t dpp_ctrl;
757 uint8_t row_mask : 4;
758 uint8_t bank_mask : 4;
759 bool bound_ctrl : 1;
760 };
761
762 struct Interp_instruction : public Instruction {
763 uint8_t attribute;
764 uint8_t component;
765 };
766
767 /**
768 * Local and Global Data Sharing instructions
769 * Operand(0): ADDR - VGPR which supplies the address.
770 * Operand(1): DATA0 - First data VGPR.
771 * Operand(2): DATA1 - Second data VGPR.
772 * Operand(n-1): M0 - LDS size.
773 * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
774 *
775 */
776 struct DS_instruction : public Instruction {
777 int16_t offset0;
778 int8_t offset1;
779 bool gds;
780 };
781
782 /**
783 * Vector Memory Untyped-buffer Instructions
784 * Operand(0): VADDR - Address source. Can carry an index and/or offset
785 * Operand(1): SRSRC - Specifies which SGPR supplies T# (resource constant)
786 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
787 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
788 *
789 */
790 struct MUBUF_instruction : public Instruction {
791 uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
792 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
793 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
794 bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
795 bool glc : 1; /* globally coherent */
796 bool dlc : 1; /* NAVI: device level coherent */
797 bool slc : 1; /* system level coherent */
798 bool tfe : 1; /* texture fail enable */
799 bool lds : 1; /* Return read-data to LDS instead of VGPRs */
800 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
801 bool can_reorder : 1;
802 barrier_interaction barrier;
803 };
804
805 /**
806 * Vector Memory Typed-buffer Instructions
807 * Operand(0): VADDR - Address source. Can carry an index and/or offset
808 * Operand(1): SRSRC - Specifies which SGPR supplies T# (resource constant)
809 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
810 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
811 *
812 */
813 struct MTBUF_instruction : public Instruction {
814 uint16_t offset; /* Unsigned byte offset - 12 bit */
815 uint8_t dfmt : 4; /* Data Format of data in memory buffer */
816 uint8_t nfmt : 3; /* Numeric format of data in memory */
817 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
818 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
819 bool glc : 1; /* globally coherent */
820 bool dlc : 1; /* NAVI: device level coherent */
821 bool slc : 1; /* system level coherent */
822 bool tfe : 1; /* texture fail enable */
823 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
824 bool can_reorder : 1;
825 barrier_interaction barrier;
826 };
827
828 /**
829 * Vector Memory Image Instructions
830 * Operand(0): VADDR - Address source. Can carry an offset or an index.
831 * Operand(1): SRSRC - Scalar GPR that specifies the resource constant.
832 * Operand(2): SSAMP - Scalar GPR that specifies sampler constant.
833 * Operand(3) / Definition(0): VDATA - Vector GPR for read / write result.
834 *
835 */
836 struct MIMG_instruction : public Instruction {
837 uint8_t dmask; /* Data VGPR enable mask */
838 uint8_t dim : 3; /* NAVI: dimensionality */
839 bool unrm : 1; /* Force address to be un-normalized */
840 bool dlc : 1; /* NAVI: device level coherent */
841 bool glc : 1; /* globally coherent */
842 bool slc : 1; /* system level coherent */
843 bool tfe : 1; /* texture fail enable */
844 bool da : 1; /* declare an array */
845 bool lwe : 1; /* Force data to be un-normalized */
846 bool r128 : 1; /* NAVI: Texture resource size */
847 bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
848 bool d16 : 1; /* Convert 32-bit data to 16-bit data */
849 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
850 bool can_reorder : 1;
851 barrier_interaction barrier;
852 };
853
854 /**
855 * Flat/Scratch/Global Instructions
856 * Operand(0): ADDR
857 * Operand(1): SADDR
858 * Operand(2) / Definition(0): DATA/VDST
859 *
860 */
861 struct FLAT_instruction : public Instruction {
862 uint16_t offset; /* Vega/Navi only */
863 bool slc : 1; /* system level coherent */
864 bool glc : 1; /* globally coherent */
865 bool dlc : 1; /* NAVI: device level coherent */
866 bool lds : 1;
867 bool nv : 1;
868 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
869 bool can_reorder : 1;
870 barrier_interaction barrier;
871 };
872
873 struct Export_instruction : public Instruction {
874 uint8_t enabled_mask;
875 uint8_t dest;
876 bool compressed : 1;
877 bool done : 1;
878 bool valid_mask : 1;
879 };
880
881 struct Pseudo_instruction : public Instruction {
882 bool tmp_in_scc;
883 PhysReg scratch_sgpr; /* might not be valid if it's not needed */
884 };
885
886 struct Pseudo_branch_instruction : public Instruction {
887 /* target[0] is the block index of the branch target.
888 * For conditional branches, target[1] contains the fall-through alternative.
889 * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
890 */
891 uint32_t target[2];
892 };
893
894 struct Pseudo_barrier_instruction : public Instruction {
895 };
896
897 enum ReduceOp {
898 iadd32, iadd64,
899 imul32, imul64,
900 fadd32, fadd64,
901 fmul32, fmul64,
902 imin32, imin64,
903 imax32, imax64,
904 umin32, umin64,
905 umax32, umax64,
906 fmin32, fmin64,
907 fmax32, fmax64,
908 iand32, iand64,
909 ior32, ior64,
910 ixor32, ixor64,
911 gfx10_wave64_bpermute
912 };
913
914 /**
915 * Subgroup Reduction Instructions, everything except for the data to be
916 * reduced and the result as inserted by setup_reduce_temp().
917 * Operand(0): data to be reduced
918 * Operand(1): reduce temporary
919 * Operand(2): vector temporary
920 * Definition(0): result
921 * Definition(1): scalar temporary
922 * Definition(2): scalar identity temporary (not used to store identity on GFX10)
923 * Definition(3): scc clobber
924 * Definition(4): vcc clobber
925 *
926 */
927 struct Pseudo_reduction_instruction : public Instruction {
928 ReduceOp reduce_op;
929 unsigned cluster_size; // must be 0 for scans
930 };
931
932 struct instr_deleter_functor {
933 void operator()(void* p) {
934 free(p);
935 }
936 };
937
938 template<typename T>
939 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
940
941 template<typename T>
942 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
943 {
944 std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
945 char *data = (char*) calloc(1, size);
946 T* inst = (T*) data;
947
948 inst->opcode = opcode;
949 inst->format = format;
950
951 uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
952 inst->operands = aco::span<Operand>(operands_offset, num_operands);
953 uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
954 inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
955
956 return inst;
957 }
958
959 constexpr bool Instruction::usesModifiers() const noexcept
960 {
961 if (isDPP() || isSDWA())
962 return true;
963 if (!isVOP3())
964 return false;
965 const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
966 for (unsigned i = 0; i < operands.size(); i++) {
967 if (vop3->abs[i] || vop3->neg[i])
968 return true;
969 }
970 return vop3->opsel || vop3->clamp || vop3->omod;
971 }
972
973 constexpr bool is_phi(Instruction* instr)
974 {
975 return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
976 }
977
978 static inline bool is_phi(aco_ptr<Instruction>& instr)
979 {
980 return is_phi(instr.get());
981 }
982
983 barrier_interaction get_barrier_interaction(Instruction* instr);
984
985 bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr);
986
987 enum block_kind {
988 /* uniform indicates that leaving this block,
989 * all actives lanes stay active */
990 block_kind_uniform = 1 << 0,
991 block_kind_top_level = 1 << 1,
992 block_kind_loop_preheader = 1 << 2,
993 block_kind_loop_header = 1 << 3,
994 block_kind_loop_exit = 1 << 4,
995 block_kind_continue = 1 << 5,
996 block_kind_break = 1 << 6,
997 block_kind_continue_or_break = 1 << 7,
998 block_kind_discard = 1 << 8,
999 block_kind_branch = 1 << 9,
1000 block_kind_merge = 1 << 10,
1001 block_kind_invert = 1 << 11,
1002 block_kind_uses_discard_if = 1 << 12,
1003 block_kind_needs_lowering = 1 << 13,
1004 block_kind_uses_demote = 1 << 14,
1005 block_kind_export_end = 1 << 15,
1006 };
1007
1008
1009 struct RegisterDemand {
1010 constexpr RegisterDemand() = default;
1011 constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1012 : vgpr{v}, sgpr{s} {}
1013 int16_t vgpr = 0;
1014 int16_t sgpr = 0;
1015
1016 constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1017 return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1018 }
1019
1020 constexpr bool exceeds(const RegisterDemand other) const noexcept {
1021 return vgpr > other.vgpr || sgpr > other.sgpr;
1022 }
1023
1024 constexpr RegisterDemand operator+(const Temp t) const noexcept {
1025 if (t.type() == RegType::sgpr)
1026 return RegisterDemand( vgpr, sgpr + t.size() );
1027 else
1028 return RegisterDemand( vgpr + t.size(), sgpr );
1029 }
1030
1031 constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1032 return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1033 }
1034
1035 constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1036 return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1037 }
1038
1039 constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1040 vgpr += other.vgpr;
1041 sgpr += other.sgpr;
1042 return *this;
1043 }
1044
1045 constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1046 vgpr -= other.vgpr;
1047 sgpr -= other.sgpr;
1048 return *this;
1049 }
1050
1051 constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1052 if (t.type() == RegType::sgpr)
1053 sgpr += t.size();
1054 else
1055 vgpr += t.size();
1056 return *this;
1057 }
1058
1059 constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1060 if (t.type() == RegType::sgpr)
1061 sgpr -= t.size();
1062 else
1063 vgpr -= t.size();
1064 return *this;
1065 }
1066
1067 constexpr void update(const RegisterDemand other) noexcept {
1068 vgpr = std::max(vgpr, other.vgpr);
1069 sgpr = std::max(sgpr, other.sgpr);
1070 }
1071
1072 };
1073
1074 /* CFG */
1075 struct Block {
1076 float_mode fp_mode;
1077 unsigned index;
1078 unsigned offset = 0;
1079 std::vector<aco_ptr<Instruction>> instructions;
1080 std::vector<unsigned> logical_preds;
1081 std::vector<unsigned> linear_preds;
1082 std::vector<unsigned> logical_succs;
1083 std::vector<unsigned> linear_succs;
1084 RegisterDemand register_demand = RegisterDemand();
1085 uint16_t loop_nest_depth = 0;
1086 uint16_t kind = 0;
1087 int logical_idom = -1;
1088 int linear_idom = -1;
1089 Temp live_out_exec = Temp();
1090
1091 /* this information is needed for predecessors to blocks with phis when
1092 * moving out of ssa */
1093 bool scc_live_out = false;
1094 PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1095
1096 Block(unsigned idx) : index(idx) {}
1097 Block() : index(0) {}
1098 };
1099
1100 using Stage = uint16_t;
1101
1102 /* software stages */
1103 static constexpr Stage sw_vs = 1 << 0;
1104 static constexpr Stage sw_gs = 1 << 1;
1105 static constexpr Stage sw_tcs = 1 << 2;
1106 static constexpr Stage sw_tes = 1 << 3;
1107 static constexpr Stage sw_fs = 1 << 4;
1108 static constexpr Stage sw_cs = 1 << 5;
1109 static constexpr Stage sw_gs_copy = 1 << 6;
1110 static constexpr Stage sw_mask = 0x7f;
1111
1112 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1113 static constexpr Stage hw_vs = 1 << 7;
1114 static constexpr Stage hw_es = 1 << 8; /* not on GFX9. combined into GS on GFX9 (and GFX10/legacy). */
1115 static constexpr Stage hw_gs = 1 << 9;
1116 static constexpr Stage hw_ls = 1 << 10; /* not on GFX9. combined into HS on GFX9 (and GFX10/legacy). */
1117 static constexpr Stage hw_hs = 1 << 11;
1118 static constexpr Stage hw_fs = 1 << 12;
1119 static constexpr Stage hw_cs = 1 << 13;
1120 static constexpr Stage hw_mask = 0x7f << 7;
1121
1122 /* possible settings of Program::stage */
1123 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1124 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1125 static constexpr Stage compute_cs = sw_cs | hw_cs;
1126 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1127 static constexpr Stage gs_copy_vs = sw_gs_copy | hw_vs;
1128 /* GFX10/NGG */
1129 static constexpr Stage ngg_vertex_gs = sw_vs | hw_gs;
1130 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1131 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1132 static constexpr Stage ngg_vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1133 /* GFX9 (and GFX10 if NGG isn't used) */
1134 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1135 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1136 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1137 /* pre-GFX9 */
1138 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1139 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1140 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1141 static constexpr Stage tess_eval_es = sw_tes | hw_gs; /* tesselation evaluation before geometry */
1142 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1143
1144 class Program final {
1145 public:
1146 float_mode next_fp_mode;
1147 std::vector<Block> blocks;
1148 RegisterDemand max_reg_demand = RegisterDemand();
1149 uint16_t num_waves = 0;
1150 uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1151 ac_shader_config* config;
1152 struct radv_shader_info *info;
1153 enum chip_class chip_class;
1154 enum radeon_family family;
1155 unsigned wave_size;
1156 RegClass lane_mask;
1157 Stage stage; /* Stage */
1158 bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1159 bool needs_wqm = false; /* there exists a p_wqm instruction */
1160 bool wb_smem_l1_on_end = false;
1161
1162 std::vector<uint8_t> constant_data;
1163 Temp private_segment_buffer;
1164 Temp scratch_offset;
1165
1166 uint16_t min_waves = 0;
1167 uint16_t lds_alloc_granule;
1168 uint32_t lds_limit; /* in bytes */
1169 uint16_t vgpr_limit;
1170 uint16_t sgpr_limit;
1171 uint16_t physical_sgprs;
1172 uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1173 uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
1174
1175 bool needs_vcc = false;
1176 bool needs_xnack_mask = false;
1177 bool needs_flat_scr = false;
1178
1179 uint32_t allocateId()
1180 {
1181 assert(allocationID <= 16777215);
1182 return allocationID++;
1183 }
1184
1185 uint32_t peekAllocationId()
1186 {
1187 return allocationID;
1188 }
1189
1190 void setAllocationId(uint32_t id)
1191 {
1192 allocationID = id;
1193 }
1194
1195 Block* create_and_insert_block() {
1196 blocks.emplace_back(blocks.size());
1197 blocks.back().fp_mode = next_fp_mode;
1198 return &blocks.back();
1199 }
1200
1201 Block* insert_block(Block&& block) {
1202 block.index = blocks.size();
1203 block.fp_mode = next_fp_mode;
1204 blocks.emplace_back(std::move(block));
1205 return &blocks.back();
1206 }
1207
1208 private:
1209 uint32_t allocationID = 1;
1210 };
1211
1212 struct live {
1213 /* live temps out per block */
1214 std::vector<std::set<Temp>> live_out;
1215 /* register demand (sgpr/vgpr) per instruction per block */
1216 std::vector<std::vector<RegisterDemand>> register_demand;
1217 };
1218
1219 void select_program(Program *program,
1220 unsigned shader_count,
1221 struct nir_shader *const *shaders,
1222 ac_shader_config* config,
1223 struct radv_shader_args *args);
1224 void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
1225 ac_shader_config* config,
1226 struct radv_shader_args *args);
1227
1228 void lower_wqm(Program* program, live& live_vars,
1229 const struct radv_nir_compiler_options *options);
1230 void lower_bool_phis(Program* program);
1231 void calc_min_waves(Program* program);
1232 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1233 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1234 std::vector<uint16_t> dead_code_analysis(Program *program);
1235 void dominator_tree(Program* program);
1236 void insert_exec_mask(Program *program);
1237 void value_numbering(Program* program);
1238 void optimize(Program* program);
1239 void setup_reduce_temp(Program* program);
1240 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1241 void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_per_block);
1242 void ssa_elimination(Program* program);
1243 void lower_to_hw_instr(Program* program);
1244 void schedule_program(Program* program, live& live_vars);
1245 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1246 void insert_wait_states(Program* program);
1247 void insert_NOPs(Program* program);
1248 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1249 void print_asm(Program *program, std::vector<uint32_t>& binary,
1250 unsigned exec_size, std::ostream& out);
1251 void validate(Program* program, FILE *output);
1252 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output);
1253 #ifndef NDEBUG
1254 void perfwarn(bool cond, const char *msg, Instruction *instr=NULL);
1255 #else
1256 #define perfwarn(program, cond, msg, ...) do {} while(0)
1257 #endif
1258
1259 void aco_print_instr(Instruction *instr, FILE *output);
1260 void aco_print_program(Program *program, FILE *output);
1261
1262 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1263 uint16_t get_extra_sgprs(Program *program);
1264
1265 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
1266 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1267 uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs);
1268
1269 /* return number of addressable sgprs/vgprs for max_waves */
1270 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1271 uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves);
1272
1273 typedef struct {
1274 const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
1275 const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1276 const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1277 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1278 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1279 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
1280 const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1281 const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1282 } Info;
1283
1284 extern const Info instr_info;
1285
1286 }
1287
1288 #endif /* ACO_IR_H */
1289