aco: Setup correct HW stages when tessellation is used.
[mesa.git] / src / amd / compiler / aco_ir.h
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #ifndef ACO_IR_H
26 #define ACO_IR_H
27
28 #include <vector>
29 #include <set>
30 #include <bitset>
31 #include <memory>
32
33 #include "nir.h"
34 #include "ac_binary.h"
35 #include "amd_family.h"
36 #include "aco_opcodes.h"
37 #include "aco_util.h"
38
39 struct radv_nir_compiler_options;
40 struct radv_shader_args;
41 struct radv_shader_info;
42
43 namespace aco {
44
45 extern uint64_t debug_flags;
46
47 enum {
48 DEBUG_VALIDATE = 0x1,
49 DEBUG_VALIDATE_RA = 0x2,
50 DEBUG_PERFWARN = 0x4,
51 };
52
53 /**
54 * Representation of the instruction's microcode encoding format
55 * Note: Some Vector ALU Formats can be combined, such that:
56 * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
57 * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
58 * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
59 *
60 * (*) The same is applicable for VOP1 and VOPC instructions.
61 */
62 enum class Format : std::uint16_t {
63 /* Pseudo Instruction Format */
64 PSEUDO = 0,
65 /* Scalar ALU & Control Formats */
66 SOP1 = 1,
67 SOP2 = 2,
68 SOPK = 3,
69 SOPP = 4,
70 SOPC = 5,
71 /* Scalar Memory Format */
72 SMEM = 6,
73 /* LDS/GDS Format */
74 DS = 8,
75 /* Vector Memory Buffer Formats */
76 MTBUF = 9,
77 MUBUF = 10,
78 /* Vector Memory Image Format */
79 MIMG = 11,
80 /* Export Format */
81 EXP = 12,
82 /* Flat Formats */
83 FLAT = 13,
84 GLOBAL = 14,
85 SCRATCH = 15,
86
87 PSEUDO_BRANCH = 16,
88 PSEUDO_BARRIER = 17,
89 PSEUDO_REDUCTION = 18,
90
91 /* Vector ALU Formats */
92 VOP1 = 1 << 8,
93 VOP2 = 1 << 9,
94 VOPC = 1 << 10,
95 VOP3 = 1 << 11,
96 VOP3A = 1 << 11,
97 VOP3B = 1 << 11,
98 VOP3P = 1 << 12,
99 /* Vector Parameter Interpolation Format */
100 VINTRP = 1 << 13,
101 DPP = 1 << 14,
102 SDWA = 1 << 15,
103 };
104
105 enum barrier_interaction : uint8_t {
106 barrier_none = 0,
107 barrier_buffer = 0x1,
108 barrier_image = 0x2,
109 barrier_atomic = 0x4,
110 barrier_shared = 0x8,
111 /* used for geometry shaders to ensure vertex data writes are before the
112 * GS_DONE s_sendmsg. */
113 barrier_gs_data = 0x10,
114 /* used for geometry shaders to ensure s_sendmsg instructions are in-order. */
115 barrier_gs_sendmsg = 0x20,
116 barrier_count = 6,
117 };
118
119 enum fp_round {
120 fp_round_ne = 0,
121 fp_round_pi = 1,
122 fp_round_ni = 2,
123 fp_round_tz = 3,
124 };
125
126 enum fp_denorm {
127 /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
128 * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
129 fp_denorm_flush = 0x0,
130 fp_denorm_keep = 0x3,
131 };
132
133 struct float_mode {
134 /* matches encoding of the MODE register */
135 union {
136 struct {
137 fp_round round32:2;
138 fp_round round16_64:2;
139 unsigned denorm32:2;
140 unsigned denorm16_64:2;
141 };
142 uint8_t val = 0;
143 };
144 /* if false, optimizations which may remove infs/nan/-0.0 can be done */
145 bool preserve_signed_zero_inf_nan32:1;
146 bool preserve_signed_zero_inf_nan16_64:1;
147 /* if false, optimizations which may remove denormal flushing can be done */
148 bool must_flush_denorms32:1;
149 bool must_flush_denorms16_64:1;
150 bool care_about_round32:1;
151 bool care_about_round16_64:1;
152
153 /* Returns true if instructions using the mode "other" can safely use the
154 * current one instead. */
155 bool canReplace(float_mode other) const noexcept {
156 return val == other.val &&
157 (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
158 (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
159 (must_flush_denorms32 || !other.must_flush_denorms32) &&
160 (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
161 (care_about_round32 || !other.care_about_round32) &&
162 (care_about_round16_64 || !other.care_about_round16_64);
163 }
164 };
165
166 constexpr Format asVOP3(Format format) {
167 return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
168 };
169
170 enum class RegType {
171 none = 0,
172 sgpr,
173 vgpr,
174 linear_vgpr,
175 };
176
177 struct RegClass {
178
179 enum RC : uint8_t {
180 s1 = 1,
181 s2 = 2,
182 s3 = 3,
183 s4 = 4,
184 s6 = 6,
185 s8 = 8,
186 s16 = 16,
187 v1 = s1 | (1 << 5),
188 v2 = s2 | (1 << 5),
189 v3 = s3 | (1 << 5),
190 v4 = s4 | (1 << 5),
191 v5 = 5 | (1 << 5),
192 v6 = 6 | (1 << 5),
193 v7 = 7 | (1 << 5),
194 v8 = 8 | (1 << 5),
195 /* these are used for WWM and spills to vgpr */
196 v1_linear = v1 | (1 << 6),
197 v2_linear = v2 | (1 << 6),
198 };
199
200 RegClass() = default;
201 constexpr RegClass(RC rc)
202 : rc(rc) {}
203 constexpr RegClass(RegType type, unsigned size)
204 : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
205
206 constexpr operator RC() const { return rc; }
207 explicit operator bool() = delete;
208
209 constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
210 constexpr unsigned size() const { return (unsigned) rc & 0x1F; }
211 constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
212 constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
213
214 private:
215 RC rc;
216 };
217
218 /* transitional helper expressions */
219 static constexpr RegClass s1{RegClass::s1};
220 static constexpr RegClass s2{RegClass::s2};
221 static constexpr RegClass s3{RegClass::s3};
222 static constexpr RegClass s4{RegClass::s4};
223 static constexpr RegClass s8{RegClass::s8};
224 static constexpr RegClass s16{RegClass::s16};
225 static constexpr RegClass v1{RegClass::v1};
226 static constexpr RegClass v2{RegClass::v2};
227 static constexpr RegClass v3{RegClass::v3};
228 static constexpr RegClass v4{RegClass::v4};
229 static constexpr RegClass v5{RegClass::v5};
230 static constexpr RegClass v6{RegClass::v6};
231 static constexpr RegClass v7{RegClass::v7};
232 static constexpr RegClass v8{RegClass::v8};
233
234 /**
235 * Temp Class
236 * Each temporary virtual register has a
237 * register class (i.e. size and type)
238 * and SSA id.
239 */
240 struct Temp {
241 Temp() = default;
242 constexpr Temp(uint32_t id, RegClass cls) noexcept
243 : id_(id), reg_class(cls) {}
244
245 constexpr uint32_t id() const noexcept { return id_; }
246 constexpr RegClass regClass() const noexcept { return reg_class; }
247
248 constexpr unsigned size() const noexcept { return reg_class.size(); }
249 constexpr RegType type() const noexcept { return reg_class.type(); }
250 constexpr bool is_linear() const noexcept { return reg_class.is_linear(); }
251
252 constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
253 constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
254 constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
255
256 private:
257 uint32_t id_:24;
258 RegClass reg_class;
259 };
260
261 /**
262 * PhysReg
263 * Represents the physical register for each
264 * Operand and Definition.
265 */
266 struct PhysReg {
267 constexpr PhysReg() = default;
268 explicit constexpr PhysReg(unsigned r) : reg(r) {}
269 constexpr operator unsigned() const { return reg; }
270
271 uint16_t reg = 0;
272 };
273
274 /* helper expressions for special registers */
275 static constexpr PhysReg m0{124};
276 static constexpr PhysReg vcc{106};
277 static constexpr PhysReg vcc_hi{107};
278 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
279 static constexpr PhysReg exec{126};
280 static constexpr PhysReg exec_lo{126};
281 static constexpr PhysReg exec_hi{127};
282 static constexpr PhysReg vccz{251};
283 static constexpr PhysReg execz{252};
284 static constexpr PhysReg scc{253};
285
286 /**
287 * Operand Class
288 * Initially, each Operand refers to either
289 * a temporary virtual register
290 * or to a constant value
291 * Temporary registers get mapped to physical register during RA
292 * Constant values are inlined into the instruction sequence.
293 */
294 class Operand final
295 {
296 public:
297 constexpr Operand()
298 : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
299 isKill_(false), isUndef_(true), isFirstKill_(false), is64BitConst_(false) {}
300
301 explicit Operand(Temp r) noexcept
302 {
303 data_.temp = r;
304 if (r.id()) {
305 isTemp_ = true;
306 } else {
307 isUndef_ = true;
308 setFixed(PhysReg{128});
309 }
310 };
311 explicit Operand(uint32_t v, bool is64bit = false) noexcept
312 {
313 data_.i = v;
314 isConstant_ = true;
315 is64BitConst_ = is64bit;
316 if (v <= 64)
317 setFixed(PhysReg{128 + v});
318 else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
319 setFixed(PhysReg{192 - v});
320 else if (v == 0x3f000000) /* 0.5 */
321 setFixed(PhysReg{240});
322 else if (v == 0xbf000000) /* -0.5 */
323 setFixed(PhysReg{241});
324 else if (v == 0x3f800000) /* 1.0 */
325 setFixed(PhysReg{242});
326 else if (v == 0xbf800000) /* -1.0 */
327 setFixed(PhysReg{243});
328 else if (v == 0x40000000) /* 2.0 */
329 setFixed(PhysReg{244});
330 else if (v == 0xc0000000) /* -2.0 */
331 setFixed(PhysReg{245});
332 else if (v == 0x40800000) /* 4.0 */
333 setFixed(PhysReg{246});
334 else if (v == 0xc0800000) /* -4.0 */
335 setFixed(PhysReg{247});
336 else { /* Literal Constant */
337 assert(!is64bit && "attempt to create a 64-bit literal constant");
338 setFixed(PhysReg{255});
339 }
340 };
341 explicit Operand(uint64_t v) noexcept
342 {
343 isConstant_ = true;
344 is64BitConst_ = true;
345 if (v <= 64) {
346 data_.i = (uint32_t) v;
347 setFixed(PhysReg{128 + (uint32_t) v});
348 } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
349 data_.i = (uint32_t) v;
350 setFixed(PhysReg{192 - (uint32_t) v});
351 } else if (v == 0x3FE0000000000000) { /* 0.5 */
352 data_.i = 0x3f000000;
353 setFixed(PhysReg{240});
354 } else if (v == 0xBFE0000000000000) { /* -0.5 */
355 data_.i = 0xbf000000;
356 setFixed(PhysReg{241});
357 } else if (v == 0x3FF0000000000000) { /* 1.0 */
358 data_.i = 0x3f800000;
359 setFixed(PhysReg{242});
360 } else if (v == 0xBFF0000000000000) { /* -1.0 */
361 data_.i = 0xbf800000;
362 setFixed(PhysReg{243});
363 } else if (v == 0x4000000000000000) { /* 2.0 */
364 data_.i = 0x40000000;
365 setFixed(PhysReg{244});
366 } else if (v == 0xC000000000000000) { /* -2.0 */
367 data_.i = 0xc0000000;
368 setFixed(PhysReg{245});
369 } else if (v == 0x4010000000000000) { /* 4.0 */
370 data_.i = 0x40800000;
371 setFixed(PhysReg{246});
372 } else if (v == 0xC010000000000000) { /* -4.0 */
373 data_.i = 0xc0800000;
374 setFixed(PhysReg{247});
375 } else { /* Literal Constant: we don't know if it is a long or double.*/
376 isConstant_ = 0;
377 assert(false && "attempt to create a 64-bit literal constant");
378 }
379 };
380 explicit Operand(RegClass type) noexcept
381 {
382 isUndef_ = true;
383 data_.temp = Temp(0, type);
384 setFixed(PhysReg{128});
385 };
386 explicit Operand(PhysReg reg, RegClass type) noexcept
387 {
388 data_.temp = Temp(0, type);
389 setFixed(reg);
390 }
391
392 constexpr bool isTemp() const noexcept
393 {
394 return isTemp_;
395 }
396
397 constexpr void setTemp(Temp t) noexcept {
398 assert(!isConstant_);
399 isTemp_ = true;
400 data_.temp = t;
401 }
402
403 constexpr Temp getTemp() const noexcept
404 {
405 return data_.temp;
406 }
407
408 constexpr uint32_t tempId() const noexcept
409 {
410 return data_.temp.id();
411 }
412
413 constexpr bool hasRegClass() const noexcept
414 {
415 return isTemp() || isUndefined();
416 }
417
418 constexpr RegClass regClass() const noexcept
419 {
420 return data_.temp.regClass();
421 }
422
423 constexpr unsigned size() const noexcept
424 {
425 if (isConstant())
426 return is64BitConst_ ? 2 : 1;
427 else
428 return data_.temp.size();
429 }
430
431 constexpr bool isFixed() const noexcept
432 {
433 return isFixed_;
434 }
435
436 constexpr PhysReg physReg() const noexcept
437 {
438 return reg_;
439 }
440
441 constexpr void setFixed(PhysReg reg) noexcept
442 {
443 isFixed_ = reg != unsigned(-1);
444 reg_ = reg;
445 }
446
447 constexpr bool isConstant() const noexcept
448 {
449 return isConstant_;
450 }
451
452 constexpr bool isLiteral() const noexcept
453 {
454 return isConstant() && reg_ == 255;
455 }
456
457 constexpr bool isUndefined() const noexcept
458 {
459 return isUndef_;
460 }
461
462 constexpr uint32_t constantValue() const noexcept
463 {
464 return data_.i;
465 }
466
467 constexpr bool constantEquals(uint32_t cmp) const noexcept
468 {
469 return isConstant() && constantValue() == cmp;
470 }
471
472 constexpr void setKill(bool flag) noexcept
473 {
474 isKill_ = flag;
475 if (!flag)
476 setFirstKill(false);
477 }
478
479 constexpr bool isKill() const noexcept
480 {
481 return isKill_ || isFirstKill();
482 }
483
484 constexpr void setFirstKill(bool flag) noexcept
485 {
486 isFirstKill_ = flag;
487 if (flag)
488 setKill(flag);
489 }
490
491 /* When there are multiple operands killing the same temporary,
492 * isFirstKill() is only returns true for the first one. */
493 constexpr bool isFirstKill() const noexcept
494 {
495 return isFirstKill_;
496 }
497
498 private:
499 union {
500 uint32_t i;
501 float f;
502 Temp temp = Temp(0, s1);
503 } data_;
504 PhysReg reg_;
505 union {
506 struct {
507 uint8_t isTemp_:1;
508 uint8_t isFixed_:1;
509 uint8_t isConstant_:1;
510 uint8_t isKill_:1;
511 uint8_t isUndef_:1;
512 uint8_t isFirstKill_:1;
513 uint8_t is64BitConst_:1;
514 };
515 /* can't initialize bit-fields in c++11, so work around using a union */
516 uint8_t control_ = 0;
517 };
518 };
519
520 /**
521 * Definition Class
522 * Definitions are the results of Instructions
523 * and refer to temporary virtual registers
524 * which are later mapped to physical registers
525 */
526 class Definition final
527 {
528 public:
529 constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0) {}
530 Definition(uint32_t index, RegClass type) noexcept
531 : temp(index, type) {}
532 explicit Definition(Temp tmp) noexcept
533 : temp(tmp) {}
534 Definition(PhysReg reg, RegClass type) noexcept
535 : temp(Temp(0, type))
536 {
537 setFixed(reg);
538 }
539 Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
540 : temp(Temp(tmpId, type))
541 {
542 setFixed(reg);
543 }
544
545 constexpr bool isTemp() const noexcept
546 {
547 return tempId() > 0;
548 }
549
550 constexpr Temp getTemp() const noexcept
551 {
552 return temp;
553 }
554
555 constexpr uint32_t tempId() const noexcept
556 {
557 return temp.id();
558 }
559
560 constexpr void setTemp(Temp t) noexcept {
561 temp = t;
562 }
563
564 constexpr RegClass regClass() const noexcept
565 {
566 return temp.regClass();
567 }
568
569 constexpr unsigned size() const noexcept
570 {
571 return temp.size();
572 }
573
574 constexpr bool isFixed() const noexcept
575 {
576 return isFixed_;
577 }
578
579 constexpr PhysReg physReg() const noexcept
580 {
581 return reg_;
582 }
583
584 constexpr void setFixed(PhysReg reg) noexcept
585 {
586 isFixed_ = 1;
587 reg_ = reg;
588 }
589
590 constexpr void setHint(PhysReg reg) noexcept
591 {
592 hasHint_ = 1;
593 reg_ = reg;
594 }
595
596 constexpr bool hasHint() const noexcept
597 {
598 return hasHint_;
599 }
600
601 constexpr void setKill(bool flag) noexcept
602 {
603 isKill_ = flag;
604 }
605
606 constexpr bool isKill() const noexcept
607 {
608 return isKill_;
609 }
610
611 private:
612 Temp temp = Temp(0, s1);
613 PhysReg reg_;
614 union {
615 struct {
616 uint8_t isFixed_:1;
617 uint8_t hasHint_:1;
618 uint8_t isKill_:1;
619 };
620 /* can't initialize bit-fields in c++11, so work around using a union */
621 uint8_t control_ = 0;
622 };
623 };
624
625 class Block;
626
627 struct Instruction {
628 aco_opcode opcode;
629 Format format;
630 uint32_t pass_flags;
631
632 aco::span<Operand> operands;
633 aco::span<Definition> definitions;
634
635 constexpr bool isVALU() const noexcept
636 {
637 return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
638 || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
639 || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
640 || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
641 || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
642 || ((uint16_t) format & (uint16_t) Format::VOP3P) == (uint16_t) Format::VOP3P;
643 }
644
645 constexpr bool isSALU() const noexcept
646 {
647 return format == Format::SOP1 ||
648 format == Format::SOP2 ||
649 format == Format::SOPC ||
650 format == Format::SOPK ||
651 format == Format::SOPP;
652 }
653
654 constexpr bool isVMEM() const noexcept
655 {
656 return format == Format::MTBUF ||
657 format == Format::MUBUF ||
658 format == Format::MIMG;
659 }
660
661 constexpr bool isDPP() const noexcept
662 {
663 return (uint16_t) format & (uint16_t) Format::DPP;
664 }
665
666 constexpr bool isVOP3() const noexcept
667 {
668 return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
669 ((uint16_t) format & (uint16_t) Format::VOP3B) ||
670 format == Format::VOP3P;
671 }
672
673 constexpr bool isSDWA() const noexcept
674 {
675 return (uint16_t) format & (uint16_t) Format::SDWA;
676 }
677
678 constexpr bool isFlatOrGlobal() const noexcept
679 {
680 return format == Format::FLAT || format == Format::GLOBAL;
681 }
682
683 constexpr bool usesModifiers() const noexcept;
684
685 constexpr bool reads_exec() const noexcept
686 {
687 for (const Operand& op : operands) {
688 if (op.isFixed() && op.physReg() == exec)
689 return true;
690 }
691 return false;
692 }
693 };
694
695 struct SOPK_instruction : public Instruction {
696 uint16_t imm;
697 };
698
699 struct SOPP_instruction : public Instruction {
700 uint32_t imm;
701 int block;
702 };
703
704 struct SOPC_instruction : public Instruction {
705 };
706
707 struct SOP1_instruction : public Instruction {
708 };
709
710 struct SOP2_instruction : public Instruction {
711 };
712
713 /**
714 * Scalar Memory Format:
715 * For s_(buffer_)load_dword*:
716 * Operand(0): SBASE - SGPR-pair which provides base address
717 * Operand(1): Offset - immediate (un)signed offset or SGPR
718 * Operand(2) / Definition(0): SDATA - SGPR for read / write result
719 * Operand(n-1): SOffset - SGPR offset (Vega only)
720 *
721 * Having no operands is also valid for instructions such as s_dcache_inv.
722 *
723 */
724 struct SMEM_instruction : public Instruction {
725 bool glc : 1; /* VI+: globally coherent */
726 bool dlc : 1; /* NAVI: device level coherent */
727 bool nv : 1; /* VEGA only: Non-volatile */
728 bool can_reorder : 1;
729 bool disable_wqm : 1;
730 barrier_interaction barrier;
731 };
732
733 struct VOP1_instruction : public Instruction {
734 };
735
736 struct VOP2_instruction : public Instruction {
737 };
738
739 struct VOPC_instruction : public Instruction {
740 };
741
742 struct VOP3A_instruction : public Instruction {
743 bool abs[3];
744 bool neg[3];
745 uint8_t opsel : 4;
746 uint8_t omod : 2;
747 bool clamp : 1;
748 };
749
750 /**
751 * Data Parallel Primitives Format:
752 * This format can be used for VOP1, VOP2 or VOPC instructions.
753 * The swizzle applies to the src0 operand.
754 *
755 */
756 struct DPP_instruction : public Instruction {
757 bool abs[2];
758 bool neg[2];
759 uint16_t dpp_ctrl;
760 uint8_t row_mask : 4;
761 uint8_t bank_mask : 4;
762 bool bound_ctrl : 1;
763 };
764
765 struct Interp_instruction : public Instruction {
766 uint8_t attribute;
767 uint8_t component;
768 };
769
770 /**
771 * Local and Global Data Sharing instructions
772 * Operand(0): ADDR - VGPR which supplies the address.
773 * Operand(1): DATA0 - First data VGPR.
774 * Operand(2): DATA1 - Second data VGPR.
775 * Operand(n-1): M0 - LDS size.
776 * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
777 *
778 */
779 struct DS_instruction : public Instruction {
780 int16_t offset0;
781 int8_t offset1;
782 bool gds;
783 };
784
785 /**
786 * Vector Memory Untyped-buffer Instructions
787 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
788 * Operand(1): VADDR - Address source. Can carry an index and/or offset
789 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
790 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
791 *
792 */
793 struct MUBUF_instruction : public Instruction {
794 uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
795 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
796 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
797 bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
798 bool glc : 1; /* globally coherent */
799 bool dlc : 1; /* NAVI: device level coherent */
800 bool slc : 1; /* system level coherent */
801 bool tfe : 1; /* texture fail enable */
802 bool lds : 1; /* Return read-data to LDS instead of VGPRs */
803 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
804 bool can_reorder : 1;
805 barrier_interaction barrier;
806 };
807
808 /**
809 * Vector Memory Typed-buffer Instructions
810 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
811 * Operand(1): VADDR - Address source. Can carry an index and/or offset
812 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
813 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
814 *
815 */
816 struct MTBUF_instruction : public Instruction {
817 uint16_t offset; /* Unsigned byte offset - 12 bit */
818 uint8_t dfmt : 4; /* Data Format of data in memory buffer */
819 uint8_t nfmt : 3; /* Numeric format of data in memory */
820 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
821 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
822 bool glc : 1; /* globally coherent */
823 bool dlc : 1; /* NAVI: device level coherent */
824 bool slc : 1; /* system level coherent */
825 bool tfe : 1; /* texture fail enable */
826 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
827 bool can_reorder : 1;
828 barrier_interaction barrier;
829 };
830
831 /**
832 * Vector Memory Image Instructions
833 * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
834 * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
835 * or VDATA - Vector GPR for write data.
836 * Operand(2): VADDR - Address source. Can carry an offset or an index.
837 * Definition(0): VDATA - Vector GPR for read result.
838 *
839 */
840 struct MIMG_instruction : public Instruction {
841 uint8_t dmask; /* Data VGPR enable mask */
842 uint8_t dim : 3; /* NAVI: dimensionality */
843 bool unrm : 1; /* Force address to be un-normalized */
844 bool dlc : 1; /* NAVI: device level coherent */
845 bool glc : 1; /* globally coherent */
846 bool slc : 1; /* system level coherent */
847 bool tfe : 1; /* texture fail enable */
848 bool da : 1; /* declare an array */
849 bool lwe : 1; /* Force data to be un-normalized */
850 bool r128 : 1; /* NAVI: Texture resource size */
851 bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
852 bool d16 : 1; /* Convert 32-bit data to 16-bit data */
853 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
854 bool can_reorder : 1;
855 barrier_interaction barrier;
856 };
857
858 /**
859 * Flat/Scratch/Global Instructions
860 * Operand(0): ADDR
861 * Operand(1): SADDR
862 * Operand(2) / Definition(0): DATA/VDST
863 *
864 */
865 struct FLAT_instruction : public Instruction {
866 uint16_t offset; /* Vega/Navi only */
867 bool slc : 1; /* system level coherent */
868 bool glc : 1; /* globally coherent */
869 bool dlc : 1; /* NAVI: device level coherent */
870 bool lds : 1;
871 bool nv : 1;
872 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
873 bool can_reorder : 1;
874 barrier_interaction barrier;
875 };
876
877 struct Export_instruction : public Instruction {
878 uint8_t enabled_mask;
879 uint8_t dest;
880 bool compressed : 1;
881 bool done : 1;
882 bool valid_mask : 1;
883 };
884
885 struct Pseudo_instruction : public Instruction {
886 bool tmp_in_scc;
887 PhysReg scratch_sgpr; /* might not be valid if it's not needed */
888 };
889
890 struct Pseudo_branch_instruction : public Instruction {
891 /* target[0] is the block index of the branch target.
892 * For conditional branches, target[1] contains the fall-through alternative.
893 * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
894 */
895 uint32_t target[2];
896 };
897
898 struct Pseudo_barrier_instruction : public Instruction {
899 };
900
901 enum ReduceOp {
902 iadd32, iadd64,
903 imul32, imul64,
904 fadd32, fadd64,
905 fmul32, fmul64,
906 imin32, imin64,
907 imax32, imax64,
908 umin32, umin64,
909 umax32, umax64,
910 fmin32, fmin64,
911 fmax32, fmax64,
912 iand32, iand64,
913 ior32, ior64,
914 ixor32, ixor64,
915 gfx10_wave64_bpermute
916 };
917
918 /**
919 * Subgroup Reduction Instructions, everything except for the data to be
920 * reduced and the result as inserted by setup_reduce_temp().
921 * Operand(0): data to be reduced
922 * Operand(1): reduce temporary
923 * Operand(2): vector temporary
924 * Definition(0): result
925 * Definition(1): scalar temporary
926 * Definition(2): scalar identity temporary (not used to store identity on GFX10)
927 * Definition(3): scc clobber
928 * Definition(4): vcc clobber
929 *
930 */
931 struct Pseudo_reduction_instruction : public Instruction {
932 ReduceOp reduce_op;
933 unsigned cluster_size; // must be 0 for scans
934 };
935
936 struct instr_deleter_functor {
937 void operator()(void* p) {
938 free(p);
939 }
940 };
941
942 template<typename T>
943 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
944
945 template<typename T>
946 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
947 {
948 std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
949 char *data = (char*) calloc(1, size);
950 T* inst = (T*) data;
951
952 inst->opcode = opcode;
953 inst->format = format;
954
955 uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
956 inst->operands = aco::span<Operand>(operands_offset, num_operands);
957 uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
958 inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
959
960 return inst;
961 }
962
963 constexpr bool Instruction::usesModifiers() const noexcept
964 {
965 if (isDPP() || isSDWA())
966 return true;
967 if (!isVOP3())
968 return false;
969 const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
970 for (unsigned i = 0; i < operands.size(); i++) {
971 if (vop3->abs[i] || vop3->neg[i])
972 return true;
973 }
974 return vop3->opsel || vop3->clamp || vop3->omod;
975 }
976
977 constexpr bool is_phi(Instruction* instr)
978 {
979 return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
980 }
981
982 static inline bool is_phi(aco_ptr<Instruction>& instr)
983 {
984 return is_phi(instr.get());
985 }
986
987 barrier_interaction get_barrier_interaction(Instruction* instr);
988
989 bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr);
990
991 enum block_kind {
992 /* uniform indicates that leaving this block,
993 * all actives lanes stay active */
994 block_kind_uniform = 1 << 0,
995 block_kind_top_level = 1 << 1,
996 block_kind_loop_preheader = 1 << 2,
997 block_kind_loop_header = 1 << 3,
998 block_kind_loop_exit = 1 << 4,
999 block_kind_continue = 1 << 5,
1000 block_kind_break = 1 << 6,
1001 block_kind_continue_or_break = 1 << 7,
1002 block_kind_discard = 1 << 8,
1003 block_kind_branch = 1 << 9,
1004 block_kind_merge = 1 << 10,
1005 block_kind_invert = 1 << 11,
1006 block_kind_uses_discard_if = 1 << 12,
1007 block_kind_needs_lowering = 1 << 13,
1008 block_kind_uses_demote = 1 << 14,
1009 block_kind_export_end = 1 << 15,
1010 };
1011
1012
1013 struct RegisterDemand {
1014 constexpr RegisterDemand() = default;
1015 constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1016 : vgpr{v}, sgpr{s} {}
1017 int16_t vgpr = 0;
1018 int16_t sgpr = 0;
1019
1020 constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1021 return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1022 }
1023
1024 constexpr bool exceeds(const RegisterDemand other) const noexcept {
1025 return vgpr > other.vgpr || sgpr > other.sgpr;
1026 }
1027
1028 constexpr RegisterDemand operator+(const Temp t) const noexcept {
1029 if (t.type() == RegType::sgpr)
1030 return RegisterDemand( vgpr, sgpr + t.size() );
1031 else
1032 return RegisterDemand( vgpr + t.size(), sgpr );
1033 }
1034
1035 constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1036 return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1037 }
1038
1039 constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1040 return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1041 }
1042
1043 constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1044 vgpr += other.vgpr;
1045 sgpr += other.sgpr;
1046 return *this;
1047 }
1048
1049 constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1050 vgpr -= other.vgpr;
1051 sgpr -= other.sgpr;
1052 return *this;
1053 }
1054
1055 constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1056 if (t.type() == RegType::sgpr)
1057 sgpr += t.size();
1058 else
1059 vgpr += t.size();
1060 return *this;
1061 }
1062
1063 constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1064 if (t.type() == RegType::sgpr)
1065 sgpr -= t.size();
1066 else
1067 vgpr -= t.size();
1068 return *this;
1069 }
1070
1071 constexpr void update(const RegisterDemand other) noexcept {
1072 vgpr = std::max(vgpr, other.vgpr);
1073 sgpr = std::max(sgpr, other.sgpr);
1074 }
1075
1076 };
1077
1078 /* CFG */
1079 struct Block {
1080 float_mode fp_mode;
1081 unsigned index;
1082 unsigned offset = 0;
1083 std::vector<aco_ptr<Instruction>> instructions;
1084 std::vector<unsigned> logical_preds;
1085 std::vector<unsigned> linear_preds;
1086 std::vector<unsigned> logical_succs;
1087 std::vector<unsigned> linear_succs;
1088 RegisterDemand register_demand = RegisterDemand();
1089 uint16_t loop_nest_depth = 0;
1090 uint16_t kind = 0;
1091 int logical_idom = -1;
1092 int linear_idom = -1;
1093 Temp live_out_exec = Temp();
1094
1095 /* this information is needed for predecessors to blocks with phis when
1096 * moving out of ssa */
1097 bool scc_live_out = false;
1098 PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1099
1100 Block(unsigned idx) : index(idx) {}
1101 Block() : index(0) {}
1102 };
1103
1104 using Stage = uint16_t;
1105
1106 /* software stages */
1107 static constexpr Stage sw_vs = 1 << 0;
1108 static constexpr Stage sw_gs = 1 << 1;
1109 static constexpr Stage sw_tcs = 1 << 2;
1110 static constexpr Stage sw_tes = 1 << 3;
1111 static constexpr Stage sw_fs = 1 << 4;
1112 static constexpr Stage sw_cs = 1 << 5;
1113 static constexpr Stage sw_gs_copy = 1 << 6;
1114 static constexpr Stage sw_mask = 0x7f;
1115
1116 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1117 static constexpr Stage hw_vs = 1 << 7;
1118 static constexpr Stage hw_es = 1 << 8; /* not on GFX9. combined into GS on GFX9 (and GFX10/legacy). */
1119 static constexpr Stage hw_gs = 1 << 9;
1120 static constexpr Stage hw_ls = 1 << 10; /* not on GFX9. combined into HS on GFX9 (and GFX10/legacy). */
1121 static constexpr Stage hw_hs = 1 << 11;
1122 static constexpr Stage hw_fs = 1 << 12;
1123 static constexpr Stage hw_cs = 1 << 13;
1124 static constexpr Stage hw_mask = 0x7f << 7;
1125
1126 /* possible settings of Program::stage */
1127 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1128 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1129 static constexpr Stage compute_cs = sw_cs | hw_cs;
1130 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1131 static constexpr Stage gs_copy_vs = sw_gs_copy | hw_vs;
1132 /* GFX10/NGG */
1133 static constexpr Stage ngg_vertex_gs = sw_vs | hw_gs;
1134 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1135 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1136 static constexpr Stage ngg_vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1137 /* GFX9 (and GFX10 if NGG isn't used) */
1138 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1139 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1140 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1141 /* pre-GFX9 */
1142 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1143 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1144 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1145 static constexpr Stage tess_eval_es = sw_tes | hw_es; /* tesselation evaluation before geometry */
1146 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1147
1148 class Program final {
1149 public:
1150 float_mode next_fp_mode;
1151 std::vector<Block> blocks;
1152 RegisterDemand max_reg_demand = RegisterDemand();
1153 uint16_t num_waves = 0;
1154 uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1155 ac_shader_config* config;
1156 struct radv_shader_info *info;
1157 enum chip_class chip_class;
1158 enum radeon_family family;
1159 unsigned wave_size;
1160 RegClass lane_mask;
1161 Stage stage; /* Stage */
1162 bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1163 bool needs_wqm = false; /* there exists a p_wqm instruction */
1164 bool wb_smem_l1_on_end = false;
1165
1166 std::vector<uint8_t> constant_data;
1167 Temp private_segment_buffer;
1168 Temp scratch_offset;
1169
1170 uint16_t min_waves = 0;
1171 uint16_t lds_alloc_granule;
1172 uint32_t lds_limit; /* in bytes */
1173 uint16_t vgpr_limit;
1174 uint16_t sgpr_limit;
1175 uint16_t physical_sgprs;
1176 uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1177 uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
1178
1179 bool needs_vcc = false;
1180 bool needs_xnack_mask = false;
1181 bool needs_flat_scr = false;
1182
1183 uint32_t allocateId()
1184 {
1185 assert(allocationID <= 16777215);
1186 return allocationID++;
1187 }
1188
1189 uint32_t peekAllocationId()
1190 {
1191 return allocationID;
1192 }
1193
1194 void setAllocationId(uint32_t id)
1195 {
1196 allocationID = id;
1197 }
1198
1199 Block* create_and_insert_block() {
1200 blocks.emplace_back(blocks.size());
1201 blocks.back().fp_mode = next_fp_mode;
1202 return &blocks.back();
1203 }
1204
1205 Block* insert_block(Block&& block) {
1206 block.index = blocks.size();
1207 block.fp_mode = next_fp_mode;
1208 blocks.emplace_back(std::move(block));
1209 return &blocks.back();
1210 }
1211
1212 private:
1213 uint32_t allocationID = 1;
1214 };
1215
1216 struct live {
1217 /* live temps out per block */
1218 std::vector<std::set<Temp>> live_out;
1219 /* register demand (sgpr/vgpr) per instruction per block */
1220 std::vector<std::vector<RegisterDemand>> register_demand;
1221 };
1222
1223 void select_program(Program *program,
1224 unsigned shader_count,
1225 struct nir_shader *const *shaders,
1226 ac_shader_config* config,
1227 struct radv_shader_args *args);
1228 void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
1229 ac_shader_config* config,
1230 struct radv_shader_args *args);
1231
1232 void lower_wqm(Program* program, live& live_vars,
1233 const struct radv_nir_compiler_options *options);
1234 void lower_bool_phis(Program* program);
1235 void calc_min_waves(Program* program);
1236 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1237 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1238 std::vector<uint16_t> dead_code_analysis(Program *program);
1239 void dominator_tree(Program* program);
1240 void insert_exec_mask(Program *program);
1241 void value_numbering(Program* program);
1242 void optimize(Program* program);
1243 void setup_reduce_temp(Program* program);
1244 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1245 void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_per_block);
1246 void ssa_elimination(Program* program);
1247 void lower_to_hw_instr(Program* program);
1248 void schedule_program(Program* program, live& live_vars);
1249 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1250 void insert_wait_states(Program* program);
1251 void insert_NOPs(Program* program);
1252 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1253 void print_asm(Program *program, std::vector<uint32_t>& binary,
1254 unsigned exec_size, std::ostream& out);
1255 void validate(Program* program, FILE *output);
1256 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output);
1257 #ifndef NDEBUG
1258 void perfwarn(bool cond, const char *msg, Instruction *instr=NULL);
1259 #else
1260 #define perfwarn(program, cond, msg, ...) do {} while(0)
1261 #endif
1262
1263 void aco_print_instr(Instruction *instr, FILE *output);
1264 void aco_print_program(Program *program, FILE *output);
1265
1266 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1267 uint16_t get_extra_sgprs(Program *program);
1268
1269 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
1270 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1271 uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs);
1272
1273 /* return number of addressable sgprs/vgprs for max_waves */
1274 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1275 uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves);
1276
1277 typedef struct {
1278 const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
1279 const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1280 const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1281 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1282 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1283 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
1284 const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1285 const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1286 } Info;
1287
1288 extern const Info instr_info;
1289
1290 }
1291
1292 #endif /* ACO_IR_H */
1293