aco: small stage corrections
[mesa.git] / src / amd / compiler / aco_ir.h
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #ifndef ACO_IR_H
26 #define ACO_IR_H
27
28 #include <vector>
29 #include <set>
30 #include <bitset>
31 #include <memory>
32
33 #include "nir.h"
34 #include "ac_binary.h"
35 #include "amd_family.h"
36 #include "aco_opcodes.h"
37 #include "aco_util.h"
38
39 struct radv_nir_compiler_options;
40 struct radv_shader_info;
41
42 namespace aco {
43
44 extern uint64_t debug_flags;
45
46 enum {
47 DEBUG_VALIDATE = 0x1,
48 DEBUG_VALIDATE_RA = 0x2,
49 DEBUG_PERFWARN = 0x4,
50 };
51
52 /**
53 * Representation of the instruction's microcode encoding format
54 * Note: Some Vector ALU Formats can be combined, such that:
55 * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
56 * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
57 * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
58 *
59 * (*) The same is applicable for VOP1 and VOPC instructions.
60 */
61 enum class Format : std::uint16_t {
62 /* Pseudo Instruction Format */
63 PSEUDO = 0,
64 /* Scalar ALU & Control Formats */
65 SOP1 = 1,
66 SOP2 = 2,
67 SOPK = 3,
68 SOPP = 4,
69 SOPC = 5,
70 /* Scalar Memory Format */
71 SMEM = 6,
72 /* LDS/GDS Format */
73 DS = 8,
74 /* Vector Memory Buffer Formats */
75 MTBUF = 9,
76 MUBUF = 10,
77 /* Vector Memory Image Format */
78 MIMG = 11,
79 /* Export Format */
80 EXP = 12,
81 /* Flat Formats */
82 FLAT = 13,
83 GLOBAL = 14,
84 SCRATCH = 15,
85
86 PSEUDO_BRANCH = 16,
87 PSEUDO_BARRIER = 17,
88 PSEUDO_REDUCTION = 18,
89
90 /* Vector ALU Formats */
91 VOP1 = 1 << 8,
92 VOP2 = 1 << 9,
93 VOPC = 1 << 10,
94 VOP3 = 1 << 11,
95 VOP3A = 1 << 11,
96 VOP3B = 1 << 11,
97 VOP3P = 1 << 12,
98 /* Vector Parameter Interpolation Format */
99 VINTRP = 1 << 13,
100 DPP = 1 << 14,
101 SDWA = 1 << 15,
102 };
103
104 enum barrier_interaction {
105 barrier_none = 0,
106 barrier_buffer = 0x1,
107 barrier_image = 0x2,
108 barrier_atomic = 0x4,
109 barrier_shared = 0x8,
110 barrier_count = 4,
111 };
112
113 constexpr Format asVOP3(Format format) {
114 return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
115 };
116
117 enum class RegType {
118 none = 0,
119 sgpr,
120 vgpr,
121 linear_vgpr,
122 };
123
124 struct RegClass {
125
126 enum RC : uint8_t {
127 s1 = 1,
128 s2 = 2,
129 s3 = 3,
130 s4 = 4,
131 s6 = 6,
132 s8 = 8,
133 s16 = 16,
134 v1 = s1 | (1 << 5),
135 v2 = s2 | (1 << 5),
136 v3 = s3 | (1 << 5),
137 v4 = s4 | (1 << 5),
138 v5 = 5 | (1 << 5),
139 v6 = 6 | (1 << 5),
140 v7 = 7 | (1 << 5),
141 v8 = 8 | (1 << 5),
142 /* these are used for WWM and spills to vgpr */
143 v1_linear = v1 | (1 << 6),
144 v2_linear = v2 | (1 << 6),
145 };
146
147 RegClass() = default;
148 constexpr RegClass(RC rc)
149 : rc(rc) {}
150 constexpr RegClass(RegType type, unsigned size)
151 : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
152
153 constexpr operator RC() const { return rc; }
154 explicit operator bool() = delete;
155
156 constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
157 constexpr unsigned size() const { return (unsigned) rc & 0x1F; }
158 constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
159 constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
160
161 private:
162 RC rc;
163 };
164
165 /* transitional helper expressions */
166 static constexpr RegClass s1{RegClass::s1};
167 static constexpr RegClass s2{RegClass::s2};
168 static constexpr RegClass s3{RegClass::s3};
169 static constexpr RegClass s4{RegClass::s4};
170 static constexpr RegClass s8{RegClass::s8};
171 static constexpr RegClass s16{RegClass::s16};
172 static constexpr RegClass v1{RegClass::v1};
173 static constexpr RegClass v2{RegClass::v2};
174 static constexpr RegClass v3{RegClass::v3};
175 static constexpr RegClass v4{RegClass::v4};
176 static constexpr RegClass v5{RegClass::v5};
177 static constexpr RegClass v6{RegClass::v6};
178 static constexpr RegClass v7{RegClass::v7};
179 static constexpr RegClass v8{RegClass::v8};
180
181 /**
182 * Temp Class
183 * Each temporary virtual register has a
184 * register class (i.e. size and type)
185 * and SSA id.
186 */
187 struct Temp {
188 Temp() = default;
189 constexpr Temp(uint32_t id, RegClass cls) noexcept
190 : id_(id), reg_class(cls) {}
191
192 constexpr uint32_t id() const noexcept { return id_; }
193 constexpr RegClass regClass() const noexcept { return reg_class; }
194
195 constexpr unsigned size() const noexcept { return reg_class.size(); }
196 constexpr RegType type() const noexcept { return reg_class.type(); }
197 constexpr bool is_linear() const noexcept { return reg_class.is_linear(); }
198
199 constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
200 constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
201 constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
202
203 private:
204 uint32_t id_:24;
205 RegClass reg_class;
206 };
207
208 /**
209 * PhysReg
210 * Represents the physical register for each
211 * Operand and Definition.
212 */
213 struct PhysReg {
214 constexpr PhysReg() = default;
215 explicit constexpr PhysReg(unsigned r) : reg(r) {}
216 constexpr operator unsigned() const { return reg; }
217
218 uint16_t reg = 0;
219 };
220
221 /* helper expressions for special registers */
222 static constexpr PhysReg m0{124};
223 static constexpr PhysReg vcc{106};
224 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
225 static constexpr PhysReg exec{126};
226 static constexpr PhysReg exec_lo{126};
227 static constexpr PhysReg exec_hi{127};
228 static constexpr PhysReg scc{253};
229
230 /**
231 * Operand Class
232 * Initially, each Operand refers to either
233 * a temporary virtual register
234 * or to a constant value
235 * Temporary registers get mapped to physical register during RA
236 * Constant values are inlined into the instruction sequence.
237 */
238 class Operand final
239 {
240 public:
241 constexpr Operand()
242 : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
243 isKill_(false), isUndef_(true), isFirstKill_(false), is64BitConst_(false) {}
244
245 explicit Operand(Temp r) noexcept
246 {
247 data_.temp = r;
248 if (r.id()) {
249 isTemp_ = true;
250 } else {
251 isUndef_ = true;
252 setFixed(PhysReg{128});
253 }
254 };
255 explicit Operand(uint32_t v) noexcept
256 {
257 data_.i = v;
258 isConstant_ = true;
259 if (v <= 64)
260 setFixed(PhysReg{128 + v});
261 else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
262 setFixed(PhysReg{192 - v});
263 else if (v == 0x3f000000) /* 0.5 */
264 setFixed(PhysReg{240});
265 else if (v == 0xbf000000) /* -0.5 */
266 setFixed(PhysReg{241});
267 else if (v == 0x3f800000) /* 1.0 */
268 setFixed(PhysReg{242});
269 else if (v == 0xbf800000) /* -1.0 */
270 setFixed(PhysReg{243});
271 else if (v == 0x40000000) /* 2.0 */
272 setFixed(PhysReg{244});
273 else if (v == 0xc0000000) /* -2.0 */
274 setFixed(PhysReg{245});
275 else if (v == 0x40800000) /* 4.0 */
276 setFixed(PhysReg{246});
277 else if (v == 0xc0800000) /* -4.0 */
278 setFixed(PhysReg{247});
279 else if (v == 0x3e22f983) /* 1/(2*PI) */
280 setFixed(PhysReg{248});
281 else /* Literal Constant */
282 setFixed(PhysReg{255});
283 };
284 explicit Operand(uint64_t v) noexcept
285 {
286 isConstant_ = true;
287 is64BitConst_ = true;
288 if (v <= 64)
289 setFixed(PhysReg{128 + (uint32_t) v});
290 else if (v >= 0xFFFFFFFFFFFFFFF0) /* [-16 .. -1] */
291 setFixed(PhysReg{192 - (uint32_t) v});
292 else if (v == 0x3FE0000000000000) /* 0.5 */
293 setFixed(PhysReg{240});
294 else if (v == 0xBFE0000000000000) /* -0.5 */
295 setFixed(PhysReg{241});
296 else if (v == 0x3FF0000000000000) /* 1.0 */
297 setFixed(PhysReg{242});
298 else if (v == 0xBFF0000000000000) /* -1.0 */
299 setFixed(PhysReg{243});
300 else if (v == 0x4000000000000000) /* 2.0 */
301 setFixed(PhysReg{244});
302 else if (v == 0xC000000000000000) /* -2.0 */
303 setFixed(PhysReg{245});
304 else if (v == 0x4010000000000000) /* 4.0 */
305 setFixed(PhysReg{246});
306 else if (v == 0xC010000000000000) /* -4.0 */
307 setFixed(PhysReg{247});
308 else if (v == 0x3fc45f306dc9c882) /* 1/(2*PI) */
309 setFixed(PhysReg{248});
310 else { /* Literal Constant: we don't know if it is a long or double.*/
311 isConstant_ = 0;
312 assert(false && "attempt to create a 64-bit literal constant");
313 }
314 };
315 explicit Operand(RegClass type) noexcept
316 {
317 isUndef_ = true;
318 data_.temp = Temp(0, type);
319 setFixed(PhysReg{128});
320 };
321 explicit Operand(PhysReg reg, RegClass type) noexcept
322 {
323 data_.temp = Temp(0, type);
324 setFixed(reg);
325 }
326
327 constexpr bool isTemp() const noexcept
328 {
329 return isTemp_;
330 }
331
332 constexpr void setTemp(Temp t) noexcept {
333 assert(!isConstant_);
334 isTemp_ = true;
335 data_.temp = t;
336 }
337
338 constexpr Temp getTemp() const noexcept
339 {
340 return data_.temp;
341 }
342
343 constexpr uint32_t tempId() const noexcept
344 {
345 return data_.temp.id();
346 }
347
348 constexpr bool hasRegClass() const noexcept
349 {
350 return isTemp() || isUndefined();
351 }
352
353 constexpr RegClass regClass() const noexcept
354 {
355 return data_.temp.regClass();
356 }
357
358 constexpr unsigned size() const noexcept
359 {
360 if (isConstant())
361 return is64BitConst_ ? 2 : 1;
362 else
363 return data_.temp.size();
364 }
365
366 constexpr bool isFixed() const noexcept
367 {
368 return isFixed_;
369 }
370
371 constexpr PhysReg physReg() const noexcept
372 {
373 return reg_;
374 }
375
376 constexpr void setFixed(PhysReg reg) noexcept
377 {
378 isFixed_ = reg != unsigned(-1);
379 reg_ = reg;
380 }
381
382 constexpr bool isConstant() const noexcept
383 {
384 return isConstant_;
385 }
386
387 constexpr bool isLiteral() const noexcept
388 {
389 return isConstant() && reg_ == 255;
390 }
391
392 constexpr bool isUndefined() const noexcept
393 {
394 return isUndef_;
395 }
396
397 constexpr uint32_t constantValue() const noexcept
398 {
399 return data_.i;
400 }
401
402 constexpr bool constantEquals(uint32_t cmp) const noexcept
403 {
404 return isConstant() && constantValue() == cmp;
405 }
406
407 constexpr void setKill(bool flag) noexcept
408 {
409 isKill_ = flag;
410 if (!flag)
411 setFirstKill(false);
412 }
413
414 constexpr bool isKill() const noexcept
415 {
416 return isKill_ || isFirstKill();
417 }
418
419 constexpr void setFirstKill(bool flag) noexcept
420 {
421 isFirstKill_ = flag;
422 if (flag)
423 setKill(flag);
424 }
425
426 /* When there are multiple operands killing the same temporary,
427 * isFirstKill() is only returns true for the first one. */
428 constexpr bool isFirstKill() const noexcept
429 {
430 return isFirstKill_;
431 }
432
433 private:
434 union {
435 uint32_t i;
436 float f;
437 Temp temp = Temp(0, s1);
438 } data_;
439 PhysReg reg_;
440 union {
441 struct {
442 uint8_t isTemp_:1;
443 uint8_t isFixed_:1;
444 uint8_t isConstant_:1;
445 uint8_t isKill_:1;
446 uint8_t isUndef_:1;
447 uint8_t isFirstKill_:1;
448 uint8_t is64BitConst_:1;
449 };
450 /* can't initialize bit-fields in c++11, so work around using a union */
451 uint8_t control_ = 0;
452 };
453 };
454
455 /**
456 * Definition Class
457 * Definitions are the results of Instructions
458 * and refer to temporary virtual registers
459 * which are later mapped to physical registers
460 */
461 class Definition final
462 {
463 public:
464 constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0) {}
465 Definition(uint32_t index, RegClass type) noexcept
466 : temp(index, type) {}
467 explicit Definition(Temp tmp) noexcept
468 : temp(tmp) {}
469 Definition(PhysReg reg, RegClass type) noexcept
470 : temp(Temp(0, type))
471 {
472 setFixed(reg);
473 }
474 Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
475 : temp(Temp(tmpId, type))
476 {
477 setFixed(reg);
478 }
479
480 constexpr bool isTemp() const noexcept
481 {
482 return tempId() > 0;
483 }
484
485 constexpr Temp getTemp() const noexcept
486 {
487 return temp;
488 }
489
490 constexpr uint32_t tempId() const noexcept
491 {
492 return temp.id();
493 }
494
495 constexpr void setTemp(Temp t) noexcept {
496 temp = t;
497 }
498
499 constexpr RegClass regClass() const noexcept
500 {
501 return temp.regClass();
502 }
503
504 constexpr unsigned size() const noexcept
505 {
506 return temp.size();
507 }
508
509 constexpr bool isFixed() const noexcept
510 {
511 return isFixed_;
512 }
513
514 constexpr PhysReg physReg() const noexcept
515 {
516 return reg_;
517 }
518
519 constexpr void setFixed(PhysReg reg) noexcept
520 {
521 isFixed_ = 1;
522 reg_ = reg;
523 }
524
525 constexpr void setHint(PhysReg reg) noexcept
526 {
527 hasHint_ = 1;
528 reg_ = reg;
529 }
530
531 constexpr bool hasHint() const noexcept
532 {
533 return hasHint_;
534 }
535
536 constexpr void setKill(bool flag) noexcept
537 {
538 isKill_ = flag;
539 }
540
541 constexpr bool isKill() const noexcept
542 {
543 return isKill_;
544 }
545
546 private:
547 Temp temp = Temp(0, s1);
548 PhysReg reg_;
549 union {
550 struct {
551 uint8_t isFixed_:1;
552 uint8_t hasHint_:1;
553 uint8_t isKill_:1;
554 };
555 /* can't initialize bit-fields in c++11, so work around using a union */
556 uint8_t control_ = 0;
557 };
558 };
559
560 class Block;
561
562 struct Instruction {
563 aco_opcode opcode;
564 Format format;
565 uint32_t pass_flags;
566
567 aco::span<Operand> operands;
568 aco::span<Definition> definitions;
569
570 constexpr bool isVALU() const noexcept
571 {
572 return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
573 || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
574 || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
575 || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
576 || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
577 || ((uint16_t) format & (uint16_t) Format::VOP3P) == (uint16_t) Format::VOP3P;
578 }
579
580 constexpr bool isSALU() const noexcept
581 {
582 return format == Format::SOP1 ||
583 format == Format::SOP2 ||
584 format == Format::SOPC ||
585 format == Format::SOPK ||
586 format == Format::SOPP;
587 }
588
589 constexpr bool isVMEM() const noexcept
590 {
591 return format == Format::MTBUF ||
592 format == Format::MUBUF ||
593 format == Format::MIMG;
594 }
595
596 constexpr bool isDPP() const noexcept
597 {
598 return (uint16_t) format & (uint16_t) Format::DPP;
599 }
600
601 constexpr bool isVOP3() const noexcept
602 {
603 return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
604 ((uint16_t) format & (uint16_t) Format::VOP3B) ||
605 format == Format::VOP3P;
606 }
607
608 constexpr bool isSDWA() const noexcept
609 {
610 return (uint16_t) format & (uint16_t) Format::SDWA;
611 }
612
613 constexpr bool isFlatOrGlobal() const noexcept
614 {
615 return format == Format::FLAT || format == Format::GLOBAL;
616 }
617 };
618
619 struct SOPK_instruction : public Instruction {
620 uint16_t imm;
621 };
622
623 struct SOPP_instruction : public Instruction {
624 uint32_t imm;
625 int block;
626 };
627
628 struct SOPC_instruction : public Instruction {
629 };
630
631 struct SOP1_instruction : public Instruction {
632 };
633
634 struct SOP2_instruction : public Instruction {
635 };
636
637 /**
638 * Scalar Memory Format:
639 * For s_(buffer_)load_dword*:
640 * Operand(0): SBASE - SGPR-pair which provides base address
641 * Operand(1): Offset - immediate (un)signed offset or SGPR
642 * Operand(2) / Definition(0): SDATA - SGPR for read / write result
643 * Operand(n-1): SOffset - SGPR offset (Vega only)
644 *
645 * Having no operands is also valid for instructions such as s_dcache_inv.
646 *
647 */
648 struct SMEM_instruction : public Instruction {
649 bool glc; /* VI+: globally coherent */
650 bool dlc; /* NAVI: device level coherent */
651 bool nv; /* VEGA only: Non-volatile */
652 bool can_reorder;
653 bool disable_wqm;
654 barrier_interaction barrier;
655 };
656
657 struct VOP1_instruction : public Instruction {
658 };
659
660 struct VOP2_instruction : public Instruction {
661 };
662
663 struct VOPC_instruction : public Instruction {
664 };
665
666 struct VOP3A_instruction : public Instruction {
667 bool abs[3];
668 bool opsel[4];
669 bool clamp;
670 unsigned omod;
671 bool neg[3];
672 };
673
674 /**
675 * Data Parallel Primitives Format:
676 * This format can be used for VOP1, VOP2 or VOPC instructions.
677 * The swizzle applies to the src0 operand.
678 *
679 */
680 struct DPP_instruction : public Instruction {
681 uint16_t dpp_ctrl;
682 uint8_t row_mask;
683 uint8_t bank_mask;
684 bool abs[2];
685 bool neg[2];
686 bool bound_ctrl;
687 };
688
689 struct Interp_instruction : public Instruction {
690 unsigned attribute;
691 unsigned component;
692 };
693
694 /**
695 * Local and Global Data Sharing instructions
696 * Operand(0): ADDR - VGPR which supplies the address.
697 * Operand(1): DATA0 - First data VGPR.
698 * Operand(2): DATA1 - Second data VGPR.
699 * Operand(n-1): M0 - LDS size.
700 * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
701 *
702 */
703 struct DS_instruction : public Instruction {
704 int16_t offset0;
705 int8_t offset1;
706 bool gds;
707 };
708
709 /**
710 * Vector Memory Untyped-buffer Instructions
711 * Operand(0): VADDR - Address source. Can carry an index and/or offset
712 * Operand(1): SRSRC - Specifies which SGPR supplies T# (resource constant)
713 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
714 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
715 *
716 */
717 struct MUBUF_instruction : public Instruction {
718 unsigned offset; /* Unsigned byte offset - 12 bit */
719 bool offen; /* Supply an offset from VGPR (VADDR) */
720 bool idxen; /* Supply an index from VGPR (VADDR) */
721 bool glc; /* globally coherent */
722 bool dlc; /* NAVI: device level coherent */
723 bool slc; /* system level coherent */
724 bool tfe; /* texture fail enable */
725 bool lds; /* Return read-data to LDS instead of VGPRs */
726 bool disable_wqm; /* Require an exec mask without helper invocations */
727 bool can_reorder;
728 barrier_interaction barrier;
729 };
730
731 /**
732 * Vector Memory Typed-buffer Instructions
733 * Operand(0): VADDR - Address source. Can carry an index and/or offset
734 * Operand(1): SRSRC - Specifies which SGPR supplies T# (resource constant)
735 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
736 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
737 *
738 */
739 struct MTBUF_instruction : public Instruction {
740 uint8_t dfmt : 4; /* Data Format of data in memory buffer */
741 uint8_t nfmt : 3; /* Numeric format of data in memory */
742 unsigned offset; /* Unsigned byte offset - 12 bit */
743 bool offen; /* Supply an offset from VGPR (VADDR) */
744 bool idxen; /* Supply an index from VGPR (VADDR) */
745 bool glc; /* globally coherent */
746 bool dlc; /* NAVI: device level coherent */
747 bool slc; /* system level coherent */
748 bool tfe; /* texture fail enable */
749 bool disable_wqm; /* Require an exec mask without helper invocations */
750 bool can_reorder;
751 barrier_interaction barrier;
752 };
753
754 /**
755 * Vector Memory Image Instructions
756 * Operand(0): VADDR - Address source. Can carry an offset or an index.
757 * Operand(1): SRSRC - Scalar GPR that specifies the resource constant.
758 * Operand(2): SSAMP - Scalar GPR that specifies sampler constant.
759 * Operand(3) / Definition(0): VDATA - Vector GPR for read / write result.
760 *
761 */
762 struct MIMG_instruction : public Instruction {
763 unsigned dmask; /* Data VGPR enable mask */
764 unsigned dim; /* NAVI: dimensionality */
765 bool unrm; /* Force address to be un-normalized */
766 bool dlc; /* NAVI: device level coherent */
767 bool glc; /* globally coherent */
768 bool slc; /* system level coherent */
769 bool tfe; /* texture fail enable */
770 bool da; /* declare an array */
771 bool lwe; /* Force data to be un-normalized */
772 bool r128; /* NAVI: Texture resource size */
773 bool a16; /* VEGA, NAVI: Address components are 16-bits */
774 bool d16; /* Convert 32-bit data to 16-bit data */
775 bool disable_wqm; /* Require an exec mask without helper invocations */
776 bool can_reorder;
777 barrier_interaction barrier;
778 };
779
780 /**
781 * Flat/Scratch/Global Instructions
782 * Operand(0): ADDR
783 * Operand(1): SADDR
784 * Operand(2) / Definition(0): DATA/VDST
785 *
786 */
787 struct FLAT_instruction : public Instruction {
788 uint16_t offset; /* Vega only */
789 bool slc; /* system level coherent */
790 bool glc; /* globally coherent */
791 bool dlc; /* NAVI: device level coherent */
792 bool lds;
793 bool nv;
794 };
795
796 struct Export_instruction : public Instruction {
797 unsigned enabled_mask;
798 unsigned dest;
799 bool compressed;
800 bool done;
801 bool valid_mask;
802 };
803
804 struct Pseudo_instruction : public Instruction {
805 bool tmp_in_scc;
806 PhysReg scratch_sgpr; /* might not be valid if it's not needed */
807 };
808
809 struct Pseudo_branch_instruction : public Instruction {
810 /* target[0] is the block index of the branch target.
811 * For conditional branches, target[1] contains the fall-through alternative.
812 * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
813 */
814 uint32_t target[2];
815 };
816
817 struct Pseudo_barrier_instruction : public Instruction {
818 };
819
820 enum ReduceOp {
821 iadd32, iadd64,
822 imul32, imul64,
823 fadd32, fadd64,
824 fmul32, fmul64,
825 imin32, imin64,
826 imax32, imax64,
827 umin32, umin64,
828 umax32, umax64,
829 fmin32, fmin64,
830 fmax32, fmax64,
831 iand32, iand64,
832 ior32, ior64,
833 ixor32, ixor64,
834 };
835
836 /**
837 * Subgroup Reduction Instructions, everything except for the data to be
838 * reduced and the result as inserted by setup_reduce_temp().
839 * Operand(0): data to be reduced
840 * Operand(1): reduce temporary
841 * Operand(2): vector temporary
842 * Definition(0): result
843 * Definition(1): scalar temporary
844 * Definition(2): scalar identity temporary
845 * Definition(3): scc clobber
846 * Definition(4): vcc clobber
847 *
848 */
849 struct Pseudo_reduction_instruction : public Instruction {
850 ReduceOp reduce_op;
851 unsigned cluster_size; // must be 0 for scans
852 };
853
854 struct instr_deleter_functor {
855 void operator()(void* p) {
856 free(p);
857 }
858 };
859
860 template<typename T>
861 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
862
863 template<typename T>
864 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
865 {
866 std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
867 char *data = (char*) calloc(1, size);
868 T* inst = (T*) data;
869
870 inst->opcode = opcode;
871 inst->format = format;
872
873 inst->operands = aco::span<Operand>((Operand*)(data + sizeof(T)), num_operands);
874 inst->definitions = aco::span<Definition>((Definition*)inst->operands.end(), num_definitions);
875
876 return inst;
877 }
878
879 constexpr bool is_phi(Instruction* instr)
880 {
881 return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
882 }
883
884 static inline bool is_phi(aco_ptr<Instruction>& instr)
885 {
886 return is_phi(instr.get());
887 }
888
889 constexpr barrier_interaction get_barrier_interaction(Instruction* instr)
890 {
891 switch (instr->format) {
892 case Format::SMEM:
893 return static_cast<SMEM_instruction*>(instr)->barrier;
894 case Format::MUBUF:
895 return static_cast<MUBUF_instruction*>(instr)->barrier;
896 case Format::MIMG:
897 return static_cast<MIMG_instruction*>(instr)->barrier;
898 case Format::FLAT:
899 case Format::GLOBAL:
900 return barrier_buffer;
901 case Format::DS:
902 return barrier_shared;
903 default:
904 return barrier_none;
905 }
906 }
907
908 enum block_kind {
909 /* uniform indicates that leaving this block,
910 * all actives lanes stay active */
911 block_kind_uniform = 1 << 0,
912 block_kind_top_level = 1 << 1,
913 block_kind_loop_preheader = 1 << 2,
914 block_kind_loop_header = 1 << 3,
915 block_kind_loop_exit = 1 << 4,
916 block_kind_continue = 1 << 5,
917 block_kind_break = 1 << 6,
918 block_kind_continue_or_break = 1 << 7,
919 block_kind_discard = 1 << 8,
920 block_kind_branch = 1 << 9,
921 block_kind_merge = 1 << 10,
922 block_kind_invert = 1 << 11,
923 block_kind_uses_discard_if = 1 << 12,
924 block_kind_needs_lowering = 1 << 13,
925 block_kind_uses_demote = 1 << 14,
926 };
927
928
929 struct RegisterDemand {
930 constexpr RegisterDemand() = default;
931 constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
932 : vgpr{v}, sgpr{s} {}
933 int16_t vgpr = 0;
934 int16_t sgpr = 0;
935
936 constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
937 return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
938 }
939
940 constexpr bool exceeds(const RegisterDemand other) const noexcept {
941 return vgpr > other.vgpr || sgpr > other.sgpr;
942 }
943
944 constexpr RegisterDemand operator+(const Temp t) const noexcept {
945 if (t.type() == RegType::sgpr)
946 return RegisterDemand( vgpr, sgpr + t.size() );
947 else
948 return RegisterDemand( vgpr + t.size(), sgpr );
949 }
950
951 constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
952 return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
953 }
954
955 constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
956 return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
957 }
958
959 constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
960 vgpr += other.vgpr;
961 sgpr += other.sgpr;
962 return *this;
963 }
964
965 constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
966 vgpr -= other.vgpr;
967 sgpr -= other.sgpr;
968 return *this;
969 }
970
971 constexpr RegisterDemand& operator+=(const Temp t) noexcept {
972 if (t.type() == RegType::sgpr)
973 sgpr += t.size();
974 else
975 vgpr += t.size();
976 return *this;
977 }
978
979 constexpr RegisterDemand& operator-=(const Temp t) noexcept {
980 if (t.type() == RegType::sgpr)
981 sgpr -= t.size();
982 else
983 vgpr -= t.size();
984 return *this;
985 }
986
987 constexpr void update(const RegisterDemand other) noexcept {
988 vgpr = std::max(vgpr, other.vgpr);
989 sgpr = std::max(sgpr, other.sgpr);
990 }
991
992 };
993
994 /* CFG */
995 struct Block {
996 unsigned index;
997 unsigned offset = 0;
998 std::vector<aco_ptr<Instruction>> instructions;
999 std::vector<unsigned> logical_preds;
1000 std::vector<unsigned> linear_preds;
1001 std::vector<unsigned> logical_succs;
1002 std::vector<unsigned> linear_succs;
1003 RegisterDemand register_demand = RegisterDemand();
1004 uint16_t loop_nest_depth = 0;
1005 uint16_t kind = 0;
1006 int logical_idom = -1;
1007 int linear_idom = -1;
1008 Temp live_out_exec = Temp();
1009
1010 /* this information is needed for predecessors to blocks with phis when
1011 * moving out of ssa */
1012 bool scc_live_out = false;
1013 PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1014
1015 Block(unsigned idx) : index(idx) {}
1016 Block() : index(0) {}
1017 };
1018
1019 using Stage = uint16_t;
1020
1021 /* software stages */
1022 static constexpr Stage sw_vs = 1 << 0;
1023 static constexpr Stage sw_gs = 1 << 1;
1024 static constexpr Stage sw_tcs = 1 << 2;
1025 static constexpr Stage sw_tes = 1 << 3;
1026 static constexpr Stage sw_fs = 1 << 4;
1027 static constexpr Stage sw_cs = 1 << 5;
1028 static constexpr Stage sw_mask = 0x3f;
1029
1030 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1031 static constexpr Stage hw_vs = 1 << 6;
1032 static constexpr Stage hw_es = 1 << 7; /* not on GFX9. combined into GS on GFX9 (and GFX10/legacy). */
1033 static constexpr Stage hw_gs = 1 << 8;
1034 static constexpr Stage hw_ls = 1 << 9; /* not on GFX9. combined into HS on GFX9 (and GFX10/legacy). */
1035 static constexpr Stage hw_hs = 1 << 10;
1036 static constexpr Stage hw_fs = 1 << 11;
1037 static constexpr Stage hw_cs = 1 << 12;
1038 static constexpr Stage hw_mask = 0x7f << 6;
1039
1040 /* possible settings of Program::stage */
1041 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1042 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1043 static constexpr Stage compute_cs = sw_cs | hw_cs;
1044 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1045 /* GFX10/NGG */
1046 static constexpr Stage ngg_vertex_gs = sw_vs | hw_gs;
1047 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1048 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1049 static constexpr Stage ngg_vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1050 /* GFX9 (and GFX10 if NGG isn't used) */
1051 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1052 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1053 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1054 /* pre-GFX9 */
1055 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1056 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1057 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1058 static constexpr Stage tess_eval_es = sw_tes | hw_gs; /* tesselation evaluation before geometry */
1059 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1060
1061 class Program final {
1062 public:
1063 std::vector<Block> blocks;
1064 RegisterDemand max_reg_demand = RegisterDemand();
1065 uint16_t sgpr_limit = 0;
1066 uint16_t num_waves = 0;
1067 ac_shader_config* config;
1068 struct radv_shader_info *info;
1069 enum chip_class chip_class;
1070 enum radeon_family family;
1071 unsigned wave_size;
1072 Stage stage; /* Stage */
1073 bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1074 bool needs_wqm = false; /* there exists a p_wqm instruction */
1075 bool wb_smem_l1_on_end = false;
1076
1077 std::vector<uint8_t> constant_data;
1078
1079 uint32_t allocateId()
1080 {
1081 assert(allocationID <= 16777215);
1082 return allocationID++;
1083 }
1084
1085 uint32_t peekAllocationId()
1086 {
1087 return allocationID;
1088 }
1089
1090 void setAllocationId(uint32_t id)
1091 {
1092 allocationID = id;
1093 }
1094
1095 Block* create_and_insert_block() {
1096 blocks.emplace_back(blocks.size());
1097 return &blocks.back();
1098 }
1099
1100 Block* insert_block(Block&& block) {
1101 block.index = blocks.size();
1102 blocks.emplace_back(std::move(block));
1103 return &blocks.back();
1104 }
1105
1106 private:
1107 uint32_t allocationID = 1;
1108 };
1109
1110 struct live {
1111 /* live temps out per block */
1112 std::vector<std::set<Temp>> live_out;
1113 /* register demand (sgpr/vgpr) per instruction per block */
1114 std::vector<std::vector<RegisterDemand>> register_demand;
1115 };
1116
1117 void select_program(Program *program,
1118 unsigned shader_count,
1119 struct nir_shader *const *shaders,
1120 ac_shader_config* config,
1121 struct radv_shader_info *info,
1122 struct radv_nir_compiler_options *options);
1123
1124 void lower_wqm(Program* program, live& live_vars,
1125 const struct radv_nir_compiler_options *options);
1126 void lower_bool_phis(Program* program);
1127 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1128 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1129 std::vector<uint16_t> dead_code_analysis(Program *program);
1130 void dominator_tree(Program* program);
1131 void insert_exec_mask(Program *program);
1132 void value_numbering(Program* program);
1133 void optimize(Program* program);
1134 void setup_reduce_temp(Program* program);
1135 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1136 void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_per_block);
1137 void ssa_elimination(Program* program);
1138 void lower_to_hw_instr(Program* program);
1139 void schedule_program(Program* program, live& live_vars);
1140 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1141 void insert_wait_states(Program* program);
1142 void insert_NOPs(Program* program);
1143 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1144 void print_asm(Program *program, std::vector<uint32_t>& binary,
1145 unsigned exec_size, std::ostream& out);
1146 void validate(Program* program, FILE *output);
1147 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output);
1148 #ifndef NDEBUG
1149 void perfwarn(bool cond, const char *msg, Instruction *instr=NULL);
1150 #else
1151 #define perfwarn(program, cond, msg, ...)
1152 #endif
1153
1154 void aco_print_instr(Instruction *instr, FILE *output);
1155 void aco_print_program(Program *program, FILE *output);
1156
1157 typedef struct {
1158 const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1159 const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1160 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1161 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1162 const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1163 const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1164 } Info;
1165
1166 extern const Info instr_info;
1167
1168 }
1169
1170 #endif /* ACO_IR_H */
1171