aco: check if SALU instructions are predeceeded by exec when calculating WQM needs
[mesa.git] / src / amd / compiler / aco_ir.h
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #ifndef ACO_IR_H
26 #define ACO_IR_H
27
28 #include <vector>
29 #include <set>
30 #include <bitset>
31 #include <memory>
32
33 #include "nir.h"
34 #include "ac_binary.h"
35 #include "amd_family.h"
36 #include "aco_opcodes.h"
37 #include "aco_util.h"
38
39 struct radv_nir_compiler_options;
40 struct radv_shader_info;
41
42 namespace aco {
43
44 extern uint64_t debug_flags;
45
46 enum {
47 DEBUG_VALIDATE = 0x1,
48 DEBUG_VALIDATE_RA = 0x2,
49 DEBUG_PERFWARN = 0x4,
50 };
51
52 /**
53 * Representation of the instruction's microcode encoding format
54 * Note: Some Vector ALU Formats can be combined, such that:
55 * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
56 * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
57 * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
58 *
59 * (*) The same is applicable for VOP1 and VOPC instructions.
60 */
61 enum class Format : std::uint16_t {
62 /* Pseudo Instruction Format */
63 PSEUDO = 0,
64 /* Scalar ALU & Control Formats */
65 SOP1 = 1,
66 SOP2 = 2,
67 SOPK = 3,
68 SOPP = 4,
69 SOPC = 5,
70 /* Scalar Memory Format */
71 SMEM = 6,
72 /* LDS/GDS Format */
73 DS = 8,
74 /* Vector Memory Buffer Formats */
75 MTBUF = 9,
76 MUBUF = 10,
77 /* Vector Memory Image Format */
78 MIMG = 11,
79 /* Export Format */
80 EXP = 12,
81 /* Flat Formats */
82 FLAT = 13,
83 GLOBAL = 14,
84 SCRATCH = 15,
85
86 PSEUDO_BRANCH = 16,
87 PSEUDO_BARRIER = 17,
88 PSEUDO_REDUCTION = 18,
89
90 /* Vector ALU Formats */
91 VOP1 = 1 << 8,
92 VOP2 = 1 << 9,
93 VOPC = 1 << 10,
94 VOP3 = 1 << 11,
95 VOP3A = 1 << 11,
96 VOP3B = 1 << 11,
97 VOP3P = 1 << 12,
98 /* Vector Parameter Interpolation Format */
99 VINTRP = 1 << 13,
100 DPP = 1 << 14,
101 SDWA = 1 << 15,
102 };
103
104 enum barrier_interaction {
105 barrier_none = 0,
106 barrier_buffer = 0x1,
107 barrier_image = 0x2,
108 barrier_atomic = 0x4,
109 barrier_shared = 0x8,
110 barrier_count = 4,
111 };
112
113 constexpr Format asVOP3(Format format) {
114 return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
115 };
116
117 enum class RegType {
118 none = 0,
119 sgpr,
120 vgpr,
121 linear_vgpr,
122 };
123
124 struct RegClass {
125
126 enum RC : uint8_t {
127 s1 = 1,
128 s2 = 2,
129 s3 = 3,
130 s4 = 4,
131 s6 = 6,
132 s8 = 8,
133 s16 = 16,
134 v1 = s1 | (1 << 5),
135 v2 = s2 | (1 << 5),
136 v3 = s3 | (1 << 5),
137 v4 = s4 | (1 << 5),
138 v5 = 5 | (1 << 5),
139 v6 = 6 | (1 << 5),
140 v7 = 7 | (1 << 5),
141 v8 = 8 | (1 << 5),
142 /* these are used for WWM and spills to vgpr */
143 v1_linear = v1 | (1 << 6),
144 v2_linear = v2 | (1 << 6),
145 };
146
147 RegClass() = default;
148 constexpr RegClass(RC rc)
149 : rc(rc) {}
150 constexpr RegClass(RegType type, unsigned size)
151 : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
152
153 constexpr operator RC() const { return rc; }
154 explicit operator bool() = delete;
155
156 constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
157 constexpr unsigned size() const { return (unsigned) rc & 0x1F; }
158 constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
159 constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
160
161 private:
162 RC rc;
163 };
164
165 /* transitional helper expressions */
166 static constexpr RegClass s1{RegClass::s1};
167 static constexpr RegClass s2{RegClass::s2};
168 static constexpr RegClass s3{RegClass::s3};
169 static constexpr RegClass s4{RegClass::s4};
170 static constexpr RegClass s8{RegClass::s8};
171 static constexpr RegClass s16{RegClass::s16};
172 static constexpr RegClass v1{RegClass::v1};
173 static constexpr RegClass v2{RegClass::v2};
174 static constexpr RegClass v3{RegClass::v3};
175 static constexpr RegClass v4{RegClass::v4};
176 static constexpr RegClass v5{RegClass::v5};
177 static constexpr RegClass v6{RegClass::v6};
178 static constexpr RegClass v7{RegClass::v7};
179 static constexpr RegClass v8{RegClass::v8};
180
181 /**
182 * Temp Class
183 * Each temporary virtual register has a
184 * register class (i.e. size and type)
185 * and SSA id.
186 */
187 struct Temp {
188 Temp() = default;
189 constexpr Temp(uint32_t id, RegClass cls) noexcept
190 : id_(id), reg_class(cls) {}
191
192 constexpr uint32_t id() const noexcept { return id_; }
193 constexpr RegClass regClass() const noexcept { return reg_class; }
194
195 constexpr unsigned size() const noexcept { return reg_class.size(); }
196 constexpr RegType type() const noexcept { return reg_class.type(); }
197 constexpr bool is_linear() const noexcept { return reg_class.is_linear(); }
198
199 constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
200 constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
201 constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
202
203 private:
204 uint32_t id_:24;
205 RegClass reg_class;
206 };
207
208 /**
209 * PhysReg
210 * Represents the physical register for each
211 * Operand and Definition.
212 */
213 struct PhysReg {
214 constexpr PhysReg() = default;
215 explicit constexpr PhysReg(unsigned r) : reg(r) {}
216 constexpr operator unsigned() const { return reg; }
217
218 uint16_t reg = 0;
219 };
220
221 /* helper expressions for special registers */
222 static constexpr PhysReg m0{124};
223 static constexpr PhysReg vcc{106};
224 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
225 static constexpr PhysReg exec{126};
226 static constexpr PhysReg exec_lo{126};
227 static constexpr PhysReg exec_hi{127};
228 static constexpr PhysReg scc{253};
229
230 /**
231 * Operand Class
232 * Initially, each Operand refers to either
233 * a temporary virtual register
234 * or to a constant value
235 * Temporary registers get mapped to physical register during RA
236 * Constant values are inlined into the instruction sequence.
237 */
238 class Operand final
239 {
240 public:
241 constexpr Operand()
242 : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
243 isKill_(false), isUndef_(true), isFirstKill_(false), is64BitConst_(false) {}
244
245 explicit Operand(Temp r) noexcept
246 {
247 data_.temp = r;
248 if (r.id()) {
249 isTemp_ = true;
250 } else {
251 isUndef_ = true;
252 setFixed(PhysReg{128});
253 }
254 };
255 explicit Operand(uint32_t v) noexcept
256 {
257 data_.i = v;
258 isConstant_ = true;
259 if (v <= 64)
260 setFixed(PhysReg{128 + v});
261 else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
262 setFixed(PhysReg{192 - v});
263 else if (v == 0x3f000000) /* 0.5 */
264 setFixed(PhysReg{240});
265 else if (v == 0xbf000000) /* -0.5 */
266 setFixed(PhysReg{241});
267 else if (v == 0x3f800000) /* 1.0 */
268 setFixed(PhysReg{242});
269 else if (v == 0xbf800000) /* -1.0 */
270 setFixed(PhysReg{243});
271 else if (v == 0x40000000) /* 2.0 */
272 setFixed(PhysReg{244});
273 else if (v == 0xc0000000) /* -2.0 */
274 setFixed(PhysReg{245});
275 else if (v == 0x40800000) /* 4.0 */
276 setFixed(PhysReg{246});
277 else if (v == 0xc0800000) /* -4.0 */
278 setFixed(PhysReg{247});
279 else if (v == 0x3e22f983) /* 1/(2*PI) */
280 setFixed(PhysReg{248});
281 else /* Literal Constant */
282 setFixed(PhysReg{255});
283 };
284 explicit Operand(uint64_t v) noexcept
285 {
286 isConstant_ = true;
287 is64BitConst_ = true;
288 if (v <= 64)
289 setFixed(PhysReg{128 + (uint32_t) v});
290 else if (v >= 0xFFFFFFFFFFFFFFF0) /* [-16 .. -1] */
291 setFixed(PhysReg{192 - (uint32_t) v});
292 else if (v == 0x3FE0000000000000) /* 0.5 */
293 setFixed(PhysReg{240});
294 else if (v == 0xBFE0000000000000) /* -0.5 */
295 setFixed(PhysReg{241});
296 else if (v == 0x3FF0000000000000) /* 1.0 */
297 setFixed(PhysReg{242});
298 else if (v == 0xBFF0000000000000) /* -1.0 */
299 setFixed(PhysReg{243});
300 else if (v == 0x4000000000000000) /* 2.0 */
301 setFixed(PhysReg{244});
302 else if (v == 0xC000000000000000) /* -2.0 */
303 setFixed(PhysReg{245});
304 else if (v == 0x4010000000000000) /* 4.0 */
305 setFixed(PhysReg{246});
306 else if (v == 0xC010000000000000) /* -4.0 */
307 setFixed(PhysReg{247});
308 else if (v == 0x3fc45f306dc9c882) /* 1/(2*PI) */
309 setFixed(PhysReg{248});
310 else { /* Literal Constant: we don't know if it is a long or double.*/
311 isConstant_ = 0;
312 assert(false && "attempt to create a 64-bit literal constant");
313 }
314 };
315 explicit Operand(RegClass type) noexcept
316 {
317 isUndef_ = true;
318 data_.temp = Temp(0, type);
319 setFixed(PhysReg{128});
320 };
321 explicit Operand(PhysReg reg, RegClass type) noexcept
322 {
323 data_.temp = Temp(0, type);
324 setFixed(reg);
325 }
326
327 constexpr bool isTemp() const noexcept
328 {
329 return isTemp_;
330 }
331
332 constexpr void setTemp(Temp t) noexcept {
333 assert(!isConstant_);
334 isTemp_ = true;
335 data_.temp = t;
336 }
337
338 constexpr Temp getTemp() const noexcept
339 {
340 return data_.temp;
341 }
342
343 constexpr uint32_t tempId() const noexcept
344 {
345 return data_.temp.id();
346 }
347
348 constexpr bool hasRegClass() const noexcept
349 {
350 return isTemp() || isUndefined();
351 }
352
353 constexpr RegClass regClass() const noexcept
354 {
355 return data_.temp.regClass();
356 }
357
358 constexpr unsigned size() const noexcept
359 {
360 if (isConstant())
361 return is64BitConst_ ? 2 : 1;
362 else
363 return data_.temp.size();
364 }
365
366 constexpr bool isFixed() const noexcept
367 {
368 return isFixed_;
369 }
370
371 constexpr PhysReg physReg() const noexcept
372 {
373 return reg_;
374 }
375
376 constexpr void setFixed(PhysReg reg) noexcept
377 {
378 isFixed_ = reg != unsigned(-1);
379 reg_ = reg;
380 }
381
382 constexpr bool isConstant() const noexcept
383 {
384 return isConstant_;
385 }
386
387 constexpr bool isLiteral() const noexcept
388 {
389 return isConstant() && reg_ == 255;
390 }
391
392 constexpr bool isUndefined() const noexcept
393 {
394 return isUndef_;
395 }
396
397 constexpr uint32_t constantValue() const noexcept
398 {
399 return data_.i;
400 }
401
402 constexpr bool constantEquals(uint32_t cmp) const noexcept
403 {
404 return isConstant() && constantValue() == cmp;
405 }
406
407 constexpr void setKill(bool flag) noexcept
408 {
409 isKill_ = flag;
410 if (!flag)
411 setFirstKill(false);
412 }
413
414 constexpr bool isKill() const noexcept
415 {
416 return isKill_ || isFirstKill();
417 }
418
419 constexpr void setFirstKill(bool flag) noexcept
420 {
421 isFirstKill_ = flag;
422 if (flag)
423 setKill(flag);
424 }
425
426 /* When there are multiple operands killing the same temporary,
427 * isFirstKill() is only returns true for the first one. */
428 constexpr bool isFirstKill() const noexcept
429 {
430 return isFirstKill_;
431 }
432
433 private:
434 union {
435 uint32_t i;
436 float f;
437 Temp temp = Temp(0, s1);
438 } data_;
439 PhysReg reg_;
440 union {
441 struct {
442 uint8_t isTemp_:1;
443 uint8_t isFixed_:1;
444 uint8_t isConstant_:1;
445 uint8_t isKill_:1;
446 uint8_t isUndef_:1;
447 uint8_t isFirstKill_:1;
448 uint8_t is64BitConst_:1;
449 };
450 /* can't initialize bit-fields in c++11, so work around using a union */
451 uint8_t control_ = 0;
452 };
453 };
454
455 /**
456 * Definition Class
457 * Definitions are the results of Instructions
458 * and refer to temporary virtual registers
459 * which are later mapped to physical registers
460 */
461 class Definition final
462 {
463 public:
464 constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0) {}
465 Definition(uint32_t index, RegClass type) noexcept
466 : temp(index, type) {}
467 explicit Definition(Temp tmp) noexcept
468 : temp(tmp) {}
469 Definition(PhysReg reg, RegClass type) noexcept
470 : temp(Temp(0, type))
471 {
472 setFixed(reg);
473 }
474 Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
475 : temp(Temp(tmpId, type))
476 {
477 setFixed(reg);
478 }
479
480 constexpr bool isTemp() const noexcept
481 {
482 return tempId() > 0;
483 }
484
485 constexpr Temp getTemp() const noexcept
486 {
487 return temp;
488 }
489
490 constexpr uint32_t tempId() const noexcept
491 {
492 return temp.id();
493 }
494
495 constexpr void setTemp(Temp t) noexcept {
496 temp = t;
497 }
498
499 constexpr RegClass regClass() const noexcept
500 {
501 return temp.regClass();
502 }
503
504 constexpr unsigned size() const noexcept
505 {
506 return temp.size();
507 }
508
509 constexpr bool isFixed() const noexcept
510 {
511 return isFixed_;
512 }
513
514 constexpr PhysReg physReg() const noexcept
515 {
516 return reg_;
517 }
518
519 constexpr void setFixed(PhysReg reg) noexcept
520 {
521 isFixed_ = 1;
522 reg_ = reg;
523 }
524
525 constexpr void setHint(PhysReg reg) noexcept
526 {
527 hasHint_ = 1;
528 reg_ = reg;
529 }
530
531 constexpr bool hasHint() const noexcept
532 {
533 return hasHint_;
534 }
535
536 constexpr void setKill(bool flag) noexcept
537 {
538 isKill_ = flag;
539 }
540
541 constexpr bool isKill() const noexcept
542 {
543 return isKill_;
544 }
545
546 private:
547 Temp temp = Temp(0, s1);
548 PhysReg reg_;
549 union {
550 struct {
551 uint8_t isFixed_:1;
552 uint8_t hasHint_:1;
553 uint8_t isKill_:1;
554 };
555 /* can't initialize bit-fields in c++11, so work around using a union */
556 uint8_t control_ = 0;
557 };
558 };
559
560 class Block;
561
562 struct Instruction {
563 aco_opcode opcode;
564 Format format;
565 uint32_t pass_flags;
566
567 aco::span<Operand> operands;
568 aco::span<Definition> definitions;
569
570 constexpr bool isVALU() const noexcept
571 {
572 return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
573 || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
574 || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
575 || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
576 || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
577 || ((uint16_t) format & (uint16_t) Format::VOP3P) == (uint16_t) Format::VOP3P;
578 }
579
580 constexpr bool isSALU() const noexcept
581 {
582 return format == Format::SOP1 ||
583 format == Format::SOP2 ||
584 format == Format::SOPC ||
585 format == Format::SOPK ||
586 format == Format::SOPP;
587 }
588
589 constexpr bool isVMEM() const noexcept
590 {
591 return format == Format::MTBUF ||
592 format == Format::MUBUF ||
593 format == Format::MIMG;
594 }
595
596 constexpr bool isDPP() const noexcept
597 {
598 return (uint16_t) format & (uint16_t) Format::DPP;
599 }
600
601 constexpr bool isVOP3() const noexcept
602 {
603 return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
604 ((uint16_t) format & (uint16_t) Format::VOP3B) ||
605 format == Format::VOP3P;
606 }
607
608 constexpr bool isSDWA() const noexcept
609 {
610 return (uint16_t) format & (uint16_t) Format::SDWA;
611 }
612
613 constexpr bool isFlatOrGlobal() const noexcept
614 {
615 return format == Format::FLAT || format == Format::GLOBAL;
616 }
617
618 constexpr bool usesModifiers() const noexcept;
619
620 constexpr bool reads_exec() const noexcept
621 {
622 for (const Operand& op : operands) {
623 if (op.isFixed() && op.physReg() == exec)
624 return true;
625 }
626 return false;
627 }
628 };
629
630 struct SOPK_instruction : public Instruction {
631 uint16_t imm;
632 };
633
634 struct SOPP_instruction : public Instruction {
635 uint32_t imm;
636 int block;
637 };
638
639 struct SOPC_instruction : public Instruction {
640 };
641
642 struct SOP1_instruction : public Instruction {
643 };
644
645 struct SOP2_instruction : public Instruction {
646 };
647
648 /**
649 * Scalar Memory Format:
650 * For s_(buffer_)load_dword*:
651 * Operand(0): SBASE - SGPR-pair which provides base address
652 * Operand(1): Offset - immediate (un)signed offset or SGPR
653 * Operand(2) / Definition(0): SDATA - SGPR for read / write result
654 * Operand(n-1): SOffset - SGPR offset (Vega only)
655 *
656 * Having no operands is also valid for instructions such as s_dcache_inv.
657 *
658 */
659 struct SMEM_instruction : public Instruction {
660 bool glc; /* VI+: globally coherent */
661 bool dlc; /* NAVI: device level coherent */
662 bool nv; /* VEGA only: Non-volatile */
663 bool can_reorder;
664 bool disable_wqm;
665 barrier_interaction barrier;
666 };
667
668 struct VOP1_instruction : public Instruction {
669 };
670
671 struct VOP2_instruction : public Instruction {
672 };
673
674 struct VOPC_instruction : public Instruction {
675 };
676
677 struct VOP3A_instruction : public Instruction {
678 bool abs[3];
679 bool opsel[4];
680 bool clamp;
681 unsigned omod;
682 bool neg[3];
683 };
684
685 /**
686 * Data Parallel Primitives Format:
687 * This format can be used for VOP1, VOP2 or VOPC instructions.
688 * The swizzle applies to the src0 operand.
689 *
690 */
691 struct DPP_instruction : public Instruction {
692 uint16_t dpp_ctrl;
693 uint8_t row_mask;
694 uint8_t bank_mask;
695 bool abs[2];
696 bool neg[2];
697 bool bound_ctrl;
698 };
699
700 struct Interp_instruction : public Instruction {
701 unsigned attribute;
702 unsigned component;
703 };
704
705 /**
706 * Local and Global Data Sharing instructions
707 * Operand(0): ADDR - VGPR which supplies the address.
708 * Operand(1): DATA0 - First data VGPR.
709 * Operand(2): DATA1 - Second data VGPR.
710 * Operand(n-1): M0 - LDS size.
711 * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
712 *
713 */
714 struct DS_instruction : public Instruction {
715 int16_t offset0;
716 int8_t offset1;
717 bool gds;
718 };
719
720 /**
721 * Vector Memory Untyped-buffer Instructions
722 * Operand(0): VADDR - Address source. Can carry an index and/or offset
723 * Operand(1): SRSRC - Specifies which SGPR supplies T# (resource constant)
724 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
725 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
726 *
727 */
728 struct MUBUF_instruction : public Instruction {
729 unsigned offset; /* Unsigned byte offset - 12 bit */
730 bool offen; /* Supply an offset from VGPR (VADDR) */
731 bool idxen; /* Supply an index from VGPR (VADDR) */
732 bool glc; /* globally coherent */
733 bool dlc; /* NAVI: device level coherent */
734 bool slc; /* system level coherent */
735 bool tfe; /* texture fail enable */
736 bool lds; /* Return read-data to LDS instead of VGPRs */
737 bool disable_wqm; /* Require an exec mask without helper invocations */
738 bool can_reorder;
739 barrier_interaction barrier;
740 };
741
742 /**
743 * Vector Memory Typed-buffer Instructions
744 * Operand(0): VADDR - Address source. Can carry an index and/or offset
745 * Operand(1): SRSRC - Specifies which SGPR supplies T# (resource constant)
746 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
747 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
748 *
749 */
750 struct MTBUF_instruction : public Instruction {
751 uint8_t dfmt : 4; /* Data Format of data in memory buffer */
752 uint8_t nfmt : 3; /* Numeric format of data in memory */
753 unsigned offset; /* Unsigned byte offset - 12 bit */
754 bool offen; /* Supply an offset from VGPR (VADDR) */
755 bool idxen; /* Supply an index from VGPR (VADDR) */
756 bool glc; /* globally coherent */
757 bool dlc; /* NAVI: device level coherent */
758 bool slc; /* system level coherent */
759 bool tfe; /* texture fail enable */
760 bool disable_wqm; /* Require an exec mask without helper invocations */
761 bool can_reorder;
762 barrier_interaction barrier;
763 };
764
765 /**
766 * Vector Memory Image Instructions
767 * Operand(0): VADDR - Address source. Can carry an offset or an index.
768 * Operand(1): SRSRC - Scalar GPR that specifies the resource constant.
769 * Operand(2): SSAMP - Scalar GPR that specifies sampler constant.
770 * Operand(3) / Definition(0): VDATA - Vector GPR for read / write result.
771 *
772 */
773 struct MIMG_instruction : public Instruction {
774 unsigned dmask; /* Data VGPR enable mask */
775 unsigned dim; /* NAVI: dimensionality */
776 bool unrm; /* Force address to be un-normalized */
777 bool dlc; /* NAVI: device level coherent */
778 bool glc; /* globally coherent */
779 bool slc; /* system level coherent */
780 bool tfe; /* texture fail enable */
781 bool da; /* declare an array */
782 bool lwe; /* Force data to be un-normalized */
783 bool r128; /* NAVI: Texture resource size */
784 bool a16; /* VEGA, NAVI: Address components are 16-bits */
785 bool d16; /* Convert 32-bit data to 16-bit data */
786 bool disable_wqm; /* Require an exec mask without helper invocations */
787 bool can_reorder;
788 barrier_interaction barrier;
789 };
790
791 /**
792 * Flat/Scratch/Global Instructions
793 * Operand(0): ADDR
794 * Operand(1): SADDR
795 * Operand(2) / Definition(0): DATA/VDST
796 *
797 */
798 struct FLAT_instruction : public Instruction {
799 uint16_t offset; /* Vega only */
800 bool slc; /* system level coherent */
801 bool glc; /* globally coherent */
802 bool dlc; /* NAVI: device level coherent */
803 bool lds;
804 bool nv;
805 };
806
807 struct Export_instruction : public Instruction {
808 unsigned enabled_mask;
809 unsigned dest;
810 bool compressed;
811 bool done;
812 bool valid_mask;
813 };
814
815 struct Pseudo_instruction : public Instruction {
816 bool tmp_in_scc;
817 PhysReg scratch_sgpr; /* might not be valid if it's not needed */
818 };
819
820 struct Pseudo_branch_instruction : public Instruction {
821 /* target[0] is the block index of the branch target.
822 * For conditional branches, target[1] contains the fall-through alternative.
823 * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
824 */
825 uint32_t target[2];
826 };
827
828 struct Pseudo_barrier_instruction : public Instruction {
829 };
830
831 enum ReduceOp {
832 iadd32, iadd64,
833 imul32, imul64,
834 fadd32, fadd64,
835 fmul32, fmul64,
836 imin32, imin64,
837 imax32, imax64,
838 umin32, umin64,
839 umax32, umax64,
840 fmin32, fmin64,
841 fmax32, fmax64,
842 iand32, iand64,
843 ior32, ior64,
844 ixor32, ixor64,
845 gfx10_wave64_bpermute
846 };
847
848 /**
849 * Subgroup Reduction Instructions, everything except for the data to be
850 * reduced and the result as inserted by setup_reduce_temp().
851 * Operand(0): data to be reduced
852 * Operand(1): reduce temporary
853 * Operand(2): vector temporary
854 * Definition(0): result
855 * Definition(1): scalar temporary
856 * Definition(2): scalar identity temporary (not used to store identity on GFX10)
857 * Definition(3): scc clobber
858 * Definition(4): vcc clobber
859 *
860 */
861 struct Pseudo_reduction_instruction : public Instruction {
862 ReduceOp reduce_op;
863 unsigned cluster_size; // must be 0 for scans
864 };
865
866 struct instr_deleter_functor {
867 void operator()(void* p) {
868 free(p);
869 }
870 };
871
872 template<typename T>
873 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
874
875 template<typename T>
876 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
877 {
878 std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
879 char *data = (char*) calloc(1, size);
880 T* inst = (T*) data;
881
882 inst->opcode = opcode;
883 inst->format = format;
884
885 inst->operands = aco::span<Operand>((Operand*)(data + sizeof(T)), num_operands);
886 inst->definitions = aco::span<Definition>((Definition*)inst->operands.end(), num_definitions);
887
888 return inst;
889 }
890
891 constexpr bool Instruction::usesModifiers() const noexcept
892 {
893 if (isDPP() || isSDWA())
894 return true;
895 if (!isVOP3())
896 return false;
897 const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
898 for (unsigned i = 0; i < operands.size(); i++) {
899 if (vop3->abs[i] || vop3->opsel[i] || vop3->neg[i])
900 return true;
901 }
902 return vop3->opsel[3] || vop3->clamp || vop3->omod;
903 }
904
905 constexpr bool is_phi(Instruction* instr)
906 {
907 return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
908 }
909
910 static inline bool is_phi(aco_ptr<Instruction>& instr)
911 {
912 return is_phi(instr.get());
913 }
914
915 constexpr barrier_interaction get_barrier_interaction(Instruction* instr)
916 {
917 switch (instr->format) {
918 case Format::SMEM:
919 return static_cast<SMEM_instruction*>(instr)->barrier;
920 case Format::MUBUF:
921 return static_cast<MUBUF_instruction*>(instr)->barrier;
922 case Format::MIMG:
923 return static_cast<MIMG_instruction*>(instr)->barrier;
924 case Format::FLAT:
925 case Format::GLOBAL:
926 return barrier_buffer;
927 case Format::DS:
928 return barrier_shared;
929 default:
930 return barrier_none;
931 }
932 }
933
934 enum block_kind {
935 /* uniform indicates that leaving this block,
936 * all actives lanes stay active */
937 block_kind_uniform = 1 << 0,
938 block_kind_top_level = 1 << 1,
939 block_kind_loop_preheader = 1 << 2,
940 block_kind_loop_header = 1 << 3,
941 block_kind_loop_exit = 1 << 4,
942 block_kind_continue = 1 << 5,
943 block_kind_break = 1 << 6,
944 block_kind_continue_or_break = 1 << 7,
945 block_kind_discard = 1 << 8,
946 block_kind_branch = 1 << 9,
947 block_kind_merge = 1 << 10,
948 block_kind_invert = 1 << 11,
949 block_kind_uses_discard_if = 1 << 12,
950 block_kind_needs_lowering = 1 << 13,
951 block_kind_uses_demote = 1 << 14,
952 };
953
954
955 struct RegisterDemand {
956 constexpr RegisterDemand() = default;
957 constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
958 : vgpr{v}, sgpr{s} {}
959 int16_t vgpr = 0;
960 int16_t sgpr = 0;
961
962 constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
963 return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
964 }
965
966 constexpr bool exceeds(const RegisterDemand other) const noexcept {
967 return vgpr > other.vgpr || sgpr > other.sgpr;
968 }
969
970 constexpr RegisterDemand operator+(const Temp t) const noexcept {
971 if (t.type() == RegType::sgpr)
972 return RegisterDemand( vgpr, sgpr + t.size() );
973 else
974 return RegisterDemand( vgpr + t.size(), sgpr );
975 }
976
977 constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
978 return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
979 }
980
981 constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
982 return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
983 }
984
985 constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
986 vgpr += other.vgpr;
987 sgpr += other.sgpr;
988 return *this;
989 }
990
991 constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
992 vgpr -= other.vgpr;
993 sgpr -= other.sgpr;
994 return *this;
995 }
996
997 constexpr RegisterDemand& operator+=(const Temp t) noexcept {
998 if (t.type() == RegType::sgpr)
999 sgpr += t.size();
1000 else
1001 vgpr += t.size();
1002 return *this;
1003 }
1004
1005 constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1006 if (t.type() == RegType::sgpr)
1007 sgpr -= t.size();
1008 else
1009 vgpr -= t.size();
1010 return *this;
1011 }
1012
1013 constexpr void update(const RegisterDemand other) noexcept {
1014 vgpr = std::max(vgpr, other.vgpr);
1015 sgpr = std::max(sgpr, other.sgpr);
1016 }
1017
1018 };
1019
1020 /* CFG */
1021 struct Block {
1022 unsigned index;
1023 unsigned offset = 0;
1024 std::vector<aco_ptr<Instruction>> instructions;
1025 std::vector<unsigned> logical_preds;
1026 std::vector<unsigned> linear_preds;
1027 std::vector<unsigned> logical_succs;
1028 std::vector<unsigned> linear_succs;
1029 RegisterDemand register_demand = RegisterDemand();
1030 uint16_t loop_nest_depth = 0;
1031 uint16_t kind = 0;
1032 int logical_idom = -1;
1033 int linear_idom = -1;
1034 Temp live_out_exec = Temp();
1035
1036 /* this information is needed for predecessors to blocks with phis when
1037 * moving out of ssa */
1038 bool scc_live_out = false;
1039 PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1040
1041 Block(unsigned idx) : index(idx) {}
1042 Block() : index(0) {}
1043 };
1044
1045 using Stage = uint16_t;
1046
1047 /* software stages */
1048 static constexpr Stage sw_vs = 1 << 0;
1049 static constexpr Stage sw_gs = 1 << 1;
1050 static constexpr Stage sw_tcs = 1 << 2;
1051 static constexpr Stage sw_tes = 1 << 3;
1052 static constexpr Stage sw_fs = 1 << 4;
1053 static constexpr Stage sw_cs = 1 << 5;
1054 static constexpr Stage sw_mask = 0x3f;
1055
1056 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1057 static constexpr Stage hw_vs = 1 << 6;
1058 static constexpr Stage hw_es = 1 << 7; /* not on GFX9. combined into GS on GFX9 (and GFX10/legacy). */
1059 static constexpr Stage hw_gs = 1 << 8;
1060 static constexpr Stage hw_ls = 1 << 9; /* not on GFX9. combined into HS on GFX9 (and GFX10/legacy). */
1061 static constexpr Stage hw_hs = 1 << 10;
1062 static constexpr Stage hw_fs = 1 << 11;
1063 static constexpr Stage hw_cs = 1 << 12;
1064 static constexpr Stage hw_mask = 0x7f << 6;
1065
1066 /* possible settings of Program::stage */
1067 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1068 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1069 static constexpr Stage compute_cs = sw_cs | hw_cs;
1070 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1071 /* GFX10/NGG */
1072 static constexpr Stage ngg_vertex_gs = sw_vs | hw_gs;
1073 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1074 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1075 static constexpr Stage ngg_vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1076 /* GFX9 (and GFX10 if NGG isn't used) */
1077 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1078 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1079 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1080 /* pre-GFX9 */
1081 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1082 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1083 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1084 static constexpr Stage tess_eval_es = sw_tes | hw_gs; /* tesselation evaluation before geometry */
1085 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1086
1087 class Program final {
1088 public:
1089 std::vector<Block> blocks;
1090 RegisterDemand max_reg_demand = RegisterDemand();
1091 uint16_t num_waves = 0;
1092 uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1093 ac_shader_config* config;
1094 struct radv_shader_info *info;
1095 enum chip_class chip_class;
1096 enum radeon_family family;
1097 unsigned wave_size;
1098 Stage stage; /* Stage */
1099 bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1100 bool needs_wqm = false; /* there exists a p_wqm instruction */
1101 bool wb_smem_l1_on_end = false;
1102
1103 std::vector<uint8_t> constant_data;
1104 Temp private_segment_buffer;
1105 Temp scratch_offset;
1106
1107 uint16_t lds_alloc_granule;
1108 uint32_t lds_limit; /* in bytes */
1109 uint16_t vgpr_limit;
1110 uint16_t sgpr_limit;
1111 uint16_t physical_sgprs;
1112 uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1113
1114 bool needs_vcc = false;
1115 bool needs_xnack_mask = false;
1116 bool needs_flat_scr = false;
1117
1118 uint32_t allocateId()
1119 {
1120 assert(allocationID <= 16777215);
1121 return allocationID++;
1122 }
1123
1124 uint32_t peekAllocationId()
1125 {
1126 return allocationID;
1127 }
1128
1129 void setAllocationId(uint32_t id)
1130 {
1131 allocationID = id;
1132 }
1133
1134 Block* create_and_insert_block() {
1135 blocks.emplace_back(blocks.size());
1136 return &blocks.back();
1137 }
1138
1139 Block* insert_block(Block&& block) {
1140 block.index = blocks.size();
1141 blocks.emplace_back(std::move(block));
1142 return &blocks.back();
1143 }
1144
1145 private:
1146 uint32_t allocationID = 1;
1147 };
1148
1149 struct live {
1150 /* live temps out per block */
1151 std::vector<std::set<Temp>> live_out;
1152 /* register demand (sgpr/vgpr) per instruction per block */
1153 std::vector<std::vector<RegisterDemand>> register_demand;
1154 };
1155
1156 void select_program(Program *program,
1157 unsigned shader_count,
1158 struct nir_shader *const *shaders,
1159 ac_shader_config* config,
1160 struct radv_shader_info *info,
1161 struct radv_nir_compiler_options *options);
1162
1163 void lower_wqm(Program* program, live& live_vars,
1164 const struct radv_nir_compiler_options *options);
1165 void lower_bool_phis(Program* program);
1166 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1167 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1168 std::vector<uint16_t> dead_code_analysis(Program *program);
1169 void dominator_tree(Program* program);
1170 void insert_exec_mask(Program *program);
1171 void value_numbering(Program* program);
1172 void optimize(Program* program);
1173 void setup_reduce_temp(Program* program);
1174 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1175 void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_per_block);
1176 void ssa_elimination(Program* program);
1177 void lower_to_hw_instr(Program* program);
1178 void schedule_program(Program* program, live& live_vars);
1179 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1180 void insert_wait_states(Program* program);
1181 void insert_NOPs(Program* program);
1182 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1183 void print_asm(Program *program, std::vector<uint32_t>& binary,
1184 unsigned exec_size, std::ostream& out);
1185 void validate(Program* program, FILE *output);
1186 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output);
1187 #ifndef NDEBUG
1188 void perfwarn(bool cond, const char *msg, Instruction *instr=NULL);
1189 #else
1190 #define perfwarn(program, cond, msg, ...)
1191 #endif
1192
1193 void aco_print_instr(Instruction *instr, FILE *output);
1194 void aco_print_program(Program *program, FILE *output);
1195
1196 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1197 uint16_t get_extra_sgprs(Program *program);
1198
1199 /* get number of sgprs allocated required to address a number of sgprs */
1200 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1201
1202 /* return number of addressable SGPRs for max_waves */
1203 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1204
1205 typedef struct {
1206 const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1207 const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1208 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1209 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1210 const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1211 const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1212 } Info;
1213
1214 extern const Info instr_info;
1215
1216 }
1217
1218 #endif /* ACO_IR_H */
1219