aco: move some register demand helpers into aco_live_var_analysis.cpp
[mesa.git] / src / amd / compiler / aco_ir.h
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #ifndef ACO_IR_H
26 #define ACO_IR_H
27
28 #include <vector>
29 #include <set>
30 #include <bitset>
31 #include <memory>
32
33 #include "nir.h"
34 #include "ac_binary.h"
35 #include "amd_family.h"
36 #include "aco_opcodes.h"
37 #include "aco_util.h"
38
39 struct radv_nir_compiler_options;
40 struct radv_shader_args;
41 struct radv_shader_info;
42
43 namespace aco {
44
45 extern uint64_t debug_flags;
46
47 enum {
48 DEBUG_VALIDATE = 0x1,
49 DEBUG_VALIDATE_RA = 0x2,
50 DEBUG_PERFWARN = 0x4,
51 };
52
53 /**
54 * Representation of the instruction's microcode encoding format
55 * Note: Some Vector ALU Formats can be combined, such that:
56 * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
57 * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
58 * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
59 *
60 * (*) The same is applicable for VOP1 and VOPC instructions.
61 */
62 enum class Format : std::uint16_t {
63 /* Pseudo Instruction Format */
64 PSEUDO = 0,
65 /* Scalar ALU & Control Formats */
66 SOP1 = 1,
67 SOP2 = 2,
68 SOPK = 3,
69 SOPP = 4,
70 SOPC = 5,
71 /* Scalar Memory Format */
72 SMEM = 6,
73 /* LDS/GDS Format */
74 DS = 8,
75 /* Vector Memory Buffer Formats */
76 MTBUF = 9,
77 MUBUF = 10,
78 /* Vector Memory Image Format */
79 MIMG = 11,
80 /* Export Format */
81 EXP = 12,
82 /* Flat Formats */
83 FLAT = 13,
84 GLOBAL = 14,
85 SCRATCH = 15,
86
87 PSEUDO_BRANCH = 16,
88 PSEUDO_BARRIER = 17,
89 PSEUDO_REDUCTION = 18,
90
91 /* Vector ALU Formats */
92 VOP1 = 1 << 8,
93 VOP2 = 1 << 9,
94 VOPC = 1 << 10,
95 VOP3 = 1 << 11,
96 VOP3A = 1 << 11,
97 VOP3B = 1 << 11,
98 VOP3P = 1 << 12,
99 /* Vector Parameter Interpolation Format */
100 VINTRP = 1 << 13,
101 DPP = 1 << 14,
102 SDWA = 1 << 15,
103 };
104
105 enum barrier_interaction : uint8_t {
106 barrier_none = 0,
107 barrier_buffer = 0x1,
108 barrier_image = 0x2,
109 barrier_atomic = 0x4,
110 barrier_shared = 0x8,
111 /* used for geometry shaders to ensure vertex data writes are before the
112 * GS_DONE s_sendmsg. */
113 barrier_gs_data = 0x10,
114 /* used for geometry shaders to ensure s_sendmsg instructions are in-order. */
115 barrier_gs_sendmsg = 0x20,
116 /* used by barriers. created by s_barrier */
117 barrier_barrier = 0x40,
118 barrier_count = 6,
119 };
120
121 enum fp_round {
122 fp_round_ne = 0,
123 fp_round_pi = 1,
124 fp_round_ni = 2,
125 fp_round_tz = 3,
126 };
127
128 enum fp_denorm {
129 /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
130 * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
131 fp_denorm_flush = 0x0,
132 fp_denorm_keep = 0x3,
133 };
134
135 struct float_mode {
136 /* matches encoding of the MODE register */
137 union {
138 struct {
139 fp_round round32:2;
140 fp_round round16_64:2;
141 unsigned denorm32:2;
142 unsigned denorm16_64:2;
143 };
144 uint8_t val = 0;
145 };
146 /* if false, optimizations which may remove infs/nan/-0.0 can be done */
147 bool preserve_signed_zero_inf_nan32:1;
148 bool preserve_signed_zero_inf_nan16_64:1;
149 /* if false, optimizations which may remove denormal flushing can be done */
150 bool must_flush_denorms32:1;
151 bool must_flush_denorms16_64:1;
152 bool care_about_round32:1;
153 bool care_about_round16_64:1;
154
155 /* Returns true if instructions using the mode "other" can safely use the
156 * current one instead. */
157 bool canReplace(float_mode other) const noexcept {
158 return val == other.val &&
159 (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
160 (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
161 (must_flush_denorms32 || !other.must_flush_denorms32) &&
162 (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
163 (care_about_round32 || !other.care_about_round32) &&
164 (care_about_round16_64 || !other.care_about_round16_64);
165 }
166 };
167
168 constexpr Format asVOP3(Format format) {
169 return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
170 };
171
172 enum class RegType {
173 none = 0,
174 sgpr,
175 vgpr,
176 linear_vgpr,
177 };
178
179 struct RegClass {
180
181 enum RC : uint8_t {
182 s1 = 1,
183 s2 = 2,
184 s3 = 3,
185 s4 = 4,
186 s6 = 6,
187 s8 = 8,
188 s16 = 16,
189 v1 = s1 | (1 << 5),
190 v2 = s2 | (1 << 5),
191 v3 = s3 | (1 << 5),
192 v4 = s4 | (1 << 5),
193 v5 = 5 | (1 << 5),
194 v6 = 6 | (1 << 5),
195 v7 = 7 | (1 << 5),
196 v8 = 8 | (1 << 5),
197 /* these are used for WWM and spills to vgpr */
198 v1_linear = v1 | (1 << 6),
199 v2_linear = v2 | (1 << 6),
200 };
201
202 RegClass() = default;
203 constexpr RegClass(RC rc)
204 : rc(rc) {}
205 constexpr RegClass(RegType type, unsigned size)
206 : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
207
208 constexpr operator RC() const { return rc; }
209 explicit operator bool() = delete;
210
211 constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
212 constexpr unsigned size() const { return (unsigned) rc & 0x1F; }
213 constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
214 constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
215
216 private:
217 RC rc;
218 };
219
220 /* transitional helper expressions */
221 static constexpr RegClass s1{RegClass::s1};
222 static constexpr RegClass s2{RegClass::s2};
223 static constexpr RegClass s3{RegClass::s3};
224 static constexpr RegClass s4{RegClass::s4};
225 static constexpr RegClass s8{RegClass::s8};
226 static constexpr RegClass s16{RegClass::s16};
227 static constexpr RegClass v1{RegClass::v1};
228 static constexpr RegClass v2{RegClass::v2};
229 static constexpr RegClass v3{RegClass::v3};
230 static constexpr RegClass v4{RegClass::v4};
231 static constexpr RegClass v5{RegClass::v5};
232 static constexpr RegClass v6{RegClass::v6};
233 static constexpr RegClass v7{RegClass::v7};
234 static constexpr RegClass v8{RegClass::v8};
235
236 /**
237 * Temp Class
238 * Each temporary virtual register has a
239 * register class (i.e. size and type)
240 * and SSA id.
241 */
242 struct Temp {
243 Temp() = default;
244 constexpr Temp(uint32_t id, RegClass cls) noexcept
245 : id_(id), reg_class(cls) {}
246
247 constexpr uint32_t id() const noexcept { return id_; }
248 constexpr RegClass regClass() const noexcept { return reg_class; }
249
250 constexpr unsigned size() const noexcept { return reg_class.size(); }
251 constexpr RegType type() const noexcept { return reg_class.type(); }
252 constexpr bool is_linear() const noexcept { return reg_class.is_linear(); }
253
254 constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
255 constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
256 constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
257
258 private:
259 uint32_t id_:24;
260 RegClass reg_class;
261 };
262
263 /**
264 * PhysReg
265 * Represents the physical register for each
266 * Operand and Definition.
267 */
268 struct PhysReg {
269 constexpr PhysReg() = default;
270 explicit constexpr PhysReg(unsigned r) : reg(r) {}
271 constexpr operator unsigned() const { return reg; }
272
273 uint16_t reg = 0;
274 };
275
276 /* helper expressions for special registers */
277 static constexpr PhysReg m0{124};
278 static constexpr PhysReg vcc{106};
279 static constexpr PhysReg vcc_hi{107};
280 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
281 static constexpr PhysReg exec{126};
282 static constexpr PhysReg exec_lo{126};
283 static constexpr PhysReg exec_hi{127};
284 static constexpr PhysReg vccz{251};
285 static constexpr PhysReg execz{252};
286 static constexpr PhysReg scc{253};
287
288 /**
289 * Operand Class
290 * Initially, each Operand refers to either
291 * a temporary virtual register
292 * or to a constant value
293 * Temporary registers get mapped to physical register during RA
294 * Constant values are inlined into the instruction sequence.
295 */
296 class Operand final
297 {
298 public:
299 constexpr Operand()
300 : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
301 isKill_(false), isUndef_(true), isFirstKill_(false), is64BitConst_(false) {}
302
303 explicit Operand(Temp r) noexcept
304 {
305 data_.temp = r;
306 if (r.id()) {
307 isTemp_ = true;
308 } else {
309 isUndef_ = true;
310 setFixed(PhysReg{128});
311 }
312 };
313 explicit Operand(uint32_t v, bool is64bit = false) noexcept
314 {
315 data_.i = v;
316 isConstant_ = true;
317 is64BitConst_ = is64bit;
318 if (v <= 64)
319 setFixed(PhysReg{128 + v});
320 else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
321 setFixed(PhysReg{192 - v});
322 else if (v == 0x3f000000) /* 0.5 */
323 setFixed(PhysReg{240});
324 else if (v == 0xbf000000) /* -0.5 */
325 setFixed(PhysReg{241});
326 else if (v == 0x3f800000) /* 1.0 */
327 setFixed(PhysReg{242});
328 else if (v == 0xbf800000) /* -1.0 */
329 setFixed(PhysReg{243});
330 else if (v == 0x40000000) /* 2.0 */
331 setFixed(PhysReg{244});
332 else if (v == 0xc0000000) /* -2.0 */
333 setFixed(PhysReg{245});
334 else if (v == 0x40800000) /* 4.0 */
335 setFixed(PhysReg{246});
336 else if (v == 0xc0800000) /* -4.0 */
337 setFixed(PhysReg{247});
338 else { /* Literal Constant */
339 assert(!is64bit && "attempt to create a 64-bit literal constant");
340 setFixed(PhysReg{255});
341 }
342 };
343 explicit Operand(uint64_t v) noexcept
344 {
345 isConstant_ = true;
346 is64BitConst_ = true;
347 if (v <= 64) {
348 data_.i = (uint32_t) v;
349 setFixed(PhysReg{128 + (uint32_t) v});
350 } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
351 data_.i = (uint32_t) v;
352 setFixed(PhysReg{192 - (uint32_t) v});
353 } else if (v == 0x3FE0000000000000) { /* 0.5 */
354 data_.i = 0x3f000000;
355 setFixed(PhysReg{240});
356 } else if (v == 0xBFE0000000000000) { /* -0.5 */
357 data_.i = 0xbf000000;
358 setFixed(PhysReg{241});
359 } else if (v == 0x3FF0000000000000) { /* 1.0 */
360 data_.i = 0x3f800000;
361 setFixed(PhysReg{242});
362 } else if (v == 0xBFF0000000000000) { /* -1.0 */
363 data_.i = 0xbf800000;
364 setFixed(PhysReg{243});
365 } else if (v == 0x4000000000000000) { /* 2.0 */
366 data_.i = 0x40000000;
367 setFixed(PhysReg{244});
368 } else if (v == 0xC000000000000000) { /* -2.0 */
369 data_.i = 0xc0000000;
370 setFixed(PhysReg{245});
371 } else if (v == 0x4010000000000000) { /* 4.0 */
372 data_.i = 0x40800000;
373 setFixed(PhysReg{246});
374 } else if (v == 0xC010000000000000) { /* -4.0 */
375 data_.i = 0xc0800000;
376 setFixed(PhysReg{247});
377 } else { /* Literal Constant: we don't know if it is a long or double.*/
378 isConstant_ = 0;
379 assert(false && "attempt to create a 64-bit literal constant");
380 }
381 };
382 explicit Operand(RegClass type) noexcept
383 {
384 isUndef_ = true;
385 data_.temp = Temp(0, type);
386 setFixed(PhysReg{128});
387 };
388 explicit Operand(PhysReg reg, RegClass type) noexcept
389 {
390 data_.temp = Temp(0, type);
391 setFixed(reg);
392 }
393
394 constexpr bool isTemp() const noexcept
395 {
396 return isTemp_;
397 }
398
399 constexpr void setTemp(Temp t) noexcept {
400 assert(!isConstant_);
401 isTemp_ = true;
402 data_.temp = t;
403 }
404
405 constexpr Temp getTemp() const noexcept
406 {
407 return data_.temp;
408 }
409
410 constexpr uint32_t tempId() const noexcept
411 {
412 return data_.temp.id();
413 }
414
415 constexpr bool hasRegClass() const noexcept
416 {
417 return isTemp() || isUndefined();
418 }
419
420 constexpr RegClass regClass() const noexcept
421 {
422 return data_.temp.regClass();
423 }
424
425 constexpr unsigned size() const noexcept
426 {
427 if (isConstant())
428 return is64BitConst_ ? 2 : 1;
429 else
430 return data_.temp.size();
431 }
432
433 constexpr bool isFixed() const noexcept
434 {
435 return isFixed_;
436 }
437
438 constexpr PhysReg physReg() const noexcept
439 {
440 return reg_;
441 }
442
443 constexpr void setFixed(PhysReg reg) noexcept
444 {
445 isFixed_ = reg != unsigned(-1);
446 reg_ = reg;
447 }
448
449 constexpr bool isConstant() const noexcept
450 {
451 return isConstant_;
452 }
453
454 constexpr bool isLiteral() const noexcept
455 {
456 return isConstant() && reg_ == 255;
457 }
458
459 constexpr bool isUndefined() const noexcept
460 {
461 return isUndef_;
462 }
463
464 constexpr uint32_t constantValue() const noexcept
465 {
466 return data_.i;
467 }
468
469 constexpr bool constantEquals(uint32_t cmp) const noexcept
470 {
471 return isConstant() && constantValue() == cmp;
472 }
473
474 constexpr void setKill(bool flag) noexcept
475 {
476 isKill_ = flag;
477 if (!flag)
478 setFirstKill(false);
479 }
480
481 constexpr bool isKill() const noexcept
482 {
483 return isKill_ || isFirstKill();
484 }
485
486 constexpr void setFirstKill(bool flag) noexcept
487 {
488 isFirstKill_ = flag;
489 if (flag)
490 setKill(flag);
491 }
492
493 /* When there are multiple operands killing the same temporary,
494 * isFirstKill() is only returns true for the first one. */
495 constexpr bool isFirstKill() const noexcept
496 {
497 return isFirstKill_;
498 }
499
500 private:
501 union {
502 uint32_t i;
503 float f;
504 Temp temp = Temp(0, s1);
505 } data_;
506 PhysReg reg_;
507 union {
508 struct {
509 uint8_t isTemp_:1;
510 uint8_t isFixed_:1;
511 uint8_t isConstant_:1;
512 uint8_t isKill_:1;
513 uint8_t isUndef_:1;
514 uint8_t isFirstKill_:1;
515 uint8_t is64BitConst_:1;
516 };
517 /* can't initialize bit-fields in c++11, so work around using a union */
518 uint8_t control_ = 0;
519 };
520 };
521
522 /**
523 * Definition Class
524 * Definitions are the results of Instructions
525 * and refer to temporary virtual registers
526 * which are later mapped to physical registers
527 */
528 class Definition final
529 {
530 public:
531 constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0) {}
532 Definition(uint32_t index, RegClass type) noexcept
533 : temp(index, type) {}
534 explicit Definition(Temp tmp) noexcept
535 : temp(tmp) {}
536 Definition(PhysReg reg, RegClass type) noexcept
537 : temp(Temp(0, type))
538 {
539 setFixed(reg);
540 }
541 Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
542 : temp(Temp(tmpId, type))
543 {
544 setFixed(reg);
545 }
546
547 constexpr bool isTemp() const noexcept
548 {
549 return tempId() > 0;
550 }
551
552 constexpr Temp getTemp() const noexcept
553 {
554 return temp;
555 }
556
557 constexpr uint32_t tempId() const noexcept
558 {
559 return temp.id();
560 }
561
562 constexpr void setTemp(Temp t) noexcept {
563 temp = t;
564 }
565
566 constexpr RegClass regClass() const noexcept
567 {
568 return temp.regClass();
569 }
570
571 constexpr unsigned size() const noexcept
572 {
573 return temp.size();
574 }
575
576 constexpr bool isFixed() const noexcept
577 {
578 return isFixed_;
579 }
580
581 constexpr PhysReg physReg() const noexcept
582 {
583 return reg_;
584 }
585
586 constexpr void setFixed(PhysReg reg) noexcept
587 {
588 isFixed_ = 1;
589 reg_ = reg;
590 }
591
592 constexpr void setHint(PhysReg reg) noexcept
593 {
594 hasHint_ = 1;
595 reg_ = reg;
596 }
597
598 constexpr bool hasHint() const noexcept
599 {
600 return hasHint_;
601 }
602
603 constexpr void setKill(bool flag) noexcept
604 {
605 isKill_ = flag;
606 }
607
608 constexpr bool isKill() const noexcept
609 {
610 return isKill_;
611 }
612
613 private:
614 Temp temp = Temp(0, s1);
615 PhysReg reg_;
616 union {
617 struct {
618 uint8_t isFixed_:1;
619 uint8_t hasHint_:1;
620 uint8_t isKill_:1;
621 };
622 /* can't initialize bit-fields in c++11, so work around using a union */
623 uint8_t control_ = 0;
624 };
625 };
626
627 class Block;
628
629 struct Instruction {
630 aco_opcode opcode;
631 Format format;
632 uint32_t pass_flags;
633
634 aco::span<Operand> operands;
635 aco::span<Definition> definitions;
636
637 constexpr bool isVALU() const noexcept
638 {
639 return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
640 || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
641 || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
642 || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
643 || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
644 || ((uint16_t) format & (uint16_t) Format::VOP3P) == (uint16_t) Format::VOP3P;
645 }
646
647 constexpr bool isSALU() const noexcept
648 {
649 return format == Format::SOP1 ||
650 format == Format::SOP2 ||
651 format == Format::SOPC ||
652 format == Format::SOPK ||
653 format == Format::SOPP;
654 }
655
656 constexpr bool isVMEM() const noexcept
657 {
658 return format == Format::MTBUF ||
659 format == Format::MUBUF ||
660 format == Format::MIMG;
661 }
662
663 constexpr bool isDPP() const noexcept
664 {
665 return (uint16_t) format & (uint16_t) Format::DPP;
666 }
667
668 constexpr bool isVOP3() const noexcept
669 {
670 return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
671 ((uint16_t) format & (uint16_t) Format::VOP3B) ||
672 format == Format::VOP3P;
673 }
674
675 constexpr bool isSDWA() const noexcept
676 {
677 return (uint16_t) format & (uint16_t) Format::SDWA;
678 }
679
680 constexpr bool isFlatOrGlobal() const noexcept
681 {
682 return format == Format::FLAT || format == Format::GLOBAL;
683 }
684
685 constexpr bool usesModifiers() const noexcept;
686
687 constexpr bool reads_exec() const noexcept
688 {
689 for (const Operand& op : operands) {
690 if (op.isFixed() && op.physReg() == exec)
691 return true;
692 }
693 return false;
694 }
695 };
696
697 struct SOPK_instruction : public Instruction {
698 uint16_t imm;
699 };
700
701 struct SOPP_instruction : public Instruction {
702 uint32_t imm;
703 int block;
704 };
705
706 struct SOPC_instruction : public Instruction {
707 };
708
709 struct SOP1_instruction : public Instruction {
710 };
711
712 struct SOP2_instruction : public Instruction {
713 };
714
715 /**
716 * Scalar Memory Format:
717 * For s_(buffer_)load_dword*:
718 * Operand(0): SBASE - SGPR-pair which provides base address
719 * Operand(1): Offset - immediate (un)signed offset or SGPR
720 * Operand(2) / Definition(0): SDATA - SGPR for read / write result
721 * Operand(n-1): SOffset - SGPR offset (Vega only)
722 *
723 * Having no operands is also valid for instructions such as s_dcache_inv.
724 *
725 */
726 struct SMEM_instruction : public Instruction {
727 bool glc : 1; /* VI+: globally coherent */
728 bool dlc : 1; /* NAVI: device level coherent */
729 bool nv : 1; /* VEGA only: Non-volatile */
730 bool can_reorder : 1;
731 bool disable_wqm : 1;
732 barrier_interaction barrier;
733 };
734
735 struct VOP1_instruction : public Instruction {
736 };
737
738 struct VOP2_instruction : public Instruction {
739 };
740
741 struct VOPC_instruction : public Instruction {
742 };
743
744 struct VOP3A_instruction : public Instruction {
745 bool abs[3];
746 bool neg[3];
747 uint8_t opsel : 4;
748 uint8_t omod : 2;
749 bool clamp : 1;
750 };
751
752 /**
753 * Data Parallel Primitives Format:
754 * This format can be used for VOP1, VOP2 or VOPC instructions.
755 * The swizzle applies to the src0 operand.
756 *
757 */
758 struct DPP_instruction : public Instruction {
759 bool abs[2];
760 bool neg[2];
761 uint16_t dpp_ctrl;
762 uint8_t row_mask : 4;
763 uint8_t bank_mask : 4;
764 bool bound_ctrl : 1;
765 };
766
767 struct Interp_instruction : public Instruction {
768 uint8_t attribute;
769 uint8_t component;
770 };
771
772 /**
773 * Local and Global Data Sharing instructions
774 * Operand(0): ADDR - VGPR which supplies the address.
775 * Operand(1): DATA0 - First data VGPR.
776 * Operand(2): DATA1 - Second data VGPR.
777 * Operand(n-1): M0 - LDS size.
778 * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
779 *
780 */
781 struct DS_instruction : public Instruction {
782 int16_t offset0;
783 int8_t offset1;
784 bool gds;
785 };
786
787 /**
788 * Vector Memory Untyped-buffer Instructions
789 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
790 * Operand(1): VADDR - Address source. Can carry an index and/or offset
791 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
792 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
793 *
794 */
795 struct MUBUF_instruction : public Instruction {
796 uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
797 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
798 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
799 bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
800 bool glc : 1; /* globally coherent */
801 bool dlc : 1; /* NAVI: device level coherent */
802 bool slc : 1; /* system level coherent */
803 bool tfe : 1; /* texture fail enable */
804 bool lds : 1; /* Return read-data to LDS instead of VGPRs */
805 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
806 bool can_reorder : 1;
807 barrier_interaction barrier;
808 };
809
810 /**
811 * Vector Memory Typed-buffer Instructions
812 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
813 * Operand(1): VADDR - Address source. Can carry an index and/or offset
814 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
815 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
816 *
817 */
818 struct MTBUF_instruction : public Instruction {
819 uint16_t offset; /* Unsigned byte offset - 12 bit */
820 uint8_t dfmt : 4; /* Data Format of data in memory buffer */
821 uint8_t nfmt : 3; /* Numeric format of data in memory */
822 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
823 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
824 bool glc : 1; /* globally coherent */
825 bool dlc : 1; /* NAVI: device level coherent */
826 bool slc : 1; /* system level coherent */
827 bool tfe : 1; /* texture fail enable */
828 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
829 bool can_reorder : 1;
830 barrier_interaction barrier;
831 };
832
833 /**
834 * Vector Memory Image Instructions
835 * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
836 * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
837 * or VDATA - Vector GPR for write data.
838 * Operand(2): VADDR - Address source. Can carry an offset or an index.
839 * Definition(0): VDATA - Vector GPR for read result.
840 *
841 */
842 struct MIMG_instruction : public Instruction {
843 uint8_t dmask; /* Data VGPR enable mask */
844 uint8_t dim : 3; /* NAVI: dimensionality */
845 bool unrm : 1; /* Force address to be un-normalized */
846 bool dlc : 1; /* NAVI: device level coherent */
847 bool glc : 1; /* globally coherent */
848 bool slc : 1; /* system level coherent */
849 bool tfe : 1; /* texture fail enable */
850 bool da : 1; /* declare an array */
851 bool lwe : 1; /* Force data to be un-normalized */
852 bool r128 : 1; /* NAVI: Texture resource size */
853 bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
854 bool d16 : 1; /* Convert 32-bit data to 16-bit data */
855 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
856 bool can_reorder : 1;
857 barrier_interaction barrier;
858 };
859
860 /**
861 * Flat/Scratch/Global Instructions
862 * Operand(0): ADDR
863 * Operand(1): SADDR
864 * Operand(2) / Definition(0): DATA/VDST
865 *
866 */
867 struct FLAT_instruction : public Instruction {
868 uint16_t offset; /* Vega/Navi only */
869 bool slc : 1; /* system level coherent */
870 bool glc : 1; /* globally coherent */
871 bool dlc : 1; /* NAVI: device level coherent */
872 bool lds : 1;
873 bool nv : 1;
874 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
875 bool can_reorder : 1;
876 barrier_interaction barrier;
877 };
878
879 struct Export_instruction : public Instruction {
880 uint8_t enabled_mask;
881 uint8_t dest;
882 bool compressed : 1;
883 bool done : 1;
884 bool valid_mask : 1;
885 };
886
887 struct Pseudo_instruction : public Instruction {
888 bool tmp_in_scc;
889 PhysReg scratch_sgpr; /* might not be valid if it's not needed */
890 };
891
892 struct Pseudo_branch_instruction : public Instruction {
893 /* target[0] is the block index of the branch target.
894 * For conditional branches, target[1] contains the fall-through alternative.
895 * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
896 */
897 uint32_t target[2];
898 };
899
900 struct Pseudo_barrier_instruction : public Instruction {
901 };
902
903 enum ReduceOp {
904 iadd32, iadd64,
905 imul32, imul64,
906 fadd32, fadd64,
907 fmul32, fmul64,
908 imin32, imin64,
909 imax32, imax64,
910 umin32, umin64,
911 umax32, umax64,
912 fmin32, fmin64,
913 fmax32, fmax64,
914 iand32, iand64,
915 ior32, ior64,
916 ixor32, ixor64,
917 gfx10_wave64_bpermute
918 };
919
920 /**
921 * Subgroup Reduction Instructions, everything except for the data to be
922 * reduced and the result as inserted by setup_reduce_temp().
923 * Operand(0): data to be reduced
924 * Operand(1): reduce temporary
925 * Operand(2): vector temporary
926 * Definition(0): result
927 * Definition(1): scalar temporary
928 * Definition(2): scalar identity temporary (not used to store identity on GFX10)
929 * Definition(3): scc clobber
930 * Definition(4): vcc clobber
931 *
932 */
933 struct Pseudo_reduction_instruction : public Instruction {
934 ReduceOp reduce_op;
935 unsigned cluster_size; // must be 0 for scans
936 };
937
938 struct instr_deleter_functor {
939 void operator()(void* p) {
940 free(p);
941 }
942 };
943
944 template<typename T>
945 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
946
947 template<typename T>
948 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
949 {
950 std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
951 char *data = (char*) calloc(1, size);
952 T* inst = (T*) data;
953
954 inst->opcode = opcode;
955 inst->format = format;
956
957 uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
958 inst->operands = aco::span<Operand>(operands_offset, num_operands);
959 uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
960 inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
961
962 return inst;
963 }
964
965 constexpr bool Instruction::usesModifiers() const noexcept
966 {
967 if (isDPP() || isSDWA())
968 return true;
969 if (!isVOP3())
970 return false;
971 const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
972 for (unsigned i = 0; i < operands.size(); i++) {
973 if (vop3->abs[i] || vop3->neg[i])
974 return true;
975 }
976 return vop3->opsel || vop3->clamp || vop3->omod;
977 }
978
979 constexpr bool is_phi(Instruction* instr)
980 {
981 return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
982 }
983
984 static inline bool is_phi(aco_ptr<Instruction>& instr)
985 {
986 return is_phi(instr.get());
987 }
988
989 barrier_interaction get_barrier_interaction(Instruction* instr);
990
991 bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr);
992
993 enum block_kind {
994 /* uniform indicates that leaving this block,
995 * all actives lanes stay active */
996 block_kind_uniform = 1 << 0,
997 block_kind_top_level = 1 << 1,
998 block_kind_loop_preheader = 1 << 2,
999 block_kind_loop_header = 1 << 3,
1000 block_kind_loop_exit = 1 << 4,
1001 block_kind_continue = 1 << 5,
1002 block_kind_break = 1 << 6,
1003 block_kind_continue_or_break = 1 << 7,
1004 block_kind_discard = 1 << 8,
1005 block_kind_branch = 1 << 9,
1006 block_kind_merge = 1 << 10,
1007 block_kind_invert = 1 << 11,
1008 block_kind_uses_discard_if = 1 << 12,
1009 block_kind_needs_lowering = 1 << 13,
1010 block_kind_uses_demote = 1 << 14,
1011 block_kind_export_end = 1 << 15,
1012 };
1013
1014
1015 struct RegisterDemand {
1016 constexpr RegisterDemand() = default;
1017 constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1018 : vgpr{v}, sgpr{s} {}
1019 int16_t vgpr = 0;
1020 int16_t sgpr = 0;
1021
1022 constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1023 return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1024 }
1025
1026 constexpr bool exceeds(const RegisterDemand other) const noexcept {
1027 return vgpr > other.vgpr || sgpr > other.sgpr;
1028 }
1029
1030 constexpr RegisterDemand operator+(const Temp t) const noexcept {
1031 if (t.type() == RegType::sgpr)
1032 return RegisterDemand( vgpr, sgpr + t.size() );
1033 else
1034 return RegisterDemand( vgpr + t.size(), sgpr );
1035 }
1036
1037 constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1038 return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1039 }
1040
1041 constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1042 return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1043 }
1044
1045 constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1046 vgpr += other.vgpr;
1047 sgpr += other.sgpr;
1048 return *this;
1049 }
1050
1051 constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1052 vgpr -= other.vgpr;
1053 sgpr -= other.sgpr;
1054 return *this;
1055 }
1056
1057 constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1058 if (t.type() == RegType::sgpr)
1059 sgpr += t.size();
1060 else
1061 vgpr += t.size();
1062 return *this;
1063 }
1064
1065 constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1066 if (t.type() == RegType::sgpr)
1067 sgpr -= t.size();
1068 else
1069 vgpr -= t.size();
1070 return *this;
1071 }
1072
1073 constexpr void update(const RegisterDemand other) noexcept {
1074 vgpr = std::max(vgpr, other.vgpr);
1075 sgpr = std::max(sgpr, other.sgpr);
1076 }
1077
1078 };
1079
1080 /* CFG */
1081 struct Block {
1082 float_mode fp_mode;
1083 unsigned index;
1084 unsigned offset = 0;
1085 std::vector<aco_ptr<Instruction>> instructions;
1086 std::vector<unsigned> logical_preds;
1087 std::vector<unsigned> linear_preds;
1088 std::vector<unsigned> logical_succs;
1089 std::vector<unsigned> linear_succs;
1090 RegisterDemand register_demand = RegisterDemand();
1091 uint16_t loop_nest_depth = 0;
1092 uint16_t kind = 0;
1093 int logical_idom = -1;
1094 int linear_idom = -1;
1095 Temp live_out_exec = Temp();
1096
1097 /* this information is needed for predecessors to blocks with phis when
1098 * moving out of ssa */
1099 bool scc_live_out = false;
1100 PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1101
1102 Block(unsigned idx) : index(idx) {}
1103 Block() : index(0) {}
1104 };
1105
1106 using Stage = uint16_t;
1107
1108 /* software stages */
1109 static constexpr Stage sw_vs = 1 << 0;
1110 static constexpr Stage sw_gs = 1 << 1;
1111 static constexpr Stage sw_tcs = 1 << 2;
1112 static constexpr Stage sw_tes = 1 << 3;
1113 static constexpr Stage sw_fs = 1 << 4;
1114 static constexpr Stage sw_cs = 1 << 5;
1115 static constexpr Stage sw_gs_copy = 1 << 6;
1116 static constexpr Stage sw_mask = 0x7f;
1117
1118 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1119 static constexpr Stage hw_vs = 1 << 7;
1120 static constexpr Stage hw_es = 1 << 8; /* not on GFX9. combined into GS on GFX9 (and GFX10/legacy). */
1121 static constexpr Stage hw_gs = 1 << 9;
1122 static constexpr Stage hw_ls = 1 << 10; /* not on GFX9. combined into HS on GFX9 (and GFX10/legacy). */
1123 static constexpr Stage hw_hs = 1 << 11;
1124 static constexpr Stage hw_fs = 1 << 12;
1125 static constexpr Stage hw_cs = 1 << 13;
1126 static constexpr Stage hw_mask = 0x7f << 7;
1127
1128 /* possible settings of Program::stage */
1129 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1130 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1131 static constexpr Stage compute_cs = sw_cs | hw_cs;
1132 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1133 static constexpr Stage gs_copy_vs = sw_gs_copy | hw_vs;
1134 /* GFX10/NGG */
1135 static constexpr Stage ngg_vertex_gs = sw_vs | hw_gs;
1136 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1137 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1138 static constexpr Stage ngg_vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1139 /* GFX9 (and GFX10 if NGG isn't used) */
1140 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1141 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1142 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1143 /* pre-GFX9 */
1144 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1145 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1146 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1147 static constexpr Stage tess_eval_es = sw_tes | hw_es; /* tesselation evaluation before geometry */
1148 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1149
1150 class Program final {
1151 public:
1152 float_mode next_fp_mode;
1153 std::vector<Block> blocks;
1154 RegisterDemand max_reg_demand = RegisterDemand();
1155 uint16_t num_waves = 0;
1156 uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1157 ac_shader_config* config;
1158 struct radv_shader_info *info;
1159 enum chip_class chip_class;
1160 enum radeon_family family;
1161 unsigned wave_size;
1162 RegClass lane_mask;
1163 Stage stage; /* Stage */
1164 bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1165 bool needs_wqm = false; /* there exists a p_wqm instruction */
1166 bool wb_smem_l1_on_end = false;
1167
1168 std::vector<uint8_t> constant_data;
1169 Temp private_segment_buffer;
1170 Temp scratch_offset;
1171
1172 uint16_t min_waves = 0;
1173 uint16_t lds_alloc_granule;
1174 uint32_t lds_limit; /* in bytes */
1175 uint16_t vgpr_limit;
1176 uint16_t sgpr_limit;
1177 uint16_t physical_sgprs;
1178 uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1179 uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
1180
1181 bool needs_vcc = false;
1182 bool needs_xnack_mask = false;
1183 bool needs_flat_scr = false;
1184
1185 uint32_t allocateId()
1186 {
1187 assert(allocationID <= 16777215);
1188 return allocationID++;
1189 }
1190
1191 uint32_t peekAllocationId()
1192 {
1193 return allocationID;
1194 }
1195
1196 void setAllocationId(uint32_t id)
1197 {
1198 allocationID = id;
1199 }
1200
1201 Block* create_and_insert_block() {
1202 blocks.emplace_back(blocks.size());
1203 blocks.back().fp_mode = next_fp_mode;
1204 return &blocks.back();
1205 }
1206
1207 Block* insert_block(Block&& block) {
1208 block.index = blocks.size();
1209 block.fp_mode = next_fp_mode;
1210 blocks.emplace_back(std::move(block));
1211 return &blocks.back();
1212 }
1213
1214 private:
1215 uint32_t allocationID = 1;
1216 };
1217
1218 struct live {
1219 /* live temps out per block */
1220 std::vector<std::set<Temp>> live_out;
1221 /* register demand (sgpr/vgpr) per instruction per block */
1222 std::vector<std::vector<RegisterDemand>> register_demand;
1223 };
1224
1225 void select_program(Program *program,
1226 unsigned shader_count,
1227 struct nir_shader *const *shaders,
1228 ac_shader_config* config,
1229 struct radv_shader_args *args);
1230 void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
1231 ac_shader_config* config,
1232 struct radv_shader_args *args);
1233
1234 void lower_wqm(Program* program, live& live_vars,
1235 const struct radv_nir_compiler_options *options);
1236 void lower_bool_phis(Program* program);
1237 void calc_min_waves(Program* program);
1238 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1239 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1240 std::vector<uint16_t> dead_code_analysis(Program *program);
1241 void dominator_tree(Program* program);
1242 void insert_exec_mask(Program *program);
1243 void value_numbering(Program* program);
1244 void optimize(Program* program);
1245 void setup_reduce_temp(Program* program);
1246 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1247 void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_per_block);
1248 void ssa_elimination(Program* program);
1249 void lower_to_hw_instr(Program* program);
1250 void schedule_program(Program* program, live& live_vars);
1251 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1252 void insert_wait_states(Program* program);
1253 void insert_NOPs(Program* program);
1254 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1255 void print_asm(Program *program, std::vector<uint32_t>& binary,
1256 unsigned exec_size, std::ostream& out);
1257 void validate(Program* program, FILE *output);
1258 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output);
1259 #ifndef NDEBUG
1260 void perfwarn(bool cond, const char *msg, Instruction *instr=NULL);
1261 #else
1262 #define perfwarn(program, cond, msg, ...) do {} while(0)
1263 #endif
1264
1265 void aco_print_instr(Instruction *instr, FILE *output);
1266 void aco_print_program(Program *program, FILE *output);
1267
1268 /* utilities for dealing with register demand */
1269 RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
1270 RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
1271 RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, aco_ptr<Instruction>& instr_before);
1272
1273 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1274 uint16_t get_extra_sgprs(Program *program);
1275
1276 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
1277 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1278 uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs);
1279
1280 /* return number of addressable sgprs/vgprs for max_waves */
1281 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1282 uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves);
1283
1284 typedef struct {
1285 const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
1286 const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1287 const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1288 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1289 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1290 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
1291 const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1292 const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1293 } Info;
1294
1295 extern const Info instr_info;
1296
1297 }
1298
1299 #endif /* ACO_IR_H */
1300