aco: add a late kill flag
[mesa.git] / src / amd / compiler / aco_ir.h
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #ifndef ACO_IR_H
26 #define ACO_IR_H
27
28 #include <vector>
29 #include <set>
30 #include <bitset>
31 #include <memory>
32
33 #include "nir.h"
34 #include "ac_binary.h"
35 #include "amd_family.h"
36 #include "aco_opcodes.h"
37 #include "aco_util.h"
38
39 struct radv_nir_compiler_options;
40 struct radv_shader_args;
41 struct radv_shader_info;
42
43 namespace aco {
44
45 extern uint64_t debug_flags;
46
47 enum {
48 DEBUG_VALIDATE = 0x1,
49 DEBUG_VALIDATE_RA = 0x2,
50 DEBUG_PERFWARN = 0x4,
51 };
52
53 /**
54 * Representation of the instruction's microcode encoding format
55 * Note: Some Vector ALU Formats can be combined, such that:
56 * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
57 * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
58 * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
59 *
60 * (*) The same is applicable for VOP1 and VOPC instructions.
61 */
62 enum class Format : std::uint16_t {
63 /* Pseudo Instruction Format */
64 PSEUDO = 0,
65 /* Scalar ALU & Control Formats */
66 SOP1 = 1,
67 SOP2 = 2,
68 SOPK = 3,
69 SOPP = 4,
70 SOPC = 5,
71 /* Scalar Memory Format */
72 SMEM = 6,
73 /* LDS/GDS Format */
74 DS = 8,
75 /* Vector Memory Buffer Formats */
76 MTBUF = 9,
77 MUBUF = 10,
78 /* Vector Memory Image Format */
79 MIMG = 11,
80 /* Export Format */
81 EXP = 12,
82 /* Flat Formats */
83 FLAT = 13,
84 GLOBAL = 14,
85 SCRATCH = 15,
86
87 PSEUDO_BRANCH = 16,
88 PSEUDO_BARRIER = 17,
89 PSEUDO_REDUCTION = 18,
90
91 /* Vector ALU Formats */
92 VOP1 = 1 << 8,
93 VOP2 = 1 << 9,
94 VOPC = 1 << 10,
95 VOP3 = 1 << 11,
96 VOP3A = 1 << 11,
97 VOP3B = 1 << 11,
98 VOP3P = 1 << 12,
99 /* Vector Parameter Interpolation Format */
100 VINTRP = 1 << 13,
101 DPP = 1 << 14,
102 SDWA = 1 << 15,
103 };
104
105 enum barrier_interaction : uint8_t {
106 barrier_none = 0,
107 barrier_buffer = 0x1,
108 barrier_image = 0x2,
109 barrier_atomic = 0x4,
110 barrier_shared = 0x8,
111 /* used for geometry shaders to ensure vertex data writes are before the
112 * GS_DONE s_sendmsg. */
113 barrier_gs_data = 0x10,
114 /* used for geometry shaders to ensure s_sendmsg instructions are in-order. */
115 barrier_gs_sendmsg = 0x20,
116 /* used by barriers. created by s_barrier */
117 barrier_barrier = 0x40,
118 barrier_count = 6,
119 };
120
121 enum fp_round {
122 fp_round_ne = 0,
123 fp_round_pi = 1,
124 fp_round_ni = 2,
125 fp_round_tz = 3,
126 };
127
128 enum fp_denorm {
129 /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
130 * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
131 fp_denorm_flush = 0x0,
132 fp_denorm_keep = 0x3,
133 };
134
135 struct float_mode {
136 /* matches encoding of the MODE register */
137 union {
138 struct {
139 fp_round round32:2;
140 fp_round round16_64:2;
141 unsigned denorm32:2;
142 unsigned denorm16_64:2;
143 };
144 uint8_t val = 0;
145 };
146 /* if false, optimizations which may remove infs/nan/-0.0 can be done */
147 bool preserve_signed_zero_inf_nan32:1;
148 bool preserve_signed_zero_inf_nan16_64:1;
149 /* if false, optimizations which may remove denormal flushing can be done */
150 bool must_flush_denorms32:1;
151 bool must_flush_denorms16_64:1;
152 bool care_about_round32:1;
153 bool care_about_round16_64:1;
154
155 /* Returns true if instructions using the mode "other" can safely use the
156 * current one instead. */
157 bool canReplace(float_mode other) const noexcept {
158 return val == other.val &&
159 (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
160 (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
161 (must_flush_denorms32 || !other.must_flush_denorms32) &&
162 (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
163 (care_about_round32 || !other.care_about_round32) &&
164 (care_about_round16_64 || !other.care_about_round16_64);
165 }
166 };
167
168 constexpr Format asVOP3(Format format) {
169 return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
170 };
171
172 enum class RegType {
173 none = 0,
174 sgpr,
175 vgpr,
176 linear_vgpr,
177 };
178
179 struct RegClass {
180
181 enum RC : uint8_t {
182 s1 = 1,
183 s2 = 2,
184 s3 = 3,
185 s4 = 4,
186 s6 = 6,
187 s8 = 8,
188 s16 = 16,
189 v1 = s1 | (1 << 5),
190 v2 = s2 | (1 << 5),
191 v3 = s3 | (1 << 5),
192 v4 = s4 | (1 << 5),
193 v5 = 5 | (1 << 5),
194 v6 = 6 | (1 << 5),
195 v7 = 7 | (1 << 5),
196 v8 = 8 | (1 << 5),
197 /* these are used for WWM and spills to vgpr */
198 v1_linear = v1 | (1 << 6),
199 v2_linear = v2 | (1 << 6),
200 };
201
202 RegClass() = default;
203 constexpr RegClass(RC rc)
204 : rc(rc) {}
205 constexpr RegClass(RegType type, unsigned size)
206 : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
207
208 constexpr operator RC() const { return rc; }
209 explicit operator bool() = delete;
210
211 constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
212 constexpr unsigned size() const { return (unsigned) rc & 0x1F; }
213 constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
214 constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
215
216 private:
217 RC rc;
218 };
219
220 /* transitional helper expressions */
221 static constexpr RegClass s1{RegClass::s1};
222 static constexpr RegClass s2{RegClass::s2};
223 static constexpr RegClass s3{RegClass::s3};
224 static constexpr RegClass s4{RegClass::s4};
225 static constexpr RegClass s8{RegClass::s8};
226 static constexpr RegClass s16{RegClass::s16};
227 static constexpr RegClass v1{RegClass::v1};
228 static constexpr RegClass v2{RegClass::v2};
229 static constexpr RegClass v3{RegClass::v3};
230 static constexpr RegClass v4{RegClass::v4};
231 static constexpr RegClass v5{RegClass::v5};
232 static constexpr RegClass v6{RegClass::v6};
233 static constexpr RegClass v7{RegClass::v7};
234 static constexpr RegClass v8{RegClass::v8};
235
236 /**
237 * Temp Class
238 * Each temporary virtual register has a
239 * register class (i.e. size and type)
240 * and SSA id.
241 */
242 struct Temp {
243 Temp() = default;
244 constexpr Temp(uint32_t id, RegClass cls) noexcept
245 : id_(id), reg_class(cls) {}
246
247 constexpr uint32_t id() const noexcept { return id_; }
248 constexpr RegClass regClass() const noexcept { return reg_class; }
249
250 constexpr unsigned size() const noexcept { return reg_class.size(); }
251 constexpr RegType type() const noexcept { return reg_class.type(); }
252 constexpr bool is_linear() const noexcept { return reg_class.is_linear(); }
253
254 constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
255 constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
256 constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
257
258 private:
259 uint32_t id_:24;
260 RegClass reg_class;
261 };
262
263 /**
264 * PhysReg
265 * Represents the physical register for each
266 * Operand and Definition.
267 */
268 struct PhysReg {
269 constexpr PhysReg() = default;
270 explicit constexpr PhysReg(unsigned r) : reg(r) {}
271 constexpr operator unsigned() const { return reg; }
272
273 uint16_t reg = 0;
274 };
275
276 /* helper expressions for special registers */
277 static constexpr PhysReg m0{124};
278 static constexpr PhysReg vcc{106};
279 static constexpr PhysReg vcc_hi{107};
280 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
281 static constexpr PhysReg exec{126};
282 static constexpr PhysReg exec_lo{126};
283 static constexpr PhysReg exec_hi{127};
284 static constexpr PhysReg vccz{251};
285 static constexpr PhysReg execz{252};
286 static constexpr PhysReg scc{253};
287
288 /**
289 * Operand Class
290 * Initially, each Operand refers to either
291 * a temporary virtual register
292 * or to a constant value
293 * Temporary registers get mapped to physical register during RA
294 * Constant values are inlined into the instruction sequence.
295 */
296 class Operand final
297 {
298 public:
299 constexpr Operand()
300 : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
301 isKill_(false), isUndef_(true), isFirstKill_(false), is64BitConst_(false),
302 isLateKill_(false) {}
303
304 explicit Operand(Temp r) noexcept
305 {
306 data_.temp = r;
307 if (r.id()) {
308 isTemp_ = true;
309 } else {
310 isUndef_ = true;
311 setFixed(PhysReg{128});
312 }
313 };
314 explicit Operand(uint32_t v, bool is64bit = false) noexcept
315 {
316 data_.i = v;
317 isConstant_ = true;
318 is64BitConst_ = is64bit;
319 if (v <= 64)
320 setFixed(PhysReg{128 + v});
321 else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
322 setFixed(PhysReg{192 - v});
323 else if (v == 0x3f000000) /* 0.5 */
324 setFixed(PhysReg{240});
325 else if (v == 0xbf000000) /* -0.5 */
326 setFixed(PhysReg{241});
327 else if (v == 0x3f800000) /* 1.0 */
328 setFixed(PhysReg{242});
329 else if (v == 0xbf800000) /* -1.0 */
330 setFixed(PhysReg{243});
331 else if (v == 0x40000000) /* 2.0 */
332 setFixed(PhysReg{244});
333 else if (v == 0xc0000000) /* -2.0 */
334 setFixed(PhysReg{245});
335 else if (v == 0x40800000) /* 4.0 */
336 setFixed(PhysReg{246});
337 else if (v == 0xc0800000) /* -4.0 */
338 setFixed(PhysReg{247});
339 else { /* Literal Constant */
340 assert(!is64bit && "attempt to create a 64-bit literal constant");
341 setFixed(PhysReg{255});
342 }
343 };
344 explicit Operand(uint64_t v) noexcept
345 {
346 isConstant_ = true;
347 is64BitConst_ = true;
348 if (v <= 64) {
349 data_.i = (uint32_t) v;
350 setFixed(PhysReg{128 + (uint32_t) v});
351 } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
352 data_.i = (uint32_t) v;
353 setFixed(PhysReg{192 - (uint32_t) v});
354 } else if (v == 0x3FE0000000000000) { /* 0.5 */
355 data_.i = 0x3f000000;
356 setFixed(PhysReg{240});
357 } else if (v == 0xBFE0000000000000) { /* -0.5 */
358 data_.i = 0xbf000000;
359 setFixed(PhysReg{241});
360 } else if (v == 0x3FF0000000000000) { /* 1.0 */
361 data_.i = 0x3f800000;
362 setFixed(PhysReg{242});
363 } else if (v == 0xBFF0000000000000) { /* -1.0 */
364 data_.i = 0xbf800000;
365 setFixed(PhysReg{243});
366 } else if (v == 0x4000000000000000) { /* 2.0 */
367 data_.i = 0x40000000;
368 setFixed(PhysReg{244});
369 } else if (v == 0xC000000000000000) { /* -2.0 */
370 data_.i = 0xc0000000;
371 setFixed(PhysReg{245});
372 } else if (v == 0x4010000000000000) { /* 4.0 */
373 data_.i = 0x40800000;
374 setFixed(PhysReg{246});
375 } else if (v == 0xC010000000000000) { /* -4.0 */
376 data_.i = 0xc0800000;
377 setFixed(PhysReg{247});
378 } else { /* Literal Constant: we don't know if it is a long or double.*/
379 isConstant_ = 0;
380 assert(false && "attempt to create a 64-bit literal constant");
381 }
382 };
383 explicit Operand(RegClass type) noexcept
384 {
385 isUndef_ = true;
386 data_.temp = Temp(0, type);
387 setFixed(PhysReg{128});
388 };
389 explicit Operand(PhysReg reg, RegClass type) noexcept
390 {
391 data_.temp = Temp(0, type);
392 setFixed(reg);
393 }
394
395 constexpr bool isTemp() const noexcept
396 {
397 return isTemp_;
398 }
399
400 constexpr void setTemp(Temp t) noexcept {
401 assert(!isConstant_);
402 isTemp_ = true;
403 data_.temp = t;
404 }
405
406 constexpr Temp getTemp() const noexcept
407 {
408 return data_.temp;
409 }
410
411 constexpr uint32_t tempId() const noexcept
412 {
413 return data_.temp.id();
414 }
415
416 constexpr bool hasRegClass() const noexcept
417 {
418 return isTemp() || isUndefined();
419 }
420
421 constexpr RegClass regClass() const noexcept
422 {
423 return data_.temp.regClass();
424 }
425
426 constexpr unsigned size() const noexcept
427 {
428 if (isConstant())
429 return is64BitConst_ ? 2 : 1;
430 else
431 return data_.temp.size();
432 }
433
434 constexpr bool isFixed() const noexcept
435 {
436 return isFixed_;
437 }
438
439 constexpr PhysReg physReg() const noexcept
440 {
441 return reg_;
442 }
443
444 constexpr void setFixed(PhysReg reg) noexcept
445 {
446 isFixed_ = reg != unsigned(-1);
447 reg_ = reg;
448 }
449
450 constexpr bool isConstant() const noexcept
451 {
452 return isConstant_;
453 }
454
455 constexpr bool isLiteral() const noexcept
456 {
457 return isConstant() && reg_ == 255;
458 }
459
460 constexpr bool isUndefined() const noexcept
461 {
462 return isUndef_;
463 }
464
465 constexpr uint32_t constantValue() const noexcept
466 {
467 return data_.i;
468 }
469
470 constexpr bool constantEquals(uint32_t cmp) const noexcept
471 {
472 return isConstant() && constantValue() == cmp;
473 }
474
475 /* Indicates that the killed operand's live range intersects with the
476 * instruction's definitions. Unlike isKill() and isFirstKill(), this is
477 * not set by liveness analysis. */
478 constexpr void setLateKill(bool flag) noexcept
479 {
480 isLateKill_ = flag;
481 }
482
483 constexpr bool isLateKill() const noexcept
484 {
485 return isLateKill_;
486 }
487
488 constexpr void setKill(bool flag) noexcept
489 {
490 isKill_ = flag;
491 if (!flag)
492 setFirstKill(false);
493 }
494
495 constexpr bool isKill() const noexcept
496 {
497 return isKill_ || isFirstKill();
498 }
499
500 constexpr void setFirstKill(bool flag) noexcept
501 {
502 isFirstKill_ = flag;
503 if (flag)
504 setKill(flag);
505 }
506
507 /* When there are multiple operands killing the same temporary,
508 * isFirstKill() is only returns true for the first one. */
509 constexpr bool isFirstKill() const noexcept
510 {
511 return isFirstKill_;
512 }
513
514 constexpr bool isKillBeforeDef() const noexcept
515 {
516 return isKill() && !isLateKill();
517 }
518
519 constexpr bool isFirstKillBeforeDef() const noexcept
520 {
521 return isFirstKill() && !isLateKill();
522 }
523
524 private:
525 union {
526 uint32_t i;
527 float f;
528 Temp temp = Temp(0, s1);
529 } data_;
530 PhysReg reg_;
531 union {
532 struct {
533 uint8_t isTemp_:1;
534 uint8_t isFixed_:1;
535 uint8_t isConstant_:1;
536 uint8_t isKill_:1;
537 uint8_t isUndef_:1;
538 uint8_t isFirstKill_:1;
539 uint8_t is64BitConst_:1;
540 uint8_t isLateKill_:1;
541 };
542 /* can't initialize bit-fields in c++11, so work around using a union */
543 uint8_t control_ = 0;
544 };
545 };
546
547 /**
548 * Definition Class
549 * Definitions are the results of Instructions
550 * and refer to temporary virtual registers
551 * which are later mapped to physical registers
552 */
553 class Definition final
554 {
555 public:
556 constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0) {}
557 Definition(uint32_t index, RegClass type) noexcept
558 : temp(index, type) {}
559 explicit Definition(Temp tmp) noexcept
560 : temp(tmp) {}
561 Definition(PhysReg reg, RegClass type) noexcept
562 : temp(Temp(0, type))
563 {
564 setFixed(reg);
565 }
566 Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
567 : temp(Temp(tmpId, type))
568 {
569 setFixed(reg);
570 }
571
572 constexpr bool isTemp() const noexcept
573 {
574 return tempId() > 0;
575 }
576
577 constexpr Temp getTemp() const noexcept
578 {
579 return temp;
580 }
581
582 constexpr uint32_t tempId() const noexcept
583 {
584 return temp.id();
585 }
586
587 constexpr void setTemp(Temp t) noexcept {
588 temp = t;
589 }
590
591 constexpr RegClass regClass() const noexcept
592 {
593 return temp.regClass();
594 }
595
596 constexpr unsigned size() const noexcept
597 {
598 return temp.size();
599 }
600
601 constexpr bool isFixed() const noexcept
602 {
603 return isFixed_;
604 }
605
606 constexpr PhysReg physReg() const noexcept
607 {
608 return reg_;
609 }
610
611 constexpr void setFixed(PhysReg reg) noexcept
612 {
613 isFixed_ = 1;
614 reg_ = reg;
615 }
616
617 constexpr void setHint(PhysReg reg) noexcept
618 {
619 hasHint_ = 1;
620 reg_ = reg;
621 }
622
623 constexpr bool hasHint() const noexcept
624 {
625 return hasHint_;
626 }
627
628 constexpr void setKill(bool flag) noexcept
629 {
630 isKill_ = flag;
631 }
632
633 constexpr bool isKill() const noexcept
634 {
635 return isKill_;
636 }
637
638 private:
639 Temp temp = Temp(0, s1);
640 PhysReg reg_;
641 union {
642 struct {
643 uint8_t isFixed_:1;
644 uint8_t hasHint_:1;
645 uint8_t isKill_:1;
646 };
647 /* can't initialize bit-fields in c++11, so work around using a union */
648 uint8_t control_ = 0;
649 };
650 };
651
652 class Block;
653
654 struct Instruction {
655 aco_opcode opcode;
656 Format format;
657 uint32_t pass_flags;
658
659 aco::span<Operand> operands;
660 aco::span<Definition> definitions;
661
662 constexpr bool isVALU() const noexcept
663 {
664 return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
665 || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
666 || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
667 || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
668 || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
669 || ((uint16_t) format & (uint16_t) Format::VOP3P) == (uint16_t) Format::VOP3P;
670 }
671
672 constexpr bool isSALU() const noexcept
673 {
674 return format == Format::SOP1 ||
675 format == Format::SOP2 ||
676 format == Format::SOPC ||
677 format == Format::SOPK ||
678 format == Format::SOPP;
679 }
680
681 constexpr bool isVMEM() const noexcept
682 {
683 return format == Format::MTBUF ||
684 format == Format::MUBUF ||
685 format == Format::MIMG;
686 }
687
688 constexpr bool isDPP() const noexcept
689 {
690 return (uint16_t) format & (uint16_t) Format::DPP;
691 }
692
693 constexpr bool isVOP3() const noexcept
694 {
695 return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
696 ((uint16_t) format & (uint16_t) Format::VOP3B) ||
697 format == Format::VOP3P;
698 }
699
700 constexpr bool isSDWA() const noexcept
701 {
702 return (uint16_t) format & (uint16_t) Format::SDWA;
703 }
704
705 constexpr bool isFlatOrGlobal() const noexcept
706 {
707 return format == Format::FLAT || format == Format::GLOBAL;
708 }
709
710 constexpr bool usesModifiers() const noexcept;
711
712 constexpr bool reads_exec() const noexcept
713 {
714 for (const Operand& op : operands) {
715 if (op.isFixed() && op.physReg() == exec)
716 return true;
717 }
718 return false;
719 }
720 };
721
722 struct SOPK_instruction : public Instruction {
723 uint16_t imm;
724 };
725
726 struct SOPP_instruction : public Instruction {
727 uint32_t imm;
728 int block;
729 };
730
731 struct SOPC_instruction : public Instruction {
732 };
733
734 struct SOP1_instruction : public Instruction {
735 };
736
737 struct SOP2_instruction : public Instruction {
738 };
739
740 /**
741 * Scalar Memory Format:
742 * For s_(buffer_)load_dword*:
743 * Operand(0): SBASE - SGPR-pair which provides base address
744 * Operand(1): Offset - immediate (un)signed offset or SGPR
745 * Operand(2) / Definition(0): SDATA - SGPR for read / write result
746 * Operand(n-1): SOffset - SGPR offset (Vega only)
747 *
748 * Having no operands is also valid for instructions such as s_dcache_inv.
749 *
750 */
751 struct SMEM_instruction : public Instruction {
752 bool glc : 1; /* VI+: globally coherent */
753 bool dlc : 1; /* NAVI: device level coherent */
754 bool nv : 1; /* VEGA only: Non-volatile */
755 bool can_reorder : 1;
756 bool disable_wqm : 1;
757 barrier_interaction barrier;
758 };
759
760 struct VOP1_instruction : public Instruction {
761 };
762
763 struct VOP2_instruction : public Instruction {
764 };
765
766 struct VOPC_instruction : public Instruction {
767 };
768
769 struct VOP3A_instruction : public Instruction {
770 bool abs[3];
771 bool neg[3];
772 uint8_t opsel : 4;
773 uint8_t omod : 2;
774 bool clamp : 1;
775 };
776
777 /**
778 * Data Parallel Primitives Format:
779 * This format can be used for VOP1, VOP2 or VOPC instructions.
780 * The swizzle applies to the src0 operand.
781 *
782 */
783 struct DPP_instruction : public Instruction {
784 bool abs[2];
785 bool neg[2];
786 uint16_t dpp_ctrl;
787 uint8_t row_mask : 4;
788 uint8_t bank_mask : 4;
789 bool bound_ctrl : 1;
790 };
791
792 struct Interp_instruction : public Instruction {
793 uint8_t attribute;
794 uint8_t component;
795 };
796
797 /**
798 * Local and Global Data Sharing instructions
799 * Operand(0): ADDR - VGPR which supplies the address.
800 * Operand(1): DATA0 - First data VGPR.
801 * Operand(2): DATA1 - Second data VGPR.
802 * Operand(n-1): M0 - LDS size.
803 * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
804 *
805 */
806 struct DS_instruction : public Instruction {
807 int16_t offset0;
808 int8_t offset1;
809 bool gds;
810 };
811
812 /**
813 * Vector Memory Untyped-buffer Instructions
814 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
815 * Operand(1): VADDR - Address source. Can carry an index and/or offset
816 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
817 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
818 *
819 */
820 struct MUBUF_instruction : public Instruction {
821 uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
822 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
823 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
824 bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
825 bool glc : 1; /* globally coherent */
826 bool dlc : 1; /* NAVI: device level coherent */
827 bool slc : 1; /* system level coherent */
828 bool tfe : 1; /* texture fail enable */
829 bool lds : 1; /* Return read-data to LDS instead of VGPRs */
830 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
831 bool can_reorder : 1;
832 barrier_interaction barrier;
833 };
834
835 /**
836 * Vector Memory Typed-buffer Instructions
837 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
838 * Operand(1): VADDR - Address source. Can carry an index and/or offset
839 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
840 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
841 *
842 */
843 struct MTBUF_instruction : public Instruction {
844 uint16_t offset; /* Unsigned byte offset - 12 bit */
845 uint8_t dfmt : 4; /* Data Format of data in memory buffer */
846 uint8_t nfmt : 3; /* Numeric format of data in memory */
847 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
848 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
849 bool glc : 1; /* globally coherent */
850 bool dlc : 1; /* NAVI: device level coherent */
851 bool slc : 1; /* system level coherent */
852 bool tfe : 1; /* texture fail enable */
853 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
854 bool can_reorder : 1;
855 barrier_interaction barrier;
856 };
857
858 /**
859 * Vector Memory Image Instructions
860 * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
861 * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
862 * or VDATA - Vector GPR for write data.
863 * Operand(2): VADDR - Address source. Can carry an offset or an index.
864 * Definition(0): VDATA - Vector GPR for read result.
865 *
866 */
867 struct MIMG_instruction : public Instruction {
868 uint8_t dmask; /* Data VGPR enable mask */
869 uint8_t dim : 3; /* NAVI: dimensionality */
870 bool unrm : 1; /* Force address to be un-normalized */
871 bool dlc : 1; /* NAVI: device level coherent */
872 bool glc : 1; /* globally coherent */
873 bool slc : 1; /* system level coherent */
874 bool tfe : 1; /* texture fail enable */
875 bool da : 1; /* declare an array */
876 bool lwe : 1; /* Force data to be un-normalized */
877 bool r128 : 1; /* NAVI: Texture resource size */
878 bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
879 bool d16 : 1; /* Convert 32-bit data to 16-bit data */
880 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
881 bool can_reorder : 1;
882 barrier_interaction barrier;
883 };
884
885 /**
886 * Flat/Scratch/Global Instructions
887 * Operand(0): ADDR
888 * Operand(1): SADDR
889 * Operand(2) / Definition(0): DATA/VDST
890 *
891 */
892 struct FLAT_instruction : public Instruction {
893 uint16_t offset; /* Vega/Navi only */
894 bool slc : 1; /* system level coherent */
895 bool glc : 1; /* globally coherent */
896 bool dlc : 1; /* NAVI: device level coherent */
897 bool lds : 1;
898 bool nv : 1;
899 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
900 bool can_reorder : 1;
901 barrier_interaction barrier;
902 };
903
904 struct Export_instruction : public Instruction {
905 uint8_t enabled_mask;
906 uint8_t dest;
907 bool compressed : 1;
908 bool done : 1;
909 bool valid_mask : 1;
910 };
911
912 struct Pseudo_instruction : public Instruction {
913 bool tmp_in_scc;
914 PhysReg scratch_sgpr; /* might not be valid if it's not needed */
915 };
916
917 struct Pseudo_branch_instruction : public Instruction {
918 /* target[0] is the block index of the branch target.
919 * For conditional branches, target[1] contains the fall-through alternative.
920 * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
921 */
922 uint32_t target[2];
923 };
924
925 struct Pseudo_barrier_instruction : public Instruction {
926 };
927
928 enum ReduceOp {
929 iadd32, iadd64,
930 imul32, imul64,
931 fadd32, fadd64,
932 fmul32, fmul64,
933 imin32, imin64,
934 imax32, imax64,
935 umin32, umin64,
936 umax32, umax64,
937 fmin32, fmin64,
938 fmax32, fmax64,
939 iand32, iand64,
940 ior32, ior64,
941 ixor32, ixor64,
942 gfx10_wave64_bpermute
943 };
944
945 /**
946 * Subgroup Reduction Instructions, everything except for the data to be
947 * reduced and the result as inserted by setup_reduce_temp().
948 * Operand(0): data to be reduced
949 * Operand(1): reduce temporary
950 * Operand(2): vector temporary
951 * Definition(0): result
952 * Definition(1): scalar temporary
953 * Definition(2): scalar identity temporary (not used to store identity on GFX10)
954 * Definition(3): scc clobber
955 * Definition(4): vcc clobber
956 *
957 */
958 struct Pseudo_reduction_instruction : public Instruction {
959 ReduceOp reduce_op;
960 unsigned cluster_size; // must be 0 for scans
961 };
962
963 struct instr_deleter_functor {
964 void operator()(void* p) {
965 free(p);
966 }
967 };
968
969 template<typename T>
970 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
971
972 template<typename T>
973 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
974 {
975 std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
976 char *data = (char*) calloc(1, size);
977 T* inst = (T*) data;
978
979 inst->opcode = opcode;
980 inst->format = format;
981
982 uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
983 inst->operands = aco::span<Operand>(operands_offset, num_operands);
984 uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
985 inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
986
987 return inst;
988 }
989
990 constexpr bool Instruction::usesModifiers() const noexcept
991 {
992 if (isDPP() || isSDWA())
993 return true;
994 if (!isVOP3())
995 return false;
996 const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
997 for (unsigned i = 0; i < operands.size(); i++) {
998 if (vop3->abs[i] || vop3->neg[i])
999 return true;
1000 }
1001 return vop3->opsel || vop3->clamp || vop3->omod;
1002 }
1003
1004 constexpr bool is_phi(Instruction* instr)
1005 {
1006 return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
1007 }
1008
1009 static inline bool is_phi(aco_ptr<Instruction>& instr)
1010 {
1011 return is_phi(instr.get());
1012 }
1013
1014 barrier_interaction get_barrier_interaction(Instruction* instr);
1015
1016 bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr);
1017
1018 enum block_kind {
1019 /* uniform indicates that leaving this block,
1020 * all actives lanes stay active */
1021 block_kind_uniform = 1 << 0,
1022 block_kind_top_level = 1 << 1,
1023 block_kind_loop_preheader = 1 << 2,
1024 block_kind_loop_header = 1 << 3,
1025 block_kind_loop_exit = 1 << 4,
1026 block_kind_continue = 1 << 5,
1027 block_kind_break = 1 << 6,
1028 block_kind_continue_or_break = 1 << 7,
1029 block_kind_discard = 1 << 8,
1030 block_kind_branch = 1 << 9,
1031 block_kind_merge = 1 << 10,
1032 block_kind_invert = 1 << 11,
1033 block_kind_uses_discard_if = 1 << 12,
1034 block_kind_needs_lowering = 1 << 13,
1035 block_kind_uses_demote = 1 << 14,
1036 block_kind_export_end = 1 << 15,
1037 };
1038
1039
1040 struct RegisterDemand {
1041 constexpr RegisterDemand() = default;
1042 constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1043 : vgpr{v}, sgpr{s} {}
1044 int16_t vgpr = 0;
1045 int16_t sgpr = 0;
1046
1047 constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1048 return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1049 }
1050
1051 constexpr bool exceeds(const RegisterDemand other) const noexcept {
1052 return vgpr > other.vgpr || sgpr > other.sgpr;
1053 }
1054
1055 constexpr RegisterDemand operator+(const Temp t) const noexcept {
1056 if (t.type() == RegType::sgpr)
1057 return RegisterDemand( vgpr, sgpr + t.size() );
1058 else
1059 return RegisterDemand( vgpr + t.size(), sgpr );
1060 }
1061
1062 constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1063 return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1064 }
1065
1066 constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1067 return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1068 }
1069
1070 constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1071 vgpr += other.vgpr;
1072 sgpr += other.sgpr;
1073 return *this;
1074 }
1075
1076 constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1077 vgpr -= other.vgpr;
1078 sgpr -= other.sgpr;
1079 return *this;
1080 }
1081
1082 constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1083 if (t.type() == RegType::sgpr)
1084 sgpr += t.size();
1085 else
1086 vgpr += t.size();
1087 return *this;
1088 }
1089
1090 constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1091 if (t.type() == RegType::sgpr)
1092 sgpr -= t.size();
1093 else
1094 vgpr -= t.size();
1095 return *this;
1096 }
1097
1098 constexpr void update(const RegisterDemand other) noexcept {
1099 vgpr = std::max(vgpr, other.vgpr);
1100 sgpr = std::max(sgpr, other.sgpr);
1101 }
1102
1103 };
1104
1105 /* CFG */
1106 struct Block {
1107 float_mode fp_mode;
1108 unsigned index;
1109 unsigned offset = 0;
1110 std::vector<aco_ptr<Instruction>> instructions;
1111 std::vector<unsigned> logical_preds;
1112 std::vector<unsigned> linear_preds;
1113 std::vector<unsigned> logical_succs;
1114 std::vector<unsigned> linear_succs;
1115 RegisterDemand register_demand = RegisterDemand();
1116 uint16_t loop_nest_depth = 0;
1117 uint16_t kind = 0;
1118 int logical_idom = -1;
1119 int linear_idom = -1;
1120 Temp live_out_exec = Temp();
1121
1122 /* this information is needed for predecessors to blocks with phis when
1123 * moving out of ssa */
1124 bool scc_live_out = false;
1125 PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1126
1127 Block(unsigned idx) : index(idx) {}
1128 Block() : index(0) {}
1129 };
1130
1131 using Stage = uint16_t;
1132
1133 /* software stages */
1134 static constexpr Stage sw_vs = 1 << 0;
1135 static constexpr Stage sw_gs = 1 << 1;
1136 static constexpr Stage sw_tcs = 1 << 2;
1137 static constexpr Stage sw_tes = 1 << 3;
1138 static constexpr Stage sw_fs = 1 << 4;
1139 static constexpr Stage sw_cs = 1 << 5;
1140 static constexpr Stage sw_gs_copy = 1 << 6;
1141 static constexpr Stage sw_mask = 0x7f;
1142
1143 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1144 static constexpr Stage hw_vs = 1 << 7;
1145 static constexpr Stage hw_es = 1 << 8; /* not on GFX9. combined into GS on GFX9 (and GFX10/legacy). */
1146 static constexpr Stage hw_gs = 1 << 9;
1147 static constexpr Stage hw_ls = 1 << 10; /* not on GFX9. combined into HS on GFX9 (and GFX10/legacy). */
1148 static constexpr Stage hw_hs = 1 << 11;
1149 static constexpr Stage hw_fs = 1 << 12;
1150 static constexpr Stage hw_cs = 1 << 13;
1151 static constexpr Stage hw_mask = 0x7f << 7;
1152
1153 /* possible settings of Program::stage */
1154 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1155 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1156 static constexpr Stage compute_cs = sw_cs | hw_cs;
1157 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1158 static constexpr Stage gs_copy_vs = sw_gs_copy | hw_vs;
1159 /* GFX10/NGG */
1160 static constexpr Stage ngg_vertex_gs = sw_vs | hw_gs;
1161 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1162 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1163 static constexpr Stage ngg_vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1164 /* GFX9 (and GFX10 if NGG isn't used) */
1165 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1166 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1167 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1168 /* pre-GFX9 */
1169 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1170 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1171 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1172 static constexpr Stage tess_eval_es = sw_tes | hw_es; /* tesselation evaluation before geometry */
1173 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1174
1175 class Program final {
1176 public:
1177 float_mode next_fp_mode;
1178 std::vector<Block> blocks;
1179 RegisterDemand max_reg_demand = RegisterDemand();
1180 uint16_t num_waves = 0;
1181 uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1182 ac_shader_config* config;
1183 struct radv_shader_info *info;
1184 enum chip_class chip_class;
1185 enum radeon_family family;
1186 unsigned wave_size;
1187 RegClass lane_mask;
1188 Stage stage; /* Stage */
1189 bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1190 bool needs_wqm = false; /* there exists a p_wqm instruction */
1191 bool wb_smem_l1_on_end = false;
1192
1193 std::vector<uint8_t> constant_data;
1194 Temp private_segment_buffer;
1195 Temp scratch_offset;
1196
1197 uint16_t min_waves = 0;
1198 uint16_t lds_alloc_granule;
1199 uint32_t lds_limit; /* in bytes */
1200 uint16_t vgpr_limit;
1201 uint16_t sgpr_limit;
1202 uint16_t physical_sgprs;
1203 uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1204 uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
1205
1206 bool needs_vcc = false;
1207 bool needs_xnack_mask = false;
1208 bool needs_flat_scr = false;
1209
1210 uint32_t allocateId()
1211 {
1212 assert(allocationID <= 16777215);
1213 return allocationID++;
1214 }
1215
1216 uint32_t peekAllocationId()
1217 {
1218 return allocationID;
1219 }
1220
1221 void setAllocationId(uint32_t id)
1222 {
1223 allocationID = id;
1224 }
1225
1226 Block* create_and_insert_block() {
1227 blocks.emplace_back(blocks.size());
1228 blocks.back().fp_mode = next_fp_mode;
1229 return &blocks.back();
1230 }
1231
1232 Block* insert_block(Block&& block) {
1233 block.index = blocks.size();
1234 block.fp_mode = next_fp_mode;
1235 blocks.emplace_back(std::move(block));
1236 return &blocks.back();
1237 }
1238
1239 private:
1240 uint32_t allocationID = 1;
1241 };
1242
1243 struct live {
1244 /* live temps out per block */
1245 std::vector<std::set<Temp>> live_out;
1246 /* register demand (sgpr/vgpr) per instruction per block */
1247 std::vector<std::vector<RegisterDemand>> register_demand;
1248 };
1249
1250 void select_program(Program *program,
1251 unsigned shader_count,
1252 struct nir_shader *const *shaders,
1253 ac_shader_config* config,
1254 struct radv_shader_args *args);
1255 void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
1256 ac_shader_config* config,
1257 struct radv_shader_args *args);
1258
1259 void lower_wqm(Program* program, live& live_vars,
1260 const struct radv_nir_compiler_options *options);
1261 void lower_bool_phis(Program* program);
1262 void calc_min_waves(Program* program);
1263 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1264 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1265 std::vector<uint16_t> dead_code_analysis(Program *program);
1266 void dominator_tree(Program* program);
1267 void insert_exec_mask(Program *program);
1268 void value_numbering(Program* program);
1269 void optimize(Program* program);
1270 void setup_reduce_temp(Program* program);
1271 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1272 void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_per_block);
1273 void ssa_elimination(Program* program);
1274 void lower_to_hw_instr(Program* program);
1275 void schedule_program(Program* program, live& live_vars);
1276 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1277 void insert_wait_states(Program* program);
1278 void insert_NOPs(Program* program);
1279 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1280 void print_asm(Program *program, std::vector<uint32_t>& binary,
1281 unsigned exec_size, std::ostream& out);
1282 void validate(Program* program, FILE *output);
1283 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output);
1284 #ifndef NDEBUG
1285 void perfwarn(bool cond, const char *msg, Instruction *instr=NULL);
1286 #else
1287 #define perfwarn(program, cond, msg, ...) do {} while(0)
1288 #endif
1289
1290 void aco_print_instr(Instruction *instr, FILE *output);
1291 void aco_print_program(Program *program, FILE *output);
1292
1293 /* utilities for dealing with register demand */
1294 RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
1295 RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
1296 RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, aco_ptr<Instruction>& instr_before);
1297
1298 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1299 uint16_t get_extra_sgprs(Program *program);
1300
1301 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
1302 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1303 uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs);
1304
1305 /* return number of addressable sgprs/vgprs for max_waves */
1306 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1307 uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves);
1308
1309 typedef struct {
1310 const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
1311 const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1312 const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1313 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1314 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1315 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
1316 const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1317 const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1318 } Info;
1319
1320 extern const Info instr_info;
1321
1322 }
1323
1324 #endif /* ACO_IR_H */
1325