fb0c9beb2082166dbf712708a6b448cbd078fbc8
[mesa.git] / src / amd / compiler / aco_ir.h
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #ifndef ACO_IR_H
26 #define ACO_IR_H
27
28 #include <vector>
29 #include <set>
30 #include <unordered_set>
31 #include <bitset>
32 #include <memory>
33
34 #include "nir.h"
35 #include "ac_binary.h"
36 #include "amd_family.h"
37 #include "aco_opcodes.h"
38 #include "aco_util.h"
39
40 struct radv_nir_compiler_options;
41 struct radv_shader_args;
42 struct radv_shader_info;
43
44 namespace aco {
45
46 extern uint64_t debug_flags;
47
48 enum {
49 DEBUG_VALIDATE = 0x1,
50 DEBUG_VALIDATE_RA = 0x2,
51 DEBUG_PERFWARN = 0x4,
52 };
53
54 /**
55 * Representation of the instruction's microcode encoding format
56 * Note: Some Vector ALU Formats can be combined, such that:
57 * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
58 * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
59 * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
60 *
61 * (*) The same is applicable for VOP1 and VOPC instructions.
62 */
63 enum class Format : std::uint16_t {
64 /* Pseudo Instruction Format */
65 PSEUDO = 0,
66 /* Scalar ALU & Control Formats */
67 SOP1 = 1,
68 SOP2 = 2,
69 SOPK = 3,
70 SOPP = 4,
71 SOPC = 5,
72 /* Scalar Memory Format */
73 SMEM = 6,
74 /* LDS/GDS Format */
75 DS = 8,
76 /* Vector Memory Buffer Formats */
77 MTBUF = 9,
78 MUBUF = 10,
79 /* Vector Memory Image Format */
80 MIMG = 11,
81 /* Export Format */
82 EXP = 12,
83 /* Flat Formats */
84 FLAT = 13,
85 GLOBAL = 14,
86 SCRATCH = 15,
87
88 PSEUDO_BRANCH = 16,
89 PSEUDO_BARRIER = 17,
90 PSEUDO_REDUCTION = 18,
91
92 /* Vector ALU Formats */
93 VOP1 = 1 << 8,
94 VOP2 = 1 << 9,
95 VOPC = 1 << 10,
96 VOP3 = 1 << 11,
97 VOP3A = 1 << 11,
98 VOP3B = 1 << 11,
99 VOP3P = 1 << 12,
100 /* Vector Parameter Interpolation Format */
101 VINTRP = 1 << 13,
102 DPP = 1 << 14,
103 SDWA = 1 << 15,
104 };
105
106 enum barrier_interaction : uint8_t {
107 barrier_none = 0,
108 barrier_buffer = 0x1,
109 barrier_image = 0x2,
110 barrier_atomic = 0x4,
111 barrier_shared = 0x8,
112 /* used for geometry shaders to ensure vertex data writes are before the
113 * GS_DONE s_sendmsg. */
114 barrier_gs_data = 0x10,
115 /* used for geometry shaders to ensure s_sendmsg instructions are in-order. */
116 barrier_gs_sendmsg = 0x20,
117 /* used by barriers. created by s_barrier */
118 barrier_barrier = 0x40,
119 barrier_count = 6,
120 };
121
122 enum fp_round {
123 fp_round_ne = 0,
124 fp_round_pi = 1,
125 fp_round_ni = 2,
126 fp_round_tz = 3,
127 };
128
129 enum fp_denorm {
130 /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
131 * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
132 fp_denorm_flush = 0x0,
133 fp_denorm_keep = 0x3,
134 };
135
136 struct float_mode {
137 /* matches encoding of the MODE register */
138 union {
139 struct {
140 fp_round round32:2;
141 fp_round round16_64:2;
142 unsigned denorm32:2;
143 unsigned denorm16_64:2;
144 };
145 uint8_t val = 0;
146 };
147 /* if false, optimizations which may remove infs/nan/-0.0 can be done */
148 bool preserve_signed_zero_inf_nan32:1;
149 bool preserve_signed_zero_inf_nan16_64:1;
150 /* if false, optimizations which may remove denormal flushing can be done */
151 bool must_flush_denorms32:1;
152 bool must_flush_denorms16_64:1;
153 bool care_about_round32:1;
154 bool care_about_round16_64:1;
155
156 /* Returns true if instructions using the mode "other" can safely use the
157 * current one instead. */
158 bool canReplace(float_mode other) const noexcept {
159 return val == other.val &&
160 (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
161 (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
162 (must_flush_denorms32 || !other.must_flush_denorms32) &&
163 (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
164 (care_about_round32 || !other.care_about_round32) &&
165 (care_about_round16_64 || !other.care_about_round16_64);
166 }
167 };
168
169 constexpr Format asVOP3(Format format) {
170 return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
171 };
172
173 constexpr Format asSDWA(Format format) {
174 assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC);
175 return (Format) ((uint32_t) Format::SDWA | (uint32_t) format);
176 }
177
178 enum class RegType {
179 none = 0,
180 sgpr,
181 vgpr,
182 linear_vgpr,
183 };
184
185 struct RegClass {
186
187 enum RC : uint8_t {
188 s1 = 1,
189 s2 = 2,
190 s3 = 3,
191 s4 = 4,
192 s6 = 6,
193 s8 = 8,
194 s16 = 16,
195 v1 = s1 | (1 << 5),
196 v2 = s2 | (1 << 5),
197 v3 = s3 | (1 << 5),
198 v4 = s4 | (1 << 5),
199 v5 = 5 | (1 << 5),
200 v6 = 6 | (1 << 5),
201 v7 = 7 | (1 << 5),
202 v8 = 8 | (1 << 5),
203 /* byte-sized register class */
204 v1b = v1 | (1 << 7),
205 v2b = v2 | (1 << 7),
206 v3b = v3 | (1 << 7),
207 v4b = v4 | (1 << 7),
208 v6b = v6 | (1 << 7),
209 v8b = v8 | (1 << 7),
210 /* these are used for WWM and spills to vgpr */
211 v1_linear = v1 | (1 << 6),
212 v2_linear = v2 | (1 << 6),
213 };
214
215 RegClass() = default;
216 constexpr RegClass(RC rc)
217 : rc(rc) {}
218 constexpr RegClass(RegType type, unsigned size)
219 : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
220
221 constexpr operator RC() const { return rc; }
222 explicit operator bool() = delete;
223
224 constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
225 constexpr bool is_subdword() const { return rc & (1 << 7); }
226 constexpr unsigned bytes() const { return ((unsigned) rc & 0x1F) * (is_subdword() ? 1 : 4); }
227 //TODO: use size() less in favor of bytes()
228 constexpr unsigned size() const { return (bytes() + 3) >> 2; }
229 constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
230 constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
231 constexpr RegClass as_subdword() const { return RegClass((RC) (rc | 1 << 7)); }
232
233 private:
234 RC rc;
235 };
236
237 /* transitional helper expressions */
238 static constexpr RegClass s1{RegClass::s1};
239 static constexpr RegClass s2{RegClass::s2};
240 static constexpr RegClass s3{RegClass::s3};
241 static constexpr RegClass s4{RegClass::s4};
242 static constexpr RegClass s8{RegClass::s8};
243 static constexpr RegClass s16{RegClass::s16};
244 static constexpr RegClass v1{RegClass::v1};
245 static constexpr RegClass v2{RegClass::v2};
246 static constexpr RegClass v3{RegClass::v3};
247 static constexpr RegClass v4{RegClass::v4};
248 static constexpr RegClass v5{RegClass::v5};
249 static constexpr RegClass v6{RegClass::v6};
250 static constexpr RegClass v7{RegClass::v7};
251 static constexpr RegClass v8{RegClass::v8};
252 static constexpr RegClass v1b{RegClass::v1b};
253 static constexpr RegClass v2b{RegClass::v2b};
254 static constexpr RegClass v3b{RegClass::v3b};
255 static constexpr RegClass v4b{RegClass::v4b};
256 static constexpr RegClass v6b{RegClass::v6b};
257 static constexpr RegClass v8b{RegClass::v8b};
258
259 /**
260 * Temp Class
261 * Each temporary virtual register has a
262 * register class (i.e. size and type)
263 * and SSA id.
264 */
265 struct Temp {
266 Temp() noexcept : id_(0), reg_class(0) {}
267 constexpr Temp(uint32_t id, RegClass cls) noexcept
268 : id_(id), reg_class(uint8_t(cls)) {}
269
270 constexpr uint32_t id() const noexcept { return id_; }
271 constexpr RegClass regClass() const noexcept { return (RegClass::RC)reg_class; }
272
273 constexpr unsigned bytes() const noexcept { return regClass().bytes(); }
274 constexpr unsigned size() const noexcept { return regClass().size(); }
275 constexpr RegType type() const noexcept { return regClass().type(); }
276 constexpr bool is_linear() const noexcept { return regClass().is_linear(); }
277
278 constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
279 constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
280 constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
281
282 private:
283 uint32_t id_: 24;
284 uint32_t reg_class : 8;
285 };
286
287 /**
288 * PhysReg
289 * Represents the physical register for each
290 * Operand and Definition.
291 */
292 struct PhysReg {
293 constexpr PhysReg() = default;
294 explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {}
295 constexpr unsigned reg() const { return reg_b >> 2; }
296 constexpr unsigned byte() const { return reg_b & 0x3; }
297 constexpr operator unsigned() const { return reg(); }
298 constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; }
299 constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; }
300 constexpr bool operator <(PhysReg other) const { return reg_b < other.reg_b; }
301
302 uint16_t reg_b = 0;
303 };
304
305 /* helper expressions for special registers */
306 static constexpr PhysReg m0{124};
307 static constexpr PhysReg vcc{106};
308 static constexpr PhysReg vcc_hi{107};
309 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
310 static constexpr PhysReg exec{126};
311 static constexpr PhysReg exec_lo{126};
312 static constexpr PhysReg exec_hi{127};
313 static constexpr PhysReg vccz{251};
314 static constexpr PhysReg execz{252};
315 static constexpr PhysReg scc{253};
316
317 /**
318 * Operand Class
319 * Initially, each Operand refers to either
320 * a temporary virtual register
321 * or to a constant value
322 * Temporary registers get mapped to physical register during RA
323 * Constant values are inlined into the instruction sequence.
324 */
325 class Operand final
326 {
327 public:
328 constexpr Operand()
329 : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
330 isKill_(false), isUndef_(true), isFirstKill_(false), is64BitConst_(false),
331 isLateKill_(false) {}
332
333 explicit Operand(Temp r) noexcept
334 {
335 data_.temp = r;
336 if (r.id()) {
337 isTemp_ = true;
338 } else {
339 isUndef_ = true;
340 setFixed(PhysReg{128});
341 }
342 };
343 explicit Operand(uint32_t v, bool is64bit = false) noexcept
344 {
345 data_.i = v;
346 isConstant_ = true;
347 is64BitConst_ = is64bit;
348 if (v <= 64)
349 setFixed(PhysReg{128 + v});
350 else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
351 setFixed(PhysReg{192 - v});
352 else if (v == 0x3f000000) /* 0.5 */
353 setFixed(PhysReg{240});
354 else if (v == 0xbf000000) /* -0.5 */
355 setFixed(PhysReg{241});
356 else if (v == 0x3f800000) /* 1.0 */
357 setFixed(PhysReg{242});
358 else if (v == 0xbf800000) /* -1.0 */
359 setFixed(PhysReg{243});
360 else if (v == 0x40000000) /* 2.0 */
361 setFixed(PhysReg{244});
362 else if (v == 0xc0000000) /* -2.0 */
363 setFixed(PhysReg{245});
364 else if (v == 0x40800000) /* 4.0 */
365 setFixed(PhysReg{246});
366 else if (v == 0xc0800000) /* -4.0 */
367 setFixed(PhysReg{247});
368 else { /* Literal Constant */
369 assert(!is64bit && "attempt to create a 64-bit literal constant");
370 setFixed(PhysReg{255});
371 }
372 };
373 explicit Operand(uint64_t v) noexcept
374 {
375 isConstant_ = true;
376 is64BitConst_ = true;
377 if (v <= 64) {
378 data_.i = (uint32_t) v;
379 setFixed(PhysReg{128 + (uint32_t) v});
380 } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
381 data_.i = (uint32_t) v;
382 setFixed(PhysReg{192 - (uint32_t) v});
383 } else if (v == 0x3FE0000000000000) { /* 0.5 */
384 data_.i = 0x3f000000;
385 setFixed(PhysReg{240});
386 } else if (v == 0xBFE0000000000000) { /* -0.5 */
387 data_.i = 0xbf000000;
388 setFixed(PhysReg{241});
389 } else if (v == 0x3FF0000000000000) { /* 1.0 */
390 data_.i = 0x3f800000;
391 setFixed(PhysReg{242});
392 } else if (v == 0xBFF0000000000000) { /* -1.0 */
393 data_.i = 0xbf800000;
394 setFixed(PhysReg{243});
395 } else if (v == 0x4000000000000000) { /* 2.0 */
396 data_.i = 0x40000000;
397 setFixed(PhysReg{244});
398 } else if (v == 0xC000000000000000) { /* -2.0 */
399 data_.i = 0xc0000000;
400 setFixed(PhysReg{245});
401 } else if (v == 0x4010000000000000) { /* 4.0 */
402 data_.i = 0x40800000;
403 setFixed(PhysReg{246});
404 } else if (v == 0xC010000000000000) { /* -4.0 */
405 data_.i = 0xc0800000;
406 setFixed(PhysReg{247});
407 } else { /* Literal Constant: we don't know if it is a long or double.*/
408 isConstant_ = 0;
409 assert(false && "attempt to create a 64-bit literal constant");
410 }
411 };
412 explicit Operand(RegClass type) noexcept
413 {
414 isUndef_ = true;
415 data_.temp = Temp(0, type);
416 setFixed(PhysReg{128});
417 };
418 explicit Operand(PhysReg reg, RegClass type) noexcept
419 {
420 data_.temp = Temp(0, type);
421 setFixed(reg);
422 }
423
424 constexpr bool isTemp() const noexcept
425 {
426 return isTemp_;
427 }
428
429 constexpr void setTemp(Temp t) noexcept {
430 assert(!isConstant_);
431 isTemp_ = true;
432 data_.temp = t;
433 }
434
435 constexpr Temp getTemp() const noexcept
436 {
437 return data_.temp;
438 }
439
440 constexpr uint32_t tempId() const noexcept
441 {
442 return data_.temp.id();
443 }
444
445 constexpr bool hasRegClass() const noexcept
446 {
447 return isTemp() || isUndefined();
448 }
449
450 constexpr RegClass regClass() const noexcept
451 {
452 return data_.temp.regClass();
453 }
454
455 constexpr unsigned bytes() const noexcept
456 {
457 if (isConstant())
458 return is64BitConst_ ? 8 : 4; //TODO: sub-dword constants
459 else
460 return data_.temp.bytes();
461 }
462
463 constexpr unsigned size() const noexcept
464 {
465 if (isConstant())
466 return is64BitConst_ ? 2 : 1;
467 else
468 return data_.temp.size();
469 }
470
471 constexpr bool isFixed() const noexcept
472 {
473 return isFixed_;
474 }
475
476 constexpr PhysReg physReg() const noexcept
477 {
478 return reg_;
479 }
480
481 constexpr void setFixed(PhysReg reg) noexcept
482 {
483 isFixed_ = reg != unsigned(-1);
484 reg_ = reg;
485 }
486
487 constexpr bool isConstant() const noexcept
488 {
489 return isConstant_;
490 }
491
492 constexpr bool isLiteral() const noexcept
493 {
494 return isConstant() && reg_ == 255;
495 }
496
497 constexpr bool isUndefined() const noexcept
498 {
499 return isUndef_;
500 }
501
502 constexpr uint32_t constantValue() const noexcept
503 {
504 return data_.i;
505 }
506
507 constexpr bool constantEquals(uint32_t cmp) const noexcept
508 {
509 return isConstant() && constantValue() == cmp;
510 }
511
512 constexpr uint64_t constantValue64(bool signext=false) const noexcept
513 {
514 if (is64BitConst_) {
515 if (reg_ <= 192)
516 return reg_ - 128;
517 else if (reg_ <= 208)
518 return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);
519
520 switch (reg_) {
521 case 240:
522 return 0x3FE0000000000000;
523 case 241:
524 return 0xBFE0000000000000;
525 case 242:
526 return 0x3FF0000000000000;
527 case 243:
528 return 0xBFF0000000000000;
529 case 244:
530 return 0x4000000000000000;
531 case 245:
532 return 0xC000000000000000;
533 case 246:
534 return 0x4010000000000000;
535 case 247:
536 return 0xC010000000000000;
537 }
538 }
539 return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
540 }
541
542 /* Indicates that the killed operand's live range intersects with the
543 * instruction's definitions. Unlike isKill() and isFirstKill(), this is
544 * not set by liveness analysis. */
545 constexpr void setLateKill(bool flag) noexcept
546 {
547 isLateKill_ = flag;
548 }
549
550 constexpr bool isLateKill() const noexcept
551 {
552 return isLateKill_;
553 }
554
555 constexpr void setKill(bool flag) noexcept
556 {
557 isKill_ = flag;
558 if (!flag)
559 setFirstKill(false);
560 }
561
562 constexpr bool isKill() const noexcept
563 {
564 return isKill_ || isFirstKill();
565 }
566
567 constexpr void setFirstKill(bool flag) noexcept
568 {
569 isFirstKill_ = flag;
570 if (flag)
571 setKill(flag);
572 }
573
574 /* When there are multiple operands killing the same temporary,
575 * isFirstKill() is only returns true for the first one. */
576 constexpr bool isFirstKill() const noexcept
577 {
578 return isFirstKill_;
579 }
580
581 constexpr bool isKillBeforeDef() const noexcept
582 {
583 return isKill() && !isLateKill();
584 }
585
586 constexpr bool isFirstKillBeforeDef() const noexcept
587 {
588 return isFirstKill() && !isLateKill();
589 }
590
591 constexpr bool operator == (Operand other) const noexcept
592 {
593 if (other.size() != size())
594 return false;
595 if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef())
596 return false;
597 if (isFixed() && other.isFixed() && physReg() != other.physReg())
598 return false;
599 if (isLiteral())
600 return other.isLiteral() && other.constantValue() == constantValue();
601 else if (isConstant())
602 return other.isConstant() && other.physReg() == physReg();
603 else if (isUndefined())
604 return other.isUndefined() && other.regClass() == regClass();
605 else
606 return other.isTemp() && other.getTemp() == getTemp();
607 }
608 private:
609 union {
610 uint32_t i;
611 float f;
612 Temp temp = Temp(0, s1);
613 } data_;
614 PhysReg reg_;
615 union {
616 struct {
617 uint8_t isTemp_:1;
618 uint8_t isFixed_:1;
619 uint8_t isConstant_:1;
620 uint8_t isKill_:1;
621 uint8_t isUndef_:1;
622 uint8_t isFirstKill_:1;
623 uint8_t is64BitConst_:1;
624 uint8_t isLateKill_:1;
625 };
626 /* can't initialize bit-fields in c++11, so work around using a union */
627 uint8_t control_ = 0;
628 };
629 };
630
631 /**
632 * Definition Class
633 * Definitions are the results of Instructions
634 * and refer to temporary virtual registers
635 * which are later mapped to physical registers
636 */
637 class Definition final
638 {
639 public:
640 constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0) {}
641 Definition(uint32_t index, RegClass type) noexcept
642 : temp(index, type) {}
643 explicit Definition(Temp tmp) noexcept
644 : temp(tmp) {}
645 Definition(PhysReg reg, RegClass type) noexcept
646 : temp(Temp(0, type))
647 {
648 setFixed(reg);
649 }
650 Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
651 : temp(Temp(tmpId, type))
652 {
653 setFixed(reg);
654 }
655
656 constexpr bool isTemp() const noexcept
657 {
658 return tempId() > 0;
659 }
660
661 constexpr Temp getTemp() const noexcept
662 {
663 return temp;
664 }
665
666 constexpr uint32_t tempId() const noexcept
667 {
668 return temp.id();
669 }
670
671 constexpr void setTemp(Temp t) noexcept {
672 temp = t;
673 }
674
675 constexpr RegClass regClass() const noexcept
676 {
677 return temp.regClass();
678 }
679
680 constexpr unsigned bytes() const noexcept
681 {
682 return temp.bytes();
683 }
684
685 constexpr unsigned size() const noexcept
686 {
687 return temp.size();
688 }
689
690 constexpr bool isFixed() const noexcept
691 {
692 return isFixed_;
693 }
694
695 constexpr PhysReg physReg() const noexcept
696 {
697 return reg_;
698 }
699
700 constexpr void setFixed(PhysReg reg) noexcept
701 {
702 isFixed_ = 1;
703 reg_ = reg;
704 }
705
706 constexpr void setHint(PhysReg reg) noexcept
707 {
708 hasHint_ = 1;
709 reg_ = reg;
710 }
711
712 constexpr bool hasHint() const noexcept
713 {
714 return hasHint_;
715 }
716
717 constexpr void setKill(bool flag) noexcept
718 {
719 isKill_ = flag;
720 }
721
722 constexpr bool isKill() const noexcept
723 {
724 return isKill_;
725 }
726
727 private:
728 Temp temp = Temp(0, s1);
729 PhysReg reg_;
730 union {
731 struct {
732 uint8_t isFixed_:1;
733 uint8_t hasHint_:1;
734 uint8_t isKill_:1;
735 };
736 /* can't initialize bit-fields in c++11, so work around using a union */
737 uint8_t control_ = 0;
738 };
739 };
740
741 class Block;
742
743 struct Instruction {
744 aco_opcode opcode;
745 Format format;
746 uint32_t pass_flags;
747
748 aco::span<Operand> operands;
749 aco::span<Definition> definitions;
750
751 constexpr bool isVALU() const noexcept
752 {
753 return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
754 || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
755 || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
756 || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
757 || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
758 || ((uint16_t) format & (uint16_t) Format::VOP3P) == (uint16_t) Format::VOP3P;
759 }
760
761 constexpr bool isSALU() const noexcept
762 {
763 return format == Format::SOP1 ||
764 format == Format::SOP2 ||
765 format == Format::SOPC ||
766 format == Format::SOPK ||
767 format == Format::SOPP;
768 }
769
770 constexpr bool isVMEM() const noexcept
771 {
772 return format == Format::MTBUF ||
773 format == Format::MUBUF ||
774 format == Format::MIMG;
775 }
776
777 constexpr bool isDPP() const noexcept
778 {
779 return (uint16_t) format & (uint16_t) Format::DPP;
780 }
781
782 constexpr bool isVOP3() const noexcept
783 {
784 return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
785 ((uint16_t) format & (uint16_t) Format::VOP3B) ||
786 format == Format::VOP3P;
787 }
788
789 constexpr bool isSDWA() const noexcept
790 {
791 return (uint16_t) format & (uint16_t) Format::SDWA;
792 }
793
794 constexpr bool isFlatOrGlobal() const noexcept
795 {
796 return format == Format::FLAT || format == Format::GLOBAL;
797 }
798
799 constexpr bool usesModifiers() const noexcept;
800
801 constexpr bool reads_exec() const noexcept
802 {
803 for (const Operand& op : operands) {
804 if (op.isFixed() && op.physReg() == exec)
805 return true;
806 }
807 return false;
808 }
809 };
810 static_assert(sizeof(Instruction) == 16);
811
812 struct SOPK_instruction : public Instruction {
813 uint16_t imm;
814 uint16_t padding;
815 };
816 static_assert(sizeof(SOPK_instruction) == sizeof(Instruction) + 4);
817
818 struct SOPP_instruction : public Instruction {
819 uint32_t imm;
820 int block;
821 };
822 static_assert(sizeof(SOPP_instruction) == sizeof(Instruction) + 8);
823
824 struct SOPC_instruction : public Instruction {
825 };
826 static_assert(sizeof(SOPC_instruction) == sizeof(Instruction) + 0);
827
828 struct SOP1_instruction : public Instruction {
829 };
830 static_assert(sizeof(SOP1_instruction) == sizeof(Instruction) + 0);
831
832 struct SOP2_instruction : public Instruction {
833 };
834 static_assert(sizeof(SOP2_instruction) == sizeof(Instruction) + 0);
835
836 /**
837 * Scalar Memory Format:
838 * For s_(buffer_)load_dword*:
839 * Operand(0): SBASE - SGPR-pair which provides base address
840 * Operand(1): Offset - immediate (un)signed offset or SGPR
841 * Operand(2) / Definition(0): SDATA - SGPR for read / write result
842 * Operand(n-1): SOffset - SGPR offset (Vega only)
843 *
844 * Having no operands is also valid for instructions such as s_dcache_inv.
845 *
846 */
847 struct SMEM_instruction : public Instruction {
848 barrier_interaction barrier;
849 bool glc : 1; /* VI+: globally coherent */
850 bool dlc : 1; /* NAVI: device level coherent */
851 bool nv : 1; /* VEGA only: Non-volatile */
852 bool can_reorder : 1;
853 bool disable_wqm : 1;
854 uint32_t padding: 19;
855 };
856 static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4);
857
858 struct VOP1_instruction : public Instruction {
859 };
860 static_assert(sizeof(VOP1_instruction) == sizeof(Instruction) + 0);
861
862 struct VOP2_instruction : public Instruction {
863 };
864 static_assert(sizeof(VOP2_instruction) == sizeof(Instruction) + 0);
865
866 struct VOPC_instruction : public Instruction {
867 };
868 static_assert(sizeof(VOPC_instruction) == sizeof(Instruction) + 0);
869
870 struct VOP3A_instruction : public Instruction {
871 bool abs[3];
872 bool neg[3];
873 uint8_t opsel : 4;
874 uint8_t omod : 2;
875 bool clamp : 1;
876 uint32_t padding : 9;
877 };
878 static_assert(sizeof(VOP3A_instruction) == sizeof(Instruction) + 8);
879
880 /**
881 * Data Parallel Primitives Format:
882 * This format can be used for VOP1, VOP2 or VOPC instructions.
883 * The swizzle applies to the src0 operand.
884 *
885 */
886 struct DPP_instruction : public Instruction {
887 bool abs[2];
888 bool neg[2];
889 uint16_t dpp_ctrl;
890 uint8_t row_mask : 4;
891 uint8_t bank_mask : 4;
892 bool bound_ctrl : 1;
893 uint32_t padding : 7;
894 };
895 static_assert(sizeof(DPP_instruction) == sizeof(Instruction) + 8);
896
897 enum sdwa_sel : uint8_t {
898 /* masks */
899 sdwa_wordnum = 0x1,
900 sdwa_bytenum = 0x3,
901 sdwa_asuint = 0x7 | 0x10,
902 sdwa_rasize = 0x3,
903
904 /* flags */
905 sdwa_isword = 0x4,
906 sdwa_sext = 0x8,
907 sdwa_isra = 0x10,
908
909 /* specific values */
910 sdwa_ubyte0 = 0,
911 sdwa_ubyte1 = 1,
912 sdwa_ubyte2 = 2,
913 sdwa_ubyte3 = 3,
914 sdwa_uword0 = sdwa_isword | 0,
915 sdwa_uword1 = sdwa_isword | 1,
916 sdwa_udword = 6,
917
918 sdwa_sbyte0 = sdwa_ubyte0 | sdwa_sext,
919 sdwa_sbyte1 = sdwa_ubyte1 | sdwa_sext,
920 sdwa_sbyte2 = sdwa_ubyte2 | sdwa_sext,
921 sdwa_sbyte3 = sdwa_ubyte3 | sdwa_sext,
922 sdwa_sword0 = sdwa_uword0 | sdwa_sext,
923 sdwa_sword1 = sdwa_uword1 | sdwa_sext,
924 sdwa_sdword = sdwa_udword | sdwa_sext,
925
926 /* register-allocated */
927 sdwa_ubyte = 1 | sdwa_isra,
928 sdwa_uword = 2 | sdwa_isra,
929 sdwa_sbyte = sdwa_ubyte | sdwa_sext,
930 sdwa_sword = sdwa_uword | sdwa_sext,
931 };
932
933 /**
934 * Sub-Dword Addressing Format:
935 * This format can be used for VOP1, VOP2 or VOPC instructions.
936 *
937 * omod and SGPR/constant operands are only available on GFX9+. For VOPC,
938 * the definition doesn't have to be VCC on GFX9+.
939 *
940 */
941 struct SDWA_instruction : public Instruction {
942 /* these destination modifiers aren't available with VOPC except for
943 * clamp on GFX8 */
944 uint8_t sel[2];
945 uint8_t dst_sel;
946 bool neg[2];
947 bool abs[2];
948 bool dst_preserve : 1;
949 bool clamp : 1;
950 uint8_t omod : 2; /* GFX9+ */
951 uint32_t padding : 4;
952 };
953 static_assert(sizeof(SDWA_instruction) == sizeof(Instruction) + 8);
954
955 struct Interp_instruction : public Instruction {
956 uint8_t attribute;
957 uint8_t component;
958 uint16_t padding;
959 };
960 static_assert(sizeof(Interp_instruction) == sizeof(Instruction) + 4);
961
962 /**
963 * Local and Global Data Sharing instructions
964 * Operand(0): ADDR - VGPR which supplies the address.
965 * Operand(1): DATA0 - First data VGPR.
966 * Operand(2): DATA1 - Second data VGPR.
967 * Operand(n-1): M0 - LDS size.
968 * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
969 *
970 */
971 struct DS_instruction : public Instruction {
972 int16_t offset0;
973 int8_t offset1;
974 bool gds;
975 };
976 static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 4);
977
978 /**
979 * Vector Memory Untyped-buffer Instructions
980 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
981 * Operand(1): VADDR - Address source. Can carry an index and/or offset
982 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
983 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
984 *
985 */
986 struct MUBUF_instruction : public Instruction {
987 uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
988 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
989 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
990 bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
991 bool glc : 1; /* globally coherent */
992 bool dlc : 1; /* NAVI: device level coherent */
993 bool slc : 1; /* system level coherent */
994 bool tfe : 1; /* texture fail enable */
995 bool lds : 1; /* Return read-data to LDS instead of VGPRs */
996 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
997 bool can_reorder : 1;
998 uint8_t padding : 2;
999 barrier_interaction barrier;
1000 };
1001 static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 4);
1002
1003 /**
1004 * Vector Memory Typed-buffer Instructions
1005 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1006 * Operand(1): VADDR - Address source. Can carry an index and/or offset
1007 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1008 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1009 *
1010 */
1011 struct MTBUF_instruction : public Instruction {
1012 uint16_t offset; /* Unsigned byte offset - 12 bit */
1013 barrier_interaction barrier;
1014 uint8_t dfmt : 4; /* Data Format of data in memory buffer */
1015 uint8_t nfmt : 3; /* Numeric format of data in memory */
1016 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1017 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1018 bool glc : 1; /* globally coherent */
1019 bool dlc : 1; /* NAVI: device level coherent */
1020 bool slc : 1; /* system level coherent */
1021 bool tfe : 1; /* texture fail enable */
1022 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1023 bool can_reorder : 1;
1024 uint32_t padding : 25;
1025 };
1026 static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8);
1027
1028 /**
1029 * Vector Memory Image Instructions
1030 * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
1031 * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
1032 * or VDATA - Vector GPR for write data.
1033 * Operand(2): VADDR - Address source. Can carry an offset or an index.
1034 * Definition(0): VDATA - Vector GPR for read result.
1035 *
1036 */
1037 struct MIMG_instruction : public Instruction {
1038 uint8_t dmask; /* Data VGPR enable mask */
1039 uint8_t dim : 3; /* NAVI: dimensionality */
1040 bool unrm : 1; /* Force address to be un-normalized */
1041 bool dlc : 1; /* NAVI: device level coherent */
1042 bool glc : 1; /* globally coherent */
1043 bool slc : 1; /* system level coherent */
1044 bool tfe : 1; /* texture fail enable */
1045 bool da : 1; /* declare an array */
1046 bool lwe : 1; /* Force data to be un-normalized */
1047 bool r128 : 1; /* NAVI: Texture resource size */
1048 bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
1049 bool d16 : 1; /* Convert 32-bit data to 16-bit data */
1050 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1051 bool can_reorder : 1;
1052 uint8_t padding : 1;
1053 barrier_interaction barrier;
1054 };
1055 static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 4);
1056
1057 /**
1058 * Flat/Scratch/Global Instructions
1059 * Operand(0): ADDR
1060 * Operand(1): SADDR
1061 * Operand(2) / Definition(0): DATA/VDST
1062 *
1063 */
1064 struct FLAT_instruction : public Instruction {
1065 uint16_t offset; /* Vega/Navi only */
1066 bool slc : 1; /* system level coherent */
1067 bool glc : 1; /* globally coherent */
1068 bool dlc : 1; /* NAVI: device level coherent */
1069 bool lds : 1;
1070 bool nv : 1;
1071 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1072 bool can_reorder : 1;
1073 uint8_t padding : 1;
1074 barrier_interaction barrier;
1075 };
1076 static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 4);
1077
1078 struct Export_instruction : public Instruction {
1079 uint8_t enabled_mask;
1080 uint8_t dest;
1081 bool compressed : 1;
1082 bool done : 1;
1083 bool valid_mask : 1;
1084 uint32_t padding : 13;
1085 };
1086 static_assert(sizeof(Export_instruction) == sizeof(Instruction) + 4);
1087
1088 struct Pseudo_instruction : public Instruction {
1089 PhysReg scratch_sgpr; /* might not be valid if it's not needed */
1090 bool tmp_in_scc;
1091 uint8_t padding;
1092 };
1093 static_assert(sizeof(Pseudo_instruction) == sizeof(Instruction) + 4);
1094
1095 struct Pseudo_branch_instruction : public Instruction {
1096 /* target[0] is the block index of the branch target.
1097 * For conditional branches, target[1] contains the fall-through alternative.
1098 * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
1099 */
1100 uint32_t target[2];
1101 };
1102 static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 8);
1103
1104 struct Pseudo_barrier_instruction : public Instruction {
1105 };
1106 static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 0);
1107
1108 enum ReduceOp : uint16_t {
1109 iadd32, iadd64,
1110 imul32, imul64,
1111 fadd32, fadd64,
1112 fmul32, fmul64,
1113 imin32, imin64,
1114 imax32, imax64,
1115 umin32, umin64,
1116 umax32, umax64,
1117 fmin32, fmin64,
1118 fmax32, fmax64,
1119 iand32, iand64,
1120 ior32, ior64,
1121 ixor32, ixor64,
1122 gfx10_wave64_bpermute
1123 };
1124
1125 /**
1126 * Subgroup Reduction Instructions, everything except for the data to be
1127 * reduced and the result as inserted by setup_reduce_temp().
1128 * Operand(0): data to be reduced
1129 * Operand(1): reduce temporary
1130 * Operand(2): vector temporary
1131 * Definition(0): result
1132 * Definition(1): scalar temporary
1133 * Definition(2): scalar identity temporary (not used to store identity on GFX10)
1134 * Definition(3): scc clobber
1135 * Definition(4): vcc clobber
1136 *
1137 */
1138 struct Pseudo_reduction_instruction : public Instruction {
1139 ReduceOp reduce_op;
1140 uint16_t cluster_size; // must be 0 for scans
1141 };
1142 static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4);
1143
1144 struct instr_deleter_functor {
1145 void operator()(void* p) {
1146 free(p);
1147 }
1148 };
1149
1150 template<typename T>
1151 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
1152
1153 template<typename T>
1154 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
1155 {
1156 std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1157 char *data = (char*) calloc(1, size);
1158 T* inst = (T*) data;
1159
1160 inst->opcode = opcode;
1161 inst->format = format;
1162
1163 uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
1164 inst->operands = aco::span<Operand>(operands_offset, num_operands);
1165 uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1166 inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1167
1168 return inst;
1169 }
1170
1171 constexpr bool Instruction::usesModifiers() const noexcept
1172 {
1173 if (isDPP() || isSDWA())
1174 return true;
1175 if (!isVOP3())
1176 return false;
1177 const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
1178 for (unsigned i = 0; i < operands.size(); i++) {
1179 if (vop3->abs[i] || vop3->neg[i])
1180 return true;
1181 }
1182 return vop3->opsel || vop3->clamp || vop3->omod;
1183 }
1184
1185 constexpr bool is_phi(Instruction* instr)
1186 {
1187 return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
1188 }
1189
1190 static inline bool is_phi(aco_ptr<Instruction>& instr)
1191 {
1192 return is_phi(instr.get());
1193 }
1194
1195 barrier_interaction get_barrier_interaction(Instruction* instr);
1196
1197 bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr);
1198
1199 enum block_kind {
1200 /* uniform indicates that leaving this block,
1201 * all actives lanes stay active */
1202 block_kind_uniform = 1 << 0,
1203 block_kind_top_level = 1 << 1,
1204 block_kind_loop_preheader = 1 << 2,
1205 block_kind_loop_header = 1 << 3,
1206 block_kind_loop_exit = 1 << 4,
1207 block_kind_continue = 1 << 5,
1208 block_kind_break = 1 << 6,
1209 block_kind_continue_or_break = 1 << 7,
1210 block_kind_discard = 1 << 8,
1211 block_kind_branch = 1 << 9,
1212 block_kind_merge = 1 << 10,
1213 block_kind_invert = 1 << 11,
1214 block_kind_uses_discard_if = 1 << 12,
1215 block_kind_needs_lowering = 1 << 13,
1216 block_kind_uses_demote = 1 << 14,
1217 block_kind_export_end = 1 << 15,
1218 };
1219
1220
1221 struct RegisterDemand {
1222 constexpr RegisterDemand() = default;
1223 constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1224 : vgpr{v}, sgpr{s} {}
1225 int16_t vgpr = 0;
1226 int16_t sgpr = 0;
1227
1228 constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1229 return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1230 }
1231
1232 constexpr bool exceeds(const RegisterDemand other) const noexcept {
1233 return vgpr > other.vgpr || sgpr > other.sgpr;
1234 }
1235
1236 constexpr RegisterDemand operator+(const Temp t) const noexcept {
1237 if (t.type() == RegType::sgpr)
1238 return RegisterDemand( vgpr, sgpr + t.size() );
1239 else
1240 return RegisterDemand( vgpr + t.size(), sgpr );
1241 }
1242
1243 constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1244 return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1245 }
1246
1247 constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1248 return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1249 }
1250
1251 constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1252 vgpr += other.vgpr;
1253 sgpr += other.sgpr;
1254 return *this;
1255 }
1256
1257 constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1258 vgpr -= other.vgpr;
1259 sgpr -= other.sgpr;
1260 return *this;
1261 }
1262
1263 constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1264 if (t.type() == RegType::sgpr)
1265 sgpr += t.size();
1266 else
1267 vgpr += t.size();
1268 return *this;
1269 }
1270
1271 constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1272 if (t.type() == RegType::sgpr)
1273 sgpr -= t.size();
1274 else
1275 vgpr -= t.size();
1276 return *this;
1277 }
1278
1279 constexpr void update(const RegisterDemand other) noexcept {
1280 vgpr = std::max(vgpr, other.vgpr);
1281 sgpr = std::max(sgpr, other.sgpr);
1282 }
1283
1284 };
1285
1286 /* CFG */
1287 struct Block {
1288 float_mode fp_mode;
1289 unsigned index;
1290 unsigned offset = 0;
1291 std::vector<aco_ptr<Instruction>> instructions;
1292 std::vector<unsigned> logical_preds;
1293 std::vector<unsigned> linear_preds;
1294 std::vector<unsigned> logical_succs;
1295 std::vector<unsigned> linear_succs;
1296 RegisterDemand register_demand = RegisterDemand();
1297 uint16_t loop_nest_depth = 0;
1298 uint16_t kind = 0;
1299 int logical_idom = -1;
1300 int linear_idom = -1;
1301 Temp live_out_exec = Temp();
1302
1303 /* this information is needed for predecessors to blocks with phis when
1304 * moving out of ssa */
1305 bool scc_live_out = false;
1306 PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1307
1308 Block(unsigned idx) : index(idx) {}
1309 Block() : index(0) {}
1310 };
1311
1312 using Stage = uint16_t;
1313
1314 /* software stages */
1315 static constexpr Stage sw_vs = 1 << 0;
1316 static constexpr Stage sw_gs = 1 << 1;
1317 static constexpr Stage sw_tcs = 1 << 2;
1318 static constexpr Stage sw_tes = 1 << 3;
1319 static constexpr Stage sw_fs = 1 << 4;
1320 static constexpr Stage sw_cs = 1 << 5;
1321 static constexpr Stage sw_gs_copy = 1 << 6;
1322 static constexpr Stage sw_mask = 0x7f;
1323
1324 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1325 static constexpr Stage hw_vs = 1 << 7;
1326 static constexpr Stage hw_es = 1 << 8; /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */
1327 static constexpr Stage hw_gs = 1 << 9; /* Geometry shader on GFX10/legacy and GFX6-9. */
1328 static constexpr Stage hw_ngg_gs = 1 << 10; /* Geometry shader on GFX10/NGG. */
1329 static constexpr Stage hw_ls = 1 << 11; /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */
1330 static constexpr Stage hw_hs = 1 << 12; /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */
1331 static constexpr Stage hw_fs = 1 << 13;
1332 static constexpr Stage hw_cs = 1 << 14;
1333 static constexpr Stage hw_mask = 0xff << 7;
1334
1335 /* possible settings of Program::stage */
1336 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1337 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1338 static constexpr Stage compute_cs = sw_cs | hw_cs;
1339 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1340 static constexpr Stage gs_copy_vs = sw_gs_copy | hw_vs;
1341 /* GFX10/NGG */
1342 static constexpr Stage ngg_vertex_gs = sw_vs | hw_ngg_gs;
1343 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_ngg_gs;
1344 static constexpr Stage ngg_tess_eval_gs = sw_tes | hw_ngg_gs;
1345 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_ngg_gs;
1346 /* GFX9 (and GFX10 if NGG isn't used) */
1347 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1348 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1349 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1350 /* pre-GFX9 */
1351 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1352 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1353 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1354 static constexpr Stage tess_eval_es = sw_tes | hw_es; /* tesselation evaluation before geometry */
1355 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1356
1357 enum statistic {
1358 statistic_hash,
1359 statistic_instructions,
1360 statistic_copies,
1361 statistic_branches,
1362 statistic_cycles,
1363 statistic_vmem_clauses,
1364 statistic_smem_clauses,
1365 statistic_vmem_score,
1366 statistic_smem_score,
1367 statistic_sgpr_presched,
1368 statistic_vgpr_presched,
1369 num_statistics
1370 };
1371
1372 class Program final {
1373 public:
1374 float_mode next_fp_mode;
1375 std::vector<Block> blocks;
1376 RegisterDemand max_reg_demand = RegisterDemand();
1377 uint16_t num_waves = 0;
1378 uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1379 ac_shader_config* config;
1380 struct radv_shader_info *info;
1381 enum chip_class chip_class;
1382 enum radeon_family family;
1383 unsigned wave_size;
1384 RegClass lane_mask;
1385 Stage stage; /* Stage */
1386 bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1387 bool needs_wqm = false; /* there exists a p_wqm instruction */
1388 bool wb_smem_l1_on_end = false;
1389
1390 std::vector<uint8_t> constant_data;
1391 Temp private_segment_buffer;
1392 Temp scratch_offset;
1393
1394 uint16_t min_waves = 0;
1395 uint16_t lds_alloc_granule;
1396 uint32_t lds_limit; /* in bytes */
1397 bool has_16bank_lds;
1398 uint16_t vgpr_limit;
1399 uint16_t sgpr_limit;
1400 uint16_t physical_sgprs;
1401 uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1402 uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
1403 unsigned workgroup_size; /* if known; otherwise UINT_MAX */
1404
1405 bool xnack_enabled = false;
1406
1407 bool needs_vcc = false;
1408 bool needs_flat_scr = false;
1409
1410 bool collect_statistics = false;
1411 uint32_t statistics[num_statistics];
1412
1413 uint32_t allocateId()
1414 {
1415 assert(allocationID <= 16777215);
1416 return allocationID++;
1417 }
1418
1419 uint32_t peekAllocationId()
1420 {
1421 return allocationID;
1422 }
1423
1424 void setAllocationId(uint32_t id)
1425 {
1426 allocationID = id;
1427 }
1428
1429 Block* create_and_insert_block() {
1430 blocks.emplace_back(blocks.size());
1431 blocks.back().fp_mode = next_fp_mode;
1432 return &blocks.back();
1433 }
1434
1435 Block* insert_block(Block&& block) {
1436 block.index = blocks.size();
1437 block.fp_mode = next_fp_mode;
1438 blocks.emplace_back(std::move(block));
1439 return &blocks.back();
1440 }
1441
1442 private:
1443 uint32_t allocationID = 1;
1444 };
1445
1446 struct TempHash {
1447 std::size_t operator()(Temp t) const {
1448 return t.id();
1449 }
1450 };
1451 using TempSet = std::unordered_set<Temp, TempHash>;
1452
1453 struct live {
1454 /* live temps out per block */
1455 std::vector<TempSet> live_out;
1456 /* register demand (sgpr/vgpr) per instruction per block */
1457 std::vector<std::vector<RegisterDemand>> register_demand;
1458 };
1459
1460 void select_program(Program *program,
1461 unsigned shader_count,
1462 struct nir_shader *const *shaders,
1463 ac_shader_config* config,
1464 struct radv_shader_args *args);
1465 void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
1466 ac_shader_config* config,
1467 struct radv_shader_args *args);
1468
1469 void lower_wqm(Program* program, live& live_vars,
1470 const struct radv_nir_compiler_options *options);
1471 void lower_phis(Program* program);
1472 void calc_min_waves(Program* program);
1473 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1474 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1475 std::vector<uint16_t> dead_code_analysis(Program *program);
1476 void dominator_tree(Program* program);
1477 void insert_exec_mask(Program *program);
1478 void value_numbering(Program* program);
1479 void optimize(Program* program);
1480 void setup_reduce_temp(Program* program);
1481 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1482 void register_allocation(Program *program, std::vector<TempSet>& live_out_per_block);
1483 void ssa_elimination(Program* program);
1484 void lower_to_hw_instr(Program* program);
1485 void schedule_program(Program* program, live& live_vars);
1486 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1487 void insert_wait_states(Program* program);
1488 void insert_NOPs(Program* program);
1489 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1490 void print_asm(Program *program, std::vector<uint32_t>& binary,
1491 unsigned exec_size, std::ostream& out);
1492 void validate(Program* program, FILE *output);
1493 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output);
1494 #ifndef NDEBUG
1495 void perfwarn(bool cond, const char *msg, Instruction *instr=NULL);
1496 #else
1497 #define perfwarn(program, cond, msg, ...) do {} while(0)
1498 #endif
1499
1500 void collect_presched_stats(Program *program);
1501 void collect_preasm_stats(Program *program);
1502 void collect_postasm_stats(Program *program, const std::vector<uint32_t>& code);
1503
1504 void aco_print_instr(Instruction *instr, FILE *output);
1505 void aco_print_program(Program *program, FILE *output);
1506
1507 /* utilities for dealing with register demand */
1508 RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
1509 RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
1510 RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, aco_ptr<Instruction>& instr_before);
1511
1512 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1513 uint16_t get_extra_sgprs(Program *program);
1514
1515 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
1516 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1517 uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs);
1518
1519 /* return number of addressable sgprs/vgprs for max_waves */
1520 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1521 uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves);
1522
1523 typedef struct {
1524 const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
1525 const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1526 const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1527 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1528 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1529 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
1530 const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1531 const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1532 } Info;
1533
1534 extern const Info instr_info;
1535
1536 }
1537
1538 #endif /* ACO_IR_H */
1539