aco: add and use RegClass::get() helper
[mesa.git] / src / amd / compiler / aco_ir.h
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #ifndef ACO_IR_H
26 #define ACO_IR_H
27
28 #include <vector>
29 #include <set>
30 #include <unordered_set>
31 #include <bitset>
32 #include <memory>
33
34 #include "nir.h"
35 #include "ac_binary.h"
36 #include "amd_family.h"
37 #include "aco_opcodes.h"
38 #include "aco_util.h"
39
40 struct radv_nir_compiler_options;
41 struct radv_shader_args;
42 struct radv_shader_info;
43
44 namespace aco {
45
46 extern uint64_t debug_flags;
47
48 enum {
49 DEBUG_VALIDATE = 0x1,
50 DEBUG_VALIDATE_RA = 0x2,
51 DEBUG_PERFWARN = 0x4,
52 };
53
54 /**
55 * Representation of the instruction's microcode encoding format
56 * Note: Some Vector ALU Formats can be combined, such that:
57 * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
58 * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
59 * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
60 *
61 * (*) The same is applicable for VOP1 and VOPC instructions.
62 */
63 enum class Format : std::uint16_t {
64 /* Pseudo Instruction Format */
65 PSEUDO = 0,
66 /* Scalar ALU & Control Formats */
67 SOP1 = 1,
68 SOP2 = 2,
69 SOPK = 3,
70 SOPP = 4,
71 SOPC = 5,
72 /* Scalar Memory Format */
73 SMEM = 6,
74 /* LDS/GDS Format */
75 DS = 8,
76 /* Vector Memory Buffer Formats */
77 MTBUF = 9,
78 MUBUF = 10,
79 /* Vector Memory Image Format */
80 MIMG = 11,
81 /* Export Format */
82 EXP = 12,
83 /* Flat Formats */
84 FLAT = 13,
85 GLOBAL = 14,
86 SCRATCH = 15,
87
88 PSEUDO_BRANCH = 16,
89 PSEUDO_BARRIER = 17,
90 PSEUDO_REDUCTION = 18,
91
92 /* Vector ALU Formats */
93 VOP3P = 19,
94 VOP1 = 1 << 8,
95 VOP2 = 1 << 9,
96 VOPC = 1 << 10,
97 VOP3 = 1 << 11,
98 VOP3A = 1 << 11,
99 VOP3B = 1 << 11,
100 /* Vector Parameter Interpolation Format */
101 VINTRP = 1 << 12,
102 DPP = 1 << 13,
103 SDWA = 1 << 14,
104 };
105
106 enum barrier_interaction : uint8_t {
107 barrier_none = 0,
108 barrier_buffer = 0x1,
109 barrier_image = 0x2,
110 barrier_atomic = 0x4,
111 barrier_shared = 0x8,
112 /* used for geometry shaders to ensure vertex data writes are before the
113 * GS_DONE s_sendmsg. */
114 barrier_gs_data = 0x10,
115 /* used for geometry shaders to ensure s_sendmsg instructions are in-order. */
116 barrier_gs_sendmsg = 0x20,
117 /* used by barriers. created by s_barrier */
118 barrier_barrier = 0x40,
119 barrier_count = 7,
120 };
121
122 enum fp_round {
123 fp_round_ne = 0,
124 fp_round_pi = 1,
125 fp_round_ni = 2,
126 fp_round_tz = 3,
127 };
128
129 enum fp_denorm {
130 /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
131 * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
132 fp_denorm_flush = 0x0,
133 fp_denorm_keep = 0x3,
134 };
135
136 struct float_mode {
137 /* matches encoding of the MODE register */
138 union {
139 struct {
140 fp_round round32:2;
141 fp_round round16_64:2;
142 unsigned denorm32:2;
143 unsigned denorm16_64:2;
144 };
145 uint8_t val = 0;
146 };
147 /* if false, optimizations which may remove infs/nan/-0.0 can be done */
148 bool preserve_signed_zero_inf_nan32:1;
149 bool preserve_signed_zero_inf_nan16_64:1;
150 /* if false, optimizations which may remove denormal flushing can be done */
151 bool must_flush_denorms32:1;
152 bool must_flush_denorms16_64:1;
153 bool care_about_round32:1;
154 bool care_about_round16_64:1;
155
156 /* Returns true if instructions using the mode "other" can safely use the
157 * current one instead. */
158 bool canReplace(float_mode other) const noexcept {
159 return val == other.val &&
160 (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
161 (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
162 (must_flush_denorms32 || !other.must_flush_denorms32) &&
163 (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
164 (care_about_round32 || !other.care_about_round32) &&
165 (care_about_round16_64 || !other.care_about_round16_64);
166 }
167 };
168
169 constexpr Format asVOP3(Format format) {
170 return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
171 };
172
173 constexpr Format asSDWA(Format format) {
174 assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC);
175 return (Format) ((uint32_t) Format::SDWA | (uint32_t) format);
176 }
177
178 enum class RegType {
179 none = 0,
180 sgpr,
181 vgpr,
182 linear_vgpr,
183 };
184
185 struct RegClass {
186
187 enum RC : uint8_t {
188 s1 = 1,
189 s2 = 2,
190 s3 = 3,
191 s4 = 4,
192 s6 = 6,
193 s8 = 8,
194 s16 = 16,
195 v1 = s1 | (1 << 5),
196 v2 = s2 | (1 << 5),
197 v3 = s3 | (1 << 5),
198 v4 = s4 | (1 << 5),
199 v5 = 5 | (1 << 5),
200 v6 = 6 | (1 << 5),
201 v7 = 7 | (1 << 5),
202 v8 = 8 | (1 << 5),
203 /* byte-sized register class */
204 v1b = v1 | (1 << 7),
205 v2b = v2 | (1 << 7),
206 v3b = v3 | (1 << 7),
207 v4b = v4 | (1 << 7),
208 v6b = v6 | (1 << 7),
209 v8b = v8 | (1 << 7),
210 /* these are used for WWM and spills to vgpr */
211 v1_linear = v1 | (1 << 6),
212 v2_linear = v2 | (1 << 6),
213 };
214
215 RegClass() = default;
216 constexpr RegClass(RC rc)
217 : rc(rc) {}
218 constexpr RegClass(RegType type, unsigned size)
219 : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
220
221 constexpr operator RC() const { return rc; }
222 explicit operator bool() = delete;
223
224 constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
225 constexpr bool is_subdword() const { return rc & (1 << 7); }
226 constexpr unsigned bytes() const { return ((unsigned) rc & 0x1F) * (is_subdword() ? 1 : 4); }
227 //TODO: use size() less in favor of bytes()
228 constexpr unsigned size() const { return (bytes() + 3) >> 2; }
229 constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
230 constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
231 constexpr RegClass as_subdword() const { return RegClass((RC) (rc | 1 << 7)); }
232
233 static constexpr RegClass get(RegType type, unsigned bytes) {
234 if (type == RegType::sgpr) {
235 return RegClass(type, DIV_ROUND_UP(bytes, 4u));
236 } else {
237 return bytes % 4u ? RegClass(type, bytes).as_subdword() :
238 RegClass(type, bytes / 4u);
239 }
240 }
241
242 private:
243 RC rc;
244 };
245
246 /* transitional helper expressions */
247 static constexpr RegClass s1{RegClass::s1};
248 static constexpr RegClass s2{RegClass::s2};
249 static constexpr RegClass s3{RegClass::s3};
250 static constexpr RegClass s4{RegClass::s4};
251 static constexpr RegClass s8{RegClass::s8};
252 static constexpr RegClass s16{RegClass::s16};
253 static constexpr RegClass v1{RegClass::v1};
254 static constexpr RegClass v2{RegClass::v2};
255 static constexpr RegClass v3{RegClass::v3};
256 static constexpr RegClass v4{RegClass::v4};
257 static constexpr RegClass v5{RegClass::v5};
258 static constexpr RegClass v6{RegClass::v6};
259 static constexpr RegClass v7{RegClass::v7};
260 static constexpr RegClass v8{RegClass::v8};
261 static constexpr RegClass v1b{RegClass::v1b};
262 static constexpr RegClass v2b{RegClass::v2b};
263 static constexpr RegClass v3b{RegClass::v3b};
264 static constexpr RegClass v4b{RegClass::v4b};
265 static constexpr RegClass v6b{RegClass::v6b};
266 static constexpr RegClass v8b{RegClass::v8b};
267
268 /**
269 * Temp Class
270 * Each temporary virtual register has a
271 * register class (i.e. size and type)
272 * and SSA id.
273 */
274 struct Temp {
275 Temp() noexcept : id_(0), reg_class(0) {}
276 constexpr Temp(uint32_t id, RegClass cls) noexcept
277 : id_(id), reg_class(uint8_t(cls)) {}
278
279 constexpr uint32_t id() const noexcept { return id_; }
280 constexpr RegClass regClass() const noexcept { return (RegClass::RC)reg_class; }
281
282 constexpr unsigned bytes() const noexcept { return regClass().bytes(); }
283 constexpr unsigned size() const noexcept { return regClass().size(); }
284 constexpr RegType type() const noexcept { return regClass().type(); }
285 constexpr bool is_linear() const noexcept { return regClass().is_linear(); }
286
287 constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
288 constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
289 constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
290
291 private:
292 uint32_t id_: 24;
293 uint32_t reg_class : 8;
294 };
295
296 /**
297 * PhysReg
298 * Represents the physical register for each
299 * Operand and Definition.
300 */
301 struct PhysReg {
302 constexpr PhysReg() = default;
303 explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {}
304 constexpr unsigned reg() const { return reg_b >> 2; }
305 constexpr unsigned byte() const { return reg_b & 0x3; }
306 constexpr operator unsigned() const { return reg(); }
307 constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; }
308 constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; }
309 constexpr bool operator <(PhysReg other) const { return reg_b < other.reg_b; }
310
311 uint16_t reg_b = 0;
312 };
313
314 /* helper expressions for special registers */
315 static constexpr PhysReg m0{124};
316 static constexpr PhysReg vcc{106};
317 static constexpr PhysReg vcc_hi{107};
318 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
319 static constexpr PhysReg exec{126};
320 static constexpr PhysReg exec_lo{126};
321 static constexpr PhysReg exec_hi{127};
322 static constexpr PhysReg vccz{251};
323 static constexpr PhysReg execz{252};
324 static constexpr PhysReg scc{253};
325
326 /**
327 * Operand Class
328 * Initially, each Operand refers to either
329 * a temporary virtual register
330 * or to a constant value
331 * Temporary registers get mapped to physical register during RA
332 * Constant values are inlined into the instruction sequence.
333 */
334 class Operand final
335 {
336 public:
337 constexpr Operand()
338 : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
339 isKill_(false), isUndef_(true), isFirstKill_(false), is64BitConst_(false),
340 isLateKill_(false) {}
341
342 explicit Operand(Temp r) noexcept
343 {
344 data_.temp = r;
345 if (r.id()) {
346 isTemp_ = true;
347 } else {
348 isUndef_ = true;
349 setFixed(PhysReg{128});
350 }
351 };
352 explicit Operand(uint32_t v, bool is64bit = false) noexcept
353 {
354 data_.i = v;
355 isConstant_ = true;
356 is64BitConst_ = is64bit;
357 if (v <= 64)
358 setFixed(PhysReg{128 + v});
359 else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
360 setFixed(PhysReg{192 - v});
361 else if (v == 0x3f000000) /* 0.5 */
362 setFixed(PhysReg{240});
363 else if (v == 0xbf000000) /* -0.5 */
364 setFixed(PhysReg{241});
365 else if (v == 0x3f800000) /* 1.0 */
366 setFixed(PhysReg{242});
367 else if (v == 0xbf800000) /* -1.0 */
368 setFixed(PhysReg{243});
369 else if (v == 0x40000000) /* 2.0 */
370 setFixed(PhysReg{244});
371 else if (v == 0xc0000000) /* -2.0 */
372 setFixed(PhysReg{245});
373 else if (v == 0x40800000) /* 4.0 */
374 setFixed(PhysReg{246});
375 else if (v == 0xc0800000) /* -4.0 */
376 setFixed(PhysReg{247});
377 else { /* Literal Constant */
378 assert(!is64bit && "attempt to create a 64-bit literal constant");
379 setFixed(PhysReg{255});
380 }
381 };
382 explicit Operand(uint64_t v) noexcept
383 {
384 isConstant_ = true;
385 is64BitConst_ = true;
386 if (v <= 64) {
387 data_.i = (uint32_t) v;
388 setFixed(PhysReg{128 + (uint32_t) v});
389 } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
390 data_.i = (uint32_t) v;
391 setFixed(PhysReg{192 - (uint32_t) v});
392 } else if (v == 0x3FE0000000000000) { /* 0.5 */
393 data_.i = 0x3f000000;
394 setFixed(PhysReg{240});
395 } else if (v == 0xBFE0000000000000) { /* -0.5 */
396 data_.i = 0xbf000000;
397 setFixed(PhysReg{241});
398 } else if (v == 0x3FF0000000000000) { /* 1.0 */
399 data_.i = 0x3f800000;
400 setFixed(PhysReg{242});
401 } else if (v == 0xBFF0000000000000) { /* -1.0 */
402 data_.i = 0xbf800000;
403 setFixed(PhysReg{243});
404 } else if (v == 0x4000000000000000) { /* 2.0 */
405 data_.i = 0x40000000;
406 setFixed(PhysReg{244});
407 } else if (v == 0xC000000000000000) { /* -2.0 */
408 data_.i = 0xc0000000;
409 setFixed(PhysReg{245});
410 } else if (v == 0x4010000000000000) { /* 4.0 */
411 data_.i = 0x40800000;
412 setFixed(PhysReg{246});
413 } else if (v == 0xC010000000000000) { /* -4.0 */
414 data_.i = 0xc0800000;
415 setFixed(PhysReg{247});
416 } else { /* Literal Constant: we don't know if it is a long or double.*/
417 isConstant_ = 0;
418 assert(false && "attempt to create a 64-bit literal constant");
419 }
420 };
421 explicit Operand(RegClass type) noexcept
422 {
423 isUndef_ = true;
424 data_.temp = Temp(0, type);
425 setFixed(PhysReg{128});
426 };
427 explicit Operand(PhysReg reg, RegClass type) noexcept
428 {
429 data_.temp = Temp(0, type);
430 setFixed(reg);
431 }
432
433 constexpr bool isTemp() const noexcept
434 {
435 return isTemp_;
436 }
437
438 constexpr void setTemp(Temp t) noexcept {
439 assert(!isConstant_);
440 isTemp_ = true;
441 data_.temp = t;
442 }
443
444 constexpr Temp getTemp() const noexcept
445 {
446 return data_.temp;
447 }
448
449 constexpr uint32_t tempId() const noexcept
450 {
451 return data_.temp.id();
452 }
453
454 constexpr bool hasRegClass() const noexcept
455 {
456 return isTemp() || isUndefined();
457 }
458
459 constexpr RegClass regClass() const noexcept
460 {
461 return data_.temp.regClass();
462 }
463
464 constexpr unsigned bytes() const noexcept
465 {
466 if (isConstant())
467 return is64BitConst_ ? 8 : 4; //TODO: sub-dword constants
468 else
469 return data_.temp.bytes();
470 }
471
472 constexpr unsigned size() const noexcept
473 {
474 if (isConstant())
475 return is64BitConst_ ? 2 : 1;
476 else
477 return data_.temp.size();
478 }
479
480 constexpr bool isFixed() const noexcept
481 {
482 return isFixed_;
483 }
484
485 constexpr PhysReg physReg() const noexcept
486 {
487 return reg_;
488 }
489
490 constexpr void setFixed(PhysReg reg) noexcept
491 {
492 isFixed_ = reg != unsigned(-1);
493 reg_ = reg;
494 }
495
496 constexpr bool isConstant() const noexcept
497 {
498 return isConstant_;
499 }
500
501 constexpr bool isLiteral() const noexcept
502 {
503 return isConstant() && reg_ == 255;
504 }
505
506 constexpr bool isUndefined() const noexcept
507 {
508 return isUndef_;
509 }
510
511 constexpr uint32_t constantValue() const noexcept
512 {
513 return data_.i;
514 }
515
516 constexpr bool constantEquals(uint32_t cmp) const noexcept
517 {
518 return isConstant() && constantValue() == cmp;
519 }
520
521 constexpr uint64_t constantValue64(bool signext=false) const noexcept
522 {
523 if (is64BitConst_) {
524 if (reg_ <= 192)
525 return reg_ - 128;
526 else if (reg_ <= 208)
527 return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);
528
529 switch (reg_) {
530 case 240:
531 return 0x3FE0000000000000;
532 case 241:
533 return 0xBFE0000000000000;
534 case 242:
535 return 0x3FF0000000000000;
536 case 243:
537 return 0xBFF0000000000000;
538 case 244:
539 return 0x4000000000000000;
540 case 245:
541 return 0xC000000000000000;
542 case 246:
543 return 0x4010000000000000;
544 case 247:
545 return 0xC010000000000000;
546 }
547 }
548 return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
549 }
550
551 /* Indicates that the killed operand's live range intersects with the
552 * instruction's definitions. Unlike isKill() and isFirstKill(), this is
553 * not set by liveness analysis. */
554 constexpr void setLateKill(bool flag) noexcept
555 {
556 isLateKill_ = flag;
557 }
558
559 constexpr bool isLateKill() const noexcept
560 {
561 return isLateKill_;
562 }
563
564 constexpr void setKill(bool flag) noexcept
565 {
566 isKill_ = flag;
567 if (!flag)
568 setFirstKill(false);
569 }
570
571 constexpr bool isKill() const noexcept
572 {
573 return isKill_ || isFirstKill();
574 }
575
576 constexpr void setFirstKill(bool flag) noexcept
577 {
578 isFirstKill_ = flag;
579 if (flag)
580 setKill(flag);
581 }
582
583 /* When there are multiple operands killing the same temporary,
584 * isFirstKill() is only returns true for the first one. */
585 constexpr bool isFirstKill() const noexcept
586 {
587 return isFirstKill_;
588 }
589
590 constexpr bool isKillBeforeDef() const noexcept
591 {
592 return isKill() && !isLateKill();
593 }
594
595 constexpr bool isFirstKillBeforeDef() const noexcept
596 {
597 return isFirstKill() && !isLateKill();
598 }
599
600 constexpr bool operator == (Operand other) const noexcept
601 {
602 if (other.size() != size())
603 return false;
604 if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef())
605 return false;
606 if (isFixed() && other.isFixed() && physReg() != other.physReg())
607 return false;
608 if (isLiteral())
609 return other.isLiteral() && other.constantValue() == constantValue();
610 else if (isConstant())
611 return other.isConstant() && other.physReg() == physReg();
612 else if (isUndefined())
613 return other.isUndefined() && other.regClass() == regClass();
614 else
615 return other.isTemp() && other.getTemp() == getTemp();
616 }
617 private:
618 union {
619 uint32_t i;
620 float f;
621 Temp temp = Temp(0, s1);
622 } data_;
623 PhysReg reg_;
624 union {
625 struct {
626 uint8_t isTemp_:1;
627 uint8_t isFixed_:1;
628 uint8_t isConstant_:1;
629 uint8_t isKill_:1;
630 uint8_t isUndef_:1;
631 uint8_t isFirstKill_:1;
632 uint8_t is64BitConst_:1;
633 uint8_t isLateKill_:1;
634 };
635 /* can't initialize bit-fields in c++11, so work around using a union */
636 uint8_t control_ = 0;
637 };
638 };
639
640 /**
641 * Definition Class
642 * Definitions are the results of Instructions
643 * and refer to temporary virtual registers
644 * which are later mapped to physical registers
645 */
646 class Definition final
647 {
648 public:
649 constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0) {}
650 Definition(uint32_t index, RegClass type) noexcept
651 : temp(index, type) {}
652 explicit Definition(Temp tmp) noexcept
653 : temp(tmp) {}
654 Definition(PhysReg reg, RegClass type) noexcept
655 : temp(Temp(0, type))
656 {
657 setFixed(reg);
658 }
659 Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
660 : temp(Temp(tmpId, type))
661 {
662 setFixed(reg);
663 }
664
665 constexpr bool isTemp() const noexcept
666 {
667 return tempId() > 0;
668 }
669
670 constexpr Temp getTemp() const noexcept
671 {
672 return temp;
673 }
674
675 constexpr uint32_t tempId() const noexcept
676 {
677 return temp.id();
678 }
679
680 constexpr void setTemp(Temp t) noexcept {
681 temp = t;
682 }
683
684 constexpr RegClass regClass() const noexcept
685 {
686 return temp.regClass();
687 }
688
689 constexpr unsigned bytes() const noexcept
690 {
691 return temp.bytes();
692 }
693
694 constexpr unsigned size() const noexcept
695 {
696 return temp.size();
697 }
698
699 constexpr bool isFixed() const noexcept
700 {
701 return isFixed_;
702 }
703
704 constexpr PhysReg physReg() const noexcept
705 {
706 return reg_;
707 }
708
709 constexpr void setFixed(PhysReg reg) noexcept
710 {
711 isFixed_ = 1;
712 reg_ = reg;
713 }
714
715 constexpr void setHint(PhysReg reg) noexcept
716 {
717 hasHint_ = 1;
718 reg_ = reg;
719 }
720
721 constexpr bool hasHint() const noexcept
722 {
723 return hasHint_;
724 }
725
726 constexpr void setKill(bool flag) noexcept
727 {
728 isKill_ = flag;
729 }
730
731 constexpr bool isKill() const noexcept
732 {
733 return isKill_;
734 }
735
736 private:
737 Temp temp = Temp(0, s1);
738 PhysReg reg_;
739 union {
740 struct {
741 uint8_t isFixed_:1;
742 uint8_t hasHint_:1;
743 uint8_t isKill_:1;
744 };
745 /* can't initialize bit-fields in c++11, so work around using a union */
746 uint8_t control_ = 0;
747 };
748 };
749
750 class Block;
751
752 struct Instruction {
753 aco_opcode opcode;
754 Format format;
755 uint32_t pass_flags;
756
757 aco::span<Operand> operands;
758 aco::span<Definition> definitions;
759
760 constexpr bool isVALU() const noexcept
761 {
762 return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
763 || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
764 || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
765 || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
766 || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
767 || format == Format::VOP3P;
768 }
769
770 constexpr bool isSALU() const noexcept
771 {
772 return format == Format::SOP1 ||
773 format == Format::SOP2 ||
774 format == Format::SOPC ||
775 format == Format::SOPK ||
776 format == Format::SOPP;
777 }
778
779 constexpr bool isVMEM() const noexcept
780 {
781 return format == Format::MTBUF ||
782 format == Format::MUBUF ||
783 format == Format::MIMG;
784 }
785
786 constexpr bool isDPP() const noexcept
787 {
788 return (uint16_t) format & (uint16_t) Format::DPP;
789 }
790
791 constexpr bool isVOP3() const noexcept
792 {
793 return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
794 ((uint16_t) format & (uint16_t) Format::VOP3B);
795 }
796
797 constexpr bool isSDWA() const noexcept
798 {
799 return (uint16_t) format & (uint16_t) Format::SDWA;
800 }
801
802 constexpr bool isFlatOrGlobal() const noexcept
803 {
804 return format == Format::FLAT || format == Format::GLOBAL;
805 }
806
807 constexpr bool usesModifiers() const noexcept;
808
809 constexpr bool reads_exec() const noexcept
810 {
811 for (const Operand& op : operands) {
812 if (op.isFixed() && op.physReg() == exec)
813 return true;
814 }
815 return false;
816 }
817 };
818 static_assert(sizeof(Instruction) == 16);
819
820 struct SOPK_instruction : public Instruction {
821 uint16_t imm;
822 uint16_t padding;
823 };
824 static_assert(sizeof(SOPK_instruction) == sizeof(Instruction) + 4);
825
826 struct SOPP_instruction : public Instruction {
827 uint32_t imm;
828 int block;
829 };
830 static_assert(sizeof(SOPP_instruction) == sizeof(Instruction) + 8);
831
832 struct SOPC_instruction : public Instruction {
833 };
834 static_assert(sizeof(SOPC_instruction) == sizeof(Instruction) + 0);
835
836 struct SOP1_instruction : public Instruction {
837 };
838 static_assert(sizeof(SOP1_instruction) == sizeof(Instruction) + 0);
839
840 struct SOP2_instruction : public Instruction {
841 };
842 static_assert(sizeof(SOP2_instruction) == sizeof(Instruction) + 0);
843
844 /**
845 * Scalar Memory Format:
846 * For s_(buffer_)load_dword*:
847 * Operand(0): SBASE - SGPR-pair which provides base address
848 * Operand(1): Offset - immediate (un)signed offset or SGPR
849 * Operand(2) / Definition(0): SDATA - SGPR for read / write result
850 * Operand(n-1): SOffset - SGPR offset (Vega only)
851 *
852 * Having no operands is also valid for instructions such as s_dcache_inv.
853 *
854 */
855 struct SMEM_instruction : public Instruction {
856 barrier_interaction barrier;
857 bool glc : 1; /* VI+: globally coherent */
858 bool dlc : 1; /* NAVI: device level coherent */
859 bool nv : 1; /* VEGA only: Non-volatile */
860 bool can_reorder : 1;
861 bool disable_wqm : 1;
862 uint32_t padding: 19;
863 };
864 static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4);
865
866 struct VOP1_instruction : public Instruction {
867 };
868 static_assert(sizeof(VOP1_instruction) == sizeof(Instruction) + 0);
869
870 struct VOP2_instruction : public Instruction {
871 };
872 static_assert(sizeof(VOP2_instruction) == sizeof(Instruction) + 0);
873
874 struct VOPC_instruction : public Instruction {
875 };
876 static_assert(sizeof(VOPC_instruction) == sizeof(Instruction) + 0);
877
878 struct VOP3A_instruction : public Instruction {
879 bool abs[3];
880 bool neg[3];
881 uint8_t opsel : 4;
882 uint8_t omod : 2;
883 bool clamp : 1;
884 uint32_t padding : 9;
885 };
886 static_assert(sizeof(VOP3A_instruction) == sizeof(Instruction) + 8);
887
888 struct VOP3P_instruction : public Instruction {
889 bool neg_lo[3];
890 bool neg_hi[3];
891 uint8_t opsel_lo : 3;
892 uint8_t opsel_hi : 3;
893 bool clamp : 1;
894 uint32_t padding : 9;
895 };
896 static_assert(sizeof(VOP3P_instruction) == sizeof(Instruction) + 8);
897
898 /**
899 * Data Parallel Primitives Format:
900 * This format can be used for VOP1, VOP2 or VOPC instructions.
901 * The swizzle applies to the src0 operand.
902 *
903 */
904 struct DPP_instruction : public Instruction {
905 bool abs[2];
906 bool neg[2];
907 uint16_t dpp_ctrl;
908 uint8_t row_mask : 4;
909 uint8_t bank_mask : 4;
910 bool bound_ctrl : 1;
911 uint32_t padding : 7;
912 };
913 static_assert(sizeof(DPP_instruction) == sizeof(Instruction) + 8);
914
915 enum sdwa_sel : uint8_t {
916 /* masks */
917 sdwa_wordnum = 0x1,
918 sdwa_bytenum = 0x3,
919 sdwa_asuint = 0x7 | 0x10,
920 sdwa_rasize = 0x3,
921
922 /* flags */
923 sdwa_isword = 0x4,
924 sdwa_sext = 0x8,
925 sdwa_isra = 0x10,
926
927 /* specific values */
928 sdwa_ubyte0 = 0,
929 sdwa_ubyte1 = 1,
930 sdwa_ubyte2 = 2,
931 sdwa_ubyte3 = 3,
932 sdwa_uword0 = sdwa_isword | 0,
933 sdwa_uword1 = sdwa_isword | 1,
934 sdwa_udword = 6,
935
936 sdwa_sbyte0 = sdwa_ubyte0 | sdwa_sext,
937 sdwa_sbyte1 = sdwa_ubyte1 | sdwa_sext,
938 sdwa_sbyte2 = sdwa_ubyte2 | sdwa_sext,
939 sdwa_sbyte3 = sdwa_ubyte3 | sdwa_sext,
940 sdwa_sword0 = sdwa_uword0 | sdwa_sext,
941 sdwa_sword1 = sdwa_uword1 | sdwa_sext,
942 sdwa_sdword = sdwa_udword | sdwa_sext,
943
944 /* register-allocated */
945 sdwa_ubyte = 1 | sdwa_isra,
946 sdwa_uword = 2 | sdwa_isra,
947 sdwa_sbyte = sdwa_ubyte | sdwa_sext,
948 sdwa_sword = sdwa_uword | sdwa_sext,
949 };
950
951 /**
952 * Sub-Dword Addressing Format:
953 * This format can be used for VOP1, VOP2 or VOPC instructions.
954 *
955 * omod and SGPR/constant operands are only available on GFX9+. For VOPC,
956 * the definition doesn't have to be VCC on GFX9+.
957 *
958 */
959 struct SDWA_instruction : public Instruction {
960 /* these destination modifiers aren't available with VOPC except for
961 * clamp on GFX8 */
962 uint8_t sel[2];
963 uint8_t dst_sel;
964 bool neg[2];
965 bool abs[2];
966 bool dst_preserve : 1;
967 bool clamp : 1;
968 uint8_t omod : 2; /* GFX9+ */
969 uint32_t padding : 4;
970 };
971 static_assert(sizeof(SDWA_instruction) == sizeof(Instruction) + 8);
972
973 struct Interp_instruction : public Instruction {
974 uint8_t attribute;
975 uint8_t component;
976 uint16_t padding;
977 };
978 static_assert(sizeof(Interp_instruction) == sizeof(Instruction) + 4);
979
980 /**
981 * Local and Global Data Sharing instructions
982 * Operand(0): ADDR - VGPR which supplies the address.
983 * Operand(1): DATA0 - First data VGPR.
984 * Operand(2): DATA1 - Second data VGPR.
985 * Operand(n-1): M0 - LDS size.
986 * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
987 *
988 */
989 struct DS_instruction : public Instruction {
990 int16_t offset0;
991 int8_t offset1;
992 bool gds;
993 };
994 static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 4);
995
996 /**
997 * Vector Memory Untyped-buffer Instructions
998 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
999 * Operand(1): VADDR - Address source. Can carry an index and/or offset
1000 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1001 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1002 *
1003 */
1004 struct MUBUF_instruction : public Instruction {
1005 uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
1006 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1007 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1008 bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
1009 bool glc : 1; /* globally coherent */
1010 bool dlc : 1; /* NAVI: device level coherent */
1011 bool slc : 1; /* system level coherent */
1012 bool tfe : 1; /* texture fail enable */
1013 bool lds : 1; /* Return read-data to LDS instead of VGPRs */
1014 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1015 bool can_reorder : 1;
1016 uint8_t padding : 2;
1017 barrier_interaction barrier;
1018 };
1019 static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 4);
1020
1021 /**
1022 * Vector Memory Typed-buffer Instructions
1023 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1024 * Operand(1): VADDR - Address source. Can carry an index and/or offset
1025 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1026 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1027 *
1028 */
1029 struct MTBUF_instruction : public Instruction {
1030 uint16_t offset; /* Unsigned byte offset - 12 bit */
1031 barrier_interaction barrier;
1032 uint8_t dfmt : 4; /* Data Format of data in memory buffer */
1033 uint8_t nfmt : 3; /* Numeric format of data in memory */
1034 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1035 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1036 bool glc : 1; /* globally coherent */
1037 bool dlc : 1; /* NAVI: device level coherent */
1038 bool slc : 1; /* system level coherent */
1039 bool tfe : 1; /* texture fail enable */
1040 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1041 bool can_reorder : 1;
1042 uint32_t padding : 25;
1043 };
1044 static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8);
1045
1046 /**
1047 * Vector Memory Image Instructions
1048 * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
1049 * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
1050 * or VDATA - Vector GPR for write data.
1051 * Operand(2): VADDR - Address source. Can carry an offset or an index.
1052 * Definition(0): VDATA - Vector GPR for read result.
1053 *
1054 */
1055 struct MIMG_instruction : public Instruction {
1056 uint8_t dmask; /* Data VGPR enable mask */
1057 uint8_t dim : 3; /* NAVI: dimensionality */
1058 bool unrm : 1; /* Force address to be un-normalized */
1059 bool dlc : 1; /* NAVI: device level coherent */
1060 bool glc : 1; /* globally coherent */
1061 bool slc : 1; /* system level coherent */
1062 bool tfe : 1; /* texture fail enable */
1063 bool da : 1; /* declare an array */
1064 bool lwe : 1; /* Force data to be un-normalized */
1065 bool r128 : 1; /* NAVI: Texture resource size */
1066 bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
1067 bool d16 : 1; /* Convert 32-bit data to 16-bit data */
1068 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1069 bool can_reorder : 1;
1070 uint8_t padding : 1;
1071 barrier_interaction barrier;
1072 };
1073 static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 4);
1074
1075 /**
1076 * Flat/Scratch/Global Instructions
1077 * Operand(0): ADDR
1078 * Operand(1): SADDR
1079 * Operand(2) / Definition(0): DATA/VDST
1080 *
1081 */
1082 struct FLAT_instruction : public Instruction {
1083 uint16_t offset; /* Vega/Navi only */
1084 bool slc : 1; /* system level coherent */
1085 bool glc : 1; /* globally coherent */
1086 bool dlc : 1; /* NAVI: device level coherent */
1087 bool lds : 1;
1088 bool nv : 1;
1089 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1090 bool can_reorder : 1;
1091 uint8_t padding : 1;
1092 barrier_interaction barrier;
1093 };
1094 static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 4);
1095
1096 struct Export_instruction : public Instruction {
1097 uint8_t enabled_mask;
1098 uint8_t dest;
1099 bool compressed : 1;
1100 bool done : 1;
1101 bool valid_mask : 1;
1102 uint32_t padding : 13;
1103 };
1104 static_assert(sizeof(Export_instruction) == sizeof(Instruction) + 4);
1105
1106 struct Pseudo_instruction : public Instruction {
1107 PhysReg scratch_sgpr; /* might not be valid if it's not needed */
1108 bool tmp_in_scc;
1109 uint8_t padding;
1110 };
1111 static_assert(sizeof(Pseudo_instruction) == sizeof(Instruction) + 4);
1112
1113 struct Pseudo_branch_instruction : public Instruction {
1114 /* target[0] is the block index of the branch target.
1115 * For conditional branches, target[1] contains the fall-through alternative.
1116 * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
1117 */
1118 uint32_t target[2];
1119 };
1120 static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 8);
1121
1122 struct Pseudo_barrier_instruction : public Instruction {
1123 };
1124 static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 0);
1125
1126 enum ReduceOp : uint16_t {
1127 iadd32, iadd64,
1128 imul32, imul64,
1129 fadd32, fadd64,
1130 fmul32, fmul64,
1131 imin32, imin64,
1132 imax32, imax64,
1133 umin32, umin64,
1134 umax32, umax64,
1135 fmin32, fmin64,
1136 fmax32, fmax64,
1137 iand32, iand64,
1138 ior32, ior64,
1139 ixor32, ixor64,
1140 gfx10_wave64_bpermute
1141 };
1142
1143 /**
1144 * Subgroup Reduction Instructions, everything except for the data to be
1145 * reduced and the result as inserted by setup_reduce_temp().
1146 * Operand(0): data to be reduced
1147 * Operand(1): reduce temporary
1148 * Operand(2): vector temporary
1149 * Definition(0): result
1150 * Definition(1): scalar temporary
1151 * Definition(2): scalar identity temporary (not used to store identity on GFX10)
1152 * Definition(3): scc clobber
1153 * Definition(4): vcc clobber
1154 *
1155 */
1156 struct Pseudo_reduction_instruction : public Instruction {
1157 ReduceOp reduce_op;
1158 uint16_t cluster_size; // must be 0 for scans
1159 };
1160 static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4);
1161
1162 struct instr_deleter_functor {
1163 void operator()(void* p) {
1164 free(p);
1165 }
1166 };
1167
1168 template<typename T>
1169 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
1170
1171 template<typename T>
1172 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
1173 {
1174 std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1175 char *data = (char*) calloc(1, size);
1176 T* inst = (T*) data;
1177
1178 inst->opcode = opcode;
1179 inst->format = format;
1180
1181 uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
1182 inst->operands = aco::span<Operand>(operands_offset, num_operands);
1183 uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1184 inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1185
1186 return inst;
1187 }
1188
1189 constexpr bool Instruction::usesModifiers() const noexcept
1190 {
1191 if (isDPP() || isSDWA())
1192 return true;
1193
1194 if (format == Format::VOP3P) {
1195 const VOP3P_instruction *vop3p = static_cast<const VOP3P_instruction*>(this);
1196 for (unsigned i = 0; i < operands.size(); i++) {
1197 if (vop3p->neg_lo[i] || vop3p->neg_hi[i])
1198 return true;
1199 }
1200 return vop3p->opsel_lo || vop3p->opsel_hi || vop3p->clamp;
1201 } else if (isVOP3()) {
1202 const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
1203 for (unsigned i = 0; i < operands.size(); i++) {
1204 if (vop3->abs[i] || vop3->neg[i])
1205 return true;
1206 }
1207 return vop3->opsel || vop3->clamp || vop3->omod;
1208 }
1209 return false;
1210 }
1211
1212 constexpr bool is_phi(Instruction* instr)
1213 {
1214 return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
1215 }
1216
1217 static inline bool is_phi(aco_ptr<Instruction>& instr)
1218 {
1219 return is_phi(instr.get());
1220 }
1221
1222 barrier_interaction get_barrier_interaction(const Instruction* instr);
1223
1224 bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr);
1225
1226 enum block_kind {
1227 /* uniform indicates that leaving this block,
1228 * all actives lanes stay active */
1229 block_kind_uniform = 1 << 0,
1230 block_kind_top_level = 1 << 1,
1231 block_kind_loop_preheader = 1 << 2,
1232 block_kind_loop_header = 1 << 3,
1233 block_kind_loop_exit = 1 << 4,
1234 block_kind_continue = 1 << 5,
1235 block_kind_break = 1 << 6,
1236 block_kind_continue_or_break = 1 << 7,
1237 block_kind_discard = 1 << 8,
1238 block_kind_branch = 1 << 9,
1239 block_kind_merge = 1 << 10,
1240 block_kind_invert = 1 << 11,
1241 block_kind_uses_discard_if = 1 << 12,
1242 block_kind_needs_lowering = 1 << 13,
1243 block_kind_uses_demote = 1 << 14,
1244 block_kind_export_end = 1 << 15,
1245 };
1246
1247
1248 struct RegisterDemand {
1249 constexpr RegisterDemand() = default;
1250 constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1251 : vgpr{v}, sgpr{s} {}
1252 int16_t vgpr = 0;
1253 int16_t sgpr = 0;
1254
1255 constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1256 return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1257 }
1258
1259 constexpr bool exceeds(const RegisterDemand other) const noexcept {
1260 return vgpr > other.vgpr || sgpr > other.sgpr;
1261 }
1262
1263 constexpr RegisterDemand operator+(const Temp t) const noexcept {
1264 if (t.type() == RegType::sgpr)
1265 return RegisterDemand( vgpr, sgpr + t.size() );
1266 else
1267 return RegisterDemand( vgpr + t.size(), sgpr );
1268 }
1269
1270 constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1271 return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1272 }
1273
1274 constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1275 return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1276 }
1277
1278 constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1279 vgpr += other.vgpr;
1280 sgpr += other.sgpr;
1281 return *this;
1282 }
1283
1284 constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1285 vgpr -= other.vgpr;
1286 sgpr -= other.sgpr;
1287 return *this;
1288 }
1289
1290 constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1291 if (t.type() == RegType::sgpr)
1292 sgpr += t.size();
1293 else
1294 vgpr += t.size();
1295 return *this;
1296 }
1297
1298 constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1299 if (t.type() == RegType::sgpr)
1300 sgpr -= t.size();
1301 else
1302 vgpr -= t.size();
1303 return *this;
1304 }
1305
1306 constexpr void update(const RegisterDemand other) noexcept {
1307 vgpr = std::max(vgpr, other.vgpr);
1308 sgpr = std::max(sgpr, other.sgpr);
1309 }
1310
1311 };
1312
1313 /* CFG */
1314 struct Block {
1315 float_mode fp_mode;
1316 unsigned index;
1317 unsigned offset = 0;
1318 std::vector<aco_ptr<Instruction>> instructions;
1319 std::vector<unsigned> logical_preds;
1320 std::vector<unsigned> linear_preds;
1321 std::vector<unsigned> logical_succs;
1322 std::vector<unsigned> linear_succs;
1323 RegisterDemand register_demand = RegisterDemand();
1324 uint16_t loop_nest_depth = 0;
1325 uint16_t kind = 0;
1326 int logical_idom = -1;
1327 int linear_idom = -1;
1328 Temp live_out_exec = Temp();
1329
1330 /* this information is needed for predecessors to blocks with phis when
1331 * moving out of ssa */
1332 bool scc_live_out = false;
1333 PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1334
1335 Block(unsigned idx) : index(idx) {}
1336 Block() : index(0) {}
1337 };
1338
1339 using Stage = uint16_t;
1340
1341 /* software stages */
1342 static constexpr Stage sw_vs = 1 << 0;
1343 static constexpr Stage sw_gs = 1 << 1;
1344 static constexpr Stage sw_tcs = 1 << 2;
1345 static constexpr Stage sw_tes = 1 << 3;
1346 static constexpr Stage sw_fs = 1 << 4;
1347 static constexpr Stage sw_cs = 1 << 5;
1348 static constexpr Stage sw_gs_copy = 1 << 6;
1349 static constexpr Stage sw_mask = 0x7f;
1350
1351 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1352 static constexpr Stage hw_vs = 1 << 7;
1353 static constexpr Stage hw_es = 1 << 8; /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */
1354 static constexpr Stage hw_gs = 1 << 9; /* Geometry shader on GFX10/legacy and GFX6-9. */
1355 static constexpr Stage hw_ngg_gs = 1 << 10; /* Geometry shader on GFX10/NGG. */
1356 static constexpr Stage hw_ls = 1 << 11; /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */
1357 static constexpr Stage hw_hs = 1 << 12; /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */
1358 static constexpr Stage hw_fs = 1 << 13;
1359 static constexpr Stage hw_cs = 1 << 14;
1360 static constexpr Stage hw_mask = 0xff << 7;
1361
1362 /* possible settings of Program::stage */
1363 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1364 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1365 static constexpr Stage compute_cs = sw_cs | hw_cs;
1366 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1367 static constexpr Stage gs_copy_vs = sw_gs_copy | hw_vs;
1368 /* GFX10/NGG */
1369 static constexpr Stage ngg_vertex_gs = sw_vs | hw_ngg_gs;
1370 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_ngg_gs;
1371 static constexpr Stage ngg_tess_eval_gs = sw_tes | hw_ngg_gs;
1372 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_ngg_gs;
1373 /* GFX9 (and GFX10 if NGG isn't used) */
1374 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1375 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1376 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1377 /* pre-GFX9 */
1378 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1379 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1380 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1381 static constexpr Stage tess_eval_es = sw_tes | hw_es; /* tesselation evaluation before geometry */
1382 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1383
1384 enum statistic {
1385 statistic_hash,
1386 statistic_instructions,
1387 statistic_copies,
1388 statistic_branches,
1389 statistic_cycles,
1390 statistic_vmem_clauses,
1391 statistic_smem_clauses,
1392 statistic_vmem_score,
1393 statistic_smem_score,
1394 statistic_sgpr_presched,
1395 statistic_vgpr_presched,
1396 num_statistics
1397 };
1398
1399 class Program final {
1400 public:
1401 float_mode next_fp_mode;
1402 std::vector<Block> blocks;
1403 RegisterDemand max_reg_demand = RegisterDemand();
1404 uint16_t num_waves = 0;
1405 uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1406 ac_shader_config* config;
1407 struct radv_shader_info *info;
1408 enum chip_class chip_class;
1409 enum radeon_family family;
1410 unsigned wave_size;
1411 RegClass lane_mask;
1412 Stage stage; /* Stage */
1413 bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1414 bool needs_wqm = false; /* there exists a p_wqm instruction */
1415 bool wb_smem_l1_on_end = false;
1416
1417 std::vector<uint8_t> constant_data;
1418 Temp private_segment_buffer;
1419 Temp scratch_offset;
1420
1421 uint16_t min_waves = 0;
1422 uint16_t lds_alloc_granule;
1423 uint32_t lds_limit; /* in bytes */
1424 bool has_16bank_lds;
1425 uint16_t vgpr_limit;
1426 uint16_t sgpr_limit;
1427 uint16_t physical_sgprs;
1428 uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1429 uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
1430 unsigned workgroup_size; /* if known; otherwise UINT_MAX */
1431
1432 bool xnack_enabled = false;
1433
1434 bool needs_vcc = false;
1435 bool needs_flat_scr = false;
1436
1437 bool collect_statistics = false;
1438 uint32_t statistics[num_statistics];
1439
1440 uint32_t allocateId()
1441 {
1442 assert(allocationID <= 16777215);
1443 return allocationID++;
1444 }
1445
1446 uint32_t peekAllocationId()
1447 {
1448 return allocationID;
1449 }
1450
1451 void setAllocationId(uint32_t id)
1452 {
1453 allocationID = id;
1454 }
1455
1456 Block* create_and_insert_block() {
1457 blocks.emplace_back(blocks.size());
1458 blocks.back().fp_mode = next_fp_mode;
1459 return &blocks.back();
1460 }
1461
1462 Block* insert_block(Block&& block) {
1463 block.index = blocks.size();
1464 block.fp_mode = next_fp_mode;
1465 blocks.emplace_back(std::move(block));
1466 return &blocks.back();
1467 }
1468
1469 private:
1470 uint32_t allocationID = 1;
1471 };
1472
1473 struct TempHash {
1474 std::size_t operator()(Temp t) const {
1475 return t.id();
1476 }
1477 };
1478 using TempSet = std::unordered_set<Temp, TempHash>;
1479
1480 struct live {
1481 /* live temps out per block */
1482 std::vector<TempSet> live_out;
1483 /* register demand (sgpr/vgpr) per instruction per block */
1484 std::vector<std::vector<RegisterDemand>> register_demand;
1485 };
1486
1487 void select_program(Program *program,
1488 unsigned shader_count,
1489 struct nir_shader *const *shaders,
1490 ac_shader_config* config,
1491 struct radv_shader_args *args);
1492 void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
1493 ac_shader_config* config,
1494 struct radv_shader_args *args);
1495
1496 void lower_wqm(Program* program, live& live_vars,
1497 const struct radv_nir_compiler_options *options);
1498 void lower_phis(Program* program);
1499 void calc_min_waves(Program* program);
1500 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1501 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1502 std::vector<uint16_t> dead_code_analysis(Program *program);
1503 void dominator_tree(Program* program);
1504 void insert_exec_mask(Program *program);
1505 void value_numbering(Program* program);
1506 void optimize(Program* program);
1507 void setup_reduce_temp(Program* program);
1508 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1509 void register_allocation(Program *program, std::vector<TempSet>& live_out_per_block);
1510 void ssa_elimination(Program* program);
1511 void lower_to_hw_instr(Program* program);
1512 void schedule_program(Program* program, live& live_vars);
1513 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1514 void insert_wait_states(Program* program);
1515 void insert_NOPs(Program* program);
1516 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1517 void print_asm(Program *program, std::vector<uint32_t>& binary,
1518 unsigned exec_size, std::ostream& out);
1519 void validate(Program* program, FILE *output);
1520 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output);
1521 #ifndef NDEBUG
1522 void perfwarn(bool cond, const char *msg, Instruction *instr=NULL);
1523 #else
1524 #define perfwarn(program, cond, msg, ...) do {} while(0)
1525 #endif
1526
1527 void collect_presched_stats(Program *program);
1528 void collect_preasm_stats(Program *program);
1529 void collect_postasm_stats(Program *program, const std::vector<uint32_t>& code);
1530
1531 void aco_print_instr(const Instruction *instr, FILE *output);
1532 void aco_print_program(const Program *program, FILE *output);
1533
1534 /* utilities for dealing with register demand */
1535 RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
1536 RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
1537 RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, aco_ptr<Instruction>& instr_before);
1538
1539 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1540 uint16_t get_extra_sgprs(Program *program);
1541
1542 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
1543 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1544 uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs);
1545
1546 /* return number of addressable sgprs/vgprs for max_waves */
1547 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1548 uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves);
1549
1550 typedef struct {
1551 const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
1552 const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1553 const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1554 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1555 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1556 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
1557 const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1558 const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1559 } Info;
1560
1561 extern const Info instr_info;
1562
1563 }
1564
1565 #endif /* ACO_IR_H */
1566