aco: refactor try_remove_trivial_phi() in RA
[mesa.git] / src / amd / compiler / aco_ir.h
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #ifndef ACO_IR_H
26 #define ACO_IR_H
27
28 #include <vector>
29 #include <set>
30 #include <bitset>
31 #include <memory>
32
33 #include "nir.h"
34 #include "ac_binary.h"
35 #include "amd_family.h"
36 #include "aco_opcodes.h"
37 #include "aco_util.h"
38
39 struct radv_nir_compiler_options;
40 struct radv_shader_args;
41 struct radv_shader_info;
42
43 namespace aco {
44
45 extern uint64_t debug_flags;
46
47 enum {
48 DEBUG_VALIDATE = 0x1,
49 DEBUG_VALIDATE_RA = 0x2,
50 DEBUG_PERFWARN = 0x4,
51 };
52
53 /**
54 * Representation of the instruction's microcode encoding format
55 * Note: Some Vector ALU Formats can be combined, such that:
56 * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
57 * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
58 * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
59 *
60 * (*) The same is applicable for VOP1 and VOPC instructions.
61 */
62 enum class Format : std::uint16_t {
63 /* Pseudo Instruction Format */
64 PSEUDO = 0,
65 /* Scalar ALU & Control Formats */
66 SOP1 = 1,
67 SOP2 = 2,
68 SOPK = 3,
69 SOPP = 4,
70 SOPC = 5,
71 /* Scalar Memory Format */
72 SMEM = 6,
73 /* LDS/GDS Format */
74 DS = 8,
75 /* Vector Memory Buffer Formats */
76 MTBUF = 9,
77 MUBUF = 10,
78 /* Vector Memory Image Format */
79 MIMG = 11,
80 /* Export Format */
81 EXP = 12,
82 /* Flat Formats */
83 FLAT = 13,
84 GLOBAL = 14,
85 SCRATCH = 15,
86
87 PSEUDO_BRANCH = 16,
88 PSEUDO_BARRIER = 17,
89 PSEUDO_REDUCTION = 18,
90
91 /* Vector ALU Formats */
92 VOP1 = 1 << 8,
93 VOP2 = 1 << 9,
94 VOPC = 1 << 10,
95 VOP3 = 1 << 11,
96 VOP3A = 1 << 11,
97 VOP3B = 1 << 11,
98 VOP3P = 1 << 12,
99 /* Vector Parameter Interpolation Format */
100 VINTRP = 1 << 13,
101 DPP = 1 << 14,
102 SDWA = 1 << 15,
103 };
104
105 enum barrier_interaction : uint8_t {
106 barrier_none = 0,
107 barrier_buffer = 0x1,
108 barrier_image = 0x2,
109 barrier_atomic = 0x4,
110 barrier_shared = 0x8,
111 /* used for geometry shaders to ensure vertex data writes are before the
112 * GS_DONE s_sendmsg. */
113 barrier_gs_data = 0x10,
114 /* used for geometry shaders to ensure s_sendmsg instructions are in-order. */
115 barrier_gs_sendmsg = 0x20,
116 /* used by barriers. created by s_barrier */
117 barrier_barrier = 0x40,
118 barrier_count = 6,
119 };
120
121 enum fp_round {
122 fp_round_ne = 0,
123 fp_round_pi = 1,
124 fp_round_ni = 2,
125 fp_round_tz = 3,
126 };
127
128 enum fp_denorm {
129 /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
130 * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
131 fp_denorm_flush = 0x0,
132 fp_denorm_keep = 0x3,
133 };
134
135 struct float_mode {
136 /* matches encoding of the MODE register */
137 union {
138 struct {
139 fp_round round32:2;
140 fp_round round16_64:2;
141 unsigned denorm32:2;
142 unsigned denorm16_64:2;
143 };
144 uint8_t val = 0;
145 };
146 /* if false, optimizations which may remove infs/nan/-0.0 can be done */
147 bool preserve_signed_zero_inf_nan32:1;
148 bool preserve_signed_zero_inf_nan16_64:1;
149 /* if false, optimizations which may remove denormal flushing can be done */
150 bool must_flush_denorms32:1;
151 bool must_flush_denorms16_64:1;
152 bool care_about_round32:1;
153 bool care_about_round16_64:1;
154
155 /* Returns true if instructions using the mode "other" can safely use the
156 * current one instead. */
157 bool canReplace(float_mode other) const noexcept {
158 return val == other.val &&
159 (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
160 (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
161 (must_flush_denorms32 || !other.must_flush_denorms32) &&
162 (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
163 (care_about_round32 || !other.care_about_round32) &&
164 (care_about_round16_64 || !other.care_about_round16_64);
165 }
166 };
167
168 constexpr Format asVOP3(Format format) {
169 return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
170 };
171
172 constexpr Format asSDWA(Format format) {
173 assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC);
174 return (Format) ((uint32_t) Format::SDWA | (uint32_t) format);
175 }
176
177 enum class RegType {
178 none = 0,
179 sgpr,
180 vgpr,
181 linear_vgpr,
182 };
183
184 struct RegClass {
185
186 enum RC : uint8_t {
187 s1 = 1,
188 s2 = 2,
189 s3 = 3,
190 s4 = 4,
191 s6 = 6,
192 s8 = 8,
193 s16 = 16,
194 v1 = s1 | (1 << 5),
195 v2 = s2 | (1 << 5),
196 v3 = s3 | (1 << 5),
197 v4 = s4 | (1 << 5),
198 v5 = 5 | (1 << 5),
199 v6 = 6 | (1 << 5),
200 v7 = 7 | (1 << 5),
201 v8 = 8 | (1 << 5),
202 /* byte-sized register class */
203 v1b = v1 | (1 << 7),
204 v2b = v2 | (1 << 7),
205 v3b = v3 | (1 << 7),
206 v4b = v4 | (1 << 7),
207 v6b = v6 | (1 << 7),
208 v8b = v8 | (1 << 7),
209 /* these are used for WWM and spills to vgpr */
210 v1_linear = v1 | (1 << 6),
211 v2_linear = v2 | (1 << 6),
212 };
213
214 RegClass() = default;
215 constexpr RegClass(RC rc)
216 : rc(rc) {}
217 constexpr RegClass(RegType type, unsigned size)
218 : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
219
220 constexpr operator RC() const { return rc; }
221 explicit operator bool() = delete;
222
223 constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
224 constexpr bool is_subdword() const { return rc & (1 << 7); }
225 constexpr unsigned bytes() const { return ((unsigned) rc & 0x1F) * (is_subdword() ? 1 : 4); }
226 //TODO: use size() less in favor of bytes()
227 constexpr unsigned size() const { return (bytes() + 3) >> 2; }
228 constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
229 constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
230 constexpr RegClass as_subdword() const { return RegClass((RC) (rc | 1 << 7)); }
231
232 private:
233 RC rc;
234 };
235
236 /* transitional helper expressions */
237 static constexpr RegClass s1{RegClass::s1};
238 static constexpr RegClass s2{RegClass::s2};
239 static constexpr RegClass s3{RegClass::s3};
240 static constexpr RegClass s4{RegClass::s4};
241 static constexpr RegClass s8{RegClass::s8};
242 static constexpr RegClass s16{RegClass::s16};
243 static constexpr RegClass v1{RegClass::v1};
244 static constexpr RegClass v2{RegClass::v2};
245 static constexpr RegClass v3{RegClass::v3};
246 static constexpr RegClass v4{RegClass::v4};
247 static constexpr RegClass v5{RegClass::v5};
248 static constexpr RegClass v6{RegClass::v6};
249 static constexpr RegClass v7{RegClass::v7};
250 static constexpr RegClass v8{RegClass::v8};
251 static constexpr RegClass v1b{RegClass::v1b};
252 static constexpr RegClass v2b{RegClass::v2b};
253 static constexpr RegClass v3b{RegClass::v3b};
254 static constexpr RegClass v4b{RegClass::v4b};
255 static constexpr RegClass v6b{RegClass::v6b};
256 static constexpr RegClass v8b{RegClass::v8b};
257
258 /**
259 * Temp Class
260 * Each temporary virtual register has a
261 * register class (i.e. size and type)
262 * and SSA id.
263 */
264 struct Temp {
265 Temp() noexcept : id_(0), reg_class(0) {}
266 constexpr Temp(uint32_t id, RegClass cls) noexcept
267 : id_(id), reg_class(uint8_t(cls)) {}
268
269 constexpr uint32_t id() const noexcept { return id_; }
270 constexpr RegClass regClass() const noexcept { return (RegClass::RC)reg_class; }
271
272 constexpr unsigned bytes() const noexcept { return regClass().bytes(); }
273 constexpr unsigned size() const noexcept { return regClass().size(); }
274 constexpr RegType type() const noexcept { return regClass().type(); }
275 constexpr bool is_linear() const noexcept { return regClass().is_linear(); }
276
277 constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
278 constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
279 constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
280
281 private:
282 uint32_t id_: 24;
283 uint32_t reg_class : 8;
284 };
285
286 /**
287 * PhysReg
288 * Represents the physical register for each
289 * Operand and Definition.
290 */
291 struct PhysReg {
292 constexpr PhysReg() = default;
293 explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {}
294 constexpr unsigned reg() const { return reg_b >> 2; }
295 constexpr unsigned byte() const { return reg_b & 0x3; }
296 constexpr operator unsigned() const { return reg(); }
297 constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; }
298 constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; }
299 constexpr bool operator <(PhysReg other) const { return reg_b < other.reg_b; }
300
301 uint16_t reg_b = 0;
302 };
303
304 /* helper expressions for special registers */
305 static constexpr PhysReg m0{124};
306 static constexpr PhysReg vcc{106};
307 static constexpr PhysReg vcc_hi{107};
308 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
309 static constexpr PhysReg exec{126};
310 static constexpr PhysReg exec_lo{126};
311 static constexpr PhysReg exec_hi{127};
312 static constexpr PhysReg vccz{251};
313 static constexpr PhysReg execz{252};
314 static constexpr PhysReg scc{253};
315
316 /**
317 * Operand Class
318 * Initially, each Operand refers to either
319 * a temporary virtual register
320 * or to a constant value
321 * Temporary registers get mapped to physical register during RA
322 * Constant values are inlined into the instruction sequence.
323 */
324 class Operand final
325 {
326 public:
327 constexpr Operand()
328 : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
329 isKill_(false), isUndef_(true), isFirstKill_(false), is64BitConst_(false),
330 isLateKill_(false) {}
331
332 explicit Operand(Temp r) noexcept
333 {
334 data_.temp = r;
335 if (r.id()) {
336 isTemp_ = true;
337 } else {
338 isUndef_ = true;
339 setFixed(PhysReg{128});
340 }
341 };
342 explicit Operand(uint32_t v, bool is64bit = false) noexcept
343 {
344 data_.i = v;
345 isConstant_ = true;
346 is64BitConst_ = is64bit;
347 if (v <= 64)
348 setFixed(PhysReg{128 + v});
349 else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
350 setFixed(PhysReg{192 - v});
351 else if (v == 0x3f000000) /* 0.5 */
352 setFixed(PhysReg{240});
353 else if (v == 0xbf000000) /* -0.5 */
354 setFixed(PhysReg{241});
355 else if (v == 0x3f800000) /* 1.0 */
356 setFixed(PhysReg{242});
357 else if (v == 0xbf800000) /* -1.0 */
358 setFixed(PhysReg{243});
359 else if (v == 0x40000000) /* 2.0 */
360 setFixed(PhysReg{244});
361 else if (v == 0xc0000000) /* -2.0 */
362 setFixed(PhysReg{245});
363 else if (v == 0x40800000) /* 4.0 */
364 setFixed(PhysReg{246});
365 else if (v == 0xc0800000) /* -4.0 */
366 setFixed(PhysReg{247});
367 else { /* Literal Constant */
368 assert(!is64bit && "attempt to create a 64-bit literal constant");
369 setFixed(PhysReg{255});
370 }
371 };
372 explicit Operand(uint64_t v) noexcept
373 {
374 isConstant_ = true;
375 is64BitConst_ = true;
376 if (v <= 64) {
377 data_.i = (uint32_t) v;
378 setFixed(PhysReg{128 + (uint32_t) v});
379 } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
380 data_.i = (uint32_t) v;
381 setFixed(PhysReg{192 - (uint32_t) v});
382 } else if (v == 0x3FE0000000000000) { /* 0.5 */
383 data_.i = 0x3f000000;
384 setFixed(PhysReg{240});
385 } else if (v == 0xBFE0000000000000) { /* -0.5 */
386 data_.i = 0xbf000000;
387 setFixed(PhysReg{241});
388 } else if (v == 0x3FF0000000000000) { /* 1.0 */
389 data_.i = 0x3f800000;
390 setFixed(PhysReg{242});
391 } else if (v == 0xBFF0000000000000) { /* -1.0 */
392 data_.i = 0xbf800000;
393 setFixed(PhysReg{243});
394 } else if (v == 0x4000000000000000) { /* 2.0 */
395 data_.i = 0x40000000;
396 setFixed(PhysReg{244});
397 } else if (v == 0xC000000000000000) { /* -2.0 */
398 data_.i = 0xc0000000;
399 setFixed(PhysReg{245});
400 } else if (v == 0x4010000000000000) { /* 4.0 */
401 data_.i = 0x40800000;
402 setFixed(PhysReg{246});
403 } else if (v == 0xC010000000000000) { /* -4.0 */
404 data_.i = 0xc0800000;
405 setFixed(PhysReg{247});
406 } else { /* Literal Constant: we don't know if it is a long or double.*/
407 isConstant_ = 0;
408 assert(false && "attempt to create a 64-bit literal constant");
409 }
410 };
411 explicit Operand(RegClass type) noexcept
412 {
413 isUndef_ = true;
414 data_.temp = Temp(0, type);
415 setFixed(PhysReg{128});
416 };
417 explicit Operand(PhysReg reg, RegClass type) noexcept
418 {
419 data_.temp = Temp(0, type);
420 setFixed(reg);
421 }
422
423 constexpr bool isTemp() const noexcept
424 {
425 return isTemp_;
426 }
427
428 constexpr void setTemp(Temp t) noexcept {
429 assert(!isConstant_);
430 isTemp_ = true;
431 data_.temp = t;
432 }
433
434 constexpr Temp getTemp() const noexcept
435 {
436 return data_.temp;
437 }
438
439 constexpr uint32_t tempId() const noexcept
440 {
441 return data_.temp.id();
442 }
443
444 constexpr bool hasRegClass() const noexcept
445 {
446 return isTemp() || isUndefined();
447 }
448
449 constexpr RegClass regClass() const noexcept
450 {
451 return data_.temp.regClass();
452 }
453
454 constexpr unsigned bytes() const noexcept
455 {
456 if (isConstant())
457 return is64BitConst_ ? 8 : 4; //TODO: sub-dword constants
458 else
459 return data_.temp.bytes();
460 }
461
462 constexpr unsigned size() const noexcept
463 {
464 if (isConstant())
465 return is64BitConst_ ? 2 : 1;
466 else
467 return data_.temp.size();
468 }
469
470 constexpr bool isFixed() const noexcept
471 {
472 return isFixed_;
473 }
474
475 constexpr PhysReg physReg() const noexcept
476 {
477 return reg_;
478 }
479
480 constexpr void setFixed(PhysReg reg) noexcept
481 {
482 isFixed_ = reg != unsigned(-1);
483 reg_ = reg;
484 }
485
486 constexpr bool isConstant() const noexcept
487 {
488 return isConstant_;
489 }
490
491 constexpr bool isLiteral() const noexcept
492 {
493 return isConstant() && reg_ == 255;
494 }
495
496 constexpr bool isUndefined() const noexcept
497 {
498 return isUndef_;
499 }
500
501 constexpr uint32_t constantValue() const noexcept
502 {
503 return data_.i;
504 }
505
506 constexpr bool constantEquals(uint32_t cmp) const noexcept
507 {
508 return isConstant() && constantValue() == cmp;
509 }
510
511 constexpr uint64_t constantValue64(bool signext=false) const noexcept
512 {
513 if (is64BitConst_) {
514 if (reg_ <= 192)
515 return reg_ - 128;
516 else if (reg_ <= 208)
517 return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);
518
519 switch (reg_) {
520 case 240:
521 return 0x3FE0000000000000;
522 case 241:
523 return 0xBFE0000000000000;
524 case 242:
525 return 0x3FF0000000000000;
526 case 243:
527 return 0xBFF0000000000000;
528 case 244:
529 return 0x4000000000000000;
530 case 245:
531 return 0xC000000000000000;
532 case 246:
533 return 0x4010000000000000;
534 case 247:
535 return 0xC010000000000000;
536 }
537 }
538 return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
539 }
540
541 /* Indicates that the killed operand's live range intersects with the
542 * instruction's definitions. Unlike isKill() and isFirstKill(), this is
543 * not set by liveness analysis. */
544 constexpr void setLateKill(bool flag) noexcept
545 {
546 isLateKill_ = flag;
547 }
548
549 constexpr bool isLateKill() const noexcept
550 {
551 return isLateKill_;
552 }
553
554 constexpr void setKill(bool flag) noexcept
555 {
556 isKill_ = flag;
557 if (!flag)
558 setFirstKill(false);
559 }
560
561 constexpr bool isKill() const noexcept
562 {
563 return isKill_ || isFirstKill();
564 }
565
566 constexpr void setFirstKill(bool flag) noexcept
567 {
568 isFirstKill_ = flag;
569 if (flag)
570 setKill(flag);
571 }
572
573 /* When there are multiple operands killing the same temporary,
574 * isFirstKill() is only returns true for the first one. */
575 constexpr bool isFirstKill() const noexcept
576 {
577 return isFirstKill_;
578 }
579
580 constexpr bool isKillBeforeDef() const noexcept
581 {
582 return isKill() && !isLateKill();
583 }
584
585 constexpr bool isFirstKillBeforeDef() const noexcept
586 {
587 return isFirstKill() && !isLateKill();
588 }
589
590 constexpr bool operator == (Operand other) const noexcept
591 {
592 if (other.size() != size())
593 return false;
594 if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef())
595 return false;
596 if (isFixed() && other.isFixed() && physReg() != other.physReg())
597 return false;
598 if (isLiteral())
599 return other.isLiteral() && other.constantValue() == constantValue();
600 else if (isConstant())
601 return other.isConstant() && other.physReg() == physReg();
602 else if (isUndefined())
603 return other.isUndefined() && other.regClass() == regClass();
604 else
605 return other.isTemp() && other.getTemp() == getTemp();
606 }
607 private:
608 union {
609 uint32_t i;
610 float f;
611 Temp temp = Temp(0, s1);
612 } data_;
613 PhysReg reg_;
614 union {
615 struct {
616 uint8_t isTemp_:1;
617 uint8_t isFixed_:1;
618 uint8_t isConstant_:1;
619 uint8_t isKill_:1;
620 uint8_t isUndef_:1;
621 uint8_t isFirstKill_:1;
622 uint8_t is64BitConst_:1;
623 uint8_t isLateKill_:1;
624 };
625 /* can't initialize bit-fields in c++11, so work around using a union */
626 uint8_t control_ = 0;
627 };
628 };
629
630 /**
631 * Definition Class
632 * Definitions are the results of Instructions
633 * and refer to temporary virtual registers
634 * which are later mapped to physical registers
635 */
636 class Definition final
637 {
638 public:
639 constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0) {}
640 Definition(uint32_t index, RegClass type) noexcept
641 : temp(index, type) {}
642 explicit Definition(Temp tmp) noexcept
643 : temp(tmp) {}
644 Definition(PhysReg reg, RegClass type) noexcept
645 : temp(Temp(0, type))
646 {
647 setFixed(reg);
648 }
649 Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
650 : temp(Temp(tmpId, type))
651 {
652 setFixed(reg);
653 }
654
655 constexpr bool isTemp() const noexcept
656 {
657 return tempId() > 0;
658 }
659
660 constexpr Temp getTemp() const noexcept
661 {
662 return temp;
663 }
664
665 constexpr uint32_t tempId() const noexcept
666 {
667 return temp.id();
668 }
669
670 constexpr void setTemp(Temp t) noexcept {
671 temp = t;
672 }
673
674 constexpr RegClass regClass() const noexcept
675 {
676 return temp.regClass();
677 }
678
679 constexpr unsigned bytes() const noexcept
680 {
681 return temp.bytes();
682 }
683
684 constexpr unsigned size() const noexcept
685 {
686 return temp.size();
687 }
688
689 constexpr bool isFixed() const noexcept
690 {
691 return isFixed_;
692 }
693
694 constexpr PhysReg physReg() const noexcept
695 {
696 return reg_;
697 }
698
699 constexpr void setFixed(PhysReg reg) noexcept
700 {
701 isFixed_ = 1;
702 reg_ = reg;
703 }
704
705 constexpr void setHint(PhysReg reg) noexcept
706 {
707 hasHint_ = 1;
708 reg_ = reg;
709 }
710
711 constexpr bool hasHint() const noexcept
712 {
713 return hasHint_;
714 }
715
716 constexpr void setKill(bool flag) noexcept
717 {
718 isKill_ = flag;
719 }
720
721 constexpr bool isKill() const noexcept
722 {
723 return isKill_;
724 }
725
726 private:
727 Temp temp = Temp(0, s1);
728 PhysReg reg_;
729 union {
730 struct {
731 uint8_t isFixed_:1;
732 uint8_t hasHint_:1;
733 uint8_t isKill_:1;
734 };
735 /* can't initialize bit-fields in c++11, so work around using a union */
736 uint8_t control_ = 0;
737 };
738 };
739
740 class Block;
741
742 struct Instruction {
743 aco_opcode opcode;
744 Format format;
745 uint32_t pass_flags;
746
747 aco::span<Operand> operands;
748 aco::span<Definition> definitions;
749
750 constexpr bool isVALU() const noexcept
751 {
752 return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
753 || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
754 || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
755 || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
756 || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
757 || ((uint16_t) format & (uint16_t) Format::VOP3P) == (uint16_t) Format::VOP3P;
758 }
759
760 constexpr bool isSALU() const noexcept
761 {
762 return format == Format::SOP1 ||
763 format == Format::SOP2 ||
764 format == Format::SOPC ||
765 format == Format::SOPK ||
766 format == Format::SOPP;
767 }
768
769 constexpr bool isVMEM() const noexcept
770 {
771 return format == Format::MTBUF ||
772 format == Format::MUBUF ||
773 format == Format::MIMG;
774 }
775
776 constexpr bool isDPP() const noexcept
777 {
778 return (uint16_t) format & (uint16_t) Format::DPP;
779 }
780
781 constexpr bool isVOP3() const noexcept
782 {
783 return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
784 ((uint16_t) format & (uint16_t) Format::VOP3B) ||
785 format == Format::VOP3P;
786 }
787
788 constexpr bool isSDWA() const noexcept
789 {
790 return (uint16_t) format & (uint16_t) Format::SDWA;
791 }
792
793 constexpr bool isFlatOrGlobal() const noexcept
794 {
795 return format == Format::FLAT || format == Format::GLOBAL;
796 }
797
798 constexpr bool usesModifiers() const noexcept;
799
800 constexpr bool reads_exec() const noexcept
801 {
802 for (const Operand& op : operands) {
803 if (op.isFixed() && op.physReg() == exec)
804 return true;
805 }
806 return false;
807 }
808 };
809 static_assert(sizeof(Instruction) == 16);
810
811 struct SOPK_instruction : public Instruction {
812 uint16_t imm;
813 uint16_t padding;
814 };
815 static_assert(sizeof(SOPK_instruction) == sizeof(Instruction) + 4);
816
817 struct SOPP_instruction : public Instruction {
818 uint32_t imm;
819 int block;
820 };
821 static_assert(sizeof(SOPP_instruction) == sizeof(Instruction) + 8);
822
823 struct SOPC_instruction : public Instruction {
824 };
825 static_assert(sizeof(SOPC_instruction) == sizeof(Instruction) + 0);
826
827 struct SOP1_instruction : public Instruction {
828 };
829 static_assert(sizeof(SOP1_instruction) == sizeof(Instruction) + 0);
830
831 struct SOP2_instruction : public Instruction {
832 };
833 static_assert(sizeof(SOP2_instruction) == sizeof(Instruction) + 0);
834
835 /**
836 * Scalar Memory Format:
837 * For s_(buffer_)load_dword*:
838 * Operand(0): SBASE - SGPR-pair which provides base address
839 * Operand(1): Offset - immediate (un)signed offset or SGPR
840 * Operand(2) / Definition(0): SDATA - SGPR for read / write result
841 * Operand(n-1): SOffset - SGPR offset (Vega only)
842 *
843 * Having no operands is also valid for instructions such as s_dcache_inv.
844 *
845 */
846 struct SMEM_instruction : public Instruction {
847 barrier_interaction barrier;
848 bool glc : 1; /* VI+: globally coherent */
849 bool dlc : 1; /* NAVI: device level coherent */
850 bool nv : 1; /* VEGA only: Non-volatile */
851 bool can_reorder : 1;
852 bool disable_wqm : 1;
853 uint32_t padding: 19;
854 };
855 static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4);
856
857 struct VOP1_instruction : public Instruction {
858 };
859 static_assert(sizeof(VOP1_instruction) == sizeof(Instruction) + 0);
860
861 struct VOP2_instruction : public Instruction {
862 };
863 static_assert(sizeof(VOP2_instruction) == sizeof(Instruction) + 0);
864
865 struct VOPC_instruction : public Instruction {
866 };
867 static_assert(sizeof(VOPC_instruction) == sizeof(Instruction) + 0);
868
869 struct VOP3A_instruction : public Instruction {
870 bool abs[3];
871 bool neg[3];
872 uint8_t opsel : 4;
873 uint8_t omod : 2;
874 bool clamp : 1;
875 uint32_t padding : 9;
876 };
877 static_assert(sizeof(VOP3A_instruction) == sizeof(Instruction) + 8);
878
879 /**
880 * Data Parallel Primitives Format:
881 * This format can be used for VOP1, VOP2 or VOPC instructions.
882 * The swizzle applies to the src0 operand.
883 *
884 */
885 struct DPP_instruction : public Instruction {
886 bool abs[2];
887 bool neg[2];
888 uint16_t dpp_ctrl;
889 uint8_t row_mask : 4;
890 uint8_t bank_mask : 4;
891 bool bound_ctrl : 1;
892 uint32_t padding : 7;
893 };
894 static_assert(sizeof(DPP_instruction) == sizeof(Instruction) + 8);
895
896 enum sdwa_sel : uint8_t {
897 /* masks */
898 sdwa_wordnum = 0x1,
899 sdwa_bytenum = 0x3,
900 sdwa_asuint = 0x7 | 0x10,
901 sdwa_rasize = 0x3,
902
903 /* flags */
904 sdwa_isword = 0x4,
905 sdwa_sext = 0x8,
906 sdwa_isra = 0x10,
907
908 /* specific values */
909 sdwa_ubyte0 = 0,
910 sdwa_ubyte1 = 1,
911 sdwa_ubyte2 = 2,
912 sdwa_ubyte3 = 3,
913 sdwa_uword0 = sdwa_isword | 0,
914 sdwa_uword1 = sdwa_isword | 1,
915 sdwa_udword = 6,
916
917 sdwa_sbyte0 = sdwa_ubyte0 | sdwa_sext,
918 sdwa_sbyte1 = sdwa_ubyte1 | sdwa_sext,
919 sdwa_sbyte2 = sdwa_ubyte2 | sdwa_sext,
920 sdwa_sbyte3 = sdwa_ubyte3 | sdwa_sext,
921 sdwa_sword0 = sdwa_uword0 | sdwa_sext,
922 sdwa_sword1 = sdwa_uword1 | sdwa_sext,
923 sdwa_sdword = sdwa_udword | sdwa_sext,
924
925 /* register-allocated */
926 sdwa_ubyte = 1 | sdwa_isra,
927 sdwa_uword = 2 | sdwa_isra,
928 sdwa_sbyte = sdwa_ubyte | sdwa_sext,
929 sdwa_sword = sdwa_uword | sdwa_sext,
930 };
931
932 /**
933 * Sub-Dword Addressing Format:
934 * This format can be used for VOP1, VOP2 or VOPC instructions.
935 *
936 * omod and SGPR/constant operands are only available on GFX9+. For VOPC,
937 * the definition doesn't have to be VCC on GFX9+.
938 *
939 */
940 struct SDWA_instruction : public Instruction {
941 /* these destination modifiers aren't available with VOPC except for
942 * clamp on GFX8 */
943 uint8_t sel[2];
944 uint8_t dst_sel;
945 bool neg[2];
946 bool abs[2];
947 bool dst_preserve : 1;
948 bool clamp : 1;
949 uint8_t omod : 2; /* GFX9+ */
950 uint32_t padding : 4;
951 };
952 static_assert(sizeof(SDWA_instruction) == sizeof(Instruction) + 8);
953
954 struct Interp_instruction : public Instruction {
955 uint8_t attribute;
956 uint8_t component;
957 uint16_t padding;
958 };
959 static_assert(sizeof(Interp_instruction) == sizeof(Instruction) + 4);
960
961 /**
962 * Local and Global Data Sharing instructions
963 * Operand(0): ADDR - VGPR which supplies the address.
964 * Operand(1): DATA0 - First data VGPR.
965 * Operand(2): DATA1 - Second data VGPR.
966 * Operand(n-1): M0 - LDS size.
967 * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
968 *
969 */
970 struct DS_instruction : public Instruction {
971 int16_t offset0;
972 int8_t offset1;
973 bool gds;
974 };
975 static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 4);
976
977 /**
978 * Vector Memory Untyped-buffer Instructions
979 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
980 * Operand(1): VADDR - Address source. Can carry an index and/or offset
981 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
982 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
983 *
984 */
985 struct MUBUF_instruction : public Instruction {
986 uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
987 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
988 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
989 bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
990 bool glc : 1; /* globally coherent */
991 bool dlc : 1; /* NAVI: device level coherent */
992 bool slc : 1; /* system level coherent */
993 bool tfe : 1; /* texture fail enable */
994 bool lds : 1; /* Return read-data to LDS instead of VGPRs */
995 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
996 bool can_reorder : 1;
997 uint8_t padding : 2;
998 barrier_interaction barrier;
999 };
1000 static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 4);
1001
1002 /**
1003 * Vector Memory Typed-buffer Instructions
1004 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1005 * Operand(1): VADDR - Address source. Can carry an index and/or offset
1006 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1007 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1008 *
1009 */
1010 struct MTBUF_instruction : public Instruction {
1011 uint16_t offset; /* Unsigned byte offset - 12 bit */
1012 barrier_interaction barrier;
1013 uint8_t dfmt : 4; /* Data Format of data in memory buffer */
1014 uint8_t nfmt : 3; /* Numeric format of data in memory */
1015 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1016 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1017 bool glc : 1; /* globally coherent */
1018 bool dlc : 1; /* NAVI: device level coherent */
1019 bool slc : 1; /* system level coherent */
1020 bool tfe : 1; /* texture fail enable */
1021 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1022 bool can_reorder : 1;
1023 uint32_t padding : 25;
1024 };
1025 static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8);
1026
1027 /**
1028 * Vector Memory Image Instructions
1029 * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
1030 * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
1031 * or VDATA - Vector GPR for write data.
1032 * Operand(2): VADDR - Address source. Can carry an offset or an index.
1033 * Definition(0): VDATA - Vector GPR for read result.
1034 *
1035 */
1036 struct MIMG_instruction : public Instruction {
1037 uint8_t dmask; /* Data VGPR enable mask */
1038 uint8_t dim : 3; /* NAVI: dimensionality */
1039 bool unrm : 1; /* Force address to be un-normalized */
1040 bool dlc : 1; /* NAVI: device level coherent */
1041 bool glc : 1; /* globally coherent */
1042 bool slc : 1; /* system level coherent */
1043 bool tfe : 1; /* texture fail enable */
1044 bool da : 1; /* declare an array */
1045 bool lwe : 1; /* Force data to be un-normalized */
1046 bool r128 : 1; /* NAVI: Texture resource size */
1047 bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
1048 bool d16 : 1; /* Convert 32-bit data to 16-bit data */
1049 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1050 bool can_reorder : 1;
1051 uint8_t padding : 1;
1052 barrier_interaction barrier;
1053 };
1054 static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 4);
1055
1056 /**
1057 * Flat/Scratch/Global Instructions
1058 * Operand(0): ADDR
1059 * Operand(1): SADDR
1060 * Operand(2) / Definition(0): DATA/VDST
1061 *
1062 */
1063 struct FLAT_instruction : public Instruction {
1064 uint16_t offset; /* Vega/Navi only */
1065 bool slc : 1; /* system level coherent */
1066 bool glc : 1; /* globally coherent */
1067 bool dlc : 1; /* NAVI: device level coherent */
1068 bool lds : 1;
1069 bool nv : 1;
1070 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1071 bool can_reorder : 1;
1072 uint8_t padding : 1;
1073 barrier_interaction barrier;
1074 };
1075 static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 4);
1076
1077 struct Export_instruction : public Instruction {
1078 uint8_t enabled_mask;
1079 uint8_t dest;
1080 bool compressed : 1;
1081 bool done : 1;
1082 bool valid_mask : 1;
1083 uint32_t padding : 13;
1084 };
1085 static_assert(sizeof(Export_instruction) == sizeof(Instruction) + 4);
1086
1087 struct Pseudo_instruction : public Instruction {
1088 PhysReg scratch_sgpr; /* might not be valid if it's not needed */
1089 bool tmp_in_scc;
1090 uint8_t padding;
1091 };
1092 static_assert(sizeof(Pseudo_instruction) == sizeof(Instruction) + 4);
1093
1094 struct Pseudo_branch_instruction : public Instruction {
1095 /* target[0] is the block index of the branch target.
1096 * For conditional branches, target[1] contains the fall-through alternative.
1097 * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
1098 */
1099 uint32_t target[2];
1100 };
1101 static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 8);
1102
1103 struct Pseudo_barrier_instruction : public Instruction {
1104 };
1105 static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 0);
1106
1107 enum ReduceOp : uint16_t {
1108 iadd32, iadd64,
1109 imul32, imul64,
1110 fadd32, fadd64,
1111 fmul32, fmul64,
1112 imin32, imin64,
1113 imax32, imax64,
1114 umin32, umin64,
1115 umax32, umax64,
1116 fmin32, fmin64,
1117 fmax32, fmax64,
1118 iand32, iand64,
1119 ior32, ior64,
1120 ixor32, ixor64,
1121 gfx10_wave64_bpermute
1122 };
1123
1124 /**
1125 * Subgroup Reduction Instructions, everything except for the data to be
1126 * reduced and the result as inserted by setup_reduce_temp().
1127 * Operand(0): data to be reduced
1128 * Operand(1): reduce temporary
1129 * Operand(2): vector temporary
1130 * Definition(0): result
1131 * Definition(1): scalar temporary
1132 * Definition(2): scalar identity temporary (not used to store identity on GFX10)
1133 * Definition(3): scc clobber
1134 * Definition(4): vcc clobber
1135 *
1136 */
1137 struct Pseudo_reduction_instruction : public Instruction {
1138 ReduceOp reduce_op;
1139 uint16_t cluster_size; // must be 0 for scans
1140 };
1141 static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4);
1142
1143 struct instr_deleter_functor {
1144 void operator()(void* p) {
1145 free(p);
1146 }
1147 };
1148
1149 template<typename T>
1150 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
1151
1152 template<typename T>
1153 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
1154 {
1155 std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1156 char *data = (char*) calloc(1, size);
1157 T* inst = (T*) data;
1158
1159 inst->opcode = opcode;
1160 inst->format = format;
1161
1162 uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
1163 inst->operands = aco::span<Operand>(operands_offset, num_operands);
1164 uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1165 inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1166
1167 return inst;
1168 }
1169
1170 constexpr bool Instruction::usesModifiers() const noexcept
1171 {
1172 if (isDPP() || isSDWA())
1173 return true;
1174 if (!isVOP3())
1175 return false;
1176 const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
1177 for (unsigned i = 0; i < operands.size(); i++) {
1178 if (vop3->abs[i] || vop3->neg[i])
1179 return true;
1180 }
1181 return vop3->opsel || vop3->clamp || vop3->omod;
1182 }
1183
1184 constexpr bool is_phi(Instruction* instr)
1185 {
1186 return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
1187 }
1188
1189 static inline bool is_phi(aco_ptr<Instruction>& instr)
1190 {
1191 return is_phi(instr.get());
1192 }
1193
1194 barrier_interaction get_barrier_interaction(Instruction* instr);
1195
1196 bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr);
1197
1198 enum block_kind {
1199 /* uniform indicates that leaving this block,
1200 * all actives lanes stay active */
1201 block_kind_uniform = 1 << 0,
1202 block_kind_top_level = 1 << 1,
1203 block_kind_loop_preheader = 1 << 2,
1204 block_kind_loop_header = 1 << 3,
1205 block_kind_loop_exit = 1 << 4,
1206 block_kind_continue = 1 << 5,
1207 block_kind_break = 1 << 6,
1208 block_kind_continue_or_break = 1 << 7,
1209 block_kind_discard = 1 << 8,
1210 block_kind_branch = 1 << 9,
1211 block_kind_merge = 1 << 10,
1212 block_kind_invert = 1 << 11,
1213 block_kind_uses_discard_if = 1 << 12,
1214 block_kind_needs_lowering = 1 << 13,
1215 block_kind_uses_demote = 1 << 14,
1216 block_kind_export_end = 1 << 15,
1217 };
1218
1219
1220 struct RegisterDemand {
1221 constexpr RegisterDemand() = default;
1222 constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1223 : vgpr{v}, sgpr{s} {}
1224 int16_t vgpr = 0;
1225 int16_t sgpr = 0;
1226
1227 constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1228 return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1229 }
1230
1231 constexpr bool exceeds(const RegisterDemand other) const noexcept {
1232 return vgpr > other.vgpr || sgpr > other.sgpr;
1233 }
1234
1235 constexpr RegisterDemand operator+(const Temp t) const noexcept {
1236 if (t.type() == RegType::sgpr)
1237 return RegisterDemand( vgpr, sgpr + t.size() );
1238 else
1239 return RegisterDemand( vgpr + t.size(), sgpr );
1240 }
1241
1242 constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1243 return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1244 }
1245
1246 constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1247 return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1248 }
1249
1250 constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1251 vgpr += other.vgpr;
1252 sgpr += other.sgpr;
1253 return *this;
1254 }
1255
1256 constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1257 vgpr -= other.vgpr;
1258 sgpr -= other.sgpr;
1259 return *this;
1260 }
1261
1262 constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1263 if (t.type() == RegType::sgpr)
1264 sgpr += t.size();
1265 else
1266 vgpr += t.size();
1267 return *this;
1268 }
1269
1270 constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1271 if (t.type() == RegType::sgpr)
1272 sgpr -= t.size();
1273 else
1274 vgpr -= t.size();
1275 return *this;
1276 }
1277
1278 constexpr void update(const RegisterDemand other) noexcept {
1279 vgpr = std::max(vgpr, other.vgpr);
1280 sgpr = std::max(sgpr, other.sgpr);
1281 }
1282
1283 };
1284
1285 /* CFG */
1286 struct Block {
1287 float_mode fp_mode;
1288 unsigned index;
1289 unsigned offset = 0;
1290 std::vector<aco_ptr<Instruction>> instructions;
1291 std::vector<unsigned> logical_preds;
1292 std::vector<unsigned> linear_preds;
1293 std::vector<unsigned> logical_succs;
1294 std::vector<unsigned> linear_succs;
1295 RegisterDemand register_demand = RegisterDemand();
1296 uint16_t loop_nest_depth = 0;
1297 uint16_t kind = 0;
1298 int logical_idom = -1;
1299 int linear_idom = -1;
1300 Temp live_out_exec = Temp();
1301
1302 /* this information is needed for predecessors to blocks with phis when
1303 * moving out of ssa */
1304 bool scc_live_out = false;
1305 PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1306
1307 Block(unsigned idx) : index(idx) {}
1308 Block() : index(0) {}
1309 };
1310
1311 using Stage = uint16_t;
1312
1313 /* software stages */
1314 static constexpr Stage sw_vs = 1 << 0;
1315 static constexpr Stage sw_gs = 1 << 1;
1316 static constexpr Stage sw_tcs = 1 << 2;
1317 static constexpr Stage sw_tes = 1 << 3;
1318 static constexpr Stage sw_fs = 1 << 4;
1319 static constexpr Stage sw_cs = 1 << 5;
1320 static constexpr Stage sw_gs_copy = 1 << 6;
1321 static constexpr Stage sw_mask = 0x7f;
1322
1323 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1324 static constexpr Stage hw_vs = 1 << 7;
1325 static constexpr Stage hw_es = 1 << 8; /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */
1326 static constexpr Stage hw_gs = 1 << 9; /* Geometry shader on GFX10/legacy and GFX6-9. */
1327 static constexpr Stage hw_ngg_gs = 1 << 10; /* Geometry shader on GFX10/NGG. */
1328 static constexpr Stage hw_ls = 1 << 11; /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */
1329 static constexpr Stage hw_hs = 1 << 12; /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */
1330 static constexpr Stage hw_fs = 1 << 13;
1331 static constexpr Stage hw_cs = 1 << 14;
1332 static constexpr Stage hw_mask = 0xff << 7;
1333
1334 /* possible settings of Program::stage */
1335 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1336 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1337 static constexpr Stage compute_cs = sw_cs | hw_cs;
1338 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1339 static constexpr Stage gs_copy_vs = sw_gs_copy | hw_vs;
1340 /* GFX10/NGG */
1341 static constexpr Stage ngg_vertex_gs = sw_vs | hw_ngg_gs;
1342 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_ngg_gs;
1343 static constexpr Stage ngg_tess_eval_gs = sw_tes | hw_ngg_gs;
1344 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_ngg_gs;
1345 /* GFX9 (and GFX10 if NGG isn't used) */
1346 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1347 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1348 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1349 /* pre-GFX9 */
1350 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1351 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1352 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1353 static constexpr Stage tess_eval_es = sw_tes | hw_es; /* tesselation evaluation before geometry */
1354 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1355
1356 enum statistic {
1357 statistic_hash,
1358 statistic_instructions,
1359 statistic_copies,
1360 statistic_branches,
1361 statistic_cycles,
1362 statistic_vmem_clauses,
1363 statistic_smem_clauses,
1364 statistic_vmem_score,
1365 statistic_smem_score,
1366 statistic_sgpr_presched,
1367 statistic_vgpr_presched,
1368 num_statistics
1369 };
1370
1371 class Program final {
1372 public:
1373 float_mode next_fp_mode;
1374 std::vector<Block> blocks;
1375 RegisterDemand max_reg_demand = RegisterDemand();
1376 uint16_t num_waves = 0;
1377 uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1378 ac_shader_config* config;
1379 struct radv_shader_info *info;
1380 enum chip_class chip_class;
1381 enum radeon_family family;
1382 unsigned wave_size;
1383 RegClass lane_mask;
1384 Stage stage; /* Stage */
1385 bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1386 bool needs_wqm = false; /* there exists a p_wqm instruction */
1387 bool wb_smem_l1_on_end = false;
1388
1389 std::vector<uint8_t> constant_data;
1390 Temp private_segment_buffer;
1391 Temp scratch_offset;
1392
1393 uint16_t min_waves = 0;
1394 uint16_t lds_alloc_granule;
1395 uint32_t lds_limit; /* in bytes */
1396 bool has_16bank_lds;
1397 uint16_t vgpr_limit;
1398 uint16_t sgpr_limit;
1399 uint16_t physical_sgprs;
1400 uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1401 uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
1402 unsigned workgroup_size; /* if known; otherwise UINT_MAX */
1403
1404 bool xnack_enabled = false;
1405
1406 bool needs_vcc = false;
1407 bool needs_flat_scr = false;
1408
1409 bool collect_statistics = false;
1410 uint32_t statistics[num_statistics];
1411
1412 uint32_t allocateId()
1413 {
1414 assert(allocationID <= 16777215);
1415 return allocationID++;
1416 }
1417
1418 uint32_t peekAllocationId()
1419 {
1420 return allocationID;
1421 }
1422
1423 void setAllocationId(uint32_t id)
1424 {
1425 allocationID = id;
1426 }
1427
1428 Block* create_and_insert_block() {
1429 blocks.emplace_back(blocks.size());
1430 blocks.back().fp_mode = next_fp_mode;
1431 return &blocks.back();
1432 }
1433
1434 Block* insert_block(Block&& block) {
1435 block.index = blocks.size();
1436 block.fp_mode = next_fp_mode;
1437 blocks.emplace_back(std::move(block));
1438 return &blocks.back();
1439 }
1440
1441 private:
1442 uint32_t allocationID = 1;
1443 };
1444
1445 struct live {
1446 /* live temps out per block */
1447 std::vector<std::set<Temp>> live_out;
1448 /* register demand (sgpr/vgpr) per instruction per block */
1449 std::vector<std::vector<RegisterDemand>> register_demand;
1450 };
1451
1452 void select_program(Program *program,
1453 unsigned shader_count,
1454 struct nir_shader *const *shaders,
1455 ac_shader_config* config,
1456 struct radv_shader_args *args);
1457 void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
1458 ac_shader_config* config,
1459 struct radv_shader_args *args);
1460
1461 void lower_wqm(Program* program, live& live_vars,
1462 const struct radv_nir_compiler_options *options);
1463 void lower_bool_phis(Program* program);
1464 void calc_min_waves(Program* program);
1465 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1466 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1467 std::vector<uint16_t> dead_code_analysis(Program *program);
1468 void dominator_tree(Program* program);
1469 void insert_exec_mask(Program *program);
1470 void value_numbering(Program* program);
1471 void optimize(Program* program);
1472 void setup_reduce_temp(Program* program);
1473 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1474 void register_allocation(Program *program, std::vector<std::set<Temp>>& live_out_per_block);
1475 void ssa_elimination(Program* program);
1476 void lower_to_hw_instr(Program* program);
1477 void schedule_program(Program* program, live& live_vars);
1478 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1479 void insert_wait_states(Program* program);
1480 void insert_NOPs(Program* program);
1481 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1482 void print_asm(Program *program, std::vector<uint32_t>& binary,
1483 unsigned exec_size, std::ostream& out);
1484 void validate(Program* program, FILE *output);
1485 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output);
1486 #ifndef NDEBUG
1487 void perfwarn(bool cond, const char *msg, Instruction *instr=NULL);
1488 #else
1489 #define perfwarn(program, cond, msg, ...) do {} while(0)
1490 #endif
1491
1492 void collect_presched_stats(Program *program);
1493 void collect_preasm_stats(Program *program);
1494 void collect_postasm_stats(Program *program, const std::vector<uint32_t>& code);
1495
1496 void aco_print_instr(Instruction *instr, FILE *output);
1497 void aco_print_program(Program *program, FILE *output);
1498
1499 /* utilities for dealing with register demand */
1500 RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
1501 RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
1502 RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, aco_ptr<Instruction>& instr_before);
1503
1504 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1505 uint16_t get_extra_sgprs(Program *program);
1506
1507 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
1508 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1509 uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs);
1510
1511 /* return number of addressable sgprs/vgprs for max_waves */
1512 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1513 uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves);
1514
1515 typedef struct {
1516 const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
1517 const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1518 const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1519 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1520 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1521 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
1522 const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1523 const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1524 } Info;
1525
1526 extern const Info instr_info;
1527
1528 }
1529
1530 #endif /* ACO_IR_H */
1531