4e8aa372dff64e6ac6ad1db20b9e3255650c0e1a
[mesa.git] / src / amd / compiler / aco_ir.h
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #ifndef ACO_IR_H
26 #define ACO_IR_H
27
28 #include <vector>
29 #include <set>
30 #include <unordered_set>
31 #include <bitset>
32 #include <memory>
33
34 #include "nir.h"
35 #include "ac_binary.h"
36 #include "amd_family.h"
37 #include "aco_opcodes.h"
38 #include "aco_util.h"
39
40 struct radv_nir_compiler_options;
41 struct radv_shader_args;
42 struct radv_shader_info;
43
44 namespace aco {
45
46 extern uint64_t debug_flags;
47
48 enum {
49 DEBUG_VALIDATE = 0x1,
50 DEBUG_VALIDATE_RA = 0x2,
51 DEBUG_PERFWARN = 0x4,
52 };
53
54 /**
55 * Representation of the instruction's microcode encoding format
56 * Note: Some Vector ALU Formats can be combined, such that:
57 * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
58 * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
59 * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
60 *
61 * (*) The same is applicable for VOP1 and VOPC instructions.
62 */
63 enum class Format : std::uint16_t {
64 /* Pseudo Instruction Format */
65 PSEUDO = 0,
66 /* Scalar ALU & Control Formats */
67 SOP1 = 1,
68 SOP2 = 2,
69 SOPK = 3,
70 SOPP = 4,
71 SOPC = 5,
72 /* Scalar Memory Format */
73 SMEM = 6,
74 /* LDS/GDS Format */
75 DS = 8,
76 /* Vector Memory Buffer Formats */
77 MTBUF = 9,
78 MUBUF = 10,
79 /* Vector Memory Image Format */
80 MIMG = 11,
81 /* Export Format */
82 EXP = 12,
83 /* Flat Formats */
84 FLAT = 13,
85 GLOBAL = 14,
86 SCRATCH = 15,
87
88 PSEUDO_BRANCH = 16,
89 PSEUDO_BARRIER = 17,
90 PSEUDO_REDUCTION = 18,
91
92 /* Vector ALU Formats */
93 VOP3P = 19,
94 VOP1 = 1 << 8,
95 VOP2 = 1 << 9,
96 VOPC = 1 << 10,
97 VOP3 = 1 << 11,
98 VOP3A = 1 << 11,
99 VOP3B = 1 << 11,
100 /* Vector Parameter Interpolation Format */
101 VINTRP = 1 << 12,
102 DPP = 1 << 13,
103 SDWA = 1 << 14,
104 };
105
106 enum barrier_interaction : uint8_t {
107 barrier_none = 0,
108 barrier_buffer = 0x1,
109 barrier_image = 0x2,
110 barrier_atomic = 0x4,
111 barrier_shared = 0x8,
112 /* used for geometry shaders to ensure vertex data writes are before the
113 * GS_DONE s_sendmsg. */
114 barrier_gs_data = 0x10,
115 /* used for geometry shaders to ensure s_sendmsg instructions are in-order. */
116 barrier_gs_sendmsg = 0x20,
117 /* used by barriers. created by s_barrier */
118 barrier_barrier = 0x40,
119 barrier_count = 7,
120 };
121
122 enum fp_round {
123 fp_round_ne = 0,
124 fp_round_pi = 1,
125 fp_round_ni = 2,
126 fp_round_tz = 3,
127 };
128
129 enum fp_denorm {
130 /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
131 * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
132 fp_denorm_flush = 0x0,
133 fp_denorm_keep = 0x3,
134 };
135
136 struct float_mode {
137 /* matches encoding of the MODE register */
138 union {
139 struct {
140 fp_round round32:2;
141 fp_round round16_64:2;
142 unsigned denorm32:2;
143 unsigned denorm16_64:2;
144 };
145 uint8_t val = 0;
146 };
147 /* if false, optimizations which may remove infs/nan/-0.0 can be done */
148 bool preserve_signed_zero_inf_nan32:1;
149 bool preserve_signed_zero_inf_nan16_64:1;
150 /* if false, optimizations which may remove denormal flushing can be done */
151 bool must_flush_denorms32:1;
152 bool must_flush_denorms16_64:1;
153 bool care_about_round32:1;
154 bool care_about_round16_64:1;
155
156 /* Returns true if instructions using the mode "other" can safely use the
157 * current one instead. */
158 bool canReplace(float_mode other) const noexcept {
159 return val == other.val &&
160 (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
161 (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
162 (must_flush_denorms32 || !other.must_flush_denorms32) &&
163 (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
164 (care_about_round32 || !other.care_about_round32) &&
165 (care_about_round16_64 || !other.care_about_round16_64);
166 }
167 };
168
169 constexpr Format asVOP3(Format format) {
170 return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
171 };
172
173 constexpr Format asSDWA(Format format) {
174 assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC);
175 return (Format) ((uint32_t) Format::SDWA | (uint32_t) format);
176 }
177
178 enum class RegType {
179 none = 0,
180 sgpr,
181 vgpr,
182 linear_vgpr,
183 };
184
185 struct RegClass {
186
187 enum RC : uint8_t {
188 s1 = 1,
189 s2 = 2,
190 s3 = 3,
191 s4 = 4,
192 s6 = 6,
193 s8 = 8,
194 s16 = 16,
195 v1 = s1 | (1 << 5),
196 v2 = s2 | (1 << 5),
197 v3 = s3 | (1 << 5),
198 v4 = s4 | (1 << 5),
199 v5 = 5 | (1 << 5),
200 v6 = 6 | (1 << 5),
201 v7 = 7 | (1 << 5),
202 v8 = 8 | (1 << 5),
203 /* byte-sized register class */
204 v1b = v1 | (1 << 7),
205 v2b = v2 | (1 << 7),
206 v3b = v3 | (1 << 7),
207 v4b = v4 | (1 << 7),
208 v6b = v6 | (1 << 7),
209 v8b = v8 | (1 << 7),
210 /* these are used for WWM and spills to vgpr */
211 v1_linear = v1 | (1 << 6),
212 v2_linear = v2 | (1 << 6),
213 };
214
215 RegClass() = default;
216 constexpr RegClass(RC rc)
217 : rc(rc) {}
218 constexpr RegClass(RegType type, unsigned size)
219 : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
220
221 constexpr operator RC() const { return rc; }
222 explicit operator bool() = delete;
223
224 constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
225 constexpr bool is_subdword() const { return rc & (1 << 7); }
226 constexpr unsigned bytes() const { return ((unsigned) rc & 0x1F) * (is_subdword() ? 1 : 4); }
227 //TODO: use size() less in favor of bytes()
228 constexpr unsigned size() const { return (bytes() + 3) >> 2; }
229 constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
230 constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
231 constexpr RegClass as_subdword() const { return RegClass((RC) (rc | 1 << 7)); }
232
233 static constexpr RegClass get(RegType type, unsigned bytes) {
234 if (type == RegType::sgpr) {
235 return RegClass(type, DIV_ROUND_UP(bytes, 4u));
236 } else {
237 return bytes % 4u ? RegClass(type, bytes).as_subdword() :
238 RegClass(type, bytes / 4u);
239 }
240 }
241
242 private:
243 RC rc;
244 };
245
246 /* transitional helper expressions */
247 static constexpr RegClass s1{RegClass::s1};
248 static constexpr RegClass s2{RegClass::s2};
249 static constexpr RegClass s3{RegClass::s3};
250 static constexpr RegClass s4{RegClass::s4};
251 static constexpr RegClass s8{RegClass::s8};
252 static constexpr RegClass s16{RegClass::s16};
253 static constexpr RegClass v1{RegClass::v1};
254 static constexpr RegClass v2{RegClass::v2};
255 static constexpr RegClass v3{RegClass::v3};
256 static constexpr RegClass v4{RegClass::v4};
257 static constexpr RegClass v5{RegClass::v5};
258 static constexpr RegClass v6{RegClass::v6};
259 static constexpr RegClass v7{RegClass::v7};
260 static constexpr RegClass v8{RegClass::v8};
261 static constexpr RegClass v1b{RegClass::v1b};
262 static constexpr RegClass v2b{RegClass::v2b};
263 static constexpr RegClass v3b{RegClass::v3b};
264 static constexpr RegClass v4b{RegClass::v4b};
265 static constexpr RegClass v6b{RegClass::v6b};
266 static constexpr RegClass v8b{RegClass::v8b};
267
268 /**
269 * Temp Class
270 * Each temporary virtual register has a
271 * register class (i.e. size and type)
272 * and SSA id.
273 */
274 struct Temp {
275 Temp() noexcept : id_(0), reg_class(0) {}
276 constexpr Temp(uint32_t id, RegClass cls) noexcept
277 : id_(id), reg_class(uint8_t(cls)) {}
278
279 constexpr uint32_t id() const noexcept { return id_; }
280 constexpr RegClass regClass() const noexcept { return (RegClass::RC)reg_class; }
281
282 constexpr unsigned bytes() const noexcept { return regClass().bytes(); }
283 constexpr unsigned size() const noexcept { return regClass().size(); }
284 constexpr RegType type() const noexcept { return regClass().type(); }
285 constexpr bool is_linear() const noexcept { return regClass().is_linear(); }
286
287 constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
288 constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
289 constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
290
291 private:
292 uint32_t id_: 24;
293 uint32_t reg_class : 8;
294 };
295
296 /**
297 * PhysReg
298 * Represents the physical register for each
299 * Operand and Definition.
300 */
301 struct PhysReg {
302 constexpr PhysReg() = default;
303 explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {}
304 constexpr unsigned reg() const { return reg_b >> 2; }
305 constexpr unsigned byte() const { return reg_b & 0x3; }
306 constexpr operator unsigned() const { return reg(); }
307 constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; }
308 constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; }
309 constexpr bool operator <(PhysReg other) const { return reg_b < other.reg_b; }
310 constexpr PhysReg advance(unsigned bytes) const { PhysReg res = *this; res.reg_b += bytes; return res; }
311
312 uint16_t reg_b = 0;
313 };
314
315 /* helper expressions for special registers */
316 static constexpr PhysReg m0{124};
317 static constexpr PhysReg vcc{106};
318 static constexpr PhysReg vcc_hi{107};
319 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
320 static constexpr PhysReg exec{126};
321 static constexpr PhysReg exec_lo{126};
322 static constexpr PhysReg exec_hi{127};
323 static constexpr PhysReg vccz{251};
324 static constexpr PhysReg execz{252};
325 static constexpr PhysReg scc{253};
326
327 /**
328 * Operand Class
329 * Initially, each Operand refers to either
330 * a temporary virtual register
331 * or to a constant value
332 * Temporary registers get mapped to physical register during RA
333 * Constant values are inlined into the instruction sequence.
334 */
335 class Operand final
336 {
337 public:
338 constexpr Operand()
339 : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
340 isKill_(false), isUndef_(true), isFirstKill_(false), is64BitConst_(false),
341 isLateKill_(false) {}
342
343 explicit Operand(Temp r) noexcept
344 {
345 data_.temp = r;
346 if (r.id()) {
347 isTemp_ = true;
348 } else {
349 isUndef_ = true;
350 setFixed(PhysReg{128});
351 }
352 };
353 explicit Operand(uint32_t v, bool is64bit = false) noexcept
354 {
355 data_.i = v;
356 isConstant_ = true;
357 is64BitConst_ = is64bit;
358 if (v <= 64)
359 setFixed(PhysReg{128 + v});
360 else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
361 setFixed(PhysReg{192 - v});
362 else if (v == 0x3f000000) /* 0.5 */
363 setFixed(PhysReg{240});
364 else if (v == 0xbf000000) /* -0.5 */
365 setFixed(PhysReg{241});
366 else if (v == 0x3f800000) /* 1.0 */
367 setFixed(PhysReg{242});
368 else if (v == 0xbf800000) /* -1.0 */
369 setFixed(PhysReg{243});
370 else if (v == 0x40000000) /* 2.0 */
371 setFixed(PhysReg{244});
372 else if (v == 0xc0000000) /* -2.0 */
373 setFixed(PhysReg{245});
374 else if (v == 0x40800000) /* 4.0 */
375 setFixed(PhysReg{246});
376 else if (v == 0xc0800000) /* -4.0 */
377 setFixed(PhysReg{247});
378 else { /* Literal Constant */
379 assert(!is64bit && "attempt to create a 64-bit literal constant");
380 setFixed(PhysReg{255});
381 }
382 };
383 explicit Operand(uint64_t v) noexcept
384 {
385 isConstant_ = true;
386 is64BitConst_ = true;
387 if (v <= 64) {
388 data_.i = (uint32_t) v;
389 setFixed(PhysReg{128 + (uint32_t) v});
390 } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
391 data_.i = (uint32_t) v;
392 setFixed(PhysReg{192 - (uint32_t) v});
393 } else if (v == 0x3FE0000000000000) { /* 0.5 */
394 data_.i = 0x3f000000;
395 setFixed(PhysReg{240});
396 } else if (v == 0xBFE0000000000000) { /* -0.5 */
397 data_.i = 0xbf000000;
398 setFixed(PhysReg{241});
399 } else if (v == 0x3FF0000000000000) { /* 1.0 */
400 data_.i = 0x3f800000;
401 setFixed(PhysReg{242});
402 } else if (v == 0xBFF0000000000000) { /* -1.0 */
403 data_.i = 0xbf800000;
404 setFixed(PhysReg{243});
405 } else if (v == 0x4000000000000000) { /* 2.0 */
406 data_.i = 0x40000000;
407 setFixed(PhysReg{244});
408 } else if (v == 0xC000000000000000) { /* -2.0 */
409 data_.i = 0xc0000000;
410 setFixed(PhysReg{245});
411 } else if (v == 0x4010000000000000) { /* 4.0 */
412 data_.i = 0x40800000;
413 setFixed(PhysReg{246});
414 } else if (v == 0xC010000000000000) { /* -4.0 */
415 data_.i = 0xc0800000;
416 setFixed(PhysReg{247});
417 } else { /* Literal Constant: we don't know if it is a long or double.*/
418 isConstant_ = 0;
419 assert(false && "attempt to create a 64-bit literal constant");
420 }
421 };
422 explicit Operand(RegClass type) noexcept
423 {
424 isUndef_ = true;
425 data_.temp = Temp(0, type);
426 setFixed(PhysReg{128});
427 };
428 explicit Operand(PhysReg reg, RegClass type) noexcept
429 {
430 data_.temp = Temp(0, type);
431 setFixed(reg);
432 }
433
434 constexpr bool isTemp() const noexcept
435 {
436 return isTemp_;
437 }
438
439 constexpr void setTemp(Temp t) noexcept {
440 assert(!isConstant_);
441 isTemp_ = true;
442 data_.temp = t;
443 }
444
445 constexpr Temp getTemp() const noexcept
446 {
447 return data_.temp;
448 }
449
450 constexpr uint32_t tempId() const noexcept
451 {
452 return data_.temp.id();
453 }
454
455 constexpr bool hasRegClass() const noexcept
456 {
457 return isTemp() || isUndefined();
458 }
459
460 constexpr RegClass regClass() const noexcept
461 {
462 return data_.temp.regClass();
463 }
464
465 constexpr unsigned bytes() const noexcept
466 {
467 if (isConstant())
468 return is64BitConst_ ? 8 : 4; //TODO: sub-dword constants
469 else
470 return data_.temp.bytes();
471 }
472
473 constexpr unsigned size() const noexcept
474 {
475 if (isConstant())
476 return is64BitConst_ ? 2 : 1;
477 else
478 return data_.temp.size();
479 }
480
481 constexpr bool isFixed() const noexcept
482 {
483 return isFixed_;
484 }
485
486 constexpr PhysReg physReg() const noexcept
487 {
488 return reg_;
489 }
490
491 constexpr void setFixed(PhysReg reg) noexcept
492 {
493 isFixed_ = reg != unsigned(-1);
494 reg_ = reg;
495 }
496
497 constexpr bool isConstant() const noexcept
498 {
499 return isConstant_;
500 }
501
502 constexpr bool isLiteral() const noexcept
503 {
504 return isConstant() && reg_ == 255;
505 }
506
507 constexpr bool isUndefined() const noexcept
508 {
509 return isUndef_;
510 }
511
512 constexpr uint32_t constantValue() const noexcept
513 {
514 return data_.i;
515 }
516
517 constexpr bool constantEquals(uint32_t cmp) const noexcept
518 {
519 return isConstant() && constantValue() == cmp;
520 }
521
522 constexpr uint64_t constantValue64(bool signext=false) const noexcept
523 {
524 if (is64BitConst_) {
525 if (reg_ <= 192)
526 return reg_ - 128;
527 else if (reg_ <= 208)
528 return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);
529
530 switch (reg_) {
531 case 240:
532 return 0x3FE0000000000000;
533 case 241:
534 return 0xBFE0000000000000;
535 case 242:
536 return 0x3FF0000000000000;
537 case 243:
538 return 0xBFF0000000000000;
539 case 244:
540 return 0x4000000000000000;
541 case 245:
542 return 0xC000000000000000;
543 case 246:
544 return 0x4010000000000000;
545 case 247:
546 return 0xC010000000000000;
547 }
548 }
549 return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
550 }
551
552 /* Indicates that the killed operand's live range intersects with the
553 * instruction's definitions. Unlike isKill() and isFirstKill(), this is
554 * not set by liveness analysis. */
555 constexpr void setLateKill(bool flag) noexcept
556 {
557 isLateKill_ = flag;
558 }
559
560 constexpr bool isLateKill() const noexcept
561 {
562 return isLateKill_;
563 }
564
565 constexpr void setKill(bool flag) noexcept
566 {
567 isKill_ = flag;
568 if (!flag)
569 setFirstKill(false);
570 }
571
572 constexpr bool isKill() const noexcept
573 {
574 return isKill_ || isFirstKill();
575 }
576
577 constexpr void setFirstKill(bool flag) noexcept
578 {
579 isFirstKill_ = flag;
580 if (flag)
581 setKill(flag);
582 }
583
584 /* When there are multiple operands killing the same temporary,
585 * isFirstKill() is only returns true for the first one. */
586 constexpr bool isFirstKill() const noexcept
587 {
588 return isFirstKill_;
589 }
590
591 constexpr bool isKillBeforeDef() const noexcept
592 {
593 return isKill() && !isLateKill();
594 }
595
596 constexpr bool isFirstKillBeforeDef() const noexcept
597 {
598 return isFirstKill() && !isLateKill();
599 }
600
601 constexpr bool operator == (Operand other) const noexcept
602 {
603 if (other.size() != size())
604 return false;
605 if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef())
606 return false;
607 if (isFixed() && other.isFixed() && physReg() != other.physReg())
608 return false;
609 if (isLiteral())
610 return other.isLiteral() && other.constantValue() == constantValue();
611 else if (isConstant())
612 return other.isConstant() && other.physReg() == physReg();
613 else if (isUndefined())
614 return other.isUndefined() && other.regClass() == regClass();
615 else
616 return other.isTemp() && other.getTemp() == getTemp();
617 }
618 private:
619 union {
620 uint32_t i;
621 float f;
622 Temp temp = Temp(0, s1);
623 } data_;
624 PhysReg reg_;
625 union {
626 struct {
627 uint8_t isTemp_:1;
628 uint8_t isFixed_:1;
629 uint8_t isConstant_:1;
630 uint8_t isKill_:1;
631 uint8_t isUndef_:1;
632 uint8_t isFirstKill_:1;
633 uint8_t is64BitConst_:1;
634 uint8_t isLateKill_:1;
635 };
636 /* can't initialize bit-fields in c++11, so work around using a union */
637 uint8_t control_ = 0;
638 };
639 };
640
641 /**
642 * Definition Class
643 * Definitions are the results of Instructions
644 * and refer to temporary virtual registers
645 * which are later mapped to physical registers
646 */
647 class Definition final
648 {
649 public:
650 constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0) {}
651 Definition(uint32_t index, RegClass type) noexcept
652 : temp(index, type) {}
653 explicit Definition(Temp tmp) noexcept
654 : temp(tmp) {}
655 Definition(PhysReg reg, RegClass type) noexcept
656 : temp(Temp(0, type))
657 {
658 setFixed(reg);
659 }
660 Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
661 : temp(Temp(tmpId, type))
662 {
663 setFixed(reg);
664 }
665
666 constexpr bool isTemp() const noexcept
667 {
668 return tempId() > 0;
669 }
670
671 constexpr Temp getTemp() const noexcept
672 {
673 return temp;
674 }
675
676 constexpr uint32_t tempId() const noexcept
677 {
678 return temp.id();
679 }
680
681 constexpr void setTemp(Temp t) noexcept {
682 temp = t;
683 }
684
685 constexpr RegClass regClass() const noexcept
686 {
687 return temp.regClass();
688 }
689
690 constexpr unsigned bytes() const noexcept
691 {
692 return temp.bytes();
693 }
694
695 constexpr unsigned size() const noexcept
696 {
697 return temp.size();
698 }
699
700 constexpr bool isFixed() const noexcept
701 {
702 return isFixed_;
703 }
704
705 constexpr PhysReg physReg() const noexcept
706 {
707 return reg_;
708 }
709
710 constexpr void setFixed(PhysReg reg) noexcept
711 {
712 isFixed_ = 1;
713 reg_ = reg;
714 }
715
716 constexpr void setHint(PhysReg reg) noexcept
717 {
718 hasHint_ = 1;
719 reg_ = reg;
720 }
721
722 constexpr bool hasHint() const noexcept
723 {
724 return hasHint_;
725 }
726
727 constexpr void setKill(bool flag) noexcept
728 {
729 isKill_ = flag;
730 }
731
732 constexpr bool isKill() const noexcept
733 {
734 return isKill_;
735 }
736
737 private:
738 Temp temp = Temp(0, s1);
739 PhysReg reg_;
740 union {
741 struct {
742 uint8_t isFixed_:1;
743 uint8_t hasHint_:1;
744 uint8_t isKill_:1;
745 };
746 /* can't initialize bit-fields in c++11, so work around using a union */
747 uint8_t control_ = 0;
748 };
749 };
750
751 struct Block;
752
753 struct Instruction {
754 aco_opcode opcode;
755 Format format;
756 uint32_t pass_flags;
757
758 aco::span<Operand> operands;
759 aco::span<Definition> definitions;
760
761 constexpr bool isVALU() const noexcept
762 {
763 return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
764 || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
765 || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
766 || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
767 || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
768 || format == Format::VOP3P;
769 }
770
771 constexpr bool isSALU() const noexcept
772 {
773 return format == Format::SOP1 ||
774 format == Format::SOP2 ||
775 format == Format::SOPC ||
776 format == Format::SOPK ||
777 format == Format::SOPP;
778 }
779
780 constexpr bool isVMEM() const noexcept
781 {
782 return format == Format::MTBUF ||
783 format == Format::MUBUF ||
784 format == Format::MIMG;
785 }
786
787 constexpr bool isDPP() const noexcept
788 {
789 return (uint16_t) format & (uint16_t) Format::DPP;
790 }
791
792 constexpr bool isVOP3() const noexcept
793 {
794 return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
795 ((uint16_t) format & (uint16_t) Format::VOP3B);
796 }
797
798 constexpr bool isSDWA() const noexcept
799 {
800 return (uint16_t) format & (uint16_t) Format::SDWA;
801 }
802
803 constexpr bool isFlatOrGlobal() const noexcept
804 {
805 return format == Format::FLAT || format == Format::GLOBAL;
806 }
807
808 constexpr bool usesModifiers() const noexcept;
809
810 constexpr bool reads_exec() const noexcept
811 {
812 for (const Operand& op : operands) {
813 if (op.isFixed() && op.physReg() == exec)
814 return true;
815 }
816 return false;
817 }
818 };
819 static_assert(sizeof(Instruction) == 16, "Unexpected padding");
820
821 struct SOPK_instruction : public Instruction {
822 uint16_t imm;
823 uint16_t padding;
824 };
825 static_assert(sizeof(SOPK_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
826
827 struct SOPP_instruction : public Instruction {
828 uint32_t imm;
829 int block;
830 };
831 static_assert(sizeof(SOPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
832
833 struct SOPC_instruction : public Instruction {
834 };
835 static_assert(sizeof(SOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
836
837 struct SOP1_instruction : public Instruction {
838 };
839 static_assert(sizeof(SOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
840
841 struct SOP2_instruction : public Instruction {
842 };
843 static_assert(sizeof(SOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
844
845 /**
846 * Scalar Memory Format:
847 * For s_(buffer_)load_dword*:
848 * Operand(0): SBASE - SGPR-pair which provides base address
849 * Operand(1): Offset - immediate (un)signed offset or SGPR
850 * Operand(2) / Definition(0): SDATA - SGPR for read / write result
851 * Operand(n-1): SOffset - SGPR offset (Vega only)
852 *
853 * Having no operands is also valid for instructions such as s_dcache_inv.
854 *
855 */
856 struct SMEM_instruction : public Instruction {
857 barrier_interaction barrier;
858 bool glc : 1; /* VI+: globally coherent */
859 bool dlc : 1; /* NAVI: device level coherent */
860 bool nv : 1; /* VEGA only: Non-volatile */
861 bool can_reorder : 1;
862 bool disable_wqm : 1;
863 uint32_t padding: 19;
864 };
865 static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
866
867 struct VOP1_instruction : public Instruction {
868 };
869 static_assert(sizeof(VOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
870
871 struct VOP2_instruction : public Instruction {
872 };
873 static_assert(sizeof(VOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
874
875 struct VOPC_instruction : public Instruction {
876 };
877 static_assert(sizeof(VOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
878
879 struct VOP3A_instruction : public Instruction {
880 bool abs[3];
881 bool neg[3];
882 uint8_t opsel : 4;
883 uint8_t omod : 2;
884 bool clamp : 1;
885 uint32_t padding : 9;
886 };
887 static_assert(sizeof(VOP3A_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
888
889 struct VOP3P_instruction : public Instruction {
890 bool neg_lo[3];
891 bool neg_hi[3];
892 uint8_t opsel_lo : 3;
893 uint8_t opsel_hi : 3;
894 bool clamp : 1;
895 uint32_t padding : 9;
896 };
897 static_assert(sizeof(VOP3P_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
898
899 /**
900 * Data Parallel Primitives Format:
901 * This format can be used for VOP1, VOP2 or VOPC instructions.
902 * The swizzle applies to the src0 operand.
903 *
904 */
905 struct DPP_instruction : public Instruction {
906 bool abs[2];
907 bool neg[2];
908 uint16_t dpp_ctrl;
909 uint8_t row_mask : 4;
910 uint8_t bank_mask : 4;
911 bool bound_ctrl : 1;
912 uint32_t padding : 7;
913 };
914 static_assert(sizeof(DPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
915
916 enum sdwa_sel : uint8_t {
917 /* masks */
918 sdwa_wordnum = 0x1,
919 sdwa_bytenum = 0x3,
920 sdwa_asuint = 0x7 | 0x10,
921 sdwa_rasize = 0x3,
922
923 /* flags */
924 sdwa_isword = 0x4,
925 sdwa_sext = 0x8,
926 sdwa_isra = 0x10,
927
928 /* specific values */
929 sdwa_ubyte0 = 0,
930 sdwa_ubyte1 = 1,
931 sdwa_ubyte2 = 2,
932 sdwa_ubyte3 = 3,
933 sdwa_uword0 = sdwa_isword | 0,
934 sdwa_uword1 = sdwa_isword | 1,
935 sdwa_udword = 6,
936
937 sdwa_sbyte0 = sdwa_ubyte0 | sdwa_sext,
938 sdwa_sbyte1 = sdwa_ubyte1 | sdwa_sext,
939 sdwa_sbyte2 = sdwa_ubyte2 | sdwa_sext,
940 sdwa_sbyte3 = sdwa_ubyte3 | sdwa_sext,
941 sdwa_sword0 = sdwa_uword0 | sdwa_sext,
942 sdwa_sword1 = sdwa_uword1 | sdwa_sext,
943 sdwa_sdword = sdwa_udword | sdwa_sext,
944
945 /* register-allocated */
946 sdwa_ubyte = 1 | sdwa_isra,
947 sdwa_uword = 2 | sdwa_isra,
948 sdwa_sbyte = sdwa_ubyte | sdwa_sext,
949 sdwa_sword = sdwa_uword | sdwa_sext,
950 };
951
952 /**
953 * Sub-Dword Addressing Format:
954 * This format can be used for VOP1, VOP2 or VOPC instructions.
955 *
956 * omod and SGPR/constant operands are only available on GFX9+. For VOPC,
957 * the definition doesn't have to be VCC on GFX9+.
958 *
959 */
960 struct SDWA_instruction : public Instruction {
961 /* these destination modifiers aren't available with VOPC except for
962 * clamp on GFX8 */
963 uint8_t sel[2];
964 uint8_t dst_sel;
965 bool neg[2];
966 bool abs[2];
967 bool dst_preserve : 1;
968 bool clamp : 1;
969 uint8_t omod : 2; /* GFX9+ */
970 uint32_t padding : 4;
971 };
972 static_assert(sizeof(SDWA_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
973
974 struct Interp_instruction : public Instruction {
975 uint8_t attribute;
976 uint8_t component;
977 uint16_t padding;
978 };
979 static_assert(sizeof(Interp_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
980
981 /**
982 * Local and Global Data Sharing instructions
983 * Operand(0): ADDR - VGPR which supplies the address.
984 * Operand(1): DATA0 - First data VGPR.
985 * Operand(2): DATA1 - Second data VGPR.
986 * Operand(n-1): M0 - LDS size.
987 * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
988 *
989 */
990 struct DS_instruction : public Instruction {
991 int16_t offset0;
992 int8_t offset1;
993 bool gds;
994 };
995 static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
996
997 /**
998 * Vector Memory Untyped-buffer Instructions
999 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1000 * Operand(1): VADDR - Address source. Can carry an index and/or offset
1001 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1002 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1003 *
1004 */
1005 struct MUBUF_instruction : public Instruction {
1006 uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
1007 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1008 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1009 bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
1010 bool glc : 1; /* globally coherent */
1011 bool dlc : 1; /* NAVI: device level coherent */
1012 bool slc : 1; /* system level coherent */
1013 bool tfe : 1; /* texture fail enable */
1014 bool lds : 1; /* Return read-data to LDS instead of VGPRs */
1015 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1016 bool can_reorder : 1;
1017 uint8_t padding : 2;
1018 barrier_interaction barrier;
1019 };
1020 static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1021
1022 /**
1023 * Vector Memory Typed-buffer Instructions
1024 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1025 * Operand(1): VADDR - Address source. Can carry an index and/or offset
1026 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1027 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1028 *
1029 */
1030 struct MTBUF_instruction : public Instruction {
1031 uint16_t offset; /* Unsigned byte offset - 12 bit */
1032 barrier_interaction barrier;
1033 uint8_t dfmt : 4; /* Data Format of data in memory buffer */
1034 uint8_t nfmt : 3; /* Numeric format of data in memory */
1035 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1036 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1037 bool glc : 1; /* globally coherent */
1038 bool dlc : 1; /* NAVI: device level coherent */
1039 bool slc : 1; /* system level coherent */
1040 bool tfe : 1; /* texture fail enable */
1041 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1042 bool can_reorder : 1;
1043 uint32_t padding : 25;
1044 };
1045 static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1046
1047 /**
1048 * Vector Memory Image Instructions
1049 * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
1050 * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
1051 * or VDATA - Vector GPR for write data.
1052 * Operand(2): VADDR - Address source. Can carry an offset or an index.
1053 * Definition(0): VDATA - Vector GPR for read result.
1054 *
1055 */
1056 struct MIMG_instruction : public Instruction {
1057 uint8_t dmask; /* Data VGPR enable mask */
1058 uint8_t dim : 3; /* NAVI: dimensionality */
1059 bool unrm : 1; /* Force address to be un-normalized */
1060 bool dlc : 1; /* NAVI: device level coherent */
1061 bool glc : 1; /* globally coherent */
1062 bool slc : 1; /* system level coherent */
1063 bool tfe : 1; /* texture fail enable */
1064 bool da : 1; /* declare an array */
1065 bool lwe : 1; /* Force data to be un-normalized */
1066 bool r128 : 1; /* NAVI: Texture resource size */
1067 bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
1068 bool d16 : 1; /* Convert 32-bit data to 16-bit data */
1069 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1070 bool can_reorder : 1;
1071 uint8_t padding : 1;
1072 barrier_interaction barrier;
1073 };
1074 static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1075
1076 /**
1077 * Flat/Scratch/Global Instructions
1078 * Operand(0): ADDR
1079 * Operand(1): SADDR
1080 * Operand(2) / Definition(0): DATA/VDST
1081 *
1082 */
1083 struct FLAT_instruction : public Instruction {
1084 uint16_t offset; /* Vega/Navi only */
1085 bool slc : 1; /* system level coherent */
1086 bool glc : 1; /* globally coherent */
1087 bool dlc : 1; /* NAVI: device level coherent */
1088 bool lds : 1;
1089 bool nv : 1;
1090 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1091 bool can_reorder : 1;
1092 uint8_t padding : 1;
1093 barrier_interaction barrier;
1094 };
1095 static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1096
1097 struct Export_instruction : public Instruction {
1098 uint8_t enabled_mask;
1099 uint8_t dest;
1100 bool compressed : 1;
1101 bool done : 1;
1102 bool valid_mask : 1;
1103 uint32_t padding : 13;
1104 };
1105 static_assert(sizeof(Export_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1106
1107 struct Pseudo_instruction : public Instruction {
1108 PhysReg scratch_sgpr; /* might not be valid if it's not needed */
1109 bool tmp_in_scc;
1110 uint8_t padding;
1111 };
1112 static_assert(sizeof(Pseudo_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1113
1114 struct Pseudo_branch_instruction : public Instruction {
1115 /* target[0] is the block index of the branch target.
1116 * For conditional branches, target[1] contains the fall-through alternative.
1117 * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
1118 */
1119 uint32_t target[2];
1120 };
1121 static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1122
1123 struct Pseudo_barrier_instruction : public Instruction {
1124 };
1125 static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1126
1127 enum ReduceOp : uint16_t {
1128 iadd8, iadd16, iadd32, iadd64,
1129 imul8, imul16, imul32, imul64,
1130 fadd16, fadd32, fadd64,
1131 fmul16, fmul32, fmul64,
1132 imin8, imin16, imin32, imin64,
1133 imax8, imax16, imax32, imax64,
1134 umin8, umin16, umin32, umin64,
1135 umax8, umax16, umax32, umax64,
1136 fmin16, fmin32, fmin64,
1137 fmax16, fmax32, fmax64,
1138 iand8, iand16, iand32, iand64,
1139 ior8, ior16, ior32, ior64,
1140 ixor8, ixor16, ixor32, ixor64,
1141 };
1142
1143 /**
1144 * Subgroup Reduction Instructions, everything except for the data to be
1145 * reduced and the result as inserted by setup_reduce_temp().
1146 * Operand(0): data to be reduced
1147 * Operand(1): reduce temporary
1148 * Operand(2): vector temporary
1149 * Definition(0): result
1150 * Definition(1): scalar temporary
1151 * Definition(2): scalar identity temporary (not used to store identity on GFX10)
1152 * Definition(3): scc clobber
1153 * Definition(4): vcc clobber
1154 *
1155 */
1156 struct Pseudo_reduction_instruction : public Instruction {
1157 ReduceOp reduce_op;
1158 uint16_t cluster_size; // must be 0 for scans
1159 };
1160 static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1161
1162 struct instr_deleter_functor {
1163 void operator()(void* p) {
1164 free(p);
1165 }
1166 };
1167
1168 template<typename T>
1169 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
1170
1171 template<typename T>
1172 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
1173 {
1174 std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1175 char *data = (char*) calloc(1, size);
1176 T* inst = (T*) data;
1177
1178 inst->opcode = opcode;
1179 inst->format = format;
1180
1181 uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
1182 inst->operands = aco::span<Operand>(operands_offset, num_operands);
1183 uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1184 inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1185
1186 return inst;
1187 }
1188
1189 constexpr bool Instruction::usesModifiers() const noexcept
1190 {
1191 if (isDPP() || isSDWA())
1192 return true;
1193
1194 if (format == Format::VOP3P) {
1195 const VOP3P_instruction *vop3p = static_cast<const VOP3P_instruction*>(this);
1196 for (unsigned i = 0; i < operands.size(); i++) {
1197 if (vop3p->neg_lo[i] || vop3p->neg_hi[i])
1198 return true;
1199 }
1200 return vop3p->opsel_lo || vop3p->opsel_hi || vop3p->clamp;
1201 } else if (isVOP3()) {
1202 const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
1203 for (unsigned i = 0; i < operands.size(); i++) {
1204 if (vop3->abs[i] || vop3->neg[i])
1205 return true;
1206 }
1207 return vop3->opsel || vop3->clamp || vop3->omod;
1208 }
1209 return false;
1210 }
1211
1212 constexpr bool is_phi(Instruction* instr)
1213 {
1214 return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
1215 }
1216
1217 static inline bool is_phi(aco_ptr<Instruction>& instr)
1218 {
1219 return is_phi(instr.get());
1220 }
1221
1222 barrier_interaction get_barrier_interaction(const Instruction* instr);
1223
1224 bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr);
1225
1226 bool can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high);
1227
1228 enum block_kind {
1229 /* uniform indicates that leaving this block,
1230 * all actives lanes stay active */
1231 block_kind_uniform = 1 << 0,
1232 block_kind_top_level = 1 << 1,
1233 block_kind_loop_preheader = 1 << 2,
1234 block_kind_loop_header = 1 << 3,
1235 block_kind_loop_exit = 1 << 4,
1236 block_kind_continue = 1 << 5,
1237 block_kind_break = 1 << 6,
1238 block_kind_continue_or_break = 1 << 7,
1239 block_kind_discard = 1 << 8,
1240 block_kind_branch = 1 << 9,
1241 block_kind_merge = 1 << 10,
1242 block_kind_invert = 1 << 11,
1243 block_kind_uses_discard_if = 1 << 12,
1244 block_kind_needs_lowering = 1 << 13,
1245 block_kind_uses_demote = 1 << 14,
1246 block_kind_export_end = 1 << 15,
1247 };
1248
1249
1250 struct RegisterDemand {
1251 constexpr RegisterDemand() = default;
1252 constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1253 : vgpr{v}, sgpr{s} {}
1254 int16_t vgpr = 0;
1255 int16_t sgpr = 0;
1256
1257 constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1258 return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1259 }
1260
1261 constexpr bool exceeds(const RegisterDemand other) const noexcept {
1262 return vgpr > other.vgpr || sgpr > other.sgpr;
1263 }
1264
1265 constexpr RegisterDemand operator+(const Temp t) const noexcept {
1266 if (t.type() == RegType::sgpr)
1267 return RegisterDemand( vgpr, sgpr + t.size() );
1268 else
1269 return RegisterDemand( vgpr + t.size(), sgpr );
1270 }
1271
1272 constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1273 return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1274 }
1275
1276 constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1277 return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1278 }
1279
1280 constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1281 vgpr += other.vgpr;
1282 sgpr += other.sgpr;
1283 return *this;
1284 }
1285
1286 constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1287 vgpr -= other.vgpr;
1288 sgpr -= other.sgpr;
1289 return *this;
1290 }
1291
1292 constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1293 if (t.type() == RegType::sgpr)
1294 sgpr += t.size();
1295 else
1296 vgpr += t.size();
1297 return *this;
1298 }
1299
1300 constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1301 if (t.type() == RegType::sgpr)
1302 sgpr -= t.size();
1303 else
1304 vgpr -= t.size();
1305 return *this;
1306 }
1307
1308 constexpr void update(const RegisterDemand other) noexcept {
1309 vgpr = std::max(vgpr, other.vgpr);
1310 sgpr = std::max(sgpr, other.sgpr);
1311 }
1312
1313 };
1314
1315 /* CFG */
1316 struct Block {
1317 float_mode fp_mode;
1318 unsigned index;
1319 unsigned offset = 0;
1320 std::vector<aco_ptr<Instruction>> instructions;
1321 std::vector<unsigned> logical_preds;
1322 std::vector<unsigned> linear_preds;
1323 std::vector<unsigned> logical_succs;
1324 std::vector<unsigned> linear_succs;
1325 RegisterDemand register_demand = RegisterDemand();
1326 uint16_t loop_nest_depth = 0;
1327 uint16_t kind = 0;
1328 int logical_idom = -1;
1329 int linear_idom = -1;
1330 Temp live_out_exec = Temp();
1331
1332 /* this information is needed for predecessors to blocks with phis when
1333 * moving out of ssa */
1334 bool scc_live_out = false;
1335 PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1336
1337 Block(unsigned idx) : index(idx) {}
1338 Block() : index(0) {}
1339 };
1340
1341 using Stage = uint16_t;
1342
1343 /* software stages */
1344 static constexpr Stage sw_vs = 1 << 0;
1345 static constexpr Stage sw_gs = 1 << 1;
1346 static constexpr Stage sw_tcs = 1 << 2;
1347 static constexpr Stage sw_tes = 1 << 3;
1348 static constexpr Stage sw_fs = 1 << 4;
1349 static constexpr Stage sw_cs = 1 << 5;
1350 static constexpr Stage sw_gs_copy = 1 << 6;
1351 static constexpr Stage sw_mask = 0x7f;
1352
1353 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1354 static constexpr Stage hw_vs = 1 << 7;
1355 static constexpr Stage hw_es = 1 << 8; /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */
1356 static constexpr Stage hw_gs = 1 << 9; /* Geometry shader on GFX10/legacy and GFX6-9. */
1357 static constexpr Stage hw_ngg_gs = 1 << 10; /* Geometry shader on GFX10/NGG. */
1358 static constexpr Stage hw_ls = 1 << 11; /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */
1359 static constexpr Stage hw_hs = 1 << 12; /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */
1360 static constexpr Stage hw_fs = 1 << 13;
1361 static constexpr Stage hw_cs = 1 << 14;
1362 static constexpr Stage hw_mask = 0xff << 7;
1363
1364 /* possible settings of Program::stage */
1365 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1366 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1367 static constexpr Stage compute_cs = sw_cs | hw_cs;
1368 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1369 static constexpr Stage gs_copy_vs = sw_gs_copy | hw_vs;
1370 /* GFX10/NGG */
1371 static constexpr Stage ngg_vertex_gs = sw_vs | hw_ngg_gs;
1372 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_ngg_gs;
1373 static constexpr Stage ngg_tess_eval_gs = sw_tes | hw_ngg_gs;
1374 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_ngg_gs;
1375 /* GFX9 (and GFX10 if NGG isn't used) */
1376 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1377 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1378 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1379 /* pre-GFX9 */
1380 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1381 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1382 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1383 static constexpr Stage tess_eval_es = sw_tes | hw_es; /* tesselation evaluation before geometry */
1384 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1385
1386 enum statistic {
1387 statistic_hash,
1388 statistic_instructions,
1389 statistic_copies,
1390 statistic_branches,
1391 statistic_cycles,
1392 statistic_vmem_clauses,
1393 statistic_smem_clauses,
1394 statistic_vmem_score,
1395 statistic_smem_score,
1396 statistic_sgpr_presched,
1397 statistic_vgpr_presched,
1398 num_statistics
1399 };
1400
1401 class Program final {
1402 public:
1403 float_mode next_fp_mode;
1404 std::vector<Block> blocks;
1405 RegisterDemand max_reg_demand = RegisterDemand();
1406 uint16_t num_waves = 0;
1407 uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1408 ac_shader_config* config;
1409 struct radv_shader_info *info;
1410 enum chip_class chip_class;
1411 enum radeon_family family;
1412 unsigned wave_size;
1413 RegClass lane_mask;
1414 Stage stage; /* Stage */
1415 bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1416 bool needs_wqm = false; /* there exists a p_wqm instruction */
1417 bool wb_smem_l1_on_end = false;
1418
1419 std::vector<uint8_t> constant_data;
1420 Temp private_segment_buffer;
1421 Temp scratch_offset;
1422
1423 uint16_t min_waves = 0;
1424 uint16_t lds_alloc_granule;
1425 uint32_t lds_limit; /* in bytes */
1426 bool has_16bank_lds;
1427 uint16_t vgpr_limit;
1428 uint16_t sgpr_limit;
1429 uint16_t physical_sgprs;
1430 uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1431 uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
1432 unsigned workgroup_size; /* if known; otherwise UINT_MAX */
1433
1434 bool xnack_enabled = false;
1435 bool sram_ecc_enabled = false;
1436
1437 bool needs_vcc = false;
1438 bool needs_flat_scr = false;
1439
1440 bool collect_statistics = false;
1441 uint32_t statistics[num_statistics];
1442
1443 uint32_t allocateId()
1444 {
1445 assert(allocationID <= 16777215);
1446 return allocationID++;
1447 }
1448
1449 uint32_t peekAllocationId()
1450 {
1451 return allocationID;
1452 }
1453
1454 void setAllocationId(uint32_t id)
1455 {
1456 allocationID = id;
1457 }
1458
1459 Block* create_and_insert_block() {
1460 blocks.emplace_back(blocks.size());
1461 blocks.back().fp_mode = next_fp_mode;
1462 return &blocks.back();
1463 }
1464
1465 Block* insert_block(Block&& block) {
1466 block.index = blocks.size();
1467 block.fp_mode = next_fp_mode;
1468 blocks.emplace_back(std::move(block));
1469 return &blocks.back();
1470 }
1471
1472 private:
1473 uint32_t allocationID = 1;
1474 };
1475
1476 struct TempHash {
1477 std::size_t operator()(Temp t) const {
1478 return t.id();
1479 }
1480 };
1481 using TempSet = std::unordered_set<Temp, TempHash>;
1482
1483 struct live {
1484 /* live temps out per block */
1485 std::vector<TempSet> live_out;
1486 /* register demand (sgpr/vgpr) per instruction per block */
1487 std::vector<std::vector<RegisterDemand>> register_demand;
1488 };
1489
1490 void select_program(Program *program,
1491 unsigned shader_count,
1492 struct nir_shader *const *shaders,
1493 ac_shader_config* config,
1494 struct radv_shader_args *args);
1495 void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
1496 ac_shader_config* config,
1497 struct radv_shader_args *args);
1498
1499 void lower_wqm(Program* program, live& live_vars,
1500 const struct radv_nir_compiler_options *options);
1501 void lower_phis(Program* program);
1502 void calc_min_waves(Program* program);
1503 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1504 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1505 std::vector<uint16_t> dead_code_analysis(Program *program);
1506 void dominator_tree(Program* program);
1507 void insert_exec_mask(Program *program);
1508 void value_numbering(Program* program);
1509 void optimize(Program* program);
1510 void setup_reduce_temp(Program* program);
1511 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1512 void register_allocation(Program *program, std::vector<TempSet>& live_out_per_block);
1513 void ssa_elimination(Program* program);
1514 void lower_to_hw_instr(Program* program);
1515 void schedule_program(Program* program, live& live_vars);
1516 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1517 void insert_wait_states(Program* program);
1518 void insert_NOPs(Program* program);
1519 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1520 void print_asm(Program *program, std::vector<uint32_t>& binary,
1521 unsigned exec_size, std::ostream& out);
1522 void validate(Program* program, FILE *output);
1523 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output);
1524 #ifndef NDEBUG
1525 void perfwarn(bool cond, const char *msg, Instruction *instr=NULL);
1526 #else
1527 #define perfwarn(program, cond, msg, ...) do {} while(0)
1528 #endif
1529
1530 void collect_presched_stats(Program *program);
1531 void collect_preasm_stats(Program *program);
1532 void collect_postasm_stats(Program *program, const std::vector<uint32_t>& code);
1533
1534 void aco_print_instr(const Instruction *instr, FILE *output);
1535 void aco_print_program(const Program *program, FILE *output);
1536
1537 /* utilities for dealing with register demand */
1538 RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
1539 RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
1540 RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, aco_ptr<Instruction>& instr_before);
1541
1542 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1543 uint16_t get_extra_sgprs(Program *program);
1544
1545 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
1546 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1547 uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs);
1548
1549 /* return number of addressable sgprs/vgprs for max_waves */
1550 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1551 uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves);
1552
1553 typedef struct {
1554 const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
1555 const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1556 const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1557 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1558 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1559 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
1560 const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1561 const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1562 } Info;
1563
1564 extern const Info instr_info;
1565
1566 }
1567
1568 #endif /* ACO_IR_H */
1569