68d0b9bf4ceed9fce26c16e512fb6edff352818a
[mesa.git] / src / amd / compiler / aco_ir.h
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #ifndef ACO_IR_H
26 #define ACO_IR_H
27
28 #include <vector>
29 #include <set>
30 #include <unordered_set>
31 #include <bitset>
32 #include <memory>
33
34 #include "nir.h"
35 #include "ac_binary.h"
36 #include "amd_family.h"
37 #include "aco_opcodes.h"
38 #include "aco_util.h"
39
40 struct radv_nir_compiler_options;
41 struct radv_shader_args;
42 struct radv_shader_info;
43
44 namespace aco {
45
46 extern uint64_t debug_flags;
47
48 enum {
49 DEBUG_VALIDATE = 0x1,
50 DEBUG_VALIDATE_RA = 0x2,
51 DEBUG_PERFWARN = 0x4,
52 };
53
54 /**
55 * Representation of the instruction's microcode encoding format
56 * Note: Some Vector ALU Formats can be combined, such that:
57 * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
58 * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
59 * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
60 *
61 * (*) The same is applicable for VOP1 and VOPC instructions.
62 */
63 enum class Format : std::uint16_t {
64 /* Pseudo Instruction Format */
65 PSEUDO = 0,
66 /* Scalar ALU & Control Formats */
67 SOP1 = 1,
68 SOP2 = 2,
69 SOPK = 3,
70 SOPP = 4,
71 SOPC = 5,
72 /* Scalar Memory Format */
73 SMEM = 6,
74 /* LDS/GDS Format */
75 DS = 8,
76 /* Vector Memory Buffer Formats */
77 MTBUF = 9,
78 MUBUF = 10,
79 /* Vector Memory Image Format */
80 MIMG = 11,
81 /* Export Format */
82 EXP = 12,
83 /* Flat Formats */
84 FLAT = 13,
85 GLOBAL = 14,
86 SCRATCH = 15,
87
88 PSEUDO_BRANCH = 16,
89 PSEUDO_BARRIER = 17,
90 PSEUDO_REDUCTION = 18,
91
92 /* Vector ALU Formats */
93 VOP3P = 19,
94 VOP1 = 1 << 8,
95 VOP2 = 1 << 9,
96 VOPC = 1 << 10,
97 VOP3 = 1 << 11,
98 VOP3A = 1 << 11,
99 VOP3B = 1 << 11,
100 /* Vector Parameter Interpolation Format */
101 VINTRP = 1 << 12,
102 DPP = 1 << 13,
103 SDWA = 1 << 14,
104 };
105
106 enum barrier_interaction : uint8_t {
107 barrier_none = 0,
108 barrier_buffer = 0x1,
109 barrier_image = 0x2,
110 barrier_atomic = 0x4,
111 barrier_shared = 0x8,
112 /* used for geometry shaders to ensure vertex data writes are before the
113 * GS_DONE s_sendmsg. */
114 barrier_gs_data = 0x10,
115 /* used for geometry shaders to ensure s_sendmsg instructions are in-order. */
116 barrier_gs_sendmsg = 0x20,
117 /* used by barriers. created by s_barrier */
118 barrier_barrier = 0x40,
119 barrier_count = 7,
120 };
121
122 enum fp_round {
123 fp_round_ne = 0,
124 fp_round_pi = 1,
125 fp_round_ni = 2,
126 fp_round_tz = 3,
127 };
128
129 enum fp_denorm {
130 /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
131 * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
132 fp_denorm_flush = 0x0,
133 fp_denorm_keep = 0x3,
134 };
135
136 struct float_mode {
137 /* matches encoding of the MODE register */
138 union {
139 struct {
140 fp_round round32:2;
141 fp_round round16_64:2;
142 unsigned denorm32:2;
143 unsigned denorm16_64:2;
144 };
145 uint8_t val = 0;
146 };
147 /* if false, optimizations which may remove infs/nan/-0.0 can be done */
148 bool preserve_signed_zero_inf_nan32:1;
149 bool preserve_signed_zero_inf_nan16_64:1;
150 /* if false, optimizations which may remove denormal flushing can be done */
151 bool must_flush_denorms32:1;
152 bool must_flush_denorms16_64:1;
153 bool care_about_round32:1;
154 bool care_about_round16_64:1;
155
156 /* Returns true if instructions using the mode "other" can safely use the
157 * current one instead. */
158 bool canReplace(float_mode other) const noexcept {
159 return val == other.val &&
160 (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
161 (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
162 (must_flush_denorms32 || !other.must_flush_denorms32) &&
163 (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
164 (care_about_round32 || !other.care_about_round32) &&
165 (care_about_round16_64 || !other.care_about_round16_64);
166 }
167 };
168
169 constexpr Format asVOP3(Format format) {
170 return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
171 };
172
173 constexpr Format asSDWA(Format format) {
174 assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC);
175 return (Format) ((uint32_t) Format::SDWA | (uint32_t) format);
176 }
177
178 enum class RegType {
179 none = 0,
180 sgpr,
181 vgpr,
182 linear_vgpr,
183 };
184
185 struct RegClass {
186
187 enum RC : uint8_t {
188 s1 = 1,
189 s2 = 2,
190 s3 = 3,
191 s4 = 4,
192 s6 = 6,
193 s8 = 8,
194 s16 = 16,
195 v1 = s1 | (1 << 5),
196 v2 = s2 | (1 << 5),
197 v3 = s3 | (1 << 5),
198 v4 = s4 | (1 << 5),
199 v5 = 5 | (1 << 5),
200 v6 = 6 | (1 << 5),
201 v7 = 7 | (1 << 5),
202 v8 = 8 | (1 << 5),
203 /* byte-sized register class */
204 v1b = v1 | (1 << 7),
205 v2b = v2 | (1 << 7),
206 v3b = v3 | (1 << 7),
207 v4b = v4 | (1 << 7),
208 v6b = v6 | (1 << 7),
209 v8b = v8 | (1 << 7),
210 /* these are used for WWM and spills to vgpr */
211 v1_linear = v1 | (1 << 6),
212 v2_linear = v2 | (1 << 6),
213 };
214
215 RegClass() = default;
216 constexpr RegClass(RC rc)
217 : rc(rc) {}
218 constexpr RegClass(RegType type, unsigned size)
219 : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
220
221 constexpr operator RC() const { return rc; }
222 explicit operator bool() = delete;
223
224 constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
225 constexpr bool is_subdword() const { return rc & (1 << 7); }
226 constexpr unsigned bytes() const { return ((unsigned) rc & 0x1F) * (is_subdword() ? 1 : 4); }
227 //TODO: use size() less in favor of bytes()
228 constexpr unsigned size() const { return (bytes() + 3) >> 2; }
229 constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
230 constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
231 constexpr RegClass as_subdword() const { return RegClass((RC) (rc | 1 << 7)); }
232
233 static constexpr RegClass get(RegType type, unsigned bytes) {
234 if (type == RegType::sgpr) {
235 return RegClass(type, DIV_ROUND_UP(bytes, 4u));
236 } else {
237 return bytes % 4u ? RegClass(type, bytes).as_subdword() :
238 RegClass(type, bytes / 4u);
239 }
240 }
241
242 private:
243 RC rc;
244 };
245
246 /* transitional helper expressions */
247 static constexpr RegClass s1{RegClass::s1};
248 static constexpr RegClass s2{RegClass::s2};
249 static constexpr RegClass s3{RegClass::s3};
250 static constexpr RegClass s4{RegClass::s4};
251 static constexpr RegClass s8{RegClass::s8};
252 static constexpr RegClass s16{RegClass::s16};
253 static constexpr RegClass v1{RegClass::v1};
254 static constexpr RegClass v2{RegClass::v2};
255 static constexpr RegClass v3{RegClass::v3};
256 static constexpr RegClass v4{RegClass::v4};
257 static constexpr RegClass v5{RegClass::v5};
258 static constexpr RegClass v6{RegClass::v6};
259 static constexpr RegClass v7{RegClass::v7};
260 static constexpr RegClass v8{RegClass::v8};
261 static constexpr RegClass v1b{RegClass::v1b};
262 static constexpr RegClass v2b{RegClass::v2b};
263 static constexpr RegClass v3b{RegClass::v3b};
264 static constexpr RegClass v4b{RegClass::v4b};
265 static constexpr RegClass v6b{RegClass::v6b};
266 static constexpr RegClass v8b{RegClass::v8b};
267
268 /**
269 * Temp Class
270 * Each temporary virtual register has a
271 * register class (i.e. size and type)
272 * and SSA id.
273 */
274 struct Temp {
275 Temp() noexcept : id_(0), reg_class(0) {}
276 constexpr Temp(uint32_t id, RegClass cls) noexcept
277 : id_(id), reg_class(uint8_t(cls)) {}
278
279 constexpr uint32_t id() const noexcept { return id_; }
280 constexpr RegClass regClass() const noexcept { return (RegClass::RC)reg_class; }
281
282 constexpr unsigned bytes() const noexcept { return regClass().bytes(); }
283 constexpr unsigned size() const noexcept { return regClass().size(); }
284 constexpr RegType type() const noexcept { return regClass().type(); }
285 constexpr bool is_linear() const noexcept { return regClass().is_linear(); }
286
287 constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
288 constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
289 constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
290
291 private:
292 uint32_t id_: 24;
293 uint32_t reg_class : 8;
294 };
295
296 /**
297 * PhysReg
298 * Represents the physical register for each
299 * Operand and Definition.
300 */
301 struct PhysReg {
302 constexpr PhysReg() = default;
303 explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {}
304 constexpr unsigned reg() const { return reg_b >> 2; }
305 constexpr unsigned byte() const { return reg_b & 0x3; }
306 constexpr operator unsigned() const { return reg(); }
307 constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; }
308 constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; }
309 constexpr bool operator <(PhysReg other) const { return reg_b < other.reg_b; }
310 constexpr PhysReg advance(unsigned bytes) const { PhysReg res = *this; res.reg_b += bytes; return res; }
311
312 uint16_t reg_b = 0;
313 };
314
315 /* helper expressions for special registers */
316 static constexpr PhysReg m0{124};
317 static constexpr PhysReg vcc{106};
318 static constexpr PhysReg vcc_hi{107};
319 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
320 static constexpr PhysReg exec{126};
321 static constexpr PhysReg exec_lo{126};
322 static constexpr PhysReg exec_hi{127};
323 static constexpr PhysReg vccz{251};
324 static constexpr PhysReg execz{252};
325 static constexpr PhysReg scc{253};
326
327 /**
328 * Operand Class
329 * Initially, each Operand refers to either
330 * a temporary virtual register
331 * or to a constant value
332 * Temporary registers get mapped to physical register during RA
333 * Constant values are inlined into the instruction sequence.
334 */
335 class Operand final
336 {
337 public:
338 constexpr Operand()
339 : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
340 isKill_(false), isUndef_(true), isFirstKill_(false), is64BitConst_(false),
341 isLateKill_(false) {}
342
343 explicit Operand(Temp r) noexcept
344 {
345 data_.temp = r;
346 if (r.id()) {
347 isTemp_ = true;
348 } else {
349 isUndef_ = true;
350 setFixed(PhysReg{128});
351 }
352 };
353 explicit Operand(uint32_t v, bool is64bit = false) noexcept
354 {
355 data_.i = v;
356 isConstant_ = true;
357 is64BitConst_ = is64bit;
358 if (v <= 64)
359 setFixed(PhysReg{128 + v});
360 else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
361 setFixed(PhysReg{192 - v});
362 else if (v == 0x3f000000) /* 0.5 */
363 setFixed(PhysReg{240});
364 else if (v == 0xbf000000) /* -0.5 */
365 setFixed(PhysReg{241});
366 else if (v == 0x3f800000) /* 1.0 */
367 setFixed(PhysReg{242});
368 else if (v == 0xbf800000) /* -1.0 */
369 setFixed(PhysReg{243});
370 else if (v == 0x40000000) /* 2.0 */
371 setFixed(PhysReg{244});
372 else if (v == 0xc0000000) /* -2.0 */
373 setFixed(PhysReg{245});
374 else if (v == 0x40800000) /* 4.0 */
375 setFixed(PhysReg{246});
376 else if (v == 0xc0800000) /* -4.0 */
377 setFixed(PhysReg{247});
378 else { /* Literal Constant */
379 assert(!is64bit && "attempt to create a 64-bit literal constant");
380 setFixed(PhysReg{255});
381 }
382 };
383 explicit Operand(uint64_t v) noexcept
384 {
385 isConstant_ = true;
386 is64BitConst_ = true;
387 if (v <= 64) {
388 data_.i = (uint32_t) v;
389 setFixed(PhysReg{128 + (uint32_t) v});
390 } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
391 data_.i = (uint32_t) v;
392 setFixed(PhysReg{192 - (uint32_t) v});
393 } else if (v == 0x3FE0000000000000) { /* 0.5 */
394 data_.i = 0x3f000000;
395 setFixed(PhysReg{240});
396 } else if (v == 0xBFE0000000000000) { /* -0.5 */
397 data_.i = 0xbf000000;
398 setFixed(PhysReg{241});
399 } else if (v == 0x3FF0000000000000) { /* 1.0 */
400 data_.i = 0x3f800000;
401 setFixed(PhysReg{242});
402 } else if (v == 0xBFF0000000000000) { /* -1.0 */
403 data_.i = 0xbf800000;
404 setFixed(PhysReg{243});
405 } else if (v == 0x4000000000000000) { /* 2.0 */
406 data_.i = 0x40000000;
407 setFixed(PhysReg{244});
408 } else if (v == 0xC000000000000000) { /* -2.0 */
409 data_.i = 0xc0000000;
410 setFixed(PhysReg{245});
411 } else if (v == 0x4010000000000000) { /* 4.0 */
412 data_.i = 0x40800000;
413 setFixed(PhysReg{246});
414 } else if (v == 0xC010000000000000) { /* -4.0 */
415 data_.i = 0xc0800000;
416 setFixed(PhysReg{247});
417 } else { /* Literal Constant: we don't know if it is a long or double.*/
418 isConstant_ = 0;
419 assert(false && "attempt to create a 64-bit literal constant");
420 }
421 };
422 explicit Operand(RegClass type) noexcept
423 {
424 isUndef_ = true;
425 data_.temp = Temp(0, type);
426 setFixed(PhysReg{128});
427 };
428 explicit Operand(PhysReg reg, RegClass type) noexcept
429 {
430 data_.temp = Temp(0, type);
431 setFixed(reg);
432 }
433
434 constexpr bool isTemp() const noexcept
435 {
436 return isTemp_;
437 }
438
439 constexpr void setTemp(Temp t) noexcept {
440 assert(!isConstant_);
441 isTemp_ = true;
442 data_.temp = t;
443 }
444
445 constexpr Temp getTemp() const noexcept
446 {
447 return data_.temp;
448 }
449
450 constexpr uint32_t tempId() const noexcept
451 {
452 return data_.temp.id();
453 }
454
455 constexpr bool hasRegClass() const noexcept
456 {
457 return isTemp() || isUndefined();
458 }
459
460 constexpr RegClass regClass() const noexcept
461 {
462 return data_.temp.regClass();
463 }
464
465 constexpr unsigned bytes() const noexcept
466 {
467 if (isConstant())
468 return is64BitConst_ ? 8 : 4; //TODO: sub-dword constants
469 else
470 return data_.temp.bytes();
471 }
472
473 constexpr unsigned size() const noexcept
474 {
475 if (isConstant())
476 return is64BitConst_ ? 2 : 1;
477 else
478 return data_.temp.size();
479 }
480
481 constexpr bool isFixed() const noexcept
482 {
483 return isFixed_;
484 }
485
486 constexpr PhysReg physReg() const noexcept
487 {
488 return reg_;
489 }
490
491 constexpr void setFixed(PhysReg reg) noexcept
492 {
493 isFixed_ = reg != unsigned(-1);
494 reg_ = reg;
495 }
496
497 constexpr bool isConstant() const noexcept
498 {
499 return isConstant_;
500 }
501
502 constexpr bool isLiteral() const noexcept
503 {
504 return isConstant() && reg_ == 255;
505 }
506
507 constexpr bool isUndefined() const noexcept
508 {
509 return isUndef_;
510 }
511
512 constexpr uint32_t constantValue() const noexcept
513 {
514 return data_.i;
515 }
516
517 constexpr bool constantEquals(uint32_t cmp) const noexcept
518 {
519 return isConstant() && constantValue() == cmp;
520 }
521
522 constexpr uint64_t constantValue64(bool signext=false) const noexcept
523 {
524 if (is64BitConst_) {
525 if (reg_ <= 192)
526 return reg_ - 128;
527 else if (reg_ <= 208)
528 return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);
529
530 switch (reg_) {
531 case 240:
532 return 0x3FE0000000000000;
533 case 241:
534 return 0xBFE0000000000000;
535 case 242:
536 return 0x3FF0000000000000;
537 case 243:
538 return 0xBFF0000000000000;
539 case 244:
540 return 0x4000000000000000;
541 case 245:
542 return 0xC000000000000000;
543 case 246:
544 return 0x4010000000000000;
545 case 247:
546 return 0xC010000000000000;
547 }
548 }
549 return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
550 }
551
552 constexpr bool isOfType(RegType type) const noexcept
553 {
554 return hasRegClass() && regClass().type() == type;
555 }
556
557 /* Indicates that the killed operand's live range intersects with the
558 * instruction's definitions. Unlike isKill() and isFirstKill(), this is
559 * not set by liveness analysis. */
560 constexpr void setLateKill(bool flag) noexcept
561 {
562 isLateKill_ = flag;
563 }
564
565 constexpr bool isLateKill() const noexcept
566 {
567 return isLateKill_;
568 }
569
570 constexpr void setKill(bool flag) noexcept
571 {
572 isKill_ = flag;
573 if (!flag)
574 setFirstKill(false);
575 }
576
577 constexpr bool isKill() const noexcept
578 {
579 return isKill_ || isFirstKill();
580 }
581
582 constexpr void setFirstKill(bool flag) noexcept
583 {
584 isFirstKill_ = flag;
585 if (flag)
586 setKill(flag);
587 }
588
589 /* When there are multiple operands killing the same temporary,
590 * isFirstKill() is only returns true for the first one. */
591 constexpr bool isFirstKill() const noexcept
592 {
593 return isFirstKill_;
594 }
595
596 constexpr bool isKillBeforeDef() const noexcept
597 {
598 return isKill() && !isLateKill();
599 }
600
601 constexpr bool isFirstKillBeforeDef() const noexcept
602 {
603 return isFirstKill() && !isLateKill();
604 }
605
606 constexpr bool operator == (Operand other) const noexcept
607 {
608 if (other.size() != size())
609 return false;
610 if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef())
611 return false;
612 if (isFixed() && other.isFixed() && physReg() != other.physReg())
613 return false;
614 if (isLiteral())
615 return other.isLiteral() && other.constantValue() == constantValue();
616 else if (isConstant())
617 return other.isConstant() && other.physReg() == physReg();
618 else if (isUndefined())
619 return other.isUndefined() && other.regClass() == regClass();
620 else
621 return other.isTemp() && other.getTemp() == getTemp();
622 }
623 private:
624 union {
625 uint32_t i;
626 float f;
627 Temp temp = Temp(0, s1);
628 } data_;
629 PhysReg reg_;
630 union {
631 struct {
632 uint8_t isTemp_:1;
633 uint8_t isFixed_:1;
634 uint8_t isConstant_:1;
635 uint8_t isKill_:1;
636 uint8_t isUndef_:1;
637 uint8_t isFirstKill_:1;
638 uint8_t is64BitConst_:1;
639 uint8_t isLateKill_:1;
640 };
641 /* can't initialize bit-fields in c++11, so work around using a union */
642 uint8_t control_ = 0;
643 };
644 };
645
646 /**
647 * Definition Class
648 * Definitions are the results of Instructions
649 * and refer to temporary virtual registers
650 * which are later mapped to physical registers
651 */
652 class Definition final
653 {
654 public:
655 constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0), isPrecise_(0) {}
656 Definition(uint32_t index, RegClass type) noexcept
657 : temp(index, type) {}
658 explicit Definition(Temp tmp) noexcept
659 : temp(tmp) {}
660 Definition(PhysReg reg, RegClass type) noexcept
661 : temp(Temp(0, type))
662 {
663 setFixed(reg);
664 }
665 Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
666 : temp(Temp(tmpId, type))
667 {
668 setFixed(reg);
669 }
670
671 constexpr bool isTemp() const noexcept
672 {
673 return tempId() > 0;
674 }
675
676 constexpr Temp getTemp() const noexcept
677 {
678 return temp;
679 }
680
681 constexpr uint32_t tempId() const noexcept
682 {
683 return temp.id();
684 }
685
686 constexpr void setTemp(Temp t) noexcept {
687 temp = t;
688 }
689
690 constexpr RegClass regClass() const noexcept
691 {
692 return temp.regClass();
693 }
694
695 constexpr unsigned bytes() const noexcept
696 {
697 return temp.bytes();
698 }
699
700 constexpr unsigned size() const noexcept
701 {
702 return temp.size();
703 }
704
705 constexpr bool isFixed() const noexcept
706 {
707 return isFixed_;
708 }
709
710 constexpr PhysReg physReg() const noexcept
711 {
712 return reg_;
713 }
714
715 constexpr void setFixed(PhysReg reg) noexcept
716 {
717 isFixed_ = 1;
718 reg_ = reg;
719 }
720
721 constexpr void setHint(PhysReg reg) noexcept
722 {
723 hasHint_ = 1;
724 reg_ = reg;
725 }
726
727 constexpr bool hasHint() const noexcept
728 {
729 return hasHint_;
730 }
731
732 constexpr void setKill(bool flag) noexcept
733 {
734 isKill_ = flag;
735 }
736
737 constexpr bool isKill() const noexcept
738 {
739 return isKill_;
740 }
741
742 constexpr void setPrecise(bool precise) noexcept
743 {
744 isPrecise_ = precise;
745 }
746
747 constexpr bool isPrecise() const noexcept
748 {
749 return isPrecise_;
750 }
751
752 private:
753 Temp temp = Temp(0, s1);
754 PhysReg reg_;
755 union {
756 struct {
757 uint8_t isFixed_:1;
758 uint8_t hasHint_:1;
759 uint8_t isKill_:1;
760 uint8_t isPrecise_:1;
761 };
762 /* can't initialize bit-fields in c++11, so work around using a union */
763 uint8_t control_ = 0;
764 };
765 };
766
767 struct Block;
768
769 struct Instruction {
770 aco_opcode opcode;
771 Format format;
772 uint32_t pass_flags;
773
774 aco::span<Operand> operands;
775 aco::span<Definition> definitions;
776
777 constexpr bool isVALU() const noexcept
778 {
779 return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
780 || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
781 || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
782 || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
783 || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
784 || format == Format::VOP3P;
785 }
786
787 constexpr bool isSALU() const noexcept
788 {
789 return format == Format::SOP1 ||
790 format == Format::SOP2 ||
791 format == Format::SOPC ||
792 format == Format::SOPK ||
793 format == Format::SOPP;
794 }
795
796 constexpr bool isVMEM() const noexcept
797 {
798 return format == Format::MTBUF ||
799 format == Format::MUBUF ||
800 format == Format::MIMG;
801 }
802
803 constexpr bool isDPP() const noexcept
804 {
805 return (uint16_t) format & (uint16_t) Format::DPP;
806 }
807
808 constexpr bool isVOP3() const noexcept
809 {
810 return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
811 ((uint16_t) format & (uint16_t) Format::VOP3B);
812 }
813
814 constexpr bool isSDWA() const noexcept
815 {
816 return (uint16_t) format & (uint16_t) Format::SDWA;
817 }
818
819 constexpr bool isFlatOrGlobal() const noexcept
820 {
821 return format == Format::FLAT || format == Format::GLOBAL;
822 }
823
824 constexpr bool usesModifiers() const noexcept;
825
826 constexpr bool reads_exec() const noexcept
827 {
828 for (const Operand& op : operands) {
829 if (op.isFixed() && op.physReg() == exec)
830 return true;
831 }
832 return false;
833 }
834 };
835 static_assert(sizeof(Instruction) == 16, "Unexpected padding");
836
837 struct SOPK_instruction : public Instruction {
838 uint16_t imm;
839 uint16_t padding;
840 };
841 static_assert(sizeof(SOPK_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
842
843 struct SOPP_instruction : public Instruction {
844 uint32_t imm;
845 int block;
846 };
847 static_assert(sizeof(SOPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
848
849 struct SOPC_instruction : public Instruction {
850 };
851 static_assert(sizeof(SOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
852
853 struct SOP1_instruction : public Instruction {
854 };
855 static_assert(sizeof(SOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
856
857 struct SOP2_instruction : public Instruction {
858 };
859 static_assert(sizeof(SOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
860
861 /**
862 * Scalar Memory Format:
863 * For s_(buffer_)load_dword*:
864 * Operand(0): SBASE - SGPR-pair which provides base address
865 * Operand(1): Offset - immediate (un)signed offset or SGPR
866 * Operand(2) / Definition(0): SDATA - SGPR for read / write result
867 * Operand(n-1): SOffset - SGPR offset (Vega only)
868 *
869 * Having no operands is also valid for instructions such as s_dcache_inv.
870 *
871 */
872 struct SMEM_instruction : public Instruction {
873 barrier_interaction barrier;
874 bool glc : 1; /* VI+: globally coherent */
875 bool dlc : 1; /* NAVI: device level coherent */
876 bool nv : 1; /* VEGA only: Non-volatile */
877 bool can_reorder : 1;
878 bool disable_wqm : 1;
879 uint32_t padding: 19;
880 };
881 static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
882
883 struct VOP1_instruction : public Instruction {
884 };
885 static_assert(sizeof(VOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
886
887 struct VOP2_instruction : public Instruction {
888 };
889 static_assert(sizeof(VOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
890
891 struct VOPC_instruction : public Instruction {
892 };
893 static_assert(sizeof(VOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
894
895 struct VOP3A_instruction : public Instruction {
896 bool abs[3];
897 bool neg[3];
898 uint8_t opsel : 4;
899 uint8_t omod : 2;
900 bool clamp : 1;
901 uint32_t padding : 9;
902 };
903 static_assert(sizeof(VOP3A_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
904
905 struct VOP3P_instruction : public Instruction {
906 bool neg_lo[3];
907 bool neg_hi[3];
908 uint8_t opsel_lo : 3;
909 uint8_t opsel_hi : 3;
910 bool clamp : 1;
911 uint32_t padding : 9;
912 };
913 static_assert(sizeof(VOP3P_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
914
915 /**
916 * Data Parallel Primitives Format:
917 * This format can be used for VOP1, VOP2 or VOPC instructions.
918 * The swizzle applies to the src0 operand.
919 *
920 */
921 struct DPP_instruction : public Instruction {
922 bool abs[2];
923 bool neg[2];
924 uint16_t dpp_ctrl;
925 uint8_t row_mask : 4;
926 uint8_t bank_mask : 4;
927 bool bound_ctrl : 1;
928 uint32_t padding : 7;
929 };
930 static_assert(sizeof(DPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
931
932 enum sdwa_sel : uint8_t {
933 /* masks */
934 sdwa_wordnum = 0x1,
935 sdwa_bytenum = 0x3,
936 sdwa_asuint = 0x7 | 0x10,
937 sdwa_rasize = 0x3,
938
939 /* flags */
940 sdwa_isword = 0x4,
941 sdwa_sext = 0x8,
942 sdwa_isra = 0x10,
943
944 /* specific values */
945 sdwa_ubyte0 = 0,
946 sdwa_ubyte1 = 1,
947 sdwa_ubyte2 = 2,
948 sdwa_ubyte3 = 3,
949 sdwa_uword0 = sdwa_isword | 0,
950 sdwa_uword1 = sdwa_isword | 1,
951 sdwa_udword = 6,
952
953 sdwa_sbyte0 = sdwa_ubyte0 | sdwa_sext,
954 sdwa_sbyte1 = sdwa_ubyte1 | sdwa_sext,
955 sdwa_sbyte2 = sdwa_ubyte2 | sdwa_sext,
956 sdwa_sbyte3 = sdwa_ubyte3 | sdwa_sext,
957 sdwa_sword0 = sdwa_uword0 | sdwa_sext,
958 sdwa_sword1 = sdwa_uword1 | sdwa_sext,
959 sdwa_sdword = sdwa_udword | sdwa_sext,
960
961 /* register-allocated */
962 sdwa_ubyte = 1 | sdwa_isra,
963 sdwa_uword = 2 | sdwa_isra,
964 sdwa_sbyte = sdwa_ubyte | sdwa_sext,
965 sdwa_sword = sdwa_uword | sdwa_sext,
966 };
967
968 /**
969 * Sub-Dword Addressing Format:
970 * This format can be used for VOP1, VOP2 or VOPC instructions.
971 *
972 * omod and SGPR/constant operands are only available on GFX9+. For VOPC,
973 * the definition doesn't have to be VCC on GFX9+.
974 *
975 */
976 struct SDWA_instruction : public Instruction {
977 /* these destination modifiers aren't available with VOPC except for
978 * clamp on GFX8 */
979 uint8_t sel[2];
980 uint8_t dst_sel;
981 bool neg[2];
982 bool abs[2];
983 bool dst_preserve : 1;
984 bool clamp : 1;
985 uint8_t omod : 2; /* GFX9+ */
986 uint32_t padding : 4;
987 };
988 static_assert(sizeof(SDWA_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
989
990 struct Interp_instruction : public Instruction {
991 uint8_t attribute;
992 uint8_t component;
993 uint16_t padding;
994 };
995 static_assert(sizeof(Interp_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
996
997 /**
998 * Local and Global Data Sharing instructions
999 * Operand(0): ADDR - VGPR which supplies the address.
1000 * Operand(1): DATA0 - First data VGPR.
1001 * Operand(2): DATA1 - Second data VGPR.
1002 * Operand(n-1): M0 - LDS size.
1003 * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
1004 *
1005 */
1006 struct DS_instruction : public Instruction {
1007 int16_t offset0;
1008 int8_t offset1;
1009 bool gds;
1010 };
1011 static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1012
1013 /**
1014 * Vector Memory Untyped-buffer Instructions
1015 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1016 * Operand(1): VADDR - Address source. Can carry an index and/or offset
1017 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1018 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1019 *
1020 */
1021 struct MUBUF_instruction : public Instruction {
1022 uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
1023 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1024 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1025 bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
1026 bool glc : 1; /* globally coherent */
1027 bool dlc : 1; /* NAVI: device level coherent */
1028 bool slc : 1; /* system level coherent */
1029 bool tfe : 1; /* texture fail enable */
1030 bool lds : 1; /* Return read-data to LDS instead of VGPRs */
1031 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1032 bool can_reorder : 1;
1033 uint8_t padding : 2;
1034 barrier_interaction barrier;
1035 };
1036 static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1037
1038 /**
1039 * Vector Memory Typed-buffer Instructions
1040 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1041 * Operand(1): VADDR - Address source. Can carry an index and/or offset
1042 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1043 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1044 *
1045 */
1046 struct MTBUF_instruction : public Instruction {
1047 uint16_t offset; /* Unsigned byte offset - 12 bit */
1048 barrier_interaction barrier;
1049 uint8_t dfmt : 4; /* Data Format of data in memory buffer */
1050 uint8_t nfmt : 3; /* Numeric format of data in memory */
1051 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1052 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1053 bool glc : 1; /* globally coherent */
1054 bool dlc : 1; /* NAVI: device level coherent */
1055 bool slc : 1; /* system level coherent */
1056 bool tfe : 1; /* texture fail enable */
1057 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1058 bool can_reorder : 1;
1059 uint32_t padding : 25;
1060 };
1061 static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1062
1063 /**
1064 * Vector Memory Image Instructions
1065 * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
1066 * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
1067 * or VDATA - Vector GPR for write data.
1068 * Operand(2): VADDR - Address source. Can carry an offset or an index.
1069 * Definition(0): VDATA - Vector GPR for read result.
1070 *
1071 */
1072 struct MIMG_instruction : public Instruction {
1073 uint8_t dmask; /* Data VGPR enable mask */
1074 uint8_t dim : 3; /* NAVI: dimensionality */
1075 bool unrm : 1; /* Force address to be un-normalized */
1076 bool dlc : 1; /* NAVI: device level coherent */
1077 bool glc : 1; /* globally coherent */
1078 bool slc : 1; /* system level coherent */
1079 bool tfe : 1; /* texture fail enable */
1080 bool da : 1; /* declare an array */
1081 bool lwe : 1; /* Force data to be un-normalized */
1082 bool r128 : 1; /* NAVI: Texture resource size */
1083 bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
1084 bool d16 : 1; /* Convert 32-bit data to 16-bit data */
1085 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1086 bool can_reorder : 1;
1087 uint8_t padding : 1;
1088 barrier_interaction barrier;
1089 };
1090 static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1091
1092 /**
1093 * Flat/Scratch/Global Instructions
1094 * Operand(0): ADDR
1095 * Operand(1): SADDR
1096 * Operand(2) / Definition(0): DATA/VDST
1097 *
1098 */
1099 struct FLAT_instruction : public Instruction {
1100 uint16_t offset; /* Vega/Navi only */
1101 bool slc : 1; /* system level coherent */
1102 bool glc : 1; /* globally coherent */
1103 bool dlc : 1; /* NAVI: device level coherent */
1104 bool lds : 1;
1105 bool nv : 1;
1106 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1107 bool can_reorder : 1;
1108 uint8_t padding : 1;
1109 barrier_interaction barrier;
1110 };
1111 static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1112
1113 struct Export_instruction : public Instruction {
1114 uint8_t enabled_mask;
1115 uint8_t dest;
1116 bool compressed : 1;
1117 bool done : 1;
1118 bool valid_mask : 1;
1119 uint32_t padding : 13;
1120 };
1121 static_assert(sizeof(Export_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1122
1123 struct Pseudo_instruction : public Instruction {
1124 PhysReg scratch_sgpr; /* might not be valid if it's not needed */
1125 bool tmp_in_scc;
1126 uint8_t padding;
1127 };
1128 static_assert(sizeof(Pseudo_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1129
1130 struct Pseudo_branch_instruction : public Instruction {
1131 /* target[0] is the block index of the branch target.
1132 * For conditional branches, target[1] contains the fall-through alternative.
1133 * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
1134 */
1135 uint32_t target[2];
1136 };
1137 static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1138
1139 struct Pseudo_barrier_instruction : public Instruction {
1140 };
1141 static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1142
1143 enum ReduceOp : uint16_t {
1144 iadd8, iadd16, iadd32, iadd64,
1145 imul8, imul16, imul32, imul64,
1146 fadd16, fadd32, fadd64,
1147 fmul16, fmul32, fmul64,
1148 imin8, imin16, imin32, imin64,
1149 imax8, imax16, imax32, imax64,
1150 umin8, umin16, umin32, umin64,
1151 umax8, umax16, umax32, umax64,
1152 fmin16, fmin32, fmin64,
1153 fmax16, fmax32, fmax64,
1154 iand8, iand16, iand32, iand64,
1155 ior8, ior16, ior32, ior64,
1156 ixor8, ixor16, ixor32, ixor64,
1157 };
1158
1159 /**
1160 * Subgroup Reduction Instructions, everything except for the data to be
1161 * reduced and the result as inserted by setup_reduce_temp().
1162 * Operand(0): data to be reduced
1163 * Operand(1): reduce temporary
1164 * Operand(2): vector temporary
1165 * Definition(0): result
1166 * Definition(1): scalar temporary
1167 * Definition(2): scalar identity temporary (not used to store identity on GFX10)
1168 * Definition(3): scc clobber
1169 * Definition(4): vcc clobber
1170 *
1171 */
1172 struct Pseudo_reduction_instruction : public Instruction {
1173 ReduceOp reduce_op;
1174 uint16_t cluster_size; // must be 0 for scans
1175 };
1176 static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1177
1178 struct instr_deleter_functor {
1179 void operator()(void* p) {
1180 free(p);
1181 }
1182 };
1183
1184 template<typename T>
1185 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
1186
1187 template<typename T>
1188 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
1189 {
1190 std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1191 char *data = (char*) calloc(1, size);
1192 T* inst = (T*) data;
1193
1194 inst->opcode = opcode;
1195 inst->format = format;
1196
1197 uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
1198 inst->operands = aco::span<Operand>(operands_offset, num_operands);
1199 uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1200 inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1201
1202 return inst;
1203 }
1204
1205 constexpr bool Instruction::usesModifiers() const noexcept
1206 {
1207 if (isDPP() || isSDWA())
1208 return true;
1209
1210 if (format == Format::VOP3P) {
1211 const VOP3P_instruction *vop3p = static_cast<const VOP3P_instruction*>(this);
1212 for (unsigned i = 0; i < operands.size(); i++) {
1213 if (vop3p->neg_lo[i] || vop3p->neg_hi[i])
1214 return true;
1215 }
1216 return vop3p->opsel_lo || vop3p->opsel_hi || vop3p->clamp;
1217 } else if (isVOP3()) {
1218 const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
1219 for (unsigned i = 0; i < operands.size(); i++) {
1220 if (vop3->abs[i] || vop3->neg[i])
1221 return true;
1222 }
1223 return vop3->opsel || vop3->clamp || vop3->omod;
1224 }
1225 return false;
1226 }
1227
1228 constexpr bool is_phi(Instruction* instr)
1229 {
1230 return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
1231 }
1232
1233 static inline bool is_phi(aco_ptr<Instruction>& instr)
1234 {
1235 return is_phi(instr.get());
1236 }
1237
1238 barrier_interaction get_barrier_interaction(const Instruction* instr);
1239 bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr);
1240
1241 bool can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high);
1242 bool can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr);
1243 /* updates "instr" and returns the old instruction (or NULL if no update was needed) */
1244 aco_ptr<Instruction> convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& instr);
1245
1246 enum block_kind {
1247 /* uniform indicates that leaving this block,
1248 * all actives lanes stay active */
1249 block_kind_uniform = 1 << 0,
1250 block_kind_top_level = 1 << 1,
1251 block_kind_loop_preheader = 1 << 2,
1252 block_kind_loop_header = 1 << 3,
1253 block_kind_loop_exit = 1 << 4,
1254 block_kind_continue = 1 << 5,
1255 block_kind_break = 1 << 6,
1256 block_kind_continue_or_break = 1 << 7,
1257 block_kind_discard = 1 << 8,
1258 block_kind_branch = 1 << 9,
1259 block_kind_merge = 1 << 10,
1260 block_kind_invert = 1 << 11,
1261 block_kind_uses_discard_if = 1 << 12,
1262 block_kind_needs_lowering = 1 << 13,
1263 block_kind_uses_demote = 1 << 14,
1264 block_kind_export_end = 1 << 15,
1265 };
1266
1267
1268 struct RegisterDemand {
1269 constexpr RegisterDemand() = default;
1270 constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1271 : vgpr{v}, sgpr{s} {}
1272 int16_t vgpr = 0;
1273 int16_t sgpr = 0;
1274
1275 constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1276 return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1277 }
1278
1279 constexpr bool exceeds(const RegisterDemand other) const noexcept {
1280 return vgpr > other.vgpr || sgpr > other.sgpr;
1281 }
1282
1283 constexpr RegisterDemand operator+(const Temp t) const noexcept {
1284 if (t.type() == RegType::sgpr)
1285 return RegisterDemand( vgpr, sgpr + t.size() );
1286 else
1287 return RegisterDemand( vgpr + t.size(), sgpr );
1288 }
1289
1290 constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1291 return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1292 }
1293
1294 constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1295 return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1296 }
1297
1298 constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1299 vgpr += other.vgpr;
1300 sgpr += other.sgpr;
1301 return *this;
1302 }
1303
1304 constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1305 vgpr -= other.vgpr;
1306 sgpr -= other.sgpr;
1307 return *this;
1308 }
1309
1310 constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1311 if (t.type() == RegType::sgpr)
1312 sgpr += t.size();
1313 else
1314 vgpr += t.size();
1315 return *this;
1316 }
1317
1318 constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1319 if (t.type() == RegType::sgpr)
1320 sgpr -= t.size();
1321 else
1322 vgpr -= t.size();
1323 return *this;
1324 }
1325
1326 constexpr void update(const RegisterDemand other) noexcept {
1327 vgpr = std::max(vgpr, other.vgpr);
1328 sgpr = std::max(sgpr, other.sgpr);
1329 }
1330
1331 };
1332
1333 /* CFG */
1334 struct Block {
1335 float_mode fp_mode;
1336 unsigned index;
1337 unsigned offset = 0;
1338 std::vector<aco_ptr<Instruction>> instructions;
1339 std::vector<unsigned> logical_preds;
1340 std::vector<unsigned> linear_preds;
1341 std::vector<unsigned> logical_succs;
1342 std::vector<unsigned> linear_succs;
1343 RegisterDemand register_demand = RegisterDemand();
1344 uint16_t loop_nest_depth = 0;
1345 uint16_t kind = 0;
1346 int logical_idom = -1;
1347 int linear_idom = -1;
1348 Temp live_out_exec = Temp();
1349
1350 /* this information is needed for predecessors to blocks with phis when
1351 * moving out of ssa */
1352 bool scc_live_out = false;
1353 PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1354
1355 Block(unsigned idx) : index(idx) {}
1356 Block() : index(0) {}
1357 };
1358
1359 using Stage = uint16_t;
1360
1361 /* software stages */
1362 static constexpr Stage sw_vs = 1 << 0;
1363 static constexpr Stage sw_gs = 1 << 1;
1364 static constexpr Stage sw_tcs = 1 << 2;
1365 static constexpr Stage sw_tes = 1 << 3;
1366 static constexpr Stage sw_fs = 1 << 4;
1367 static constexpr Stage sw_cs = 1 << 5;
1368 static constexpr Stage sw_gs_copy = 1 << 6;
1369 static constexpr Stage sw_mask = 0x7f;
1370
1371 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1372 static constexpr Stage hw_vs = 1 << 7;
1373 static constexpr Stage hw_es = 1 << 8; /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */
1374 static constexpr Stage hw_gs = 1 << 9; /* Geometry shader on GFX10/legacy and GFX6-9. */
1375 static constexpr Stage hw_ngg_gs = 1 << 10; /* Geometry shader on GFX10/NGG. */
1376 static constexpr Stage hw_ls = 1 << 11; /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */
1377 static constexpr Stage hw_hs = 1 << 12; /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */
1378 static constexpr Stage hw_fs = 1 << 13;
1379 static constexpr Stage hw_cs = 1 << 14;
1380 static constexpr Stage hw_mask = 0xff << 7;
1381
1382 /* possible settings of Program::stage */
1383 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1384 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1385 static constexpr Stage compute_cs = sw_cs | hw_cs;
1386 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1387 static constexpr Stage gs_copy_vs = sw_gs_copy | hw_vs;
1388 /* GFX10/NGG */
1389 static constexpr Stage ngg_vertex_gs = sw_vs | hw_ngg_gs;
1390 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_ngg_gs;
1391 static constexpr Stage ngg_tess_eval_gs = sw_tes | hw_ngg_gs;
1392 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_ngg_gs;
1393 /* GFX9 (and GFX10 if NGG isn't used) */
1394 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1395 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1396 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1397 /* pre-GFX9 */
1398 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1399 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1400 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1401 static constexpr Stage tess_eval_es = sw_tes | hw_es; /* tesselation evaluation before geometry */
1402 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1403
1404 enum statistic {
1405 statistic_hash,
1406 statistic_instructions,
1407 statistic_copies,
1408 statistic_branches,
1409 statistic_cycles,
1410 statistic_vmem_clauses,
1411 statistic_smem_clauses,
1412 statistic_vmem_score,
1413 statistic_smem_score,
1414 statistic_sgpr_presched,
1415 statistic_vgpr_presched,
1416 num_statistics
1417 };
1418
1419 class Program final {
1420 public:
1421 float_mode next_fp_mode;
1422 std::vector<Block> blocks;
1423 RegisterDemand max_reg_demand = RegisterDemand();
1424 uint16_t num_waves = 0;
1425 uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1426 ac_shader_config* config;
1427 struct radv_shader_info *info;
1428 enum chip_class chip_class;
1429 enum radeon_family family;
1430 unsigned wave_size;
1431 RegClass lane_mask;
1432 Stage stage; /* Stage */
1433 bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1434 bool needs_wqm = false; /* there exists a p_wqm instruction */
1435 bool wb_smem_l1_on_end = false;
1436
1437 std::vector<uint8_t> constant_data;
1438 Temp private_segment_buffer;
1439 Temp scratch_offset;
1440
1441 uint16_t min_waves = 0;
1442 uint16_t lds_alloc_granule;
1443 uint32_t lds_limit; /* in bytes */
1444 bool has_16bank_lds;
1445 uint16_t vgpr_limit;
1446 uint16_t sgpr_limit;
1447 uint16_t physical_sgprs;
1448 uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1449 uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
1450 unsigned workgroup_size; /* if known; otherwise UINT_MAX */
1451
1452 bool xnack_enabled = false;
1453 bool sram_ecc_enabled = false;
1454 bool has_fast_fma32 = false;
1455
1456 bool needs_vcc = false;
1457 bool needs_flat_scr = false;
1458
1459 bool collect_statistics = false;
1460 uint32_t statistics[num_statistics];
1461
1462 uint32_t allocateId()
1463 {
1464 assert(allocationID <= 16777215);
1465 return allocationID++;
1466 }
1467
1468 uint32_t peekAllocationId()
1469 {
1470 return allocationID;
1471 }
1472
1473 void setAllocationId(uint32_t id)
1474 {
1475 allocationID = id;
1476 }
1477
1478 Block* create_and_insert_block() {
1479 blocks.emplace_back(blocks.size());
1480 blocks.back().fp_mode = next_fp_mode;
1481 return &blocks.back();
1482 }
1483
1484 Block* insert_block(Block&& block) {
1485 block.index = blocks.size();
1486 block.fp_mode = next_fp_mode;
1487 blocks.emplace_back(std::move(block));
1488 return &blocks.back();
1489 }
1490
1491 private:
1492 uint32_t allocationID = 1;
1493 };
1494
1495 struct TempHash {
1496 std::size_t operator()(Temp t) const {
1497 return t.id();
1498 }
1499 };
1500 using TempSet = std::unordered_set<Temp, TempHash>;
1501
1502 struct live {
1503 /* live temps out per block */
1504 std::vector<TempSet> live_out;
1505 /* register demand (sgpr/vgpr) per instruction per block */
1506 std::vector<std::vector<RegisterDemand>> register_demand;
1507 };
1508
1509 void select_program(Program *program,
1510 unsigned shader_count,
1511 struct nir_shader *const *shaders,
1512 ac_shader_config* config,
1513 struct radv_shader_args *args);
1514 void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
1515 ac_shader_config* config,
1516 struct radv_shader_args *args);
1517
1518 void lower_wqm(Program* program, live& live_vars,
1519 const struct radv_nir_compiler_options *options);
1520 void lower_phis(Program* program);
1521 void calc_min_waves(Program* program);
1522 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1523 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1524 std::vector<uint16_t> dead_code_analysis(Program *program);
1525 void dominator_tree(Program* program);
1526 void insert_exec_mask(Program *program);
1527 void value_numbering(Program* program);
1528 void optimize(Program* program);
1529 void setup_reduce_temp(Program* program);
1530 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1531 void register_allocation(Program *program, std::vector<TempSet>& live_out_per_block);
1532 void ssa_elimination(Program* program);
1533 void lower_to_hw_instr(Program* program);
1534 void schedule_program(Program* program, live& live_vars);
1535 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1536 void insert_wait_states(Program* program);
1537 void insert_NOPs(Program* program);
1538 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1539 void print_asm(Program *program, std::vector<uint32_t>& binary,
1540 unsigned exec_size, std::ostream& out);
1541 void validate(Program* program, FILE *output);
1542 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output);
1543 #ifndef NDEBUG
1544 void perfwarn(bool cond, const char *msg, Instruction *instr=NULL);
1545 #else
1546 #define perfwarn(program, cond, msg, ...) do {} while(0)
1547 #endif
1548
1549 void collect_presched_stats(Program *program);
1550 void collect_preasm_stats(Program *program);
1551 void collect_postasm_stats(Program *program, const std::vector<uint32_t>& code);
1552
1553 void aco_print_instr(const Instruction *instr, FILE *output);
1554 void aco_print_program(const Program *program, FILE *output);
1555
1556 /* utilities for dealing with register demand */
1557 RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
1558 RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
1559 RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, aco_ptr<Instruction>& instr_before);
1560
1561 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1562 uint16_t get_extra_sgprs(Program *program);
1563
1564 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
1565 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1566 uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs);
1567
1568 /* return number of addressable sgprs/vgprs for max_waves */
1569 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1570 uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves);
1571
1572 typedef struct {
1573 const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
1574 const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1575 const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1576 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1577 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1578 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
1579 const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1580 const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1581 /* sizes used for input/output modifiers and constants */
1582 const unsigned operand_size[static_cast<int>(aco_opcode::num_opcodes)];
1583 const unsigned definition_size[static_cast<int>(aco_opcode::num_opcodes)];
1584 } Info;
1585
1586 extern const Info instr_info;
1587
1588 }
1589
1590 #endif /* ACO_IR_H */
1591