aco: allow reading/writing upper halves/bytes when possible
[mesa.git] / src / amd / compiler / aco_ir.h
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #ifndef ACO_IR_H
26 #define ACO_IR_H
27
28 #include <vector>
29 #include <set>
30 #include <unordered_set>
31 #include <bitset>
32 #include <memory>
33
34 #include "nir.h"
35 #include "ac_binary.h"
36 #include "amd_family.h"
37 #include "aco_opcodes.h"
38 #include "aco_util.h"
39
40 struct radv_nir_compiler_options;
41 struct radv_shader_args;
42 struct radv_shader_info;
43
44 namespace aco {
45
46 extern uint64_t debug_flags;
47
48 enum {
49 DEBUG_VALIDATE = 0x1,
50 DEBUG_VALIDATE_RA = 0x2,
51 DEBUG_PERFWARN = 0x4,
52 };
53
54 /**
55 * Representation of the instruction's microcode encoding format
56 * Note: Some Vector ALU Formats can be combined, such that:
57 * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
58 * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
59 * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
60 *
61 * (*) The same is applicable for VOP1 and VOPC instructions.
62 */
63 enum class Format : std::uint16_t {
64 /* Pseudo Instruction Format */
65 PSEUDO = 0,
66 /* Scalar ALU & Control Formats */
67 SOP1 = 1,
68 SOP2 = 2,
69 SOPK = 3,
70 SOPP = 4,
71 SOPC = 5,
72 /* Scalar Memory Format */
73 SMEM = 6,
74 /* LDS/GDS Format */
75 DS = 8,
76 /* Vector Memory Buffer Formats */
77 MTBUF = 9,
78 MUBUF = 10,
79 /* Vector Memory Image Format */
80 MIMG = 11,
81 /* Export Format */
82 EXP = 12,
83 /* Flat Formats */
84 FLAT = 13,
85 GLOBAL = 14,
86 SCRATCH = 15,
87
88 PSEUDO_BRANCH = 16,
89 PSEUDO_BARRIER = 17,
90 PSEUDO_REDUCTION = 18,
91
92 /* Vector ALU Formats */
93 VOP3P = 19,
94 VOP1 = 1 << 8,
95 VOP2 = 1 << 9,
96 VOPC = 1 << 10,
97 VOP3 = 1 << 11,
98 VOP3A = 1 << 11,
99 VOP3B = 1 << 11,
100 /* Vector Parameter Interpolation Format */
101 VINTRP = 1 << 12,
102 DPP = 1 << 13,
103 SDWA = 1 << 14,
104 };
105
106 enum barrier_interaction : uint8_t {
107 barrier_none = 0,
108 barrier_buffer = 0x1,
109 barrier_image = 0x2,
110 barrier_atomic = 0x4,
111 barrier_shared = 0x8,
112 /* used for geometry shaders to ensure vertex data writes are before the
113 * GS_DONE s_sendmsg. */
114 barrier_gs_data = 0x10,
115 /* used for geometry shaders to ensure s_sendmsg instructions are in-order. */
116 barrier_gs_sendmsg = 0x20,
117 /* used by barriers. created by s_barrier */
118 barrier_barrier = 0x40,
119 barrier_count = 7,
120 };
121
122 enum fp_round {
123 fp_round_ne = 0,
124 fp_round_pi = 1,
125 fp_round_ni = 2,
126 fp_round_tz = 3,
127 };
128
129 enum fp_denorm {
130 /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
131 * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
132 fp_denorm_flush = 0x0,
133 fp_denorm_keep = 0x3,
134 };
135
136 struct float_mode {
137 /* matches encoding of the MODE register */
138 union {
139 struct {
140 fp_round round32:2;
141 fp_round round16_64:2;
142 unsigned denorm32:2;
143 unsigned denorm16_64:2;
144 };
145 uint8_t val = 0;
146 };
147 /* if false, optimizations which may remove infs/nan/-0.0 can be done */
148 bool preserve_signed_zero_inf_nan32:1;
149 bool preserve_signed_zero_inf_nan16_64:1;
150 /* if false, optimizations which may remove denormal flushing can be done */
151 bool must_flush_denorms32:1;
152 bool must_flush_denorms16_64:1;
153 bool care_about_round32:1;
154 bool care_about_round16_64:1;
155
156 /* Returns true if instructions using the mode "other" can safely use the
157 * current one instead. */
158 bool canReplace(float_mode other) const noexcept {
159 return val == other.val &&
160 (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
161 (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
162 (must_flush_denorms32 || !other.must_flush_denorms32) &&
163 (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
164 (care_about_round32 || !other.care_about_round32) &&
165 (care_about_round16_64 || !other.care_about_round16_64);
166 }
167 };
168
169 constexpr Format asVOP3(Format format) {
170 return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
171 };
172
173 constexpr Format asSDWA(Format format) {
174 assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC);
175 return (Format) ((uint32_t) Format::SDWA | (uint32_t) format);
176 }
177
178 enum class RegType {
179 none = 0,
180 sgpr,
181 vgpr,
182 linear_vgpr,
183 };
184
185 struct RegClass {
186
187 enum RC : uint8_t {
188 s1 = 1,
189 s2 = 2,
190 s3 = 3,
191 s4 = 4,
192 s6 = 6,
193 s8 = 8,
194 s16 = 16,
195 v1 = s1 | (1 << 5),
196 v2 = s2 | (1 << 5),
197 v3 = s3 | (1 << 5),
198 v4 = s4 | (1 << 5),
199 v5 = 5 | (1 << 5),
200 v6 = 6 | (1 << 5),
201 v7 = 7 | (1 << 5),
202 v8 = 8 | (1 << 5),
203 /* byte-sized register class */
204 v1b = v1 | (1 << 7),
205 v2b = v2 | (1 << 7),
206 v3b = v3 | (1 << 7),
207 v4b = v4 | (1 << 7),
208 v6b = v6 | (1 << 7),
209 v8b = v8 | (1 << 7),
210 /* these are used for WWM and spills to vgpr */
211 v1_linear = v1 | (1 << 6),
212 v2_linear = v2 | (1 << 6),
213 };
214
215 RegClass() = default;
216 constexpr RegClass(RC rc)
217 : rc(rc) {}
218 constexpr RegClass(RegType type, unsigned size)
219 : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
220
221 constexpr operator RC() const { return rc; }
222 explicit operator bool() = delete;
223
224 constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
225 constexpr bool is_subdword() const { return rc & (1 << 7); }
226 constexpr unsigned bytes() const { return ((unsigned) rc & 0x1F) * (is_subdword() ? 1 : 4); }
227 //TODO: use size() less in favor of bytes()
228 constexpr unsigned size() const { return (bytes() + 3) >> 2; }
229 constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
230 constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
231 constexpr RegClass as_subdword() const { return RegClass((RC) (rc | 1 << 7)); }
232
233 static constexpr RegClass get(RegType type, unsigned bytes) {
234 if (type == RegType::sgpr) {
235 return RegClass(type, DIV_ROUND_UP(bytes, 4u));
236 } else {
237 return bytes % 4u ? RegClass(type, bytes).as_subdword() :
238 RegClass(type, bytes / 4u);
239 }
240 }
241
242 private:
243 RC rc;
244 };
245
246 /* transitional helper expressions */
247 static constexpr RegClass s1{RegClass::s1};
248 static constexpr RegClass s2{RegClass::s2};
249 static constexpr RegClass s3{RegClass::s3};
250 static constexpr RegClass s4{RegClass::s4};
251 static constexpr RegClass s8{RegClass::s8};
252 static constexpr RegClass s16{RegClass::s16};
253 static constexpr RegClass v1{RegClass::v1};
254 static constexpr RegClass v2{RegClass::v2};
255 static constexpr RegClass v3{RegClass::v3};
256 static constexpr RegClass v4{RegClass::v4};
257 static constexpr RegClass v5{RegClass::v5};
258 static constexpr RegClass v6{RegClass::v6};
259 static constexpr RegClass v7{RegClass::v7};
260 static constexpr RegClass v8{RegClass::v8};
261 static constexpr RegClass v1b{RegClass::v1b};
262 static constexpr RegClass v2b{RegClass::v2b};
263 static constexpr RegClass v3b{RegClass::v3b};
264 static constexpr RegClass v4b{RegClass::v4b};
265 static constexpr RegClass v6b{RegClass::v6b};
266 static constexpr RegClass v8b{RegClass::v8b};
267
268 /**
269 * Temp Class
270 * Each temporary virtual register has a
271 * register class (i.e. size and type)
272 * and SSA id.
273 */
274 struct Temp {
275 Temp() noexcept : id_(0), reg_class(0) {}
276 constexpr Temp(uint32_t id, RegClass cls) noexcept
277 : id_(id), reg_class(uint8_t(cls)) {}
278
279 constexpr uint32_t id() const noexcept { return id_; }
280 constexpr RegClass regClass() const noexcept { return (RegClass::RC)reg_class; }
281
282 constexpr unsigned bytes() const noexcept { return regClass().bytes(); }
283 constexpr unsigned size() const noexcept { return regClass().size(); }
284 constexpr RegType type() const noexcept { return regClass().type(); }
285 constexpr bool is_linear() const noexcept { return regClass().is_linear(); }
286
287 constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
288 constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
289 constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
290
291 private:
292 uint32_t id_: 24;
293 uint32_t reg_class : 8;
294 };
295
296 /**
297 * PhysReg
298 * Represents the physical register for each
299 * Operand and Definition.
300 */
301 struct PhysReg {
302 constexpr PhysReg() = default;
303 explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {}
304 constexpr unsigned reg() const { return reg_b >> 2; }
305 constexpr unsigned byte() const { return reg_b & 0x3; }
306 constexpr operator unsigned() const { return reg(); }
307 constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; }
308 constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; }
309 constexpr bool operator <(PhysReg other) const { return reg_b < other.reg_b; }
310 constexpr PhysReg advance(unsigned bytes) const { PhysReg res = *this; res.reg_b += bytes; return res; }
311
312 uint16_t reg_b = 0;
313 };
314
315 /* helper expressions for special registers */
316 static constexpr PhysReg m0{124};
317 static constexpr PhysReg vcc{106};
318 static constexpr PhysReg vcc_hi{107};
319 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
320 static constexpr PhysReg exec{126};
321 static constexpr PhysReg exec_lo{126};
322 static constexpr PhysReg exec_hi{127};
323 static constexpr PhysReg vccz{251};
324 static constexpr PhysReg execz{252};
325 static constexpr PhysReg scc{253};
326
327 /**
328 * Operand Class
329 * Initially, each Operand refers to either
330 * a temporary virtual register
331 * or to a constant value
332 * Temporary registers get mapped to physical register during RA
333 * Constant values are inlined into the instruction sequence.
334 */
335 class Operand final
336 {
337 public:
338 constexpr Operand()
339 : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
340 isKill_(false), isUndef_(true), isFirstKill_(false), is64BitConst_(false),
341 isLateKill_(false) {}
342
343 explicit Operand(Temp r) noexcept
344 {
345 data_.temp = r;
346 if (r.id()) {
347 isTemp_ = true;
348 } else {
349 isUndef_ = true;
350 setFixed(PhysReg{128});
351 }
352 };
353 explicit Operand(uint32_t v, bool is64bit = false) noexcept
354 {
355 data_.i = v;
356 isConstant_ = true;
357 is64BitConst_ = is64bit;
358 if (v <= 64)
359 setFixed(PhysReg{128 + v});
360 else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
361 setFixed(PhysReg{192 - v});
362 else if (v == 0x3f000000) /* 0.5 */
363 setFixed(PhysReg{240});
364 else if (v == 0xbf000000) /* -0.5 */
365 setFixed(PhysReg{241});
366 else if (v == 0x3f800000) /* 1.0 */
367 setFixed(PhysReg{242});
368 else if (v == 0xbf800000) /* -1.0 */
369 setFixed(PhysReg{243});
370 else if (v == 0x40000000) /* 2.0 */
371 setFixed(PhysReg{244});
372 else if (v == 0xc0000000) /* -2.0 */
373 setFixed(PhysReg{245});
374 else if (v == 0x40800000) /* 4.0 */
375 setFixed(PhysReg{246});
376 else if (v == 0xc0800000) /* -4.0 */
377 setFixed(PhysReg{247});
378 else { /* Literal Constant */
379 assert(!is64bit && "attempt to create a 64-bit literal constant");
380 setFixed(PhysReg{255});
381 }
382 };
383 explicit Operand(uint64_t v) noexcept
384 {
385 isConstant_ = true;
386 is64BitConst_ = true;
387 if (v <= 64) {
388 data_.i = (uint32_t) v;
389 setFixed(PhysReg{128 + (uint32_t) v});
390 } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
391 data_.i = (uint32_t) v;
392 setFixed(PhysReg{192 - (uint32_t) v});
393 } else if (v == 0x3FE0000000000000) { /* 0.5 */
394 data_.i = 0x3f000000;
395 setFixed(PhysReg{240});
396 } else if (v == 0xBFE0000000000000) { /* -0.5 */
397 data_.i = 0xbf000000;
398 setFixed(PhysReg{241});
399 } else if (v == 0x3FF0000000000000) { /* 1.0 */
400 data_.i = 0x3f800000;
401 setFixed(PhysReg{242});
402 } else if (v == 0xBFF0000000000000) { /* -1.0 */
403 data_.i = 0xbf800000;
404 setFixed(PhysReg{243});
405 } else if (v == 0x4000000000000000) { /* 2.0 */
406 data_.i = 0x40000000;
407 setFixed(PhysReg{244});
408 } else if (v == 0xC000000000000000) { /* -2.0 */
409 data_.i = 0xc0000000;
410 setFixed(PhysReg{245});
411 } else if (v == 0x4010000000000000) { /* 4.0 */
412 data_.i = 0x40800000;
413 setFixed(PhysReg{246});
414 } else if (v == 0xC010000000000000) { /* -4.0 */
415 data_.i = 0xc0800000;
416 setFixed(PhysReg{247});
417 } else { /* Literal Constant: we don't know if it is a long or double.*/
418 isConstant_ = 0;
419 assert(false && "attempt to create a 64-bit literal constant");
420 }
421 };
422 explicit Operand(RegClass type) noexcept
423 {
424 isUndef_ = true;
425 data_.temp = Temp(0, type);
426 setFixed(PhysReg{128});
427 };
428 explicit Operand(PhysReg reg, RegClass type) noexcept
429 {
430 data_.temp = Temp(0, type);
431 setFixed(reg);
432 }
433
434 constexpr bool isTemp() const noexcept
435 {
436 return isTemp_;
437 }
438
439 constexpr void setTemp(Temp t) noexcept {
440 assert(!isConstant_);
441 isTemp_ = true;
442 data_.temp = t;
443 }
444
445 constexpr Temp getTemp() const noexcept
446 {
447 return data_.temp;
448 }
449
450 constexpr uint32_t tempId() const noexcept
451 {
452 return data_.temp.id();
453 }
454
455 constexpr bool hasRegClass() const noexcept
456 {
457 return isTemp() || isUndefined();
458 }
459
460 constexpr RegClass regClass() const noexcept
461 {
462 return data_.temp.regClass();
463 }
464
465 constexpr unsigned bytes() const noexcept
466 {
467 if (isConstant())
468 return is64BitConst_ ? 8 : 4; //TODO: sub-dword constants
469 else
470 return data_.temp.bytes();
471 }
472
473 constexpr unsigned size() const noexcept
474 {
475 if (isConstant())
476 return is64BitConst_ ? 2 : 1;
477 else
478 return data_.temp.size();
479 }
480
481 constexpr bool isFixed() const noexcept
482 {
483 return isFixed_;
484 }
485
486 constexpr PhysReg physReg() const noexcept
487 {
488 return reg_;
489 }
490
491 constexpr void setFixed(PhysReg reg) noexcept
492 {
493 isFixed_ = reg != unsigned(-1);
494 reg_ = reg;
495 }
496
497 constexpr bool isConstant() const noexcept
498 {
499 return isConstant_;
500 }
501
502 constexpr bool isLiteral() const noexcept
503 {
504 return isConstant() && reg_ == 255;
505 }
506
507 constexpr bool isUndefined() const noexcept
508 {
509 return isUndef_;
510 }
511
512 constexpr uint32_t constantValue() const noexcept
513 {
514 return data_.i;
515 }
516
517 constexpr bool constantEquals(uint32_t cmp) const noexcept
518 {
519 return isConstant() && constantValue() == cmp;
520 }
521
522 constexpr uint64_t constantValue64(bool signext=false) const noexcept
523 {
524 if (is64BitConst_) {
525 if (reg_ <= 192)
526 return reg_ - 128;
527 else if (reg_ <= 208)
528 return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);
529
530 switch (reg_) {
531 case 240:
532 return 0x3FE0000000000000;
533 case 241:
534 return 0xBFE0000000000000;
535 case 242:
536 return 0x3FF0000000000000;
537 case 243:
538 return 0xBFF0000000000000;
539 case 244:
540 return 0x4000000000000000;
541 case 245:
542 return 0xC000000000000000;
543 case 246:
544 return 0x4010000000000000;
545 case 247:
546 return 0xC010000000000000;
547 }
548 }
549 return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
550 }
551
552 constexpr bool isOfType(RegType type) const noexcept
553 {
554 return hasRegClass() && regClass().type() == type;
555 }
556
557 /* Indicates that the killed operand's live range intersects with the
558 * instruction's definitions. Unlike isKill() and isFirstKill(), this is
559 * not set by liveness analysis. */
560 constexpr void setLateKill(bool flag) noexcept
561 {
562 isLateKill_ = flag;
563 }
564
565 constexpr bool isLateKill() const noexcept
566 {
567 return isLateKill_;
568 }
569
570 constexpr void setKill(bool flag) noexcept
571 {
572 isKill_ = flag;
573 if (!flag)
574 setFirstKill(false);
575 }
576
577 constexpr bool isKill() const noexcept
578 {
579 return isKill_ || isFirstKill();
580 }
581
582 constexpr void setFirstKill(bool flag) noexcept
583 {
584 isFirstKill_ = flag;
585 if (flag)
586 setKill(flag);
587 }
588
589 /* When there are multiple operands killing the same temporary,
590 * isFirstKill() is only returns true for the first one. */
591 constexpr bool isFirstKill() const noexcept
592 {
593 return isFirstKill_;
594 }
595
596 constexpr bool isKillBeforeDef() const noexcept
597 {
598 return isKill() && !isLateKill();
599 }
600
601 constexpr bool isFirstKillBeforeDef() const noexcept
602 {
603 return isFirstKill() && !isLateKill();
604 }
605
606 constexpr bool operator == (Operand other) const noexcept
607 {
608 if (other.size() != size())
609 return false;
610 if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef())
611 return false;
612 if (isFixed() && other.isFixed() && physReg() != other.physReg())
613 return false;
614 if (isLiteral())
615 return other.isLiteral() && other.constantValue() == constantValue();
616 else if (isConstant())
617 return other.isConstant() && other.physReg() == physReg();
618 else if (isUndefined())
619 return other.isUndefined() && other.regClass() == regClass();
620 else
621 return other.isTemp() && other.getTemp() == getTemp();
622 }
623 private:
624 union {
625 uint32_t i;
626 float f;
627 Temp temp = Temp(0, s1);
628 } data_;
629 PhysReg reg_;
630 union {
631 struct {
632 uint8_t isTemp_:1;
633 uint8_t isFixed_:1;
634 uint8_t isConstant_:1;
635 uint8_t isKill_:1;
636 uint8_t isUndef_:1;
637 uint8_t isFirstKill_:1;
638 uint8_t is64BitConst_:1;
639 uint8_t isLateKill_:1;
640 };
641 /* can't initialize bit-fields in c++11, so work around using a union */
642 uint8_t control_ = 0;
643 };
644 };
645
646 /**
647 * Definition Class
648 * Definitions are the results of Instructions
649 * and refer to temporary virtual registers
650 * which are later mapped to physical registers
651 */
652 class Definition final
653 {
654 public:
655 constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0) {}
656 Definition(uint32_t index, RegClass type) noexcept
657 : temp(index, type) {}
658 explicit Definition(Temp tmp) noexcept
659 : temp(tmp) {}
660 Definition(PhysReg reg, RegClass type) noexcept
661 : temp(Temp(0, type))
662 {
663 setFixed(reg);
664 }
665 Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
666 : temp(Temp(tmpId, type))
667 {
668 setFixed(reg);
669 }
670
671 constexpr bool isTemp() const noexcept
672 {
673 return tempId() > 0;
674 }
675
676 constexpr Temp getTemp() const noexcept
677 {
678 return temp;
679 }
680
681 constexpr uint32_t tempId() const noexcept
682 {
683 return temp.id();
684 }
685
686 constexpr void setTemp(Temp t) noexcept {
687 temp = t;
688 }
689
690 constexpr RegClass regClass() const noexcept
691 {
692 return temp.regClass();
693 }
694
695 constexpr unsigned bytes() const noexcept
696 {
697 return temp.bytes();
698 }
699
700 constexpr unsigned size() const noexcept
701 {
702 return temp.size();
703 }
704
705 constexpr bool isFixed() const noexcept
706 {
707 return isFixed_;
708 }
709
710 constexpr PhysReg physReg() const noexcept
711 {
712 return reg_;
713 }
714
715 constexpr void setFixed(PhysReg reg) noexcept
716 {
717 isFixed_ = 1;
718 reg_ = reg;
719 }
720
721 constexpr void setHint(PhysReg reg) noexcept
722 {
723 hasHint_ = 1;
724 reg_ = reg;
725 }
726
727 constexpr bool hasHint() const noexcept
728 {
729 return hasHint_;
730 }
731
732 constexpr void setKill(bool flag) noexcept
733 {
734 isKill_ = flag;
735 }
736
737 constexpr bool isKill() const noexcept
738 {
739 return isKill_;
740 }
741
742 private:
743 Temp temp = Temp(0, s1);
744 PhysReg reg_;
745 union {
746 struct {
747 uint8_t isFixed_:1;
748 uint8_t hasHint_:1;
749 uint8_t isKill_:1;
750 };
751 /* can't initialize bit-fields in c++11, so work around using a union */
752 uint8_t control_ = 0;
753 };
754 };
755
756 struct Block;
757
758 struct Instruction {
759 aco_opcode opcode;
760 Format format;
761 uint32_t pass_flags;
762
763 aco::span<Operand> operands;
764 aco::span<Definition> definitions;
765
766 constexpr bool isVALU() const noexcept
767 {
768 return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
769 || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
770 || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
771 || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
772 || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
773 || format == Format::VOP3P;
774 }
775
776 constexpr bool isSALU() const noexcept
777 {
778 return format == Format::SOP1 ||
779 format == Format::SOP2 ||
780 format == Format::SOPC ||
781 format == Format::SOPK ||
782 format == Format::SOPP;
783 }
784
785 constexpr bool isVMEM() const noexcept
786 {
787 return format == Format::MTBUF ||
788 format == Format::MUBUF ||
789 format == Format::MIMG;
790 }
791
792 constexpr bool isDPP() const noexcept
793 {
794 return (uint16_t) format & (uint16_t) Format::DPP;
795 }
796
797 constexpr bool isVOP3() const noexcept
798 {
799 return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
800 ((uint16_t) format & (uint16_t) Format::VOP3B);
801 }
802
803 constexpr bool isSDWA() const noexcept
804 {
805 return (uint16_t) format & (uint16_t) Format::SDWA;
806 }
807
808 constexpr bool isFlatOrGlobal() const noexcept
809 {
810 return format == Format::FLAT || format == Format::GLOBAL;
811 }
812
813 constexpr bool usesModifiers() const noexcept;
814
815 constexpr bool reads_exec() const noexcept
816 {
817 for (const Operand& op : operands) {
818 if (op.isFixed() && op.physReg() == exec)
819 return true;
820 }
821 return false;
822 }
823 };
824 static_assert(sizeof(Instruction) == 16, "Unexpected padding");
825
826 struct SOPK_instruction : public Instruction {
827 uint16_t imm;
828 uint16_t padding;
829 };
830 static_assert(sizeof(SOPK_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
831
832 struct SOPP_instruction : public Instruction {
833 uint32_t imm;
834 int block;
835 };
836 static_assert(sizeof(SOPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
837
838 struct SOPC_instruction : public Instruction {
839 };
840 static_assert(sizeof(SOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
841
842 struct SOP1_instruction : public Instruction {
843 };
844 static_assert(sizeof(SOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
845
846 struct SOP2_instruction : public Instruction {
847 };
848 static_assert(sizeof(SOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
849
850 /**
851 * Scalar Memory Format:
852 * For s_(buffer_)load_dword*:
853 * Operand(0): SBASE - SGPR-pair which provides base address
854 * Operand(1): Offset - immediate (un)signed offset or SGPR
855 * Operand(2) / Definition(0): SDATA - SGPR for read / write result
856 * Operand(n-1): SOffset - SGPR offset (Vega only)
857 *
858 * Having no operands is also valid for instructions such as s_dcache_inv.
859 *
860 */
861 struct SMEM_instruction : public Instruction {
862 barrier_interaction barrier;
863 bool glc : 1; /* VI+: globally coherent */
864 bool dlc : 1; /* NAVI: device level coherent */
865 bool nv : 1; /* VEGA only: Non-volatile */
866 bool can_reorder : 1;
867 bool disable_wqm : 1;
868 uint32_t padding: 19;
869 };
870 static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
871
872 struct VOP1_instruction : public Instruction {
873 };
874 static_assert(sizeof(VOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
875
876 struct VOP2_instruction : public Instruction {
877 };
878 static_assert(sizeof(VOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
879
880 struct VOPC_instruction : public Instruction {
881 };
882 static_assert(sizeof(VOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
883
884 struct VOP3A_instruction : public Instruction {
885 bool abs[3];
886 bool neg[3];
887 uint8_t opsel : 4;
888 uint8_t omod : 2;
889 bool clamp : 1;
890 uint32_t padding : 9;
891 };
892 static_assert(sizeof(VOP3A_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
893
894 struct VOP3P_instruction : public Instruction {
895 bool neg_lo[3];
896 bool neg_hi[3];
897 uint8_t opsel_lo : 3;
898 uint8_t opsel_hi : 3;
899 bool clamp : 1;
900 uint32_t padding : 9;
901 };
902 static_assert(sizeof(VOP3P_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
903
904 /**
905 * Data Parallel Primitives Format:
906 * This format can be used for VOP1, VOP2 or VOPC instructions.
907 * The swizzle applies to the src0 operand.
908 *
909 */
910 struct DPP_instruction : public Instruction {
911 bool abs[2];
912 bool neg[2];
913 uint16_t dpp_ctrl;
914 uint8_t row_mask : 4;
915 uint8_t bank_mask : 4;
916 bool bound_ctrl : 1;
917 uint32_t padding : 7;
918 };
919 static_assert(sizeof(DPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
920
921 enum sdwa_sel : uint8_t {
922 /* masks */
923 sdwa_wordnum = 0x1,
924 sdwa_bytenum = 0x3,
925 sdwa_asuint = 0x7 | 0x10,
926 sdwa_rasize = 0x3,
927
928 /* flags */
929 sdwa_isword = 0x4,
930 sdwa_sext = 0x8,
931 sdwa_isra = 0x10,
932
933 /* specific values */
934 sdwa_ubyte0 = 0,
935 sdwa_ubyte1 = 1,
936 sdwa_ubyte2 = 2,
937 sdwa_ubyte3 = 3,
938 sdwa_uword0 = sdwa_isword | 0,
939 sdwa_uword1 = sdwa_isword | 1,
940 sdwa_udword = 6,
941
942 sdwa_sbyte0 = sdwa_ubyte0 | sdwa_sext,
943 sdwa_sbyte1 = sdwa_ubyte1 | sdwa_sext,
944 sdwa_sbyte2 = sdwa_ubyte2 | sdwa_sext,
945 sdwa_sbyte3 = sdwa_ubyte3 | sdwa_sext,
946 sdwa_sword0 = sdwa_uword0 | sdwa_sext,
947 sdwa_sword1 = sdwa_uword1 | sdwa_sext,
948 sdwa_sdword = sdwa_udword | sdwa_sext,
949
950 /* register-allocated */
951 sdwa_ubyte = 1 | sdwa_isra,
952 sdwa_uword = 2 | sdwa_isra,
953 sdwa_sbyte = sdwa_ubyte | sdwa_sext,
954 sdwa_sword = sdwa_uword | sdwa_sext,
955 };
956
957 /**
958 * Sub-Dword Addressing Format:
959 * This format can be used for VOP1, VOP2 or VOPC instructions.
960 *
961 * omod and SGPR/constant operands are only available on GFX9+. For VOPC,
962 * the definition doesn't have to be VCC on GFX9+.
963 *
964 */
965 struct SDWA_instruction : public Instruction {
966 /* these destination modifiers aren't available with VOPC except for
967 * clamp on GFX8 */
968 uint8_t sel[2];
969 uint8_t dst_sel;
970 bool neg[2];
971 bool abs[2];
972 bool dst_preserve : 1;
973 bool clamp : 1;
974 uint8_t omod : 2; /* GFX9+ */
975 uint32_t padding : 4;
976 };
977 static_assert(sizeof(SDWA_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
978
979 struct Interp_instruction : public Instruction {
980 uint8_t attribute;
981 uint8_t component;
982 uint16_t padding;
983 };
984 static_assert(sizeof(Interp_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
985
986 /**
987 * Local and Global Data Sharing instructions
988 * Operand(0): ADDR - VGPR which supplies the address.
989 * Operand(1): DATA0 - First data VGPR.
990 * Operand(2): DATA1 - Second data VGPR.
991 * Operand(n-1): M0 - LDS size.
992 * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
993 *
994 */
995 struct DS_instruction : public Instruction {
996 int16_t offset0;
997 int8_t offset1;
998 bool gds;
999 };
1000 static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1001
1002 /**
1003 * Vector Memory Untyped-buffer Instructions
1004 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1005 * Operand(1): VADDR - Address source. Can carry an index and/or offset
1006 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1007 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1008 *
1009 */
1010 struct MUBUF_instruction : public Instruction {
1011 uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
1012 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1013 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1014 bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
1015 bool glc : 1; /* globally coherent */
1016 bool dlc : 1; /* NAVI: device level coherent */
1017 bool slc : 1; /* system level coherent */
1018 bool tfe : 1; /* texture fail enable */
1019 bool lds : 1; /* Return read-data to LDS instead of VGPRs */
1020 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1021 bool can_reorder : 1;
1022 uint8_t padding : 2;
1023 barrier_interaction barrier;
1024 };
1025 static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1026
1027 /**
1028 * Vector Memory Typed-buffer Instructions
1029 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1030 * Operand(1): VADDR - Address source. Can carry an index and/or offset
1031 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1032 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1033 *
1034 */
1035 struct MTBUF_instruction : public Instruction {
1036 uint16_t offset; /* Unsigned byte offset - 12 bit */
1037 barrier_interaction barrier;
1038 uint8_t dfmt : 4; /* Data Format of data in memory buffer */
1039 uint8_t nfmt : 3; /* Numeric format of data in memory */
1040 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1041 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1042 bool glc : 1; /* globally coherent */
1043 bool dlc : 1; /* NAVI: device level coherent */
1044 bool slc : 1; /* system level coherent */
1045 bool tfe : 1; /* texture fail enable */
1046 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1047 bool can_reorder : 1;
1048 uint32_t padding : 25;
1049 };
1050 static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1051
1052 /**
1053 * Vector Memory Image Instructions
1054 * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
1055 * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
1056 * or VDATA - Vector GPR for write data.
1057 * Operand(2): VADDR - Address source. Can carry an offset or an index.
1058 * Definition(0): VDATA - Vector GPR for read result.
1059 *
1060 */
1061 struct MIMG_instruction : public Instruction {
1062 uint8_t dmask; /* Data VGPR enable mask */
1063 uint8_t dim : 3; /* NAVI: dimensionality */
1064 bool unrm : 1; /* Force address to be un-normalized */
1065 bool dlc : 1; /* NAVI: device level coherent */
1066 bool glc : 1; /* globally coherent */
1067 bool slc : 1; /* system level coherent */
1068 bool tfe : 1; /* texture fail enable */
1069 bool da : 1; /* declare an array */
1070 bool lwe : 1; /* Force data to be un-normalized */
1071 bool r128 : 1; /* NAVI: Texture resource size */
1072 bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
1073 bool d16 : 1; /* Convert 32-bit data to 16-bit data */
1074 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1075 bool can_reorder : 1;
1076 uint8_t padding : 1;
1077 barrier_interaction barrier;
1078 };
1079 static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1080
1081 /**
1082 * Flat/Scratch/Global Instructions
1083 * Operand(0): ADDR
1084 * Operand(1): SADDR
1085 * Operand(2) / Definition(0): DATA/VDST
1086 *
1087 */
1088 struct FLAT_instruction : public Instruction {
1089 uint16_t offset; /* Vega/Navi only */
1090 bool slc : 1; /* system level coherent */
1091 bool glc : 1; /* globally coherent */
1092 bool dlc : 1; /* NAVI: device level coherent */
1093 bool lds : 1;
1094 bool nv : 1;
1095 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1096 bool can_reorder : 1;
1097 uint8_t padding : 1;
1098 barrier_interaction barrier;
1099 };
1100 static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1101
1102 struct Export_instruction : public Instruction {
1103 uint8_t enabled_mask;
1104 uint8_t dest;
1105 bool compressed : 1;
1106 bool done : 1;
1107 bool valid_mask : 1;
1108 uint32_t padding : 13;
1109 };
1110 static_assert(sizeof(Export_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1111
1112 struct Pseudo_instruction : public Instruction {
1113 PhysReg scratch_sgpr; /* might not be valid if it's not needed */
1114 bool tmp_in_scc;
1115 uint8_t padding;
1116 };
1117 static_assert(sizeof(Pseudo_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1118
1119 struct Pseudo_branch_instruction : public Instruction {
1120 /* target[0] is the block index of the branch target.
1121 * For conditional branches, target[1] contains the fall-through alternative.
1122 * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
1123 */
1124 uint32_t target[2];
1125 };
1126 static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1127
1128 struct Pseudo_barrier_instruction : public Instruction {
1129 };
1130 static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1131
1132 enum ReduceOp : uint16_t {
1133 iadd8, iadd16, iadd32, iadd64,
1134 imul8, imul16, imul32, imul64,
1135 fadd16, fadd32, fadd64,
1136 fmul16, fmul32, fmul64,
1137 imin8, imin16, imin32, imin64,
1138 imax8, imax16, imax32, imax64,
1139 umin8, umin16, umin32, umin64,
1140 umax8, umax16, umax32, umax64,
1141 fmin16, fmin32, fmin64,
1142 fmax16, fmax32, fmax64,
1143 iand8, iand16, iand32, iand64,
1144 ior8, ior16, ior32, ior64,
1145 ixor8, ixor16, ixor32, ixor64,
1146 };
1147
1148 /**
1149 * Subgroup Reduction Instructions, everything except for the data to be
1150 * reduced and the result as inserted by setup_reduce_temp().
1151 * Operand(0): data to be reduced
1152 * Operand(1): reduce temporary
1153 * Operand(2): vector temporary
1154 * Definition(0): result
1155 * Definition(1): scalar temporary
1156 * Definition(2): scalar identity temporary (not used to store identity on GFX10)
1157 * Definition(3): scc clobber
1158 * Definition(4): vcc clobber
1159 *
1160 */
1161 struct Pseudo_reduction_instruction : public Instruction {
1162 ReduceOp reduce_op;
1163 uint16_t cluster_size; // must be 0 for scans
1164 };
1165 static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1166
1167 struct instr_deleter_functor {
1168 void operator()(void* p) {
1169 free(p);
1170 }
1171 };
1172
1173 template<typename T>
1174 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
1175
1176 template<typename T>
1177 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
1178 {
1179 std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1180 char *data = (char*) calloc(1, size);
1181 T* inst = (T*) data;
1182
1183 inst->opcode = opcode;
1184 inst->format = format;
1185
1186 uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
1187 inst->operands = aco::span<Operand>(operands_offset, num_operands);
1188 uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1189 inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1190
1191 return inst;
1192 }
1193
1194 constexpr bool Instruction::usesModifiers() const noexcept
1195 {
1196 if (isDPP() || isSDWA())
1197 return true;
1198
1199 if (format == Format::VOP3P) {
1200 const VOP3P_instruction *vop3p = static_cast<const VOP3P_instruction*>(this);
1201 for (unsigned i = 0; i < operands.size(); i++) {
1202 if (vop3p->neg_lo[i] || vop3p->neg_hi[i])
1203 return true;
1204 }
1205 return vop3p->opsel_lo || vop3p->opsel_hi || vop3p->clamp;
1206 } else if (isVOP3()) {
1207 const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
1208 for (unsigned i = 0; i < operands.size(); i++) {
1209 if (vop3->abs[i] || vop3->neg[i])
1210 return true;
1211 }
1212 return vop3->opsel || vop3->clamp || vop3->omod;
1213 }
1214 return false;
1215 }
1216
1217 constexpr bool is_phi(Instruction* instr)
1218 {
1219 return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
1220 }
1221
1222 static inline bool is_phi(aco_ptr<Instruction>& instr)
1223 {
1224 return is_phi(instr.get());
1225 }
1226
1227 barrier_interaction get_barrier_interaction(const Instruction* instr);
1228 bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr);
1229
1230 bool can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high);
1231 bool can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr);
1232 /* updates "instr" and returns the old instruction (or NULL if no update was needed) */
1233 aco_ptr<Instruction> convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& instr);
1234
1235 enum block_kind {
1236 /* uniform indicates that leaving this block,
1237 * all actives lanes stay active */
1238 block_kind_uniform = 1 << 0,
1239 block_kind_top_level = 1 << 1,
1240 block_kind_loop_preheader = 1 << 2,
1241 block_kind_loop_header = 1 << 3,
1242 block_kind_loop_exit = 1 << 4,
1243 block_kind_continue = 1 << 5,
1244 block_kind_break = 1 << 6,
1245 block_kind_continue_or_break = 1 << 7,
1246 block_kind_discard = 1 << 8,
1247 block_kind_branch = 1 << 9,
1248 block_kind_merge = 1 << 10,
1249 block_kind_invert = 1 << 11,
1250 block_kind_uses_discard_if = 1 << 12,
1251 block_kind_needs_lowering = 1 << 13,
1252 block_kind_uses_demote = 1 << 14,
1253 block_kind_export_end = 1 << 15,
1254 };
1255
1256
1257 struct RegisterDemand {
1258 constexpr RegisterDemand() = default;
1259 constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1260 : vgpr{v}, sgpr{s} {}
1261 int16_t vgpr = 0;
1262 int16_t sgpr = 0;
1263
1264 constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1265 return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1266 }
1267
1268 constexpr bool exceeds(const RegisterDemand other) const noexcept {
1269 return vgpr > other.vgpr || sgpr > other.sgpr;
1270 }
1271
1272 constexpr RegisterDemand operator+(const Temp t) const noexcept {
1273 if (t.type() == RegType::sgpr)
1274 return RegisterDemand( vgpr, sgpr + t.size() );
1275 else
1276 return RegisterDemand( vgpr + t.size(), sgpr );
1277 }
1278
1279 constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1280 return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1281 }
1282
1283 constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1284 return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1285 }
1286
1287 constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1288 vgpr += other.vgpr;
1289 sgpr += other.sgpr;
1290 return *this;
1291 }
1292
1293 constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1294 vgpr -= other.vgpr;
1295 sgpr -= other.sgpr;
1296 return *this;
1297 }
1298
1299 constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1300 if (t.type() == RegType::sgpr)
1301 sgpr += t.size();
1302 else
1303 vgpr += t.size();
1304 return *this;
1305 }
1306
1307 constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1308 if (t.type() == RegType::sgpr)
1309 sgpr -= t.size();
1310 else
1311 vgpr -= t.size();
1312 return *this;
1313 }
1314
1315 constexpr void update(const RegisterDemand other) noexcept {
1316 vgpr = std::max(vgpr, other.vgpr);
1317 sgpr = std::max(sgpr, other.sgpr);
1318 }
1319
1320 };
1321
1322 /* CFG */
1323 struct Block {
1324 float_mode fp_mode;
1325 unsigned index;
1326 unsigned offset = 0;
1327 std::vector<aco_ptr<Instruction>> instructions;
1328 std::vector<unsigned> logical_preds;
1329 std::vector<unsigned> linear_preds;
1330 std::vector<unsigned> logical_succs;
1331 std::vector<unsigned> linear_succs;
1332 RegisterDemand register_demand = RegisterDemand();
1333 uint16_t loop_nest_depth = 0;
1334 uint16_t kind = 0;
1335 int logical_idom = -1;
1336 int linear_idom = -1;
1337 Temp live_out_exec = Temp();
1338
1339 /* this information is needed for predecessors to blocks with phis when
1340 * moving out of ssa */
1341 bool scc_live_out = false;
1342 PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1343
1344 Block(unsigned idx) : index(idx) {}
1345 Block() : index(0) {}
1346 };
1347
1348 using Stage = uint16_t;
1349
1350 /* software stages */
1351 static constexpr Stage sw_vs = 1 << 0;
1352 static constexpr Stage sw_gs = 1 << 1;
1353 static constexpr Stage sw_tcs = 1 << 2;
1354 static constexpr Stage sw_tes = 1 << 3;
1355 static constexpr Stage sw_fs = 1 << 4;
1356 static constexpr Stage sw_cs = 1 << 5;
1357 static constexpr Stage sw_gs_copy = 1 << 6;
1358 static constexpr Stage sw_mask = 0x7f;
1359
1360 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1361 static constexpr Stage hw_vs = 1 << 7;
1362 static constexpr Stage hw_es = 1 << 8; /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */
1363 static constexpr Stage hw_gs = 1 << 9; /* Geometry shader on GFX10/legacy and GFX6-9. */
1364 static constexpr Stage hw_ngg_gs = 1 << 10; /* Geometry shader on GFX10/NGG. */
1365 static constexpr Stage hw_ls = 1 << 11; /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */
1366 static constexpr Stage hw_hs = 1 << 12; /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */
1367 static constexpr Stage hw_fs = 1 << 13;
1368 static constexpr Stage hw_cs = 1 << 14;
1369 static constexpr Stage hw_mask = 0xff << 7;
1370
1371 /* possible settings of Program::stage */
1372 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1373 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1374 static constexpr Stage compute_cs = sw_cs | hw_cs;
1375 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1376 static constexpr Stage gs_copy_vs = sw_gs_copy | hw_vs;
1377 /* GFX10/NGG */
1378 static constexpr Stage ngg_vertex_gs = sw_vs | hw_ngg_gs;
1379 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_ngg_gs;
1380 static constexpr Stage ngg_tess_eval_gs = sw_tes | hw_ngg_gs;
1381 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_ngg_gs;
1382 /* GFX9 (and GFX10 if NGG isn't used) */
1383 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1384 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1385 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1386 /* pre-GFX9 */
1387 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1388 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1389 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1390 static constexpr Stage tess_eval_es = sw_tes | hw_es; /* tesselation evaluation before geometry */
1391 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1392
1393 enum statistic {
1394 statistic_hash,
1395 statistic_instructions,
1396 statistic_copies,
1397 statistic_branches,
1398 statistic_cycles,
1399 statistic_vmem_clauses,
1400 statistic_smem_clauses,
1401 statistic_vmem_score,
1402 statistic_smem_score,
1403 statistic_sgpr_presched,
1404 statistic_vgpr_presched,
1405 num_statistics
1406 };
1407
1408 class Program final {
1409 public:
1410 float_mode next_fp_mode;
1411 std::vector<Block> blocks;
1412 RegisterDemand max_reg_demand = RegisterDemand();
1413 uint16_t num_waves = 0;
1414 uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1415 ac_shader_config* config;
1416 struct radv_shader_info *info;
1417 enum chip_class chip_class;
1418 enum radeon_family family;
1419 unsigned wave_size;
1420 RegClass lane_mask;
1421 Stage stage; /* Stage */
1422 bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1423 bool needs_wqm = false; /* there exists a p_wqm instruction */
1424 bool wb_smem_l1_on_end = false;
1425
1426 std::vector<uint8_t> constant_data;
1427 Temp private_segment_buffer;
1428 Temp scratch_offset;
1429
1430 uint16_t min_waves = 0;
1431 uint16_t lds_alloc_granule;
1432 uint32_t lds_limit; /* in bytes */
1433 bool has_16bank_lds;
1434 uint16_t vgpr_limit;
1435 uint16_t sgpr_limit;
1436 uint16_t physical_sgprs;
1437 uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1438 uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
1439 unsigned workgroup_size; /* if known; otherwise UINT_MAX */
1440
1441 bool xnack_enabled = false;
1442 bool sram_ecc_enabled = false;
1443
1444 bool needs_vcc = false;
1445 bool needs_flat_scr = false;
1446
1447 bool collect_statistics = false;
1448 uint32_t statistics[num_statistics];
1449
1450 uint32_t allocateId()
1451 {
1452 assert(allocationID <= 16777215);
1453 return allocationID++;
1454 }
1455
1456 uint32_t peekAllocationId()
1457 {
1458 return allocationID;
1459 }
1460
1461 void setAllocationId(uint32_t id)
1462 {
1463 allocationID = id;
1464 }
1465
1466 Block* create_and_insert_block() {
1467 blocks.emplace_back(blocks.size());
1468 blocks.back().fp_mode = next_fp_mode;
1469 return &blocks.back();
1470 }
1471
1472 Block* insert_block(Block&& block) {
1473 block.index = blocks.size();
1474 block.fp_mode = next_fp_mode;
1475 blocks.emplace_back(std::move(block));
1476 return &blocks.back();
1477 }
1478
1479 private:
1480 uint32_t allocationID = 1;
1481 };
1482
1483 struct TempHash {
1484 std::size_t operator()(Temp t) const {
1485 return t.id();
1486 }
1487 };
1488 using TempSet = std::unordered_set<Temp, TempHash>;
1489
1490 struct live {
1491 /* live temps out per block */
1492 std::vector<TempSet> live_out;
1493 /* register demand (sgpr/vgpr) per instruction per block */
1494 std::vector<std::vector<RegisterDemand>> register_demand;
1495 };
1496
1497 void select_program(Program *program,
1498 unsigned shader_count,
1499 struct nir_shader *const *shaders,
1500 ac_shader_config* config,
1501 struct radv_shader_args *args);
1502 void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
1503 ac_shader_config* config,
1504 struct radv_shader_args *args);
1505
1506 void lower_wqm(Program* program, live& live_vars,
1507 const struct radv_nir_compiler_options *options);
1508 void lower_phis(Program* program);
1509 void calc_min_waves(Program* program);
1510 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1511 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1512 std::vector<uint16_t> dead_code_analysis(Program *program);
1513 void dominator_tree(Program* program);
1514 void insert_exec_mask(Program *program);
1515 void value_numbering(Program* program);
1516 void optimize(Program* program);
1517 void setup_reduce_temp(Program* program);
1518 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1519 void register_allocation(Program *program, std::vector<TempSet>& live_out_per_block);
1520 void ssa_elimination(Program* program);
1521 void lower_to_hw_instr(Program* program);
1522 void schedule_program(Program* program, live& live_vars);
1523 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1524 void insert_wait_states(Program* program);
1525 void insert_NOPs(Program* program);
1526 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1527 void print_asm(Program *program, std::vector<uint32_t>& binary,
1528 unsigned exec_size, std::ostream& out);
1529 void validate(Program* program, FILE *output);
1530 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output);
1531 #ifndef NDEBUG
1532 void perfwarn(bool cond, const char *msg, Instruction *instr=NULL);
1533 #else
1534 #define perfwarn(program, cond, msg, ...) do {} while(0)
1535 #endif
1536
1537 void collect_presched_stats(Program *program);
1538 void collect_preasm_stats(Program *program);
1539 void collect_postasm_stats(Program *program, const std::vector<uint32_t>& code);
1540
1541 void aco_print_instr(const Instruction *instr, FILE *output);
1542 void aco_print_program(const Program *program, FILE *output);
1543
1544 /* utilities for dealing with register demand */
1545 RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
1546 RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
1547 RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, aco_ptr<Instruction>& instr_before);
1548
1549 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1550 uint16_t get_extra_sgprs(Program *program);
1551
1552 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
1553 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1554 uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs);
1555
1556 /* return number of addressable sgprs/vgprs for max_waves */
1557 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1558 uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves);
1559
1560 typedef struct {
1561 const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
1562 const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1563 const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1564 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1565 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1566 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
1567 const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1568 const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1569 } Info;
1570
1571 extern const Info instr_info;
1572
1573 }
1574
1575 #endif /* ACO_IR_H */
1576