aco: Increase barrier_count to 7 to include barrier_barrier.
[mesa.git] / src / amd / compiler / aco_ir.h
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #ifndef ACO_IR_H
26 #define ACO_IR_H
27
28 #include <vector>
29 #include <set>
30 #include <unordered_set>
31 #include <bitset>
32 #include <memory>
33
34 #include "nir.h"
35 #include "ac_binary.h"
36 #include "amd_family.h"
37 #include "aco_opcodes.h"
38 #include "aco_util.h"
39
40 struct radv_nir_compiler_options;
41 struct radv_shader_args;
42 struct radv_shader_info;
43
44 namespace aco {
45
46 extern uint64_t debug_flags;
47
48 enum {
49 DEBUG_VALIDATE = 0x1,
50 DEBUG_VALIDATE_RA = 0x2,
51 DEBUG_PERFWARN = 0x4,
52 };
53
54 /**
55 * Representation of the instruction's microcode encoding format
56 * Note: Some Vector ALU Formats can be combined, such that:
57 * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
58 * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
59 * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
60 *
61 * (*) The same is applicable for VOP1 and VOPC instructions.
62 */
63 enum class Format : std::uint16_t {
64 /* Pseudo Instruction Format */
65 PSEUDO = 0,
66 /* Scalar ALU & Control Formats */
67 SOP1 = 1,
68 SOP2 = 2,
69 SOPK = 3,
70 SOPP = 4,
71 SOPC = 5,
72 /* Scalar Memory Format */
73 SMEM = 6,
74 /* LDS/GDS Format */
75 DS = 8,
76 /* Vector Memory Buffer Formats */
77 MTBUF = 9,
78 MUBUF = 10,
79 /* Vector Memory Image Format */
80 MIMG = 11,
81 /* Export Format */
82 EXP = 12,
83 /* Flat Formats */
84 FLAT = 13,
85 GLOBAL = 14,
86 SCRATCH = 15,
87
88 PSEUDO_BRANCH = 16,
89 PSEUDO_BARRIER = 17,
90 PSEUDO_REDUCTION = 18,
91
92 /* Vector ALU Formats */
93 VOP3P = 19,
94 VOP1 = 1 << 8,
95 VOP2 = 1 << 9,
96 VOPC = 1 << 10,
97 VOP3 = 1 << 11,
98 VOP3A = 1 << 11,
99 VOP3B = 1 << 11,
100 /* Vector Parameter Interpolation Format */
101 VINTRP = 1 << 12,
102 DPP = 1 << 13,
103 SDWA = 1 << 14,
104 };
105
106 enum barrier_interaction : uint8_t {
107 barrier_none = 0,
108 barrier_buffer = 0x1,
109 barrier_image = 0x2,
110 barrier_atomic = 0x4,
111 barrier_shared = 0x8,
112 /* used for geometry shaders to ensure vertex data writes are before the
113 * GS_DONE s_sendmsg. */
114 barrier_gs_data = 0x10,
115 /* used for geometry shaders to ensure s_sendmsg instructions are in-order. */
116 barrier_gs_sendmsg = 0x20,
117 /* used by barriers. created by s_barrier */
118 barrier_barrier = 0x40,
119 barrier_count = 7,
120 };
121
122 enum fp_round {
123 fp_round_ne = 0,
124 fp_round_pi = 1,
125 fp_round_ni = 2,
126 fp_round_tz = 3,
127 };
128
129 enum fp_denorm {
130 /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
131 * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
132 fp_denorm_flush = 0x0,
133 fp_denorm_keep = 0x3,
134 };
135
136 struct float_mode {
137 /* matches encoding of the MODE register */
138 union {
139 struct {
140 fp_round round32:2;
141 fp_round round16_64:2;
142 unsigned denorm32:2;
143 unsigned denorm16_64:2;
144 };
145 uint8_t val = 0;
146 };
147 /* if false, optimizations which may remove infs/nan/-0.0 can be done */
148 bool preserve_signed_zero_inf_nan32:1;
149 bool preserve_signed_zero_inf_nan16_64:1;
150 /* if false, optimizations which may remove denormal flushing can be done */
151 bool must_flush_denorms32:1;
152 bool must_flush_denorms16_64:1;
153 bool care_about_round32:1;
154 bool care_about_round16_64:1;
155
156 /* Returns true if instructions using the mode "other" can safely use the
157 * current one instead. */
158 bool canReplace(float_mode other) const noexcept {
159 return val == other.val &&
160 (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
161 (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
162 (must_flush_denorms32 || !other.must_flush_denorms32) &&
163 (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
164 (care_about_round32 || !other.care_about_round32) &&
165 (care_about_round16_64 || !other.care_about_round16_64);
166 }
167 };
168
169 constexpr Format asVOP3(Format format) {
170 return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
171 };
172
173 constexpr Format asSDWA(Format format) {
174 assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC);
175 return (Format) ((uint32_t) Format::SDWA | (uint32_t) format);
176 }
177
178 enum class RegType {
179 none = 0,
180 sgpr,
181 vgpr,
182 linear_vgpr,
183 };
184
185 struct RegClass {
186
187 enum RC : uint8_t {
188 s1 = 1,
189 s2 = 2,
190 s3 = 3,
191 s4 = 4,
192 s6 = 6,
193 s8 = 8,
194 s16 = 16,
195 v1 = s1 | (1 << 5),
196 v2 = s2 | (1 << 5),
197 v3 = s3 | (1 << 5),
198 v4 = s4 | (1 << 5),
199 v5 = 5 | (1 << 5),
200 v6 = 6 | (1 << 5),
201 v7 = 7 | (1 << 5),
202 v8 = 8 | (1 << 5),
203 /* byte-sized register class */
204 v1b = v1 | (1 << 7),
205 v2b = v2 | (1 << 7),
206 v3b = v3 | (1 << 7),
207 v4b = v4 | (1 << 7),
208 v6b = v6 | (1 << 7),
209 v8b = v8 | (1 << 7),
210 /* these are used for WWM and spills to vgpr */
211 v1_linear = v1 | (1 << 6),
212 v2_linear = v2 | (1 << 6),
213 };
214
215 RegClass() = default;
216 constexpr RegClass(RC rc)
217 : rc(rc) {}
218 constexpr RegClass(RegType type, unsigned size)
219 : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
220
221 constexpr operator RC() const { return rc; }
222 explicit operator bool() = delete;
223
224 constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
225 constexpr bool is_subdword() const { return rc & (1 << 7); }
226 constexpr unsigned bytes() const { return ((unsigned) rc & 0x1F) * (is_subdword() ? 1 : 4); }
227 //TODO: use size() less in favor of bytes()
228 constexpr unsigned size() const { return (bytes() + 3) >> 2; }
229 constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
230 constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
231 constexpr RegClass as_subdword() const { return RegClass((RC) (rc | 1 << 7)); }
232
233 private:
234 RC rc;
235 };
236
237 /* transitional helper expressions */
238 static constexpr RegClass s1{RegClass::s1};
239 static constexpr RegClass s2{RegClass::s2};
240 static constexpr RegClass s3{RegClass::s3};
241 static constexpr RegClass s4{RegClass::s4};
242 static constexpr RegClass s8{RegClass::s8};
243 static constexpr RegClass s16{RegClass::s16};
244 static constexpr RegClass v1{RegClass::v1};
245 static constexpr RegClass v2{RegClass::v2};
246 static constexpr RegClass v3{RegClass::v3};
247 static constexpr RegClass v4{RegClass::v4};
248 static constexpr RegClass v5{RegClass::v5};
249 static constexpr RegClass v6{RegClass::v6};
250 static constexpr RegClass v7{RegClass::v7};
251 static constexpr RegClass v8{RegClass::v8};
252 static constexpr RegClass v1b{RegClass::v1b};
253 static constexpr RegClass v2b{RegClass::v2b};
254 static constexpr RegClass v3b{RegClass::v3b};
255 static constexpr RegClass v4b{RegClass::v4b};
256 static constexpr RegClass v6b{RegClass::v6b};
257 static constexpr RegClass v8b{RegClass::v8b};
258
259 /**
260 * Temp Class
261 * Each temporary virtual register has a
262 * register class (i.e. size and type)
263 * and SSA id.
264 */
265 struct Temp {
266 Temp() noexcept : id_(0), reg_class(0) {}
267 constexpr Temp(uint32_t id, RegClass cls) noexcept
268 : id_(id), reg_class(uint8_t(cls)) {}
269
270 constexpr uint32_t id() const noexcept { return id_; }
271 constexpr RegClass regClass() const noexcept { return (RegClass::RC)reg_class; }
272
273 constexpr unsigned bytes() const noexcept { return regClass().bytes(); }
274 constexpr unsigned size() const noexcept { return regClass().size(); }
275 constexpr RegType type() const noexcept { return regClass().type(); }
276 constexpr bool is_linear() const noexcept { return regClass().is_linear(); }
277
278 constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
279 constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
280 constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
281
282 private:
283 uint32_t id_: 24;
284 uint32_t reg_class : 8;
285 };
286
287 /**
288 * PhysReg
289 * Represents the physical register for each
290 * Operand and Definition.
291 */
292 struct PhysReg {
293 constexpr PhysReg() = default;
294 explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {}
295 constexpr unsigned reg() const { return reg_b >> 2; }
296 constexpr unsigned byte() const { return reg_b & 0x3; }
297 constexpr operator unsigned() const { return reg(); }
298 constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; }
299 constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; }
300 constexpr bool operator <(PhysReg other) const { return reg_b < other.reg_b; }
301
302 uint16_t reg_b = 0;
303 };
304
305 /* helper expressions for special registers */
306 static constexpr PhysReg m0{124};
307 static constexpr PhysReg vcc{106};
308 static constexpr PhysReg vcc_hi{107};
309 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
310 static constexpr PhysReg exec{126};
311 static constexpr PhysReg exec_lo{126};
312 static constexpr PhysReg exec_hi{127};
313 static constexpr PhysReg vccz{251};
314 static constexpr PhysReg execz{252};
315 static constexpr PhysReg scc{253};
316
317 /**
318 * Operand Class
319 * Initially, each Operand refers to either
320 * a temporary virtual register
321 * or to a constant value
322 * Temporary registers get mapped to physical register during RA
323 * Constant values are inlined into the instruction sequence.
324 */
325 class Operand final
326 {
327 public:
328 constexpr Operand()
329 : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
330 isKill_(false), isUndef_(true), isFirstKill_(false), is64BitConst_(false),
331 isLateKill_(false) {}
332
333 explicit Operand(Temp r) noexcept
334 {
335 data_.temp = r;
336 if (r.id()) {
337 isTemp_ = true;
338 } else {
339 isUndef_ = true;
340 setFixed(PhysReg{128});
341 }
342 };
343 explicit Operand(uint32_t v, bool is64bit = false) noexcept
344 {
345 data_.i = v;
346 isConstant_ = true;
347 is64BitConst_ = is64bit;
348 if (v <= 64)
349 setFixed(PhysReg{128 + v});
350 else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
351 setFixed(PhysReg{192 - v});
352 else if (v == 0x3f000000) /* 0.5 */
353 setFixed(PhysReg{240});
354 else if (v == 0xbf000000) /* -0.5 */
355 setFixed(PhysReg{241});
356 else if (v == 0x3f800000) /* 1.0 */
357 setFixed(PhysReg{242});
358 else if (v == 0xbf800000) /* -1.0 */
359 setFixed(PhysReg{243});
360 else if (v == 0x40000000) /* 2.0 */
361 setFixed(PhysReg{244});
362 else if (v == 0xc0000000) /* -2.0 */
363 setFixed(PhysReg{245});
364 else if (v == 0x40800000) /* 4.0 */
365 setFixed(PhysReg{246});
366 else if (v == 0xc0800000) /* -4.0 */
367 setFixed(PhysReg{247});
368 else { /* Literal Constant */
369 assert(!is64bit && "attempt to create a 64-bit literal constant");
370 setFixed(PhysReg{255});
371 }
372 };
373 explicit Operand(uint64_t v) noexcept
374 {
375 isConstant_ = true;
376 is64BitConst_ = true;
377 if (v <= 64) {
378 data_.i = (uint32_t) v;
379 setFixed(PhysReg{128 + (uint32_t) v});
380 } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
381 data_.i = (uint32_t) v;
382 setFixed(PhysReg{192 - (uint32_t) v});
383 } else if (v == 0x3FE0000000000000) { /* 0.5 */
384 data_.i = 0x3f000000;
385 setFixed(PhysReg{240});
386 } else if (v == 0xBFE0000000000000) { /* -0.5 */
387 data_.i = 0xbf000000;
388 setFixed(PhysReg{241});
389 } else if (v == 0x3FF0000000000000) { /* 1.0 */
390 data_.i = 0x3f800000;
391 setFixed(PhysReg{242});
392 } else if (v == 0xBFF0000000000000) { /* -1.0 */
393 data_.i = 0xbf800000;
394 setFixed(PhysReg{243});
395 } else if (v == 0x4000000000000000) { /* 2.0 */
396 data_.i = 0x40000000;
397 setFixed(PhysReg{244});
398 } else if (v == 0xC000000000000000) { /* -2.0 */
399 data_.i = 0xc0000000;
400 setFixed(PhysReg{245});
401 } else if (v == 0x4010000000000000) { /* 4.0 */
402 data_.i = 0x40800000;
403 setFixed(PhysReg{246});
404 } else if (v == 0xC010000000000000) { /* -4.0 */
405 data_.i = 0xc0800000;
406 setFixed(PhysReg{247});
407 } else { /* Literal Constant: we don't know if it is a long or double.*/
408 isConstant_ = 0;
409 assert(false && "attempt to create a 64-bit literal constant");
410 }
411 };
412 explicit Operand(RegClass type) noexcept
413 {
414 isUndef_ = true;
415 data_.temp = Temp(0, type);
416 setFixed(PhysReg{128});
417 };
418 explicit Operand(PhysReg reg, RegClass type) noexcept
419 {
420 data_.temp = Temp(0, type);
421 setFixed(reg);
422 }
423
424 constexpr bool isTemp() const noexcept
425 {
426 return isTemp_;
427 }
428
429 constexpr void setTemp(Temp t) noexcept {
430 assert(!isConstant_);
431 isTemp_ = true;
432 data_.temp = t;
433 }
434
435 constexpr Temp getTemp() const noexcept
436 {
437 return data_.temp;
438 }
439
440 constexpr uint32_t tempId() const noexcept
441 {
442 return data_.temp.id();
443 }
444
445 constexpr bool hasRegClass() const noexcept
446 {
447 return isTemp() || isUndefined();
448 }
449
450 constexpr RegClass regClass() const noexcept
451 {
452 return data_.temp.regClass();
453 }
454
455 constexpr unsigned bytes() const noexcept
456 {
457 if (isConstant())
458 return is64BitConst_ ? 8 : 4; //TODO: sub-dword constants
459 else
460 return data_.temp.bytes();
461 }
462
463 constexpr unsigned size() const noexcept
464 {
465 if (isConstant())
466 return is64BitConst_ ? 2 : 1;
467 else
468 return data_.temp.size();
469 }
470
471 constexpr bool isFixed() const noexcept
472 {
473 return isFixed_;
474 }
475
476 constexpr PhysReg physReg() const noexcept
477 {
478 return reg_;
479 }
480
481 constexpr void setFixed(PhysReg reg) noexcept
482 {
483 isFixed_ = reg != unsigned(-1);
484 reg_ = reg;
485 }
486
487 constexpr bool isConstant() const noexcept
488 {
489 return isConstant_;
490 }
491
492 constexpr bool isLiteral() const noexcept
493 {
494 return isConstant() && reg_ == 255;
495 }
496
497 constexpr bool isUndefined() const noexcept
498 {
499 return isUndef_;
500 }
501
502 constexpr uint32_t constantValue() const noexcept
503 {
504 return data_.i;
505 }
506
507 constexpr bool constantEquals(uint32_t cmp) const noexcept
508 {
509 return isConstant() && constantValue() == cmp;
510 }
511
512 constexpr uint64_t constantValue64(bool signext=false) const noexcept
513 {
514 if (is64BitConst_) {
515 if (reg_ <= 192)
516 return reg_ - 128;
517 else if (reg_ <= 208)
518 return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);
519
520 switch (reg_) {
521 case 240:
522 return 0x3FE0000000000000;
523 case 241:
524 return 0xBFE0000000000000;
525 case 242:
526 return 0x3FF0000000000000;
527 case 243:
528 return 0xBFF0000000000000;
529 case 244:
530 return 0x4000000000000000;
531 case 245:
532 return 0xC000000000000000;
533 case 246:
534 return 0x4010000000000000;
535 case 247:
536 return 0xC010000000000000;
537 }
538 }
539 return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
540 }
541
542 /* Indicates that the killed operand's live range intersects with the
543 * instruction's definitions. Unlike isKill() and isFirstKill(), this is
544 * not set by liveness analysis. */
545 constexpr void setLateKill(bool flag) noexcept
546 {
547 isLateKill_ = flag;
548 }
549
550 constexpr bool isLateKill() const noexcept
551 {
552 return isLateKill_;
553 }
554
555 constexpr void setKill(bool flag) noexcept
556 {
557 isKill_ = flag;
558 if (!flag)
559 setFirstKill(false);
560 }
561
562 constexpr bool isKill() const noexcept
563 {
564 return isKill_ || isFirstKill();
565 }
566
567 constexpr void setFirstKill(bool flag) noexcept
568 {
569 isFirstKill_ = flag;
570 if (flag)
571 setKill(flag);
572 }
573
574 /* When there are multiple operands killing the same temporary,
575 * isFirstKill() is only returns true for the first one. */
576 constexpr bool isFirstKill() const noexcept
577 {
578 return isFirstKill_;
579 }
580
581 constexpr bool isKillBeforeDef() const noexcept
582 {
583 return isKill() && !isLateKill();
584 }
585
586 constexpr bool isFirstKillBeforeDef() const noexcept
587 {
588 return isFirstKill() && !isLateKill();
589 }
590
591 constexpr bool operator == (Operand other) const noexcept
592 {
593 if (other.size() != size())
594 return false;
595 if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef())
596 return false;
597 if (isFixed() && other.isFixed() && physReg() != other.physReg())
598 return false;
599 if (isLiteral())
600 return other.isLiteral() && other.constantValue() == constantValue();
601 else if (isConstant())
602 return other.isConstant() && other.physReg() == physReg();
603 else if (isUndefined())
604 return other.isUndefined() && other.regClass() == regClass();
605 else
606 return other.isTemp() && other.getTemp() == getTemp();
607 }
608 private:
609 union {
610 uint32_t i;
611 float f;
612 Temp temp = Temp(0, s1);
613 } data_;
614 PhysReg reg_;
615 union {
616 struct {
617 uint8_t isTemp_:1;
618 uint8_t isFixed_:1;
619 uint8_t isConstant_:1;
620 uint8_t isKill_:1;
621 uint8_t isUndef_:1;
622 uint8_t isFirstKill_:1;
623 uint8_t is64BitConst_:1;
624 uint8_t isLateKill_:1;
625 };
626 /* can't initialize bit-fields in c++11, so work around using a union */
627 uint8_t control_ = 0;
628 };
629 };
630
631 /**
632 * Definition Class
633 * Definitions are the results of Instructions
634 * and refer to temporary virtual registers
635 * which are later mapped to physical registers
636 */
637 class Definition final
638 {
639 public:
640 constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0) {}
641 Definition(uint32_t index, RegClass type) noexcept
642 : temp(index, type) {}
643 explicit Definition(Temp tmp) noexcept
644 : temp(tmp) {}
645 Definition(PhysReg reg, RegClass type) noexcept
646 : temp(Temp(0, type))
647 {
648 setFixed(reg);
649 }
650 Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
651 : temp(Temp(tmpId, type))
652 {
653 setFixed(reg);
654 }
655
656 constexpr bool isTemp() const noexcept
657 {
658 return tempId() > 0;
659 }
660
661 constexpr Temp getTemp() const noexcept
662 {
663 return temp;
664 }
665
666 constexpr uint32_t tempId() const noexcept
667 {
668 return temp.id();
669 }
670
671 constexpr void setTemp(Temp t) noexcept {
672 temp = t;
673 }
674
675 constexpr RegClass regClass() const noexcept
676 {
677 return temp.regClass();
678 }
679
680 constexpr unsigned bytes() const noexcept
681 {
682 return temp.bytes();
683 }
684
685 constexpr unsigned size() const noexcept
686 {
687 return temp.size();
688 }
689
690 constexpr bool isFixed() const noexcept
691 {
692 return isFixed_;
693 }
694
695 constexpr PhysReg physReg() const noexcept
696 {
697 return reg_;
698 }
699
700 constexpr void setFixed(PhysReg reg) noexcept
701 {
702 isFixed_ = 1;
703 reg_ = reg;
704 }
705
706 constexpr void setHint(PhysReg reg) noexcept
707 {
708 hasHint_ = 1;
709 reg_ = reg;
710 }
711
712 constexpr bool hasHint() const noexcept
713 {
714 return hasHint_;
715 }
716
717 constexpr void setKill(bool flag) noexcept
718 {
719 isKill_ = flag;
720 }
721
722 constexpr bool isKill() const noexcept
723 {
724 return isKill_;
725 }
726
727 private:
728 Temp temp = Temp(0, s1);
729 PhysReg reg_;
730 union {
731 struct {
732 uint8_t isFixed_:1;
733 uint8_t hasHint_:1;
734 uint8_t isKill_:1;
735 };
736 /* can't initialize bit-fields in c++11, so work around using a union */
737 uint8_t control_ = 0;
738 };
739 };
740
741 class Block;
742
743 struct Instruction {
744 aco_opcode opcode;
745 Format format;
746 uint32_t pass_flags;
747
748 aco::span<Operand> operands;
749 aco::span<Definition> definitions;
750
751 constexpr bool isVALU() const noexcept
752 {
753 return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
754 || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
755 || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
756 || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
757 || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
758 || format == Format::VOP3P;
759 }
760
761 constexpr bool isSALU() const noexcept
762 {
763 return format == Format::SOP1 ||
764 format == Format::SOP2 ||
765 format == Format::SOPC ||
766 format == Format::SOPK ||
767 format == Format::SOPP;
768 }
769
770 constexpr bool isVMEM() const noexcept
771 {
772 return format == Format::MTBUF ||
773 format == Format::MUBUF ||
774 format == Format::MIMG;
775 }
776
777 constexpr bool isDPP() const noexcept
778 {
779 return (uint16_t) format & (uint16_t) Format::DPP;
780 }
781
782 constexpr bool isVOP3() const noexcept
783 {
784 return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
785 ((uint16_t) format & (uint16_t) Format::VOP3B);
786 }
787
788 constexpr bool isSDWA() const noexcept
789 {
790 return (uint16_t) format & (uint16_t) Format::SDWA;
791 }
792
793 constexpr bool isFlatOrGlobal() const noexcept
794 {
795 return format == Format::FLAT || format == Format::GLOBAL;
796 }
797
798 constexpr bool usesModifiers() const noexcept;
799
800 constexpr bool reads_exec() const noexcept
801 {
802 for (const Operand& op : operands) {
803 if (op.isFixed() && op.physReg() == exec)
804 return true;
805 }
806 return false;
807 }
808 };
809 static_assert(sizeof(Instruction) == 16);
810
811 struct SOPK_instruction : public Instruction {
812 uint16_t imm;
813 uint16_t padding;
814 };
815 static_assert(sizeof(SOPK_instruction) == sizeof(Instruction) + 4);
816
817 struct SOPP_instruction : public Instruction {
818 uint32_t imm;
819 int block;
820 };
821 static_assert(sizeof(SOPP_instruction) == sizeof(Instruction) + 8);
822
823 struct SOPC_instruction : public Instruction {
824 };
825 static_assert(sizeof(SOPC_instruction) == sizeof(Instruction) + 0);
826
827 struct SOP1_instruction : public Instruction {
828 };
829 static_assert(sizeof(SOP1_instruction) == sizeof(Instruction) + 0);
830
831 struct SOP2_instruction : public Instruction {
832 };
833 static_assert(sizeof(SOP2_instruction) == sizeof(Instruction) + 0);
834
835 /**
836 * Scalar Memory Format:
837 * For s_(buffer_)load_dword*:
838 * Operand(0): SBASE - SGPR-pair which provides base address
839 * Operand(1): Offset - immediate (un)signed offset or SGPR
840 * Operand(2) / Definition(0): SDATA - SGPR for read / write result
841 * Operand(n-1): SOffset - SGPR offset (Vega only)
842 *
843 * Having no operands is also valid for instructions such as s_dcache_inv.
844 *
845 */
846 struct SMEM_instruction : public Instruction {
847 barrier_interaction barrier;
848 bool glc : 1; /* VI+: globally coherent */
849 bool dlc : 1; /* NAVI: device level coherent */
850 bool nv : 1; /* VEGA only: Non-volatile */
851 bool can_reorder : 1;
852 bool disable_wqm : 1;
853 uint32_t padding: 19;
854 };
855 static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4);
856
857 struct VOP1_instruction : public Instruction {
858 };
859 static_assert(sizeof(VOP1_instruction) == sizeof(Instruction) + 0);
860
861 struct VOP2_instruction : public Instruction {
862 };
863 static_assert(sizeof(VOP2_instruction) == sizeof(Instruction) + 0);
864
865 struct VOPC_instruction : public Instruction {
866 };
867 static_assert(sizeof(VOPC_instruction) == sizeof(Instruction) + 0);
868
869 struct VOP3A_instruction : public Instruction {
870 bool abs[3];
871 bool neg[3];
872 uint8_t opsel : 4;
873 uint8_t omod : 2;
874 bool clamp : 1;
875 uint32_t padding : 9;
876 };
877 static_assert(sizeof(VOP3A_instruction) == sizeof(Instruction) + 8);
878
879 struct VOP3P_instruction : public Instruction {
880 bool neg_lo[3];
881 bool neg_hi[3];
882 uint8_t opsel_lo : 3;
883 uint8_t opsel_hi : 3;
884 bool clamp : 1;
885 uint32_t padding : 9;
886 };
887 static_assert(sizeof(VOP3P_instruction) == sizeof(Instruction) + 8);
888
889 /**
890 * Data Parallel Primitives Format:
891 * This format can be used for VOP1, VOP2 or VOPC instructions.
892 * The swizzle applies to the src0 operand.
893 *
894 */
895 struct DPP_instruction : public Instruction {
896 bool abs[2];
897 bool neg[2];
898 uint16_t dpp_ctrl;
899 uint8_t row_mask : 4;
900 uint8_t bank_mask : 4;
901 bool bound_ctrl : 1;
902 uint32_t padding : 7;
903 };
904 static_assert(sizeof(DPP_instruction) == sizeof(Instruction) + 8);
905
906 enum sdwa_sel : uint8_t {
907 /* masks */
908 sdwa_wordnum = 0x1,
909 sdwa_bytenum = 0x3,
910 sdwa_asuint = 0x7 | 0x10,
911 sdwa_rasize = 0x3,
912
913 /* flags */
914 sdwa_isword = 0x4,
915 sdwa_sext = 0x8,
916 sdwa_isra = 0x10,
917
918 /* specific values */
919 sdwa_ubyte0 = 0,
920 sdwa_ubyte1 = 1,
921 sdwa_ubyte2 = 2,
922 sdwa_ubyte3 = 3,
923 sdwa_uword0 = sdwa_isword | 0,
924 sdwa_uword1 = sdwa_isword | 1,
925 sdwa_udword = 6,
926
927 sdwa_sbyte0 = sdwa_ubyte0 | sdwa_sext,
928 sdwa_sbyte1 = sdwa_ubyte1 | sdwa_sext,
929 sdwa_sbyte2 = sdwa_ubyte2 | sdwa_sext,
930 sdwa_sbyte3 = sdwa_ubyte3 | sdwa_sext,
931 sdwa_sword0 = sdwa_uword0 | sdwa_sext,
932 sdwa_sword1 = sdwa_uword1 | sdwa_sext,
933 sdwa_sdword = sdwa_udword | sdwa_sext,
934
935 /* register-allocated */
936 sdwa_ubyte = 1 | sdwa_isra,
937 sdwa_uword = 2 | sdwa_isra,
938 sdwa_sbyte = sdwa_ubyte | sdwa_sext,
939 sdwa_sword = sdwa_uword | sdwa_sext,
940 };
941
942 /**
943 * Sub-Dword Addressing Format:
944 * This format can be used for VOP1, VOP2 or VOPC instructions.
945 *
946 * omod and SGPR/constant operands are only available on GFX9+. For VOPC,
947 * the definition doesn't have to be VCC on GFX9+.
948 *
949 */
950 struct SDWA_instruction : public Instruction {
951 /* these destination modifiers aren't available with VOPC except for
952 * clamp on GFX8 */
953 uint8_t sel[2];
954 uint8_t dst_sel;
955 bool neg[2];
956 bool abs[2];
957 bool dst_preserve : 1;
958 bool clamp : 1;
959 uint8_t omod : 2; /* GFX9+ */
960 uint32_t padding : 4;
961 };
962 static_assert(sizeof(SDWA_instruction) == sizeof(Instruction) + 8);
963
964 struct Interp_instruction : public Instruction {
965 uint8_t attribute;
966 uint8_t component;
967 uint16_t padding;
968 };
969 static_assert(sizeof(Interp_instruction) == sizeof(Instruction) + 4);
970
971 /**
972 * Local and Global Data Sharing instructions
973 * Operand(0): ADDR - VGPR which supplies the address.
974 * Operand(1): DATA0 - First data VGPR.
975 * Operand(2): DATA1 - Second data VGPR.
976 * Operand(n-1): M0 - LDS size.
977 * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
978 *
979 */
980 struct DS_instruction : public Instruction {
981 int16_t offset0;
982 int8_t offset1;
983 bool gds;
984 };
985 static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 4);
986
987 /**
988 * Vector Memory Untyped-buffer Instructions
989 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
990 * Operand(1): VADDR - Address source. Can carry an index and/or offset
991 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
992 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
993 *
994 */
995 struct MUBUF_instruction : public Instruction {
996 uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
997 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
998 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
999 bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
1000 bool glc : 1; /* globally coherent */
1001 bool dlc : 1; /* NAVI: device level coherent */
1002 bool slc : 1; /* system level coherent */
1003 bool tfe : 1; /* texture fail enable */
1004 bool lds : 1; /* Return read-data to LDS instead of VGPRs */
1005 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1006 bool can_reorder : 1;
1007 uint8_t padding : 2;
1008 barrier_interaction barrier;
1009 };
1010 static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 4);
1011
1012 /**
1013 * Vector Memory Typed-buffer Instructions
1014 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1015 * Operand(1): VADDR - Address source. Can carry an index and/or offset
1016 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1017 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1018 *
1019 */
1020 struct MTBUF_instruction : public Instruction {
1021 uint16_t offset; /* Unsigned byte offset - 12 bit */
1022 barrier_interaction barrier;
1023 uint8_t dfmt : 4; /* Data Format of data in memory buffer */
1024 uint8_t nfmt : 3; /* Numeric format of data in memory */
1025 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1026 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1027 bool glc : 1; /* globally coherent */
1028 bool dlc : 1; /* NAVI: device level coherent */
1029 bool slc : 1; /* system level coherent */
1030 bool tfe : 1; /* texture fail enable */
1031 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1032 bool can_reorder : 1;
1033 uint32_t padding : 25;
1034 };
1035 static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8);
1036
1037 /**
1038 * Vector Memory Image Instructions
1039 * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
1040 * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
1041 * or VDATA - Vector GPR for write data.
1042 * Operand(2): VADDR - Address source. Can carry an offset or an index.
1043 * Definition(0): VDATA - Vector GPR for read result.
1044 *
1045 */
1046 struct MIMG_instruction : public Instruction {
1047 uint8_t dmask; /* Data VGPR enable mask */
1048 uint8_t dim : 3; /* NAVI: dimensionality */
1049 bool unrm : 1; /* Force address to be un-normalized */
1050 bool dlc : 1; /* NAVI: device level coherent */
1051 bool glc : 1; /* globally coherent */
1052 bool slc : 1; /* system level coherent */
1053 bool tfe : 1; /* texture fail enable */
1054 bool da : 1; /* declare an array */
1055 bool lwe : 1; /* Force data to be un-normalized */
1056 bool r128 : 1; /* NAVI: Texture resource size */
1057 bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
1058 bool d16 : 1; /* Convert 32-bit data to 16-bit data */
1059 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1060 bool can_reorder : 1;
1061 uint8_t padding : 1;
1062 barrier_interaction barrier;
1063 };
1064 static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 4);
1065
1066 /**
1067 * Flat/Scratch/Global Instructions
1068 * Operand(0): ADDR
1069 * Operand(1): SADDR
1070 * Operand(2) / Definition(0): DATA/VDST
1071 *
1072 */
1073 struct FLAT_instruction : public Instruction {
1074 uint16_t offset; /* Vega/Navi only */
1075 bool slc : 1; /* system level coherent */
1076 bool glc : 1; /* globally coherent */
1077 bool dlc : 1; /* NAVI: device level coherent */
1078 bool lds : 1;
1079 bool nv : 1;
1080 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1081 bool can_reorder : 1;
1082 uint8_t padding : 1;
1083 barrier_interaction barrier;
1084 };
1085 static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 4);
1086
1087 struct Export_instruction : public Instruction {
1088 uint8_t enabled_mask;
1089 uint8_t dest;
1090 bool compressed : 1;
1091 bool done : 1;
1092 bool valid_mask : 1;
1093 uint32_t padding : 13;
1094 };
1095 static_assert(sizeof(Export_instruction) == sizeof(Instruction) + 4);
1096
1097 struct Pseudo_instruction : public Instruction {
1098 PhysReg scratch_sgpr; /* might not be valid if it's not needed */
1099 bool tmp_in_scc;
1100 uint8_t padding;
1101 };
1102 static_assert(sizeof(Pseudo_instruction) == sizeof(Instruction) + 4);
1103
1104 struct Pseudo_branch_instruction : public Instruction {
1105 /* target[0] is the block index of the branch target.
1106 * For conditional branches, target[1] contains the fall-through alternative.
1107 * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
1108 */
1109 uint32_t target[2];
1110 };
1111 static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 8);
1112
1113 struct Pseudo_barrier_instruction : public Instruction {
1114 };
1115 static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 0);
1116
1117 enum ReduceOp : uint16_t {
1118 iadd32, iadd64,
1119 imul32, imul64,
1120 fadd32, fadd64,
1121 fmul32, fmul64,
1122 imin32, imin64,
1123 imax32, imax64,
1124 umin32, umin64,
1125 umax32, umax64,
1126 fmin32, fmin64,
1127 fmax32, fmax64,
1128 iand32, iand64,
1129 ior32, ior64,
1130 ixor32, ixor64,
1131 gfx10_wave64_bpermute
1132 };
1133
1134 /**
1135 * Subgroup Reduction Instructions, everything except for the data to be
1136 * reduced and the result as inserted by setup_reduce_temp().
1137 * Operand(0): data to be reduced
1138 * Operand(1): reduce temporary
1139 * Operand(2): vector temporary
1140 * Definition(0): result
1141 * Definition(1): scalar temporary
1142 * Definition(2): scalar identity temporary (not used to store identity on GFX10)
1143 * Definition(3): scc clobber
1144 * Definition(4): vcc clobber
1145 *
1146 */
1147 struct Pseudo_reduction_instruction : public Instruction {
1148 ReduceOp reduce_op;
1149 uint16_t cluster_size; // must be 0 for scans
1150 };
1151 static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4);
1152
1153 struct instr_deleter_functor {
1154 void operator()(void* p) {
1155 free(p);
1156 }
1157 };
1158
1159 template<typename T>
1160 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
1161
1162 template<typename T>
1163 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
1164 {
1165 std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1166 char *data = (char*) calloc(1, size);
1167 T* inst = (T*) data;
1168
1169 inst->opcode = opcode;
1170 inst->format = format;
1171
1172 uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
1173 inst->operands = aco::span<Operand>(operands_offset, num_operands);
1174 uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1175 inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1176
1177 return inst;
1178 }
1179
1180 constexpr bool Instruction::usesModifiers() const noexcept
1181 {
1182 if (isDPP() || isSDWA())
1183 return true;
1184
1185 if (format == Format::VOP3P) {
1186 const VOP3P_instruction *vop3p = static_cast<const VOP3P_instruction*>(this);
1187 for (unsigned i = 0; i < operands.size(); i++) {
1188 if (vop3p->neg_lo[i] || vop3p->neg_hi[i])
1189 return true;
1190 }
1191 return vop3p->opsel_lo || vop3p->opsel_hi || vop3p->clamp;
1192 } else if (isVOP3()) {
1193 const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
1194 for (unsigned i = 0; i < operands.size(); i++) {
1195 if (vop3->abs[i] || vop3->neg[i])
1196 return true;
1197 }
1198 return vop3->opsel || vop3->clamp || vop3->omod;
1199 }
1200 return false;
1201 }
1202
1203 constexpr bool is_phi(Instruction* instr)
1204 {
1205 return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
1206 }
1207
1208 static inline bool is_phi(aco_ptr<Instruction>& instr)
1209 {
1210 return is_phi(instr.get());
1211 }
1212
1213 barrier_interaction get_barrier_interaction(Instruction* instr);
1214
1215 bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr);
1216
1217 enum block_kind {
1218 /* uniform indicates that leaving this block,
1219 * all actives lanes stay active */
1220 block_kind_uniform = 1 << 0,
1221 block_kind_top_level = 1 << 1,
1222 block_kind_loop_preheader = 1 << 2,
1223 block_kind_loop_header = 1 << 3,
1224 block_kind_loop_exit = 1 << 4,
1225 block_kind_continue = 1 << 5,
1226 block_kind_break = 1 << 6,
1227 block_kind_continue_or_break = 1 << 7,
1228 block_kind_discard = 1 << 8,
1229 block_kind_branch = 1 << 9,
1230 block_kind_merge = 1 << 10,
1231 block_kind_invert = 1 << 11,
1232 block_kind_uses_discard_if = 1 << 12,
1233 block_kind_needs_lowering = 1 << 13,
1234 block_kind_uses_demote = 1 << 14,
1235 block_kind_export_end = 1 << 15,
1236 };
1237
1238
1239 struct RegisterDemand {
1240 constexpr RegisterDemand() = default;
1241 constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1242 : vgpr{v}, sgpr{s} {}
1243 int16_t vgpr = 0;
1244 int16_t sgpr = 0;
1245
1246 constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1247 return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1248 }
1249
1250 constexpr bool exceeds(const RegisterDemand other) const noexcept {
1251 return vgpr > other.vgpr || sgpr > other.sgpr;
1252 }
1253
1254 constexpr RegisterDemand operator+(const Temp t) const noexcept {
1255 if (t.type() == RegType::sgpr)
1256 return RegisterDemand( vgpr, sgpr + t.size() );
1257 else
1258 return RegisterDemand( vgpr + t.size(), sgpr );
1259 }
1260
1261 constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1262 return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1263 }
1264
1265 constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1266 return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1267 }
1268
1269 constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1270 vgpr += other.vgpr;
1271 sgpr += other.sgpr;
1272 return *this;
1273 }
1274
1275 constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1276 vgpr -= other.vgpr;
1277 sgpr -= other.sgpr;
1278 return *this;
1279 }
1280
1281 constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1282 if (t.type() == RegType::sgpr)
1283 sgpr += t.size();
1284 else
1285 vgpr += t.size();
1286 return *this;
1287 }
1288
1289 constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1290 if (t.type() == RegType::sgpr)
1291 sgpr -= t.size();
1292 else
1293 vgpr -= t.size();
1294 return *this;
1295 }
1296
1297 constexpr void update(const RegisterDemand other) noexcept {
1298 vgpr = std::max(vgpr, other.vgpr);
1299 sgpr = std::max(sgpr, other.sgpr);
1300 }
1301
1302 };
1303
1304 /* CFG */
1305 struct Block {
1306 float_mode fp_mode;
1307 unsigned index;
1308 unsigned offset = 0;
1309 std::vector<aco_ptr<Instruction>> instructions;
1310 std::vector<unsigned> logical_preds;
1311 std::vector<unsigned> linear_preds;
1312 std::vector<unsigned> logical_succs;
1313 std::vector<unsigned> linear_succs;
1314 RegisterDemand register_demand = RegisterDemand();
1315 uint16_t loop_nest_depth = 0;
1316 uint16_t kind = 0;
1317 int logical_idom = -1;
1318 int linear_idom = -1;
1319 Temp live_out_exec = Temp();
1320
1321 /* this information is needed for predecessors to blocks with phis when
1322 * moving out of ssa */
1323 bool scc_live_out = false;
1324 PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1325
1326 Block(unsigned idx) : index(idx) {}
1327 Block() : index(0) {}
1328 };
1329
1330 using Stage = uint16_t;
1331
1332 /* software stages */
1333 static constexpr Stage sw_vs = 1 << 0;
1334 static constexpr Stage sw_gs = 1 << 1;
1335 static constexpr Stage sw_tcs = 1 << 2;
1336 static constexpr Stage sw_tes = 1 << 3;
1337 static constexpr Stage sw_fs = 1 << 4;
1338 static constexpr Stage sw_cs = 1 << 5;
1339 static constexpr Stage sw_gs_copy = 1 << 6;
1340 static constexpr Stage sw_mask = 0x7f;
1341
1342 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1343 static constexpr Stage hw_vs = 1 << 7;
1344 static constexpr Stage hw_es = 1 << 8; /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */
1345 static constexpr Stage hw_gs = 1 << 9; /* Geometry shader on GFX10/legacy and GFX6-9. */
1346 static constexpr Stage hw_ngg_gs = 1 << 10; /* Geometry shader on GFX10/NGG. */
1347 static constexpr Stage hw_ls = 1 << 11; /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */
1348 static constexpr Stage hw_hs = 1 << 12; /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */
1349 static constexpr Stage hw_fs = 1 << 13;
1350 static constexpr Stage hw_cs = 1 << 14;
1351 static constexpr Stage hw_mask = 0xff << 7;
1352
1353 /* possible settings of Program::stage */
1354 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1355 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1356 static constexpr Stage compute_cs = sw_cs | hw_cs;
1357 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1358 static constexpr Stage gs_copy_vs = sw_gs_copy | hw_vs;
1359 /* GFX10/NGG */
1360 static constexpr Stage ngg_vertex_gs = sw_vs | hw_ngg_gs;
1361 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_ngg_gs;
1362 static constexpr Stage ngg_tess_eval_gs = sw_tes | hw_ngg_gs;
1363 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_ngg_gs;
1364 /* GFX9 (and GFX10 if NGG isn't used) */
1365 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1366 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1367 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1368 /* pre-GFX9 */
1369 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1370 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1371 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1372 static constexpr Stage tess_eval_es = sw_tes | hw_es; /* tesselation evaluation before geometry */
1373 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1374
1375 enum statistic {
1376 statistic_hash,
1377 statistic_instructions,
1378 statistic_copies,
1379 statistic_branches,
1380 statistic_cycles,
1381 statistic_vmem_clauses,
1382 statistic_smem_clauses,
1383 statistic_vmem_score,
1384 statistic_smem_score,
1385 statistic_sgpr_presched,
1386 statistic_vgpr_presched,
1387 num_statistics
1388 };
1389
1390 class Program final {
1391 public:
1392 float_mode next_fp_mode;
1393 std::vector<Block> blocks;
1394 RegisterDemand max_reg_demand = RegisterDemand();
1395 uint16_t num_waves = 0;
1396 uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1397 ac_shader_config* config;
1398 struct radv_shader_info *info;
1399 enum chip_class chip_class;
1400 enum radeon_family family;
1401 unsigned wave_size;
1402 RegClass lane_mask;
1403 Stage stage; /* Stage */
1404 bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1405 bool needs_wqm = false; /* there exists a p_wqm instruction */
1406 bool wb_smem_l1_on_end = false;
1407
1408 std::vector<uint8_t> constant_data;
1409 Temp private_segment_buffer;
1410 Temp scratch_offset;
1411
1412 uint16_t min_waves = 0;
1413 uint16_t lds_alloc_granule;
1414 uint32_t lds_limit; /* in bytes */
1415 bool has_16bank_lds;
1416 uint16_t vgpr_limit;
1417 uint16_t sgpr_limit;
1418 uint16_t physical_sgprs;
1419 uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1420 uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
1421 unsigned workgroup_size; /* if known; otherwise UINT_MAX */
1422
1423 bool xnack_enabled = false;
1424
1425 bool needs_vcc = false;
1426 bool needs_flat_scr = false;
1427
1428 bool collect_statistics = false;
1429 uint32_t statistics[num_statistics];
1430
1431 uint32_t allocateId()
1432 {
1433 assert(allocationID <= 16777215);
1434 return allocationID++;
1435 }
1436
1437 uint32_t peekAllocationId()
1438 {
1439 return allocationID;
1440 }
1441
1442 void setAllocationId(uint32_t id)
1443 {
1444 allocationID = id;
1445 }
1446
1447 Block* create_and_insert_block() {
1448 blocks.emplace_back(blocks.size());
1449 blocks.back().fp_mode = next_fp_mode;
1450 return &blocks.back();
1451 }
1452
1453 Block* insert_block(Block&& block) {
1454 block.index = blocks.size();
1455 block.fp_mode = next_fp_mode;
1456 blocks.emplace_back(std::move(block));
1457 return &blocks.back();
1458 }
1459
1460 private:
1461 uint32_t allocationID = 1;
1462 };
1463
1464 struct TempHash {
1465 std::size_t operator()(Temp t) const {
1466 return t.id();
1467 }
1468 };
1469 using TempSet = std::unordered_set<Temp, TempHash>;
1470
1471 struct live {
1472 /* live temps out per block */
1473 std::vector<TempSet> live_out;
1474 /* register demand (sgpr/vgpr) per instruction per block */
1475 std::vector<std::vector<RegisterDemand>> register_demand;
1476 };
1477
1478 void select_program(Program *program,
1479 unsigned shader_count,
1480 struct nir_shader *const *shaders,
1481 ac_shader_config* config,
1482 struct radv_shader_args *args);
1483 void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
1484 ac_shader_config* config,
1485 struct radv_shader_args *args);
1486
1487 void lower_wqm(Program* program, live& live_vars,
1488 const struct radv_nir_compiler_options *options);
1489 void lower_phis(Program* program);
1490 void calc_min_waves(Program* program);
1491 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1492 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1493 std::vector<uint16_t> dead_code_analysis(Program *program);
1494 void dominator_tree(Program* program);
1495 void insert_exec_mask(Program *program);
1496 void value_numbering(Program* program);
1497 void optimize(Program* program);
1498 void setup_reduce_temp(Program* program);
1499 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1500 void register_allocation(Program *program, std::vector<TempSet>& live_out_per_block);
1501 void ssa_elimination(Program* program);
1502 void lower_to_hw_instr(Program* program);
1503 void schedule_program(Program* program, live& live_vars);
1504 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1505 void insert_wait_states(Program* program);
1506 void insert_NOPs(Program* program);
1507 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1508 void print_asm(Program *program, std::vector<uint32_t>& binary,
1509 unsigned exec_size, std::ostream& out);
1510 void validate(Program* program, FILE *output);
1511 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output);
1512 #ifndef NDEBUG
1513 void perfwarn(bool cond, const char *msg, Instruction *instr=NULL);
1514 #else
1515 #define perfwarn(program, cond, msg, ...) do {} while(0)
1516 #endif
1517
1518 void collect_presched_stats(Program *program);
1519 void collect_preasm_stats(Program *program);
1520 void collect_postasm_stats(Program *program, const std::vector<uint32_t>& code);
1521
1522 void aco_print_instr(Instruction *instr, FILE *output);
1523 void aco_print_program(Program *program, FILE *output);
1524
1525 /* utilities for dealing with register demand */
1526 RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
1527 RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
1528 RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, aco_ptr<Instruction>& instr_before);
1529
1530 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1531 uint16_t get_extra_sgprs(Program *program);
1532
1533 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
1534 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1535 uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs);
1536
1537 /* return number of addressable sgprs/vgprs for max_waves */
1538 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1539 uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves);
1540
1541 typedef struct {
1542 const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
1543 const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1544 const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1545 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1546 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1547 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
1548 const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1549 const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1550 } Info;
1551
1552 extern const Info instr_info;
1553
1554 }
1555
1556 #endif /* ACO_IR_H */
1557