7e010db7cdf925ae2cb971664a3adfeb9deffb60
[mesa.git] / src / amd / compiler / aco_ir.h
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #ifndef ACO_IR_H
26 #define ACO_IR_H
27
28 #include <vector>
29 #include <set>
30 #include <bitset>
31 #include <memory>
32
33 #include "nir.h"
34 #include "ac_binary.h"
35 #include "amd_family.h"
36 #include "aco_opcodes.h"
37 #include "aco_util.h"
38
39 struct radv_nir_compiler_options;
40 struct radv_shader_args;
41 struct radv_shader_info;
42
43 namespace aco {
44
45 extern uint64_t debug_flags;
46
47 enum {
48 DEBUG_VALIDATE = 0x1,
49 DEBUG_VALIDATE_RA = 0x2,
50 DEBUG_PERFWARN = 0x4,
51 };
52
53 /**
54 * Representation of the instruction's microcode encoding format
55 * Note: Some Vector ALU Formats can be combined, such that:
56 * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
57 * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
58 * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
59 *
60 * (*) The same is applicable for VOP1 and VOPC instructions.
61 */
62 enum class Format : std::uint16_t {
63 /* Pseudo Instruction Format */
64 PSEUDO = 0,
65 /* Scalar ALU & Control Formats */
66 SOP1 = 1,
67 SOP2 = 2,
68 SOPK = 3,
69 SOPP = 4,
70 SOPC = 5,
71 /* Scalar Memory Format */
72 SMEM = 6,
73 /* LDS/GDS Format */
74 DS = 8,
75 /* Vector Memory Buffer Formats */
76 MTBUF = 9,
77 MUBUF = 10,
78 /* Vector Memory Image Format */
79 MIMG = 11,
80 /* Export Format */
81 EXP = 12,
82 /* Flat Formats */
83 FLAT = 13,
84 GLOBAL = 14,
85 SCRATCH = 15,
86
87 PSEUDO_BRANCH = 16,
88 PSEUDO_BARRIER = 17,
89 PSEUDO_REDUCTION = 18,
90
91 /* Vector ALU Formats */
92 VOP1 = 1 << 8,
93 VOP2 = 1 << 9,
94 VOPC = 1 << 10,
95 VOP3 = 1 << 11,
96 VOP3A = 1 << 11,
97 VOP3B = 1 << 11,
98 VOP3P = 1 << 12,
99 /* Vector Parameter Interpolation Format */
100 VINTRP = 1 << 13,
101 DPP = 1 << 14,
102 SDWA = 1 << 15,
103 };
104
105 enum barrier_interaction : uint8_t {
106 barrier_none = 0,
107 barrier_buffer = 0x1,
108 barrier_image = 0x2,
109 barrier_atomic = 0x4,
110 barrier_shared = 0x8,
111 /* used for geometry shaders to ensure vertex data writes are before the
112 * GS_DONE s_sendmsg. */
113 barrier_gs_data = 0x10,
114 /* used for geometry shaders to ensure s_sendmsg instructions are in-order. */
115 barrier_gs_sendmsg = 0x20,
116 /* used by barriers. created by s_barrier */
117 barrier_barrier = 0x40,
118 barrier_count = 6,
119 };
120
121 enum fp_round {
122 fp_round_ne = 0,
123 fp_round_pi = 1,
124 fp_round_ni = 2,
125 fp_round_tz = 3,
126 };
127
128 enum fp_denorm {
129 /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
130 * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
131 fp_denorm_flush = 0x0,
132 fp_denorm_keep = 0x3,
133 };
134
135 struct float_mode {
136 /* matches encoding of the MODE register */
137 union {
138 struct {
139 fp_round round32:2;
140 fp_round round16_64:2;
141 unsigned denorm32:2;
142 unsigned denorm16_64:2;
143 };
144 uint8_t val = 0;
145 };
146 /* if false, optimizations which may remove infs/nan/-0.0 can be done */
147 bool preserve_signed_zero_inf_nan32:1;
148 bool preserve_signed_zero_inf_nan16_64:1;
149 /* if false, optimizations which may remove denormal flushing can be done */
150 bool must_flush_denorms32:1;
151 bool must_flush_denorms16_64:1;
152 bool care_about_round32:1;
153 bool care_about_round16_64:1;
154
155 /* Returns true if instructions using the mode "other" can safely use the
156 * current one instead. */
157 bool canReplace(float_mode other) const noexcept {
158 return val == other.val &&
159 (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
160 (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
161 (must_flush_denorms32 || !other.must_flush_denorms32) &&
162 (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
163 (care_about_round32 || !other.care_about_round32) &&
164 (care_about_round16_64 || !other.care_about_round16_64);
165 }
166 };
167
168 constexpr Format asVOP3(Format format) {
169 return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
170 };
171
172 constexpr Format asSDWA(Format format) {
173 assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC);
174 return (Format) ((uint32_t) Format::SDWA | (uint32_t) format);
175 }
176
177 enum class RegType {
178 none = 0,
179 sgpr,
180 vgpr,
181 linear_vgpr,
182 };
183
184 struct RegClass {
185
186 enum RC : uint8_t {
187 s1 = 1,
188 s2 = 2,
189 s3 = 3,
190 s4 = 4,
191 s6 = 6,
192 s8 = 8,
193 s16 = 16,
194 v1 = s1 | (1 << 5),
195 v2 = s2 | (1 << 5),
196 v3 = s3 | (1 << 5),
197 v4 = s4 | (1 << 5),
198 v5 = 5 | (1 << 5),
199 v6 = 6 | (1 << 5),
200 v7 = 7 | (1 << 5),
201 v8 = 8 | (1 << 5),
202 /* byte-sized register class */
203 v1b = v1 | (1 << 7),
204 v2b = v2 | (1 << 7),
205 v3b = v3 | (1 << 7),
206 v4b = v4 | (1 << 7),
207 v6b = v6 | (1 << 7),
208 v8b = v8 | (1 << 7),
209 /* these are used for WWM and spills to vgpr */
210 v1_linear = v1 | (1 << 6),
211 v2_linear = v2 | (1 << 6),
212 };
213
214 RegClass() = default;
215 constexpr RegClass(RC rc)
216 : rc(rc) {}
217 constexpr RegClass(RegType type, unsigned size)
218 : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
219
220 constexpr operator RC() const { return rc; }
221 explicit operator bool() = delete;
222
223 constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
224 constexpr bool is_subdword() const { return rc & (1 << 7); }
225 constexpr unsigned bytes() const { return ((unsigned) rc & 0x1F) * (is_subdword() ? 1 : 4); }
226 //TODO: use size() less in favor of bytes()
227 constexpr unsigned size() const { return (bytes() + 3) >> 2; }
228 constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
229 constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
230
231 private:
232 RC rc;
233 };
234
235 /* transitional helper expressions */
236 static constexpr RegClass s1{RegClass::s1};
237 static constexpr RegClass s2{RegClass::s2};
238 static constexpr RegClass s3{RegClass::s3};
239 static constexpr RegClass s4{RegClass::s4};
240 static constexpr RegClass s8{RegClass::s8};
241 static constexpr RegClass s16{RegClass::s16};
242 static constexpr RegClass v1{RegClass::v1};
243 static constexpr RegClass v2{RegClass::v2};
244 static constexpr RegClass v3{RegClass::v3};
245 static constexpr RegClass v4{RegClass::v4};
246 static constexpr RegClass v5{RegClass::v5};
247 static constexpr RegClass v6{RegClass::v6};
248 static constexpr RegClass v7{RegClass::v7};
249 static constexpr RegClass v8{RegClass::v8};
250 static constexpr RegClass v1b{RegClass::v1b};
251 static constexpr RegClass v2b{RegClass::v2b};
252 static constexpr RegClass v3b{RegClass::v3b};
253 static constexpr RegClass v4b{RegClass::v4b};
254 static constexpr RegClass v6b{RegClass::v6b};
255 static constexpr RegClass v8b{RegClass::v8b};
256
257 /**
258 * Temp Class
259 * Each temporary virtual register has a
260 * register class (i.e. size and type)
261 * and SSA id.
262 */
263 struct Temp {
264 Temp() = default;
265 constexpr Temp(uint32_t id, RegClass cls) noexcept
266 : id_(id), reg_class(cls) {}
267
268 constexpr uint32_t id() const noexcept { return id_; }
269 constexpr RegClass regClass() const noexcept { return reg_class; }
270
271 constexpr unsigned bytes() const noexcept { return reg_class.bytes(); }
272 constexpr unsigned size() const noexcept { return reg_class.size(); }
273 constexpr RegType type() const noexcept { return reg_class.type(); }
274 constexpr bool is_linear() const noexcept { return reg_class.is_linear(); }
275
276 constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
277 constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
278 constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
279
280 private:
281 uint32_t id_:24;
282 RegClass reg_class;
283 };
284
285 /**
286 * PhysReg
287 * Represents the physical register for each
288 * Operand and Definition.
289 */
290 struct PhysReg {
291 constexpr PhysReg() = default;
292 explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {}
293 constexpr unsigned reg() const { return reg_b >> 2; }
294 constexpr unsigned byte() const { return reg_b & 0x3; }
295 constexpr operator unsigned() const { return reg(); }
296 constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; }
297 constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; }
298 constexpr bool operator <(PhysReg other) const { return reg_b < other.reg_b; }
299
300 uint16_t reg_b = 0;
301 };
302
303 /* helper expressions for special registers */
304 static constexpr PhysReg m0{124};
305 static constexpr PhysReg vcc{106};
306 static constexpr PhysReg vcc_hi{107};
307 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
308 static constexpr PhysReg exec{126};
309 static constexpr PhysReg exec_lo{126};
310 static constexpr PhysReg exec_hi{127};
311 static constexpr PhysReg vccz{251};
312 static constexpr PhysReg execz{252};
313 static constexpr PhysReg scc{253};
314
315 /**
316 * Operand Class
317 * Initially, each Operand refers to either
318 * a temporary virtual register
319 * or to a constant value
320 * Temporary registers get mapped to physical register during RA
321 * Constant values are inlined into the instruction sequence.
322 */
323 class Operand final
324 {
325 public:
326 constexpr Operand()
327 : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
328 isKill_(false), isUndef_(true), isFirstKill_(false), is64BitConst_(false),
329 isLateKill_(false) {}
330
331 explicit Operand(Temp r) noexcept
332 {
333 data_.temp = r;
334 if (r.id()) {
335 isTemp_ = true;
336 } else {
337 isUndef_ = true;
338 setFixed(PhysReg{128});
339 }
340 };
341 explicit Operand(uint32_t v, bool is64bit = false) noexcept
342 {
343 data_.i = v;
344 isConstant_ = true;
345 is64BitConst_ = is64bit;
346 if (v <= 64)
347 setFixed(PhysReg{128 + v});
348 else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
349 setFixed(PhysReg{192 - v});
350 else if (v == 0x3f000000) /* 0.5 */
351 setFixed(PhysReg{240});
352 else if (v == 0xbf000000) /* -0.5 */
353 setFixed(PhysReg{241});
354 else if (v == 0x3f800000) /* 1.0 */
355 setFixed(PhysReg{242});
356 else if (v == 0xbf800000) /* -1.0 */
357 setFixed(PhysReg{243});
358 else if (v == 0x40000000) /* 2.0 */
359 setFixed(PhysReg{244});
360 else if (v == 0xc0000000) /* -2.0 */
361 setFixed(PhysReg{245});
362 else if (v == 0x40800000) /* 4.0 */
363 setFixed(PhysReg{246});
364 else if (v == 0xc0800000) /* -4.0 */
365 setFixed(PhysReg{247});
366 else { /* Literal Constant */
367 assert(!is64bit && "attempt to create a 64-bit literal constant");
368 setFixed(PhysReg{255});
369 }
370 };
371 explicit Operand(uint64_t v) noexcept
372 {
373 isConstant_ = true;
374 is64BitConst_ = true;
375 if (v <= 64) {
376 data_.i = (uint32_t) v;
377 setFixed(PhysReg{128 + (uint32_t) v});
378 } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
379 data_.i = (uint32_t) v;
380 setFixed(PhysReg{192 - (uint32_t) v});
381 } else if (v == 0x3FE0000000000000) { /* 0.5 */
382 data_.i = 0x3f000000;
383 setFixed(PhysReg{240});
384 } else if (v == 0xBFE0000000000000) { /* -0.5 */
385 data_.i = 0xbf000000;
386 setFixed(PhysReg{241});
387 } else if (v == 0x3FF0000000000000) { /* 1.0 */
388 data_.i = 0x3f800000;
389 setFixed(PhysReg{242});
390 } else if (v == 0xBFF0000000000000) { /* -1.0 */
391 data_.i = 0xbf800000;
392 setFixed(PhysReg{243});
393 } else if (v == 0x4000000000000000) { /* 2.0 */
394 data_.i = 0x40000000;
395 setFixed(PhysReg{244});
396 } else if (v == 0xC000000000000000) { /* -2.0 */
397 data_.i = 0xc0000000;
398 setFixed(PhysReg{245});
399 } else if (v == 0x4010000000000000) { /* 4.0 */
400 data_.i = 0x40800000;
401 setFixed(PhysReg{246});
402 } else if (v == 0xC010000000000000) { /* -4.0 */
403 data_.i = 0xc0800000;
404 setFixed(PhysReg{247});
405 } else { /* Literal Constant: we don't know if it is a long or double.*/
406 isConstant_ = 0;
407 assert(false && "attempt to create a 64-bit literal constant");
408 }
409 };
410 explicit Operand(RegClass type) noexcept
411 {
412 isUndef_ = true;
413 data_.temp = Temp(0, type);
414 setFixed(PhysReg{128});
415 };
416 explicit Operand(PhysReg reg, RegClass type) noexcept
417 {
418 data_.temp = Temp(0, type);
419 setFixed(reg);
420 }
421
422 constexpr bool isTemp() const noexcept
423 {
424 return isTemp_;
425 }
426
427 constexpr void setTemp(Temp t) noexcept {
428 assert(!isConstant_);
429 isTemp_ = true;
430 data_.temp = t;
431 }
432
433 constexpr Temp getTemp() const noexcept
434 {
435 return data_.temp;
436 }
437
438 constexpr uint32_t tempId() const noexcept
439 {
440 return data_.temp.id();
441 }
442
443 constexpr bool hasRegClass() const noexcept
444 {
445 return isTemp() || isUndefined();
446 }
447
448 constexpr RegClass regClass() const noexcept
449 {
450 return data_.temp.regClass();
451 }
452
453 constexpr unsigned bytes() const noexcept
454 {
455 if (isConstant())
456 return is64BitConst_ ? 8 : 4; //TODO: sub-dword constants
457 else
458 return data_.temp.bytes();
459 }
460
461 constexpr unsigned size() const noexcept
462 {
463 if (isConstant())
464 return is64BitConst_ ? 2 : 1;
465 else
466 return data_.temp.size();
467 }
468
469 constexpr bool isFixed() const noexcept
470 {
471 return isFixed_;
472 }
473
474 constexpr PhysReg physReg() const noexcept
475 {
476 return reg_;
477 }
478
479 constexpr void setFixed(PhysReg reg) noexcept
480 {
481 isFixed_ = reg != unsigned(-1);
482 reg_ = reg;
483 }
484
485 constexpr bool isConstant() const noexcept
486 {
487 return isConstant_;
488 }
489
490 constexpr bool isLiteral() const noexcept
491 {
492 return isConstant() && reg_ == 255;
493 }
494
495 constexpr bool isUndefined() const noexcept
496 {
497 return isUndef_;
498 }
499
500 constexpr uint32_t constantValue() const noexcept
501 {
502 return data_.i;
503 }
504
505 constexpr bool constantEquals(uint32_t cmp) const noexcept
506 {
507 return isConstant() && constantValue() == cmp;
508 }
509
510 constexpr uint64_t constantValue64(bool signext=false) const noexcept
511 {
512 if (is64BitConst_) {
513 if (reg_ <= 192)
514 return reg_ - 128;
515 else if (reg_ <= 208)
516 return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);
517
518 switch (reg_) {
519 case 240:
520 return 0x3FE0000000000000;
521 case 241:
522 return 0xBFE0000000000000;
523 case 242:
524 return 0x3FF0000000000000;
525 case 243:
526 return 0xBFF0000000000000;
527 case 244:
528 return 0x4000000000000000;
529 case 245:
530 return 0xC000000000000000;
531 case 246:
532 return 0x4010000000000000;
533 case 247:
534 return 0xC010000000000000;
535 }
536 }
537 return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
538 }
539
540 /* Indicates that the killed operand's live range intersects with the
541 * instruction's definitions. Unlike isKill() and isFirstKill(), this is
542 * not set by liveness analysis. */
543 constexpr void setLateKill(bool flag) noexcept
544 {
545 isLateKill_ = flag;
546 }
547
548 constexpr bool isLateKill() const noexcept
549 {
550 return isLateKill_;
551 }
552
553 constexpr void setKill(bool flag) noexcept
554 {
555 isKill_ = flag;
556 if (!flag)
557 setFirstKill(false);
558 }
559
560 constexpr bool isKill() const noexcept
561 {
562 return isKill_ || isFirstKill();
563 }
564
565 constexpr void setFirstKill(bool flag) noexcept
566 {
567 isFirstKill_ = flag;
568 if (flag)
569 setKill(flag);
570 }
571
572 /* When there are multiple operands killing the same temporary,
573 * isFirstKill() is only returns true for the first one. */
574 constexpr bool isFirstKill() const noexcept
575 {
576 return isFirstKill_;
577 }
578
579 constexpr bool isKillBeforeDef() const noexcept
580 {
581 return isKill() && !isLateKill();
582 }
583
584 constexpr bool isFirstKillBeforeDef() const noexcept
585 {
586 return isFirstKill() && !isLateKill();
587 }
588
589 constexpr bool operator == (Operand other) const noexcept
590 {
591 if (other.size() != size())
592 return false;
593 if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef())
594 return false;
595 if (isFixed() && other.isFixed() && physReg() != other.physReg())
596 return false;
597 if (isLiteral())
598 return other.isLiteral() && other.constantValue() == constantValue();
599 else if (isConstant())
600 return other.isConstant() && other.physReg() == physReg();
601 else if (isUndefined())
602 return other.isUndefined() && other.regClass() == regClass();
603 else
604 return other.isTemp() && other.getTemp() == getTemp();
605 }
606 private:
607 union {
608 uint32_t i;
609 float f;
610 Temp temp = Temp(0, s1);
611 } data_;
612 PhysReg reg_;
613 union {
614 struct {
615 uint8_t isTemp_:1;
616 uint8_t isFixed_:1;
617 uint8_t isConstant_:1;
618 uint8_t isKill_:1;
619 uint8_t isUndef_:1;
620 uint8_t isFirstKill_:1;
621 uint8_t is64BitConst_:1;
622 uint8_t isLateKill_:1;
623 };
624 /* can't initialize bit-fields in c++11, so work around using a union */
625 uint8_t control_ = 0;
626 };
627 };
628
629 /**
630 * Definition Class
631 * Definitions are the results of Instructions
632 * and refer to temporary virtual registers
633 * which are later mapped to physical registers
634 */
635 class Definition final
636 {
637 public:
638 constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0) {}
639 Definition(uint32_t index, RegClass type) noexcept
640 : temp(index, type) {}
641 explicit Definition(Temp tmp) noexcept
642 : temp(tmp) {}
643 Definition(PhysReg reg, RegClass type) noexcept
644 : temp(Temp(0, type))
645 {
646 setFixed(reg);
647 }
648 Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
649 : temp(Temp(tmpId, type))
650 {
651 setFixed(reg);
652 }
653
654 constexpr bool isTemp() const noexcept
655 {
656 return tempId() > 0;
657 }
658
659 constexpr Temp getTemp() const noexcept
660 {
661 return temp;
662 }
663
664 constexpr uint32_t tempId() const noexcept
665 {
666 return temp.id();
667 }
668
669 constexpr void setTemp(Temp t) noexcept {
670 temp = t;
671 }
672
673 constexpr RegClass regClass() const noexcept
674 {
675 return temp.regClass();
676 }
677
678 constexpr unsigned bytes() const noexcept
679 {
680 return temp.bytes();
681 }
682
683 constexpr unsigned size() const noexcept
684 {
685 return temp.size();
686 }
687
688 constexpr bool isFixed() const noexcept
689 {
690 return isFixed_;
691 }
692
693 constexpr PhysReg physReg() const noexcept
694 {
695 return reg_;
696 }
697
698 constexpr void setFixed(PhysReg reg) noexcept
699 {
700 isFixed_ = 1;
701 reg_ = reg;
702 }
703
704 constexpr void setHint(PhysReg reg) noexcept
705 {
706 hasHint_ = 1;
707 reg_ = reg;
708 }
709
710 constexpr bool hasHint() const noexcept
711 {
712 return hasHint_;
713 }
714
715 constexpr void setKill(bool flag) noexcept
716 {
717 isKill_ = flag;
718 }
719
720 constexpr bool isKill() const noexcept
721 {
722 return isKill_;
723 }
724
725 private:
726 Temp temp = Temp(0, s1);
727 PhysReg reg_;
728 union {
729 struct {
730 uint8_t isFixed_:1;
731 uint8_t hasHint_:1;
732 uint8_t isKill_:1;
733 };
734 /* can't initialize bit-fields in c++11, so work around using a union */
735 uint8_t control_ = 0;
736 };
737 };
738
739 class Block;
740
741 struct Instruction {
742 aco_opcode opcode;
743 Format format;
744 uint32_t pass_flags;
745
746 aco::span<Operand> operands;
747 aco::span<Definition> definitions;
748
749 constexpr bool isVALU() const noexcept
750 {
751 return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
752 || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
753 || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
754 || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
755 || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
756 || ((uint16_t) format & (uint16_t) Format::VOP3P) == (uint16_t) Format::VOP3P;
757 }
758
759 constexpr bool isSALU() const noexcept
760 {
761 return format == Format::SOP1 ||
762 format == Format::SOP2 ||
763 format == Format::SOPC ||
764 format == Format::SOPK ||
765 format == Format::SOPP;
766 }
767
768 constexpr bool isVMEM() const noexcept
769 {
770 return format == Format::MTBUF ||
771 format == Format::MUBUF ||
772 format == Format::MIMG;
773 }
774
775 constexpr bool isDPP() const noexcept
776 {
777 return (uint16_t) format & (uint16_t) Format::DPP;
778 }
779
780 constexpr bool isVOP3() const noexcept
781 {
782 return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
783 ((uint16_t) format & (uint16_t) Format::VOP3B) ||
784 format == Format::VOP3P;
785 }
786
787 constexpr bool isSDWA() const noexcept
788 {
789 return (uint16_t) format & (uint16_t) Format::SDWA;
790 }
791
792 constexpr bool isFlatOrGlobal() const noexcept
793 {
794 return format == Format::FLAT || format == Format::GLOBAL;
795 }
796
797 constexpr bool usesModifiers() const noexcept;
798
799 constexpr bool reads_exec() const noexcept
800 {
801 for (const Operand& op : operands) {
802 if (op.isFixed() && op.physReg() == exec)
803 return true;
804 }
805 return false;
806 }
807 };
808
809 struct SOPK_instruction : public Instruction {
810 uint16_t imm;
811 };
812
813 struct SOPP_instruction : public Instruction {
814 uint32_t imm;
815 int block;
816 };
817
818 struct SOPC_instruction : public Instruction {
819 };
820
821 struct SOP1_instruction : public Instruction {
822 };
823
824 struct SOP2_instruction : public Instruction {
825 };
826
827 /**
828 * Scalar Memory Format:
829 * For s_(buffer_)load_dword*:
830 * Operand(0): SBASE - SGPR-pair which provides base address
831 * Operand(1): Offset - immediate (un)signed offset or SGPR
832 * Operand(2) / Definition(0): SDATA - SGPR for read / write result
833 * Operand(n-1): SOffset - SGPR offset (Vega only)
834 *
835 * Having no operands is also valid for instructions such as s_dcache_inv.
836 *
837 */
838 struct SMEM_instruction : public Instruction {
839 bool glc : 1; /* VI+: globally coherent */
840 bool dlc : 1; /* NAVI: device level coherent */
841 bool nv : 1; /* VEGA only: Non-volatile */
842 bool can_reorder : 1;
843 bool disable_wqm : 1;
844 barrier_interaction barrier;
845 };
846
847 struct VOP1_instruction : public Instruction {
848 };
849
850 struct VOP2_instruction : public Instruction {
851 };
852
853 struct VOPC_instruction : public Instruction {
854 };
855
856 struct VOP3A_instruction : public Instruction {
857 bool abs[3];
858 bool neg[3];
859 uint8_t opsel : 4;
860 uint8_t omod : 2;
861 bool clamp : 1;
862 };
863
864 /**
865 * Data Parallel Primitives Format:
866 * This format can be used for VOP1, VOP2 or VOPC instructions.
867 * The swizzle applies to the src0 operand.
868 *
869 */
870 struct DPP_instruction : public Instruction {
871 bool abs[2];
872 bool neg[2];
873 uint16_t dpp_ctrl;
874 uint8_t row_mask : 4;
875 uint8_t bank_mask : 4;
876 bool bound_ctrl : 1;
877 };
878
879 enum sdwa_sel : uint8_t {
880 /* masks */
881 sdwa_wordnum = 0x1,
882 sdwa_bytenum = 0x3,
883 sdwa_asuint = 0x7 | 0x10,
884 sdwa_rasize = 0x3,
885
886 /* flags */
887 sdwa_isword = 0x4,
888 sdwa_sext = 0x8,
889 sdwa_isra = 0x10,
890
891 /* specific values */
892 sdwa_ubyte0 = 0,
893 sdwa_ubyte1 = 1,
894 sdwa_ubyte2 = 2,
895 sdwa_ubyte3 = 3,
896 sdwa_uword0 = sdwa_isword | 0,
897 sdwa_uword1 = sdwa_isword | 1,
898 sdwa_udword = 6,
899
900 sdwa_sbyte0 = sdwa_ubyte0 | sdwa_sext,
901 sdwa_sbyte1 = sdwa_ubyte1 | sdwa_sext,
902 sdwa_sbyte2 = sdwa_ubyte2 | sdwa_sext,
903 sdwa_sbyte3 = sdwa_ubyte3 | sdwa_sext,
904 sdwa_sword0 = sdwa_uword0 | sdwa_sext,
905 sdwa_sword1 = sdwa_uword1 | sdwa_sext,
906 sdwa_sdword = sdwa_udword | sdwa_sext,
907
908 /* register-allocated */
909 sdwa_ubyte = 1 | sdwa_isra,
910 sdwa_uword = 2 | sdwa_isra,
911 sdwa_sbyte = sdwa_ubyte | sdwa_sext,
912 sdwa_sword = sdwa_uword | sdwa_sext,
913 };
914
915 /**
916 * Sub-Dword Addressing Format:
917 * This format can be used for VOP1, VOP2 or VOPC instructions.
918 *
919 * omod and SGPR/constant operands are only available on GFX9+. For VOPC,
920 * the definition doesn't have to be VCC on GFX9+.
921 *
922 */
923 struct SDWA_instruction : public Instruction {
924 /* these destination modifiers aren't available with VOPC except for
925 * clamp on GFX8 */
926 unsigned dst_sel:8;
927 bool dst_preserve:1;
928 bool clamp:1;
929 unsigned omod:2; /* GFX9+ */
930
931 unsigned sel[2];
932 bool neg[2];
933 bool abs[2];
934 };
935
936 struct Interp_instruction : public Instruction {
937 uint8_t attribute;
938 uint8_t component;
939 };
940
941 /**
942 * Local and Global Data Sharing instructions
943 * Operand(0): ADDR - VGPR which supplies the address.
944 * Operand(1): DATA0 - First data VGPR.
945 * Operand(2): DATA1 - Second data VGPR.
946 * Operand(n-1): M0 - LDS size.
947 * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
948 *
949 */
950 struct DS_instruction : public Instruction {
951 int16_t offset0;
952 int8_t offset1;
953 bool gds;
954 };
955
956 /**
957 * Vector Memory Untyped-buffer Instructions
958 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
959 * Operand(1): VADDR - Address source. Can carry an index and/or offset
960 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
961 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
962 *
963 */
964 struct MUBUF_instruction : public Instruction {
965 uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
966 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
967 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
968 bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
969 bool glc : 1; /* globally coherent */
970 bool dlc : 1; /* NAVI: device level coherent */
971 bool slc : 1; /* system level coherent */
972 bool tfe : 1; /* texture fail enable */
973 bool lds : 1; /* Return read-data to LDS instead of VGPRs */
974 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
975 bool can_reorder : 1;
976 barrier_interaction barrier;
977 };
978
979 /**
980 * Vector Memory Typed-buffer Instructions
981 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
982 * Operand(1): VADDR - Address source. Can carry an index and/or offset
983 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
984 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
985 *
986 */
987 struct MTBUF_instruction : public Instruction {
988 uint16_t offset; /* Unsigned byte offset - 12 bit */
989 uint8_t dfmt : 4; /* Data Format of data in memory buffer */
990 uint8_t nfmt : 3; /* Numeric format of data in memory */
991 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
992 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
993 bool glc : 1; /* globally coherent */
994 bool dlc : 1; /* NAVI: device level coherent */
995 bool slc : 1; /* system level coherent */
996 bool tfe : 1; /* texture fail enable */
997 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
998 bool can_reorder : 1;
999 barrier_interaction barrier;
1000 };
1001
1002 /**
1003 * Vector Memory Image Instructions
1004 * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
1005 * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
1006 * or VDATA - Vector GPR for write data.
1007 * Operand(2): VADDR - Address source. Can carry an offset or an index.
1008 * Definition(0): VDATA - Vector GPR for read result.
1009 *
1010 */
1011 struct MIMG_instruction : public Instruction {
1012 uint8_t dmask; /* Data VGPR enable mask */
1013 uint8_t dim : 3; /* NAVI: dimensionality */
1014 bool unrm : 1; /* Force address to be un-normalized */
1015 bool dlc : 1; /* NAVI: device level coherent */
1016 bool glc : 1; /* globally coherent */
1017 bool slc : 1; /* system level coherent */
1018 bool tfe : 1; /* texture fail enable */
1019 bool da : 1; /* declare an array */
1020 bool lwe : 1; /* Force data to be un-normalized */
1021 bool r128 : 1; /* NAVI: Texture resource size */
1022 bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
1023 bool d16 : 1; /* Convert 32-bit data to 16-bit data */
1024 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1025 bool can_reorder : 1;
1026 barrier_interaction barrier;
1027 };
1028
1029 /**
1030 * Flat/Scratch/Global Instructions
1031 * Operand(0): ADDR
1032 * Operand(1): SADDR
1033 * Operand(2) / Definition(0): DATA/VDST
1034 *
1035 */
1036 struct FLAT_instruction : public Instruction {
1037 uint16_t offset; /* Vega/Navi only */
1038 bool slc : 1; /* system level coherent */
1039 bool glc : 1; /* globally coherent */
1040 bool dlc : 1; /* NAVI: device level coherent */
1041 bool lds : 1;
1042 bool nv : 1;
1043 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1044 bool can_reorder : 1;
1045 barrier_interaction barrier;
1046 };
1047
1048 struct Export_instruction : public Instruction {
1049 uint8_t enabled_mask;
1050 uint8_t dest;
1051 bool compressed : 1;
1052 bool done : 1;
1053 bool valid_mask : 1;
1054 };
1055
1056 struct Pseudo_instruction : public Instruction {
1057 bool tmp_in_scc;
1058 PhysReg scratch_sgpr; /* might not be valid if it's not needed */
1059 };
1060
1061 struct Pseudo_branch_instruction : public Instruction {
1062 /* target[0] is the block index of the branch target.
1063 * For conditional branches, target[1] contains the fall-through alternative.
1064 * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
1065 */
1066 uint32_t target[2];
1067 };
1068
1069 struct Pseudo_barrier_instruction : public Instruction {
1070 };
1071
1072 enum ReduceOp {
1073 iadd32, iadd64,
1074 imul32, imul64,
1075 fadd32, fadd64,
1076 fmul32, fmul64,
1077 imin32, imin64,
1078 imax32, imax64,
1079 umin32, umin64,
1080 umax32, umax64,
1081 fmin32, fmin64,
1082 fmax32, fmax64,
1083 iand32, iand64,
1084 ior32, ior64,
1085 ixor32, ixor64,
1086 gfx10_wave64_bpermute
1087 };
1088
1089 /**
1090 * Subgroup Reduction Instructions, everything except for the data to be
1091 * reduced and the result as inserted by setup_reduce_temp().
1092 * Operand(0): data to be reduced
1093 * Operand(1): reduce temporary
1094 * Operand(2): vector temporary
1095 * Definition(0): result
1096 * Definition(1): scalar temporary
1097 * Definition(2): scalar identity temporary (not used to store identity on GFX10)
1098 * Definition(3): scc clobber
1099 * Definition(4): vcc clobber
1100 *
1101 */
1102 struct Pseudo_reduction_instruction : public Instruction {
1103 ReduceOp reduce_op;
1104 unsigned cluster_size; // must be 0 for scans
1105 };
1106
1107 struct instr_deleter_functor {
1108 void operator()(void* p) {
1109 free(p);
1110 }
1111 };
1112
1113 template<typename T>
1114 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
1115
1116 template<typename T>
1117 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
1118 {
1119 std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1120 char *data = (char*) calloc(1, size);
1121 T* inst = (T*) data;
1122
1123 inst->opcode = opcode;
1124 inst->format = format;
1125
1126 uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
1127 inst->operands = aco::span<Operand>(operands_offset, num_operands);
1128 uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1129 inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1130
1131 return inst;
1132 }
1133
1134 constexpr bool Instruction::usesModifiers() const noexcept
1135 {
1136 if (isDPP() || isSDWA())
1137 return true;
1138 if (!isVOP3())
1139 return false;
1140 const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
1141 for (unsigned i = 0; i < operands.size(); i++) {
1142 if (vop3->abs[i] || vop3->neg[i])
1143 return true;
1144 }
1145 return vop3->opsel || vop3->clamp || vop3->omod;
1146 }
1147
1148 constexpr bool is_phi(Instruction* instr)
1149 {
1150 return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
1151 }
1152
1153 static inline bool is_phi(aco_ptr<Instruction>& instr)
1154 {
1155 return is_phi(instr.get());
1156 }
1157
1158 barrier_interaction get_barrier_interaction(Instruction* instr);
1159
1160 bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr);
1161
1162 enum block_kind {
1163 /* uniform indicates that leaving this block,
1164 * all actives lanes stay active */
1165 block_kind_uniform = 1 << 0,
1166 block_kind_top_level = 1 << 1,
1167 block_kind_loop_preheader = 1 << 2,
1168 block_kind_loop_header = 1 << 3,
1169 block_kind_loop_exit = 1 << 4,
1170 block_kind_continue = 1 << 5,
1171 block_kind_break = 1 << 6,
1172 block_kind_continue_or_break = 1 << 7,
1173 block_kind_discard = 1 << 8,
1174 block_kind_branch = 1 << 9,
1175 block_kind_merge = 1 << 10,
1176 block_kind_invert = 1 << 11,
1177 block_kind_uses_discard_if = 1 << 12,
1178 block_kind_needs_lowering = 1 << 13,
1179 block_kind_uses_demote = 1 << 14,
1180 block_kind_export_end = 1 << 15,
1181 };
1182
1183
1184 struct RegisterDemand {
1185 constexpr RegisterDemand() = default;
1186 constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1187 : vgpr{v}, sgpr{s} {}
1188 int16_t vgpr = 0;
1189 int16_t sgpr = 0;
1190
1191 constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1192 return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1193 }
1194
1195 constexpr bool exceeds(const RegisterDemand other) const noexcept {
1196 return vgpr > other.vgpr || sgpr > other.sgpr;
1197 }
1198
1199 constexpr RegisterDemand operator+(const Temp t) const noexcept {
1200 if (t.type() == RegType::sgpr)
1201 return RegisterDemand( vgpr, sgpr + t.size() );
1202 else
1203 return RegisterDemand( vgpr + t.size(), sgpr );
1204 }
1205
1206 constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1207 return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1208 }
1209
1210 constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1211 return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1212 }
1213
1214 constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1215 vgpr += other.vgpr;
1216 sgpr += other.sgpr;
1217 return *this;
1218 }
1219
1220 constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1221 vgpr -= other.vgpr;
1222 sgpr -= other.sgpr;
1223 return *this;
1224 }
1225
1226 constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1227 if (t.type() == RegType::sgpr)
1228 sgpr += t.size();
1229 else
1230 vgpr += t.size();
1231 return *this;
1232 }
1233
1234 constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1235 if (t.type() == RegType::sgpr)
1236 sgpr -= t.size();
1237 else
1238 vgpr -= t.size();
1239 return *this;
1240 }
1241
1242 constexpr void update(const RegisterDemand other) noexcept {
1243 vgpr = std::max(vgpr, other.vgpr);
1244 sgpr = std::max(sgpr, other.sgpr);
1245 }
1246
1247 };
1248
1249 /* CFG */
1250 struct Block {
1251 float_mode fp_mode;
1252 unsigned index;
1253 unsigned offset = 0;
1254 std::vector<aco_ptr<Instruction>> instructions;
1255 std::vector<unsigned> logical_preds;
1256 std::vector<unsigned> linear_preds;
1257 std::vector<unsigned> logical_succs;
1258 std::vector<unsigned> linear_succs;
1259 RegisterDemand register_demand = RegisterDemand();
1260 uint16_t loop_nest_depth = 0;
1261 uint16_t kind = 0;
1262 int logical_idom = -1;
1263 int linear_idom = -1;
1264 Temp live_out_exec = Temp();
1265
1266 /* this information is needed for predecessors to blocks with phis when
1267 * moving out of ssa */
1268 bool scc_live_out = false;
1269 PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1270
1271 Block(unsigned idx) : index(idx) {}
1272 Block() : index(0) {}
1273 };
1274
1275 using Stage = uint16_t;
1276
1277 /* software stages */
1278 static constexpr Stage sw_vs = 1 << 0;
1279 static constexpr Stage sw_gs = 1 << 1;
1280 static constexpr Stage sw_tcs = 1 << 2;
1281 static constexpr Stage sw_tes = 1 << 3;
1282 static constexpr Stage sw_fs = 1 << 4;
1283 static constexpr Stage sw_cs = 1 << 5;
1284 static constexpr Stage sw_gs_copy = 1 << 6;
1285 static constexpr Stage sw_mask = 0x7f;
1286
1287 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1288 static constexpr Stage hw_vs = 1 << 7;
1289 static constexpr Stage hw_es = 1 << 8; /* not on GFX9. combined into GS on GFX9 (and GFX10/legacy). */
1290 static constexpr Stage hw_gs = 1 << 9;
1291 static constexpr Stage hw_ls = 1 << 10; /* not on GFX9. combined into HS on GFX9 (and GFX10/legacy). */
1292 static constexpr Stage hw_hs = 1 << 11;
1293 static constexpr Stage hw_fs = 1 << 12;
1294 static constexpr Stage hw_cs = 1 << 13;
1295 static constexpr Stage hw_mask = 0x7f << 7;
1296
1297 /* possible settings of Program::stage */
1298 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1299 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1300 static constexpr Stage compute_cs = sw_cs | hw_cs;
1301 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1302 static constexpr Stage gs_copy_vs = sw_gs_copy | hw_vs;
1303 /* GFX10/NGG */
1304 static constexpr Stage ngg_vertex_gs = sw_vs | hw_gs;
1305 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1306 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1307 static constexpr Stage ngg_vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1308 /* GFX9 (and GFX10 if NGG isn't used) */
1309 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1310 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1311 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1312 /* pre-GFX9 */
1313 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1314 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1315 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1316 static constexpr Stage tess_eval_es = sw_tes | hw_es; /* tesselation evaluation before geometry */
1317 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1318
1319 enum statistic {
1320 statistic_hash,
1321 statistic_instructions,
1322 statistic_copies,
1323 statistic_branches,
1324 statistic_cycles,
1325 statistic_vmem_clauses,
1326 statistic_smem_clauses,
1327 statistic_vmem_score,
1328 statistic_smem_score,
1329 statistic_sgpr_presched,
1330 statistic_vgpr_presched,
1331 num_statistics
1332 };
1333
1334 class Program final {
1335 public:
1336 float_mode next_fp_mode;
1337 std::vector<Block> blocks;
1338 RegisterDemand max_reg_demand = RegisterDemand();
1339 uint16_t num_waves = 0;
1340 uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1341 ac_shader_config* config;
1342 struct radv_shader_info *info;
1343 enum chip_class chip_class;
1344 enum radeon_family family;
1345 unsigned wave_size;
1346 RegClass lane_mask;
1347 Stage stage; /* Stage */
1348 bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1349 bool needs_wqm = false; /* there exists a p_wqm instruction */
1350 bool wb_smem_l1_on_end = false;
1351
1352 std::vector<uint8_t> constant_data;
1353 Temp private_segment_buffer;
1354 Temp scratch_offset;
1355
1356 uint16_t min_waves = 0;
1357 uint16_t lds_alloc_granule;
1358 uint32_t lds_limit; /* in bytes */
1359 bool has_16bank_lds;
1360 uint16_t vgpr_limit;
1361 uint16_t sgpr_limit;
1362 uint16_t physical_sgprs;
1363 uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1364 uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
1365 unsigned workgroup_size; /* if known; otherwise UINT_MAX */
1366
1367 bool xnack_enabled = false;
1368
1369 bool needs_vcc = false;
1370 bool needs_flat_scr = false;
1371
1372 bool collect_statistics = false;
1373 uint32_t statistics[num_statistics];
1374
1375 uint32_t allocateId()
1376 {
1377 assert(allocationID <= 16777215);
1378 return allocationID++;
1379 }
1380
1381 uint32_t peekAllocationId()
1382 {
1383 return allocationID;
1384 }
1385
1386 void setAllocationId(uint32_t id)
1387 {
1388 allocationID = id;
1389 }
1390
1391 Block* create_and_insert_block() {
1392 blocks.emplace_back(blocks.size());
1393 blocks.back().fp_mode = next_fp_mode;
1394 return &blocks.back();
1395 }
1396
1397 Block* insert_block(Block&& block) {
1398 block.index = blocks.size();
1399 block.fp_mode = next_fp_mode;
1400 blocks.emplace_back(std::move(block));
1401 return &blocks.back();
1402 }
1403
1404 private:
1405 uint32_t allocationID = 1;
1406 };
1407
1408 struct live {
1409 /* live temps out per block */
1410 std::vector<std::set<Temp>> live_out;
1411 /* register demand (sgpr/vgpr) per instruction per block */
1412 std::vector<std::vector<RegisterDemand>> register_demand;
1413 };
1414
1415 void select_program(Program *program,
1416 unsigned shader_count,
1417 struct nir_shader *const *shaders,
1418 ac_shader_config* config,
1419 struct radv_shader_args *args);
1420 void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
1421 ac_shader_config* config,
1422 struct radv_shader_args *args);
1423
1424 void lower_wqm(Program* program, live& live_vars,
1425 const struct radv_nir_compiler_options *options);
1426 void lower_bool_phis(Program* program);
1427 void calc_min_waves(Program* program);
1428 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1429 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1430 std::vector<uint16_t> dead_code_analysis(Program *program);
1431 void dominator_tree(Program* program);
1432 void insert_exec_mask(Program *program);
1433 void value_numbering(Program* program);
1434 void optimize(Program* program);
1435 void setup_reduce_temp(Program* program);
1436 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1437 void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_per_block);
1438 void ssa_elimination(Program* program);
1439 void lower_to_hw_instr(Program* program);
1440 void schedule_program(Program* program, live& live_vars);
1441 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1442 void insert_wait_states(Program* program);
1443 void insert_NOPs(Program* program);
1444 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1445 void print_asm(Program *program, std::vector<uint32_t>& binary,
1446 unsigned exec_size, std::ostream& out);
1447 void validate(Program* program, FILE *output);
1448 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output);
1449 #ifndef NDEBUG
1450 void perfwarn(bool cond, const char *msg, Instruction *instr=NULL);
1451 #else
1452 #define perfwarn(program, cond, msg, ...) do {} while(0)
1453 #endif
1454
1455 void collect_presched_stats(Program *program);
1456 void collect_preasm_stats(Program *program);
1457 void collect_postasm_stats(Program *program, const std::vector<uint32_t>& code);
1458
1459 void aco_print_instr(Instruction *instr, FILE *output);
1460 void aco_print_program(Program *program, FILE *output);
1461
1462 /* utilities for dealing with register demand */
1463 RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
1464 RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
1465 RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, aco_ptr<Instruction>& instr_before);
1466
1467 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1468 uint16_t get_extra_sgprs(Program *program);
1469
1470 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
1471 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1472 uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs);
1473
1474 /* return number of addressable sgprs/vgprs for max_waves */
1475 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1476 uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves);
1477
1478 typedef struct {
1479 const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
1480 const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1481 const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1482 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1483 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1484 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
1485 const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1486 const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1487 } Info;
1488
1489 extern const Info instr_info;
1490
1491 }
1492
1493 #endif /* ACO_IR_H */
1494