aco: add sub-dword regclasses
[mesa.git] / src / amd / compiler / aco_ir.h
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #ifndef ACO_IR_H
26 #define ACO_IR_H
27
28 #include <vector>
29 #include <set>
30 #include <bitset>
31 #include <memory>
32
33 #include "nir.h"
34 #include "ac_binary.h"
35 #include "amd_family.h"
36 #include "aco_opcodes.h"
37 #include "aco_util.h"
38
39 struct radv_nir_compiler_options;
40 struct radv_shader_args;
41 struct radv_shader_info;
42
43 namespace aco {
44
45 extern uint64_t debug_flags;
46
47 enum {
48 DEBUG_VALIDATE = 0x1,
49 DEBUG_VALIDATE_RA = 0x2,
50 DEBUG_PERFWARN = 0x4,
51 };
52
53 /**
54 * Representation of the instruction's microcode encoding format
55 * Note: Some Vector ALU Formats can be combined, such that:
56 * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
57 * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
58 * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
59 *
60 * (*) The same is applicable for VOP1 and VOPC instructions.
61 */
62 enum class Format : std::uint16_t {
63 /* Pseudo Instruction Format */
64 PSEUDO = 0,
65 /* Scalar ALU & Control Formats */
66 SOP1 = 1,
67 SOP2 = 2,
68 SOPK = 3,
69 SOPP = 4,
70 SOPC = 5,
71 /* Scalar Memory Format */
72 SMEM = 6,
73 /* LDS/GDS Format */
74 DS = 8,
75 /* Vector Memory Buffer Formats */
76 MTBUF = 9,
77 MUBUF = 10,
78 /* Vector Memory Image Format */
79 MIMG = 11,
80 /* Export Format */
81 EXP = 12,
82 /* Flat Formats */
83 FLAT = 13,
84 GLOBAL = 14,
85 SCRATCH = 15,
86
87 PSEUDO_BRANCH = 16,
88 PSEUDO_BARRIER = 17,
89 PSEUDO_REDUCTION = 18,
90
91 /* Vector ALU Formats */
92 VOP1 = 1 << 8,
93 VOP2 = 1 << 9,
94 VOPC = 1 << 10,
95 VOP3 = 1 << 11,
96 VOP3A = 1 << 11,
97 VOP3B = 1 << 11,
98 VOP3P = 1 << 12,
99 /* Vector Parameter Interpolation Format */
100 VINTRP = 1 << 13,
101 DPP = 1 << 14,
102 SDWA = 1 << 15,
103 };
104
105 enum barrier_interaction : uint8_t {
106 barrier_none = 0,
107 barrier_buffer = 0x1,
108 barrier_image = 0x2,
109 barrier_atomic = 0x4,
110 barrier_shared = 0x8,
111 /* used for geometry shaders to ensure vertex data writes are before the
112 * GS_DONE s_sendmsg. */
113 barrier_gs_data = 0x10,
114 /* used for geometry shaders to ensure s_sendmsg instructions are in-order. */
115 barrier_gs_sendmsg = 0x20,
116 /* used by barriers. created by s_barrier */
117 barrier_barrier = 0x40,
118 barrier_count = 6,
119 };
120
121 enum fp_round {
122 fp_round_ne = 0,
123 fp_round_pi = 1,
124 fp_round_ni = 2,
125 fp_round_tz = 3,
126 };
127
128 enum fp_denorm {
129 /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
130 * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
131 fp_denorm_flush = 0x0,
132 fp_denorm_keep = 0x3,
133 };
134
135 struct float_mode {
136 /* matches encoding of the MODE register */
137 union {
138 struct {
139 fp_round round32:2;
140 fp_round round16_64:2;
141 unsigned denorm32:2;
142 unsigned denorm16_64:2;
143 };
144 uint8_t val = 0;
145 };
146 /* if false, optimizations which may remove infs/nan/-0.0 can be done */
147 bool preserve_signed_zero_inf_nan32:1;
148 bool preserve_signed_zero_inf_nan16_64:1;
149 /* if false, optimizations which may remove denormal flushing can be done */
150 bool must_flush_denorms32:1;
151 bool must_flush_denorms16_64:1;
152 bool care_about_round32:1;
153 bool care_about_round16_64:1;
154
155 /* Returns true if instructions using the mode "other" can safely use the
156 * current one instead. */
157 bool canReplace(float_mode other) const noexcept {
158 return val == other.val &&
159 (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
160 (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
161 (must_flush_denorms32 || !other.must_flush_denorms32) &&
162 (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
163 (care_about_round32 || !other.care_about_round32) &&
164 (care_about_round16_64 || !other.care_about_round16_64);
165 }
166 };
167
168 constexpr Format asVOP3(Format format) {
169 return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
170 };
171
172 constexpr Format asSDWA(Format format) {
173 assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC);
174 return (Format) ((uint32_t) Format::SDWA | (uint32_t) format);
175 }
176
177 enum class RegType {
178 none = 0,
179 sgpr,
180 vgpr,
181 linear_vgpr,
182 };
183
184 struct RegClass {
185
186 enum RC : uint8_t {
187 s1 = 1,
188 s2 = 2,
189 s3 = 3,
190 s4 = 4,
191 s6 = 6,
192 s8 = 8,
193 s16 = 16,
194 v1 = s1 | (1 << 5),
195 v2 = s2 | (1 << 5),
196 v3 = s3 | (1 << 5),
197 v4 = s4 | (1 << 5),
198 v5 = 5 | (1 << 5),
199 v6 = 6 | (1 << 5),
200 v7 = 7 | (1 << 5),
201 v8 = 8 | (1 << 5),
202 /* byte-sized register class */
203 v1b = v1 | (1 << 7),
204 v2b = v2 | (1 << 7),
205 v3b = v3 | (1 << 7),
206 v4b = v4 | (1 << 7),
207 v6b = v6 | (1 << 7),
208 v8b = v8 | (1 << 7),
209 /* these are used for WWM and spills to vgpr */
210 v1_linear = v1 | (1 << 6),
211 v2_linear = v2 | (1 << 6),
212 };
213
214 RegClass() = default;
215 constexpr RegClass(RC rc)
216 : rc(rc) {}
217 constexpr RegClass(RegType type, unsigned size)
218 : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
219
220 constexpr operator RC() const { return rc; }
221 explicit operator bool() = delete;
222
223 constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
224 constexpr bool is_subdword() const { return rc & (1 << 7); }
225 constexpr unsigned bytes() const { return ((unsigned) rc & 0x1F) * (is_subdword() ? 1 : 4); }
226 //TODO: use size() less in favor of bytes()
227 constexpr unsigned size() const { return (bytes() + 3) >> 2; }
228 constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
229 constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
230
231 private:
232 RC rc;
233 };
234
235 /* transitional helper expressions */
236 static constexpr RegClass s1{RegClass::s1};
237 static constexpr RegClass s2{RegClass::s2};
238 static constexpr RegClass s3{RegClass::s3};
239 static constexpr RegClass s4{RegClass::s4};
240 static constexpr RegClass s8{RegClass::s8};
241 static constexpr RegClass s16{RegClass::s16};
242 static constexpr RegClass v1{RegClass::v1};
243 static constexpr RegClass v2{RegClass::v2};
244 static constexpr RegClass v3{RegClass::v3};
245 static constexpr RegClass v4{RegClass::v4};
246 static constexpr RegClass v5{RegClass::v5};
247 static constexpr RegClass v6{RegClass::v6};
248 static constexpr RegClass v7{RegClass::v7};
249 static constexpr RegClass v8{RegClass::v8};
250 static constexpr RegClass v1b{RegClass::v1b};
251 static constexpr RegClass v2b{RegClass::v2b};
252 static constexpr RegClass v3b{RegClass::v3b};
253 static constexpr RegClass v4b{RegClass::v4b};
254 static constexpr RegClass v6b{RegClass::v6b};
255 static constexpr RegClass v8b{RegClass::v8b};
256
257 /**
258 * Temp Class
259 * Each temporary virtual register has a
260 * register class (i.e. size and type)
261 * and SSA id.
262 */
263 struct Temp {
264 Temp() = default;
265 constexpr Temp(uint32_t id, RegClass cls) noexcept
266 : id_(id), reg_class(cls) {}
267
268 constexpr uint32_t id() const noexcept { return id_; }
269 constexpr RegClass regClass() const noexcept { return reg_class; }
270
271 constexpr unsigned bytes() const noexcept { return reg_class.bytes(); }
272 constexpr unsigned size() const noexcept { return reg_class.size(); }
273 constexpr RegType type() const noexcept { return reg_class.type(); }
274 constexpr bool is_linear() const noexcept { return reg_class.is_linear(); }
275
276 constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
277 constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
278 constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
279
280 private:
281 uint32_t id_:24;
282 RegClass reg_class;
283 };
284
285 /**
286 * PhysReg
287 * Represents the physical register for each
288 * Operand and Definition.
289 */
290 struct PhysReg {
291 constexpr PhysReg() = default;
292 explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {}
293 constexpr unsigned reg() const { return reg_b >> 2; }
294 constexpr unsigned byte() const { return reg_b & 0x3; }
295 constexpr operator unsigned() const { return reg(); }
296 constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; }
297 constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; }
298 constexpr bool operator <(PhysReg other) const { return reg_b < other.reg_b; }
299
300 uint16_t reg_b = 0;
301 };
302
303 /* helper expressions for special registers */
304 static constexpr PhysReg m0{124};
305 static constexpr PhysReg vcc{106};
306 static constexpr PhysReg vcc_hi{107};
307 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
308 static constexpr PhysReg exec{126};
309 static constexpr PhysReg exec_lo{126};
310 static constexpr PhysReg exec_hi{127};
311 static constexpr PhysReg vccz{251};
312 static constexpr PhysReg execz{252};
313 static constexpr PhysReg scc{253};
314
315 /**
316 * Operand Class
317 * Initially, each Operand refers to either
318 * a temporary virtual register
319 * or to a constant value
320 * Temporary registers get mapped to physical register during RA
321 * Constant values are inlined into the instruction sequence.
322 */
323 class Operand final
324 {
325 public:
326 constexpr Operand()
327 : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
328 isKill_(false), isUndef_(true), isFirstKill_(false), is64BitConst_(false),
329 isLateKill_(false) {}
330
331 explicit Operand(Temp r) noexcept
332 {
333 data_.temp = r;
334 if (r.id()) {
335 isTemp_ = true;
336 } else {
337 isUndef_ = true;
338 setFixed(PhysReg{128});
339 }
340 };
341 explicit Operand(uint32_t v, bool is64bit = false) noexcept
342 {
343 data_.i = v;
344 isConstant_ = true;
345 is64BitConst_ = is64bit;
346 if (v <= 64)
347 setFixed(PhysReg{128 + v});
348 else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
349 setFixed(PhysReg{192 - v});
350 else if (v == 0x3f000000) /* 0.5 */
351 setFixed(PhysReg{240});
352 else if (v == 0xbf000000) /* -0.5 */
353 setFixed(PhysReg{241});
354 else if (v == 0x3f800000) /* 1.0 */
355 setFixed(PhysReg{242});
356 else if (v == 0xbf800000) /* -1.0 */
357 setFixed(PhysReg{243});
358 else if (v == 0x40000000) /* 2.0 */
359 setFixed(PhysReg{244});
360 else if (v == 0xc0000000) /* -2.0 */
361 setFixed(PhysReg{245});
362 else if (v == 0x40800000) /* 4.0 */
363 setFixed(PhysReg{246});
364 else if (v == 0xc0800000) /* -4.0 */
365 setFixed(PhysReg{247});
366 else { /* Literal Constant */
367 assert(!is64bit && "attempt to create a 64-bit literal constant");
368 setFixed(PhysReg{255});
369 }
370 };
371 explicit Operand(uint64_t v) noexcept
372 {
373 isConstant_ = true;
374 is64BitConst_ = true;
375 if (v <= 64) {
376 data_.i = (uint32_t) v;
377 setFixed(PhysReg{128 + (uint32_t) v});
378 } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
379 data_.i = (uint32_t) v;
380 setFixed(PhysReg{192 - (uint32_t) v});
381 } else if (v == 0x3FE0000000000000) { /* 0.5 */
382 data_.i = 0x3f000000;
383 setFixed(PhysReg{240});
384 } else if (v == 0xBFE0000000000000) { /* -0.5 */
385 data_.i = 0xbf000000;
386 setFixed(PhysReg{241});
387 } else if (v == 0x3FF0000000000000) { /* 1.0 */
388 data_.i = 0x3f800000;
389 setFixed(PhysReg{242});
390 } else if (v == 0xBFF0000000000000) { /* -1.0 */
391 data_.i = 0xbf800000;
392 setFixed(PhysReg{243});
393 } else if (v == 0x4000000000000000) { /* 2.0 */
394 data_.i = 0x40000000;
395 setFixed(PhysReg{244});
396 } else if (v == 0xC000000000000000) { /* -2.0 */
397 data_.i = 0xc0000000;
398 setFixed(PhysReg{245});
399 } else if (v == 0x4010000000000000) { /* 4.0 */
400 data_.i = 0x40800000;
401 setFixed(PhysReg{246});
402 } else if (v == 0xC010000000000000) { /* -4.0 */
403 data_.i = 0xc0800000;
404 setFixed(PhysReg{247});
405 } else { /* Literal Constant: we don't know if it is a long or double.*/
406 isConstant_ = 0;
407 assert(false && "attempt to create a 64-bit literal constant");
408 }
409 };
410 explicit Operand(RegClass type) noexcept
411 {
412 isUndef_ = true;
413 data_.temp = Temp(0, type);
414 setFixed(PhysReg{128});
415 };
416 explicit Operand(PhysReg reg, RegClass type) noexcept
417 {
418 data_.temp = Temp(0, type);
419 setFixed(reg);
420 }
421
422 constexpr bool isTemp() const noexcept
423 {
424 return isTemp_;
425 }
426
427 constexpr void setTemp(Temp t) noexcept {
428 assert(!isConstant_);
429 isTemp_ = true;
430 data_.temp = t;
431 }
432
433 constexpr Temp getTemp() const noexcept
434 {
435 return data_.temp;
436 }
437
438 constexpr uint32_t tempId() const noexcept
439 {
440 return data_.temp.id();
441 }
442
443 constexpr bool hasRegClass() const noexcept
444 {
445 return isTemp() || isUndefined();
446 }
447
448 constexpr RegClass regClass() const noexcept
449 {
450 return data_.temp.regClass();
451 }
452
453 constexpr unsigned bytes() const noexcept
454 {
455 if (isConstant())
456 return is64BitConst_ ? 8 : 4; //TODO: sub-dword constants
457 else
458 return data_.temp.bytes();
459 }
460
461 constexpr unsigned size() const noexcept
462 {
463 if (isConstant())
464 return is64BitConst_ ? 2 : 1;
465 else
466 return data_.temp.size();
467 }
468
469 constexpr bool isFixed() const noexcept
470 {
471 return isFixed_;
472 }
473
474 constexpr PhysReg physReg() const noexcept
475 {
476 return reg_;
477 }
478
479 constexpr void setFixed(PhysReg reg) noexcept
480 {
481 isFixed_ = reg != unsigned(-1);
482 reg_ = reg;
483 }
484
485 constexpr bool isConstant() const noexcept
486 {
487 return isConstant_;
488 }
489
490 constexpr bool isLiteral() const noexcept
491 {
492 return isConstant() && reg_ == 255;
493 }
494
495 constexpr bool isUndefined() const noexcept
496 {
497 return isUndef_;
498 }
499
500 constexpr uint32_t constantValue() const noexcept
501 {
502 return data_.i;
503 }
504
505 constexpr bool constantEquals(uint32_t cmp) const noexcept
506 {
507 return isConstant() && constantValue() == cmp;
508 }
509
510 constexpr uint64_t constantValue64(bool signext=false) const noexcept
511 {
512 if (is64BitConst_) {
513 if (reg_ <= 192)
514 return reg_ - 128;
515 else if (reg_ <= 208)
516 return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);
517
518 switch (reg_) {
519 case 240:
520 return 0x3FE0000000000000;
521 case 241:
522 return 0xBFE0000000000000;
523 case 242:
524 return 0x3FF0000000000000;
525 case 243:
526 return 0xBFF0000000000000;
527 case 244:
528 return 0x4000000000000000;
529 case 245:
530 return 0xC000000000000000;
531 case 246:
532 return 0x4010000000000000;
533 case 247:
534 return 0xC010000000000000;
535 }
536 }
537 return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
538 }
539
540 /* Indicates that the killed operand's live range intersects with the
541 * instruction's definitions. Unlike isKill() and isFirstKill(), this is
542 * not set by liveness analysis. */
543 constexpr void setLateKill(bool flag) noexcept
544 {
545 isLateKill_ = flag;
546 }
547
548 constexpr bool isLateKill() const noexcept
549 {
550 return isLateKill_;
551 }
552
553 constexpr void setKill(bool flag) noexcept
554 {
555 isKill_ = flag;
556 if (!flag)
557 setFirstKill(false);
558 }
559
560 constexpr bool isKill() const noexcept
561 {
562 return isKill_ || isFirstKill();
563 }
564
565 constexpr void setFirstKill(bool flag) noexcept
566 {
567 isFirstKill_ = flag;
568 if (flag)
569 setKill(flag);
570 }
571
572 /* When there are multiple operands killing the same temporary,
573 * isFirstKill() is only returns true for the first one. */
574 constexpr bool isFirstKill() const noexcept
575 {
576 return isFirstKill_;
577 }
578
579 constexpr bool isKillBeforeDef() const noexcept
580 {
581 return isKill() && !isLateKill();
582 }
583
584 constexpr bool isFirstKillBeforeDef() const noexcept
585 {
586 return isFirstKill() && !isLateKill();
587 }
588
589 constexpr bool operator == (Operand other) const noexcept
590 {
591 if (other.size() != size())
592 return false;
593 if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef())
594 return false;
595 if (isFixed() && other.isFixed() && physReg() != other.physReg())
596 return false;
597 if (isLiteral())
598 return other.isLiteral() && other.constantValue() == constantValue();
599 else if (isConstant())
600 return other.isConstant() && other.physReg() == physReg();
601 else if (isUndefined())
602 return other.isUndefined() && other.regClass() == regClass();
603 else
604 return other.isTemp() && other.getTemp() == getTemp();
605 }
606 private:
607 union {
608 uint32_t i;
609 float f;
610 Temp temp = Temp(0, s1);
611 } data_;
612 PhysReg reg_;
613 union {
614 struct {
615 uint8_t isTemp_:1;
616 uint8_t isFixed_:1;
617 uint8_t isConstant_:1;
618 uint8_t isKill_:1;
619 uint8_t isUndef_:1;
620 uint8_t isFirstKill_:1;
621 uint8_t is64BitConst_:1;
622 uint8_t isLateKill_:1;
623 };
624 /* can't initialize bit-fields in c++11, so work around using a union */
625 uint8_t control_ = 0;
626 };
627 };
628
629 /**
630 * Definition Class
631 * Definitions are the results of Instructions
632 * and refer to temporary virtual registers
633 * which are later mapped to physical registers
634 */
635 class Definition final
636 {
637 public:
638 constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0) {}
639 Definition(uint32_t index, RegClass type) noexcept
640 : temp(index, type) {}
641 explicit Definition(Temp tmp) noexcept
642 : temp(tmp) {}
643 Definition(PhysReg reg, RegClass type) noexcept
644 : temp(Temp(0, type))
645 {
646 setFixed(reg);
647 }
648 Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
649 : temp(Temp(tmpId, type))
650 {
651 setFixed(reg);
652 }
653
654 constexpr bool isTemp() const noexcept
655 {
656 return tempId() > 0;
657 }
658
659 constexpr Temp getTemp() const noexcept
660 {
661 return temp;
662 }
663
664 constexpr uint32_t tempId() const noexcept
665 {
666 return temp.id();
667 }
668
669 constexpr void setTemp(Temp t) noexcept {
670 temp = t;
671 }
672
673 constexpr RegClass regClass() const noexcept
674 {
675 return temp.regClass();
676 }
677
678 constexpr unsigned bytes() const noexcept
679 {
680 return temp.bytes();
681 }
682
683 constexpr unsigned size() const noexcept
684 {
685 return temp.size();
686 }
687
688 constexpr bool isFixed() const noexcept
689 {
690 return isFixed_;
691 }
692
693 constexpr PhysReg physReg() const noexcept
694 {
695 return reg_;
696 }
697
698 constexpr void setFixed(PhysReg reg) noexcept
699 {
700 isFixed_ = 1;
701 reg_ = reg;
702 }
703
704 constexpr void setHint(PhysReg reg) noexcept
705 {
706 hasHint_ = 1;
707 reg_ = reg;
708 }
709
710 constexpr bool hasHint() const noexcept
711 {
712 return hasHint_;
713 }
714
715 constexpr void setKill(bool flag) noexcept
716 {
717 isKill_ = flag;
718 }
719
720 constexpr bool isKill() const noexcept
721 {
722 return isKill_;
723 }
724
725 private:
726 Temp temp = Temp(0, s1);
727 PhysReg reg_;
728 union {
729 struct {
730 uint8_t isFixed_:1;
731 uint8_t hasHint_:1;
732 uint8_t isKill_:1;
733 };
734 /* can't initialize bit-fields in c++11, so work around using a union */
735 uint8_t control_ = 0;
736 };
737 };
738
739 class Block;
740
741 struct Instruction {
742 aco_opcode opcode;
743 Format format;
744 uint32_t pass_flags;
745
746 aco::span<Operand> operands;
747 aco::span<Definition> definitions;
748
749 constexpr bool isVALU() const noexcept
750 {
751 return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
752 || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
753 || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
754 || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
755 || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
756 || ((uint16_t) format & (uint16_t) Format::VOP3P) == (uint16_t) Format::VOP3P;
757 }
758
759 constexpr bool isSALU() const noexcept
760 {
761 return format == Format::SOP1 ||
762 format == Format::SOP2 ||
763 format == Format::SOPC ||
764 format == Format::SOPK ||
765 format == Format::SOPP;
766 }
767
768 constexpr bool isVMEM() const noexcept
769 {
770 return format == Format::MTBUF ||
771 format == Format::MUBUF ||
772 format == Format::MIMG;
773 }
774
775 constexpr bool isDPP() const noexcept
776 {
777 return (uint16_t) format & (uint16_t) Format::DPP;
778 }
779
780 constexpr bool isVOP3() const noexcept
781 {
782 return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
783 ((uint16_t) format & (uint16_t) Format::VOP3B) ||
784 format == Format::VOP3P;
785 }
786
787 constexpr bool isSDWA() const noexcept
788 {
789 return (uint16_t) format & (uint16_t) Format::SDWA;
790 }
791
792 constexpr bool isFlatOrGlobal() const noexcept
793 {
794 return format == Format::FLAT || format == Format::GLOBAL;
795 }
796
797 constexpr bool usesModifiers() const noexcept;
798
799 constexpr bool reads_exec() const noexcept
800 {
801 for (const Operand& op : operands) {
802 if (op.isFixed() && op.physReg() == exec)
803 return true;
804 }
805 return false;
806 }
807 };
808
809 struct SOPK_instruction : public Instruction {
810 uint16_t imm;
811 };
812
813 struct SOPP_instruction : public Instruction {
814 uint32_t imm;
815 int block;
816 };
817
818 struct SOPC_instruction : public Instruction {
819 };
820
821 struct SOP1_instruction : public Instruction {
822 };
823
824 struct SOP2_instruction : public Instruction {
825 };
826
827 /**
828 * Scalar Memory Format:
829 * For s_(buffer_)load_dword*:
830 * Operand(0): SBASE - SGPR-pair which provides base address
831 * Operand(1): Offset - immediate (un)signed offset or SGPR
832 * Operand(2) / Definition(0): SDATA - SGPR for read / write result
833 * Operand(n-1): SOffset - SGPR offset (Vega only)
834 *
835 * Having no operands is also valid for instructions such as s_dcache_inv.
836 *
837 */
838 struct SMEM_instruction : public Instruction {
839 bool glc : 1; /* VI+: globally coherent */
840 bool dlc : 1; /* NAVI: device level coherent */
841 bool nv : 1; /* VEGA only: Non-volatile */
842 bool can_reorder : 1;
843 bool disable_wqm : 1;
844 barrier_interaction barrier;
845 };
846
847 struct VOP1_instruction : public Instruction {
848 };
849
850 struct VOP2_instruction : public Instruction {
851 };
852
853 struct VOPC_instruction : public Instruction {
854 };
855
856 struct VOP3A_instruction : public Instruction {
857 bool abs[3];
858 bool neg[3];
859 uint8_t opsel : 4;
860 uint8_t omod : 2;
861 bool clamp : 1;
862 };
863
864 /**
865 * Data Parallel Primitives Format:
866 * This format can be used for VOP1, VOP2 or VOPC instructions.
867 * The swizzle applies to the src0 operand.
868 *
869 */
870 struct DPP_instruction : public Instruction {
871 bool abs[2];
872 bool neg[2];
873 uint16_t dpp_ctrl;
874 uint8_t row_mask : 4;
875 uint8_t bank_mask : 4;
876 bool bound_ctrl : 1;
877 };
878
879 enum sdwa_sel : uint8_t {
880 /* masks */
881 sdwa_wordnum = 0x1,
882 sdwa_bytenum = 0x3,
883 sdwa_asuint = 0x7,
884
885 /* flags */
886 sdwa_isword = 0x4,
887 sdwa_sext = 0x8,
888
889 /* specific values */
890 sdwa_ubyte0 = 0,
891 sdwa_ubyte1 = 1,
892 sdwa_ubyte2 = 2,
893 sdwa_ubyte3 = 3,
894 sdwa_uword0 = sdwa_isword | 0,
895 sdwa_uword1 = sdwa_isword | 1,
896 sdwa_udword = 6,
897
898 sdwa_sbyte0 = sdwa_ubyte0 | sdwa_sext,
899 sdwa_sbyte1 = sdwa_ubyte1 | sdwa_sext,
900 sdwa_sbyte2 = sdwa_ubyte2 | sdwa_sext,
901 sdwa_sbyte3 = sdwa_ubyte3 | sdwa_sext,
902 sdwa_sword0 = sdwa_uword0 | sdwa_sext,
903 sdwa_sword1 = sdwa_uword1 | sdwa_sext,
904 sdwa_sdword = sdwa_udword | sdwa_sext,
905 };
906
907 /**
908 * Sub-Dword Addressing Format:
909 * This format can be used for VOP1, VOP2 or VOPC instructions.
910 *
911 * omod and SGPR/constant operands are only available on GFX9+. For VOPC,
912 * the definition doesn't have to be VCC on GFX9+.
913 *
914 */
915 struct SDWA_instruction : public Instruction {
916 /* these destination modifiers aren't available with VOPC except for
917 * clamp on GFX8 */
918 unsigned dst_sel:4;
919 bool dst_preserve:1;
920 bool clamp:1;
921 unsigned omod:2; /* GFX9+ */
922
923 unsigned sel[2];
924 bool neg[2];
925 bool abs[2];
926 };
927
928 struct Interp_instruction : public Instruction {
929 uint8_t attribute;
930 uint8_t component;
931 };
932
933 /**
934 * Local and Global Data Sharing instructions
935 * Operand(0): ADDR - VGPR which supplies the address.
936 * Operand(1): DATA0 - First data VGPR.
937 * Operand(2): DATA1 - Second data VGPR.
938 * Operand(n-1): M0 - LDS size.
939 * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
940 *
941 */
942 struct DS_instruction : public Instruction {
943 int16_t offset0;
944 int8_t offset1;
945 bool gds;
946 };
947
948 /**
949 * Vector Memory Untyped-buffer Instructions
950 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
951 * Operand(1): VADDR - Address source. Can carry an index and/or offset
952 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
953 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
954 *
955 */
956 struct MUBUF_instruction : public Instruction {
957 uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
958 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
959 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
960 bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
961 bool glc : 1; /* globally coherent */
962 bool dlc : 1; /* NAVI: device level coherent */
963 bool slc : 1; /* system level coherent */
964 bool tfe : 1; /* texture fail enable */
965 bool lds : 1; /* Return read-data to LDS instead of VGPRs */
966 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
967 bool can_reorder : 1;
968 barrier_interaction barrier;
969 };
970
971 /**
972 * Vector Memory Typed-buffer Instructions
973 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
974 * Operand(1): VADDR - Address source. Can carry an index and/or offset
975 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
976 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
977 *
978 */
979 struct MTBUF_instruction : public Instruction {
980 uint16_t offset; /* Unsigned byte offset - 12 bit */
981 uint8_t dfmt : 4; /* Data Format of data in memory buffer */
982 uint8_t nfmt : 3; /* Numeric format of data in memory */
983 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
984 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
985 bool glc : 1; /* globally coherent */
986 bool dlc : 1; /* NAVI: device level coherent */
987 bool slc : 1; /* system level coherent */
988 bool tfe : 1; /* texture fail enable */
989 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
990 bool can_reorder : 1;
991 barrier_interaction barrier;
992 };
993
994 /**
995 * Vector Memory Image Instructions
996 * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
997 * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
998 * or VDATA - Vector GPR for write data.
999 * Operand(2): VADDR - Address source. Can carry an offset or an index.
1000 * Definition(0): VDATA - Vector GPR for read result.
1001 *
1002 */
1003 struct MIMG_instruction : public Instruction {
1004 uint8_t dmask; /* Data VGPR enable mask */
1005 uint8_t dim : 3; /* NAVI: dimensionality */
1006 bool unrm : 1; /* Force address to be un-normalized */
1007 bool dlc : 1; /* NAVI: device level coherent */
1008 bool glc : 1; /* globally coherent */
1009 bool slc : 1; /* system level coherent */
1010 bool tfe : 1; /* texture fail enable */
1011 bool da : 1; /* declare an array */
1012 bool lwe : 1; /* Force data to be un-normalized */
1013 bool r128 : 1; /* NAVI: Texture resource size */
1014 bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
1015 bool d16 : 1; /* Convert 32-bit data to 16-bit data */
1016 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1017 bool can_reorder : 1;
1018 barrier_interaction barrier;
1019 };
1020
1021 /**
1022 * Flat/Scratch/Global Instructions
1023 * Operand(0): ADDR
1024 * Operand(1): SADDR
1025 * Operand(2) / Definition(0): DATA/VDST
1026 *
1027 */
1028 struct FLAT_instruction : public Instruction {
1029 uint16_t offset; /* Vega/Navi only */
1030 bool slc : 1; /* system level coherent */
1031 bool glc : 1; /* globally coherent */
1032 bool dlc : 1; /* NAVI: device level coherent */
1033 bool lds : 1;
1034 bool nv : 1;
1035 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1036 bool can_reorder : 1;
1037 barrier_interaction barrier;
1038 };
1039
1040 struct Export_instruction : public Instruction {
1041 uint8_t enabled_mask;
1042 uint8_t dest;
1043 bool compressed : 1;
1044 bool done : 1;
1045 bool valid_mask : 1;
1046 };
1047
1048 struct Pseudo_instruction : public Instruction {
1049 bool tmp_in_scc;
1050 PhysReg scratch_sgpr; /* might not be valid if it's not needed */
1051 };
1052
1053 struct Pseudo_branch_instruction : public Instruction {
1054 /* target[0] is the block index of the branch target.
1055 * For conditional branches, target[1] contains the fall-through alternative.
1056 * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
1057 */
1058 uint32_t target[2];
1059 };
1060
1061 struct Pseudo_barrier_instruction : public Instruction {
1062 };
1063
1064 enum ReduceOp {
1065 iadd32, iadd64,
1066 imul32, imul64,
1067 fadd32, fadd64,
1068 fmul32, fmul64,
1069 imin32, imin64,
1070 imax32, imax64,
1071 umin32, umin64,
1072 umax32, umax64,
1073 fmin32, fmin64,
1074 fmax32, fmax64,
1075 iand32, iand64,
1076 ior32, ior64,
1077 ixor32, ixor64,
1078 gfx10_wave64_bpermute
1079 };
1080
1081 /**
1082 * Subgroup Reduction Instructions, everything except for the data to be
1083 * reduced and the result as inserted by setup_reduce_temp().
1084 * Operand(0): data to be reduced
1085 * Operand(1): reduce temporary
1086 * Operand(2): vector temporary
1087 * Definition(0): result
1088 * Definition(1): scalar temporary
1089 * Definition(2): scalar identity temporary (not used to store identity on GFX10)
1090 * Definition(3): scc clobber
1091 * Definition(4): vcc clobber
1092 *
1093 */
1094 struct Pseudo_reduction_instruction : public Instruction {
1095 ReduceOp reduce_op;
1096 unsigned cluster_size; // must be 0 for scans
1097 };
1098
1099 struct instr_deleter_functor {
1100 void operator()(void* p) {
1101 free(p);
1102 }
1103 };
1104
1105 template<typename T>
1106 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
1107
1108 template<typename T>
1109 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
1110 {
1111 std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1112 char *data = (char*) calloc(1, size);
1113 T* inst = (T*) data;
1114
1115 inst->opcode = opcode;
1116 inst->format = format;
1117
1118 uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
1119 inst->operands = aco::span<Operand>(operands_offset, num_operands);
1120 uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1121 inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1122
1123 return inst;
1124 }
1125
1126 constexpr bool Instruction::usesModifiers() const noexcept
1127 {
1128 if (isDPP() || isSDWA())
1129 return true;
1130 if (!isVOP3())
1131 return false;
1132 const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
1133 for (unsigned i = 0; i < operands.size(); i++) {
1134 if (vop3->abs[i] || vop3->neg[i])
1135 return true;
1136 }
1137 return vop3->opsel || vop3->clamp || vop3->omod;
1138 }
1139
1140 constexpr bool is_phi(Instruction* instr)
1141 {
1142 return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
1143 }
1144
1145 static inline bool is_phi(aco_ptr<Instruction>& instr)
1146 {
1147 return is_phi(instr.get());
1148 }
1149
1150 barrier_interaction get_barrier_interaction(Instruction* instr);
1151
1152 bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr);
1153
1154 enum block_kind {
1155 /* uniform indicates that leaving this block,
1156 * all actives lanes stay active */
1157 block_kind_uniform = 1 << 0,
1158 block_kind_top_level = 1 << 1,
1159 block_kind_loop_preheader = 1 << 2,
1160 block_kind_loop_header = 1 << 3,
1161 block_kind_loop_exit = 1 << 4,
1162 block_kind_continue = 1 << 5,
1163 block_kind_break = 1 << 6,
1164 block_kind_continue_or_break = 1 << 7,
1165 block_kind_discard = 1 << 8,
1166 block_kind_branch = 1 << 9,
1167 block_kind_merge = 1 << 10,
1168 block_kind_invert = 1 << 11,
1169 block_kind_uses_discard_if = 1 << 12,
1170 block_kind_needs_lowering = 1 << 13,
1171 block_kind_uses_demote = 1 << 14,
1172 block_kind_export_end = 1 << 15,
1173 };
1174
1175
1176 struct RegisterDemand {
1177 constexpr RegisterDemand() = default;
1178 constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1179 : vgpr{v}, sgpr{s} {}
1180 int16_t vgpr = 0;
1181 int16_t sgpr = 0;
1182
1183 constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1184 return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1185 }
1186
1187 constexpr bool exceeds(const RegisterDemand other) const noexcept {
1188 return vgpr > other.vgpr || sgpr > other.sgpr;
1189 }
1190
1191 constexpr RegisterDemand operator+(const Temp t) const noexcept {
1192 if (t.type() == RegType::sgpr)
1193 return RegisterDemand( vgpr, sgpr + t.size() );
1194 else
1195 return RegisterDemand( vgpr + t.size(), sgpr );
1196 }
1197
1198 constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1199 return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1200 }
1201
1202 constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1203 return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1204 }
1205
1206 constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1207 vgpr += other.vgpr;
1208 sgpr += other.sgpr;
1209 return *this;
1210 }
1211
1212 constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1213 vgpr -= other.vgpr;
1214 sgpr -= other.sgpr;
1215 return *this;
1216 }
1217
1218 constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1219 if (t.type() == RegType::sgpr)
1220 sgpr += t.size();
1221 else
1222 vgpr += t.size();
1223 return *this;
1224 }
1225
1226 constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1227 if (t.type() == RegType::sgpr)
1228 sgpr -= t.size();
1229 else
1230 vgpr -= t.size();
1231 return *this;
1232 }
1233
1234 constexpr void update(const RegisterDemand other) noexcept {
1235 vgpr = std::max(vgpr, other.vgpr);
1236 sgpr = std::max(sgpr, other.sgpr);
1237 }
1238
1239 };
1240
1241 /* CFG */
1242 struct Block {
1243 float_mode fp_mode;
1244 unsigned index;
1245 unsigned offset = 0;
1246 std::vector<aco_ptr<Instruction>> instructions;
1247 std::vector<unsigned> logical_preds;
1248 std::vector<unsigned> linear_preds;
1249 std::vector<unsigned> logical_succs;
1250 std::vector<unsigned> linear_succs;
1251 RegisterDemand register_demand = RegisterDemand();
1252 uint16_t loop_nest_depth = 0;
1253 uint16_t kind = 0;
1254 int logical_idom = -1;
1255 int linear_idom = -1;
1256 Temp live_out_exec = Temp();
1257
1258 /* this information is needed for predecessors to blocks with phis when
1259 * moving out of ssa */
1260 bool scc_live_out = false;
1261 PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1262
1263 Block(unsigned idx) : index(idx) {}
1264 Block() : index(0) {}
1265 };
1266
1267 using Stage = uint16_t;
1268
1269 /* software stages */
1270 static constexpr Stage sw_vs = 1 << 0;
1271 static constexpr Stage sw_gs = 1 << 1;
1272 static constexpr Stage sw_tcs = 1 << 2;
1273 static constexpr Stage sw_tes = 1 << 3;
1274 static constexpr Stage sw_fs = 1 << 4;
1275 static constexpr Stage sw_cs = 1 << 5;
1276 static constexpr Stage sw_gs_copy = 1 << 6;
1277 static constexpr Stage sw_mask = 0x7f;
1278
1279 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1280 static constexpr Stage hw_vs = 1 << 7;
1281 static constexpr Stage hw_es = 1 << 8; /* not on GFX9. combined into GS on GFX9 (and GFX10/legacy). */
1282 static constexpr Stage hw_gs = 1 << 9;
1283 static constexpr Stage hw_ls = 1 << 10; /* not on GFX9. combined into HS on GFX9 (and GFX10/legacy). */
1284 static constexpr Stage hw_hs = 1 << 11;
1285 static constexpr Stage hw_fs = 1 << 12;
1286 static constexpr Stage hw_cs = 1 << 13;
1287 static constexpr Stage hw_mask = 0x7f << 7;
1288
1289 /* possible settings of Program::stage */
1290 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1291 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1292 static constexpr Stage compute_cs = sw_cs | hw_cs;
1293 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1294 static constexpr Stage gs_copy_vs = sw_gs_copy | hw_vs;
1295 /* GFX10/NGG */
1296 static constexpr Stage ngg_vertex_gs = sw_vs | hw_gs;
1297 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1298 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1299 static constexpr Stage ngg_vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1300 /* GFX9 (and GFX10 if NGG isn't used) */
1301 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1302 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1303 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1304 /* pre-GFX9 */
1305 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1306 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1307 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1308 static constexpr Stage tess_eval_es = sw_tes | hw_es; /* tesselation evaluation before geometry */
1309 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1310
1311 enum statistic {
1312 statistic_hash,
1313 statistic_instructions,
1314 statistic_copies,
1315 statistic_branches,
1316 statistic_cycles,
1317 statistic_vmem_clauses,
1318 statistic_smem_clauses,
1319 statistic_vmem_score,
1320 statistic_smem_score,
1321 statistic_sgpr_presched,
1322 statistic_vgpr_presched,
1323 num_statistics
1324 };
1325
1326 class Program final {
1327 public:
1328 float_mode next_fp_mode;
1329 std::vector<Block> blocks;
1330 RegisterDemand max_reg_demand = RegisterDemand();
1331 uint16_t num_waves = 0;
1332 uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1333 ac_shader_config* config;
1334 struct radv_shader_info *info;
1335 enum chip_class chip_class;
1336 enum radeon_family family;
1337 unsigned wave_size;
1338 RegClass lane_mask;
1339 Stage stage; /* Stage */
1340 bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1341 bool needs_wqm = false; /* there exists a p_wqm instruction */
1342 bool wb_smem_l1_on_end = false;
1343
1344 std::vector<uint8_t> constant_data;
1345 Temp private_segment_buffer;
1346 Temp scratch_offset;
1347
1348 uint16_t min_waves = 0;
1349 uint16_t lds_alloc_granule;
1350 uint32_t lds_limit; /* in bytes */
1351 bool has_16bank_lds;
1352 uint16_t vgpr_limit;
1353 uint16_t sgpr_limit;
1354 uint16_t physical_sgprs;
1355 uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1356 uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
1357 unsigned workgroup_size; /* if known; otherwise UINT_MAX */
1358
1359 bool xnack_enabled = false;
1360
1361 bool needs_vcc = false;
1362 bool needs_flat_scr = false;
1363
1364 bool collect_statistics = false;
1365 uint32_t statistics[num_statistics];
1366
1367 uint32_t allocateId()
1368 {
1369 assert(allocationID <= 16777215);
1370 return allocationID++;
1371 }
1372
1373 uint32_t peekAllocationId()
1374 {
1375 return allocationID;
1376 }
1377
1378 void setAllocationId(uint32_t id)
1379 {
1380 allocationID = id;
1381 }
1382
1383 Block* create_and_insert_block() {
1384 blocks.emplace_back(blocks.size());
1385 blocks.back().fp_mode = next_fp_mode;
1386 return &blocks.back();
1387 }
1388
1389 Block* insert_block(Block&& block) {
1390 block.index = blocks.size();
1391 block.fp_mode = next_fp_mode;
1392 blocks.emplace_back(std::move(block));
1393 return &blocks.back();
1394 }
1395
1396 private:
1397 uint32_t allocationID = 1;
1398 };
1399
1400 struct live {
1401 /* live temps out per block */
1402 std::vector<std::set<Temp>> live_out;
1403 /* register demand (sgpr/vgpr) per instruction per block */
1404 std::vector<std::vector<RegisterDemand>> register_demand;
1405 };
1406
1407 void select_program(Program *program,
1408 unsigned shader_count,
1409 struct nir_shader *const *shaders,
1410 ac_shader_config* config,
1411 struct radv_shader_args *args);
1412 void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
1413 ac_shader_config* config,
1414 struct radv_shader_args *args);
1415
1416 void lower_wqm(Program* program, live& live_vars,
1417 const struct radv_nir_compiler_options *options);
1418 void lower_bool_phis(Program* program);
1419 void calc_min_waves(Program* program);
1420 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1421 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1422 std::vector<uint16_t> dead_code_analysis(Program *program);
1423 void dominator_tree(Program* program);
1424 void insert_exec_mask(Program *program);
1425 void value_numbering(Program* program);
1426 void optimize(Program* program);
1427 void setup_reduce_temp(Program* program);
1428 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1429 void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_per_block);
1430 void ssa_elimination(Program* program);
1431 void lower_to_hw_instr(Program* program);
1432 void schedule_program(Program* program, live& live_vars);
1433 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1434 void insert_wait_states(Program* program);
1435 void insert_NOPs(Program* program);
1436 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1437 void print_asm(Program *program, std::vector<uint32_t>& binary,
1438 unsigned exec_size, std::ostream& out);
1439 void validate(Program* program, FILE *output);
1440 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output);
1441 #ifndef NDEBUG
1442 void perfwarn(bool cond, const char *msg, Instruction *instr=NULL);
1443 #else
1444 #define perfwarn(program, cond, msg, ...) do {} while(0)
1445 #endif
1446
1447 void collect_presched_stats(Program *program);
1448 void collect_preasm_stats(Program *program);
1449 void collect_postasm_stats(Program *program, const std::vector<uint32_t>& code);
1450
1451 void aco_print_instr(Instruction *instr, FILE *output);
1452 void aco_print_program(Program *program, FILE *output);
1453
1454 /* utilities for dealing with register demand */
1455 RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
1456 RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
1457 RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, aco_ptr<Instruction>& instr_before);
1458
1459 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1460 uint16_t get_extra_sgprs(Program *program);
1461
1462 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
1463 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1464 uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs);
1465
1466 /* return number of addressable sgprs/vgprs for max_waves */
1467 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1468 uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves);
1469
1470 typedef struct {
1471 const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
1472 const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1473 const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1474 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1475 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1476 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
1477 const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1478 const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1479 } Info;
1480
1481 extern const Info instr_info;
1482
1483 }
1484
1485 #endif /* ACO_IR_H */
1486