aco: guarantee that Temp fits in 4 bytes
[mesa.git] / src / amd / compiler / aco_ir.h
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #ifndef ACO_IR_H
26 #define ACO_IR_H
27
28 #include <vector>
29 #include <set>
30 #include <bitset>
31 #include <memory>
32
33 #include "nir.h"
34 #include "ac_binary.h"
35 #include "amd_family.h"
36 #include "aco_opcodes.h"
37 #include "aco_util.h"
38
39 struct radv_nir_compiler_options;
40 struct radv_shader_args;
41 struct radv_shader_info;
42
43 namespace aco {
44
45 extern uint64_t debug_flags;
46
47 enum {
48 DEBUG_VALIDATE = 0x1,
49 DEBUG_VALIDATE_RA = 0x2,
50 DEBUG_PERFWARN = 0x4,
51 };
52
53 /**
54 * Representation of the instruction's microcode encoding format
55 * Note: Some Vector ALU Formats can be combined, such that:
56 * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
57 * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
58 * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
59 *
60 * (*) The same is applicable for VOP1 and VOPC instructions.
61 */
62 enum class Format : std::uint16_t {
63 /* Pseudo Instruction Format */
64 PSEUDO = 0,
65 /* Scalar ALU & Control Formats */
66 SOP1 = 1,
67 SOP2 = 2,
68 SOPK = 3,
69 SOPP = 4,
70 SOPC = 5,
71 /* Scalar Memory Format */
72 SMEM = 6,
73 /* LDS/GDS Format */
74 DS = 8,
75 /* Vector Memory Buffer Formats */
76 MTBUF = 9,
77 MUBUF = 10,
78 /* Vector Memory Image Format */
79 MIMG = 11,
80 /* Export Format */
81 EXP = 12,
82 /* Flat Formats */
83 FLAT = 13,
84 GLOBAL = 14,
85 SCRATCH = 15,
86
87 PSEUDO_BRANCH = 16,
88 PSEUDO_BARRIER = 17,
89 PSEUDO_REDUCTION = 18,
90
91 /* Vector ALU Formats */
92 VOP1 = 1 << 8,
93 VOP2 = 1 << 9,
94 VOPC = 1 << 10,
95 VOP3 = 1 << 11,
96 VOP3A = 1 << 11,
97 VOP3B = 1 << 11,
98 VOP3P = 1 << 12,
99 /* Vector Parameter Interpolation Format */
100 VINTRP = 1 << 13,
101 DPP = 1 << 14,
102 SDWA = 1 << 15,
103 };
104
105 enum barrier_interaction : uint8_t {
106 barrier_none = 0,
107 barrier_buffer = 0x1,
108 barrier_image = 0x2,
109 barrier_atomic = 0x4,
110 barrier_shared = 0x8,
111 /* used for geometry shaders to ensure vertex data writes are before the
112 * GS_DONE s_sendmsg. */
113 barrier_gs_data = 0x10,
114 /* used for geometry shaders to ensure s_sendmsg instructions are in-order. */
115 barrier_gs_sendmsg = 0x20,
116 /* used by barriers. created by s_barrier */
117 barrier_barrier = 0x40,
118 barrier_count = 6,
119 };
120
121 enum fp_round {
122 fp_round_ne = 0,
123 fp_round_pi = 1,
124 fp_round_ni = 2,
125 fp_round_tz = 3,
126 };
127
128 enum fp_denorm {
129 /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
130 * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
131 fp_denorm_flush = 0x0,
132 fp_denorm_keep = 0x3,
133 };
134
135 struct float_mode {
136 /* matches encoding of the MODE register */
137 union {
138 struct {
139 fp_round round32:2;
140 fp_round round16_64:2;
141 unsigned denorm32:2;
142 unsigned denorm16_64:2;
143 };
144 uint8_t val = 0;
145 };
146 /* if false, optimizations which may remove infs/nan/-0.0 can be done */
147 bool preserve_signed_zero_inf_nan32:1;
148 bool preserve_signed_zero_inf_nan16_64:1;
149 /* if false, optimizations which may remove denormal flushing can be done */
150 bool must_flush_denorms32:1;
151 bool must_flush_denorms16_64:1;
152 bool care_about_round32:1;
153 bool care_about_round16_64:1;
154
155 /* Returns true if instructions using the mode "other" can safely use the
156 * current one instead. */
157 bool canReplace(float_mode other) const noexcept {
158 return val == other.val &&
159 (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
160 (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
161 (must_flush_denorms32 || !other.must_flush_denorms32) &&
162 (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
163 (care_about_round32 || !other.care_about_round32) &&
164 (care_about_round16_64 || !other.care_about_round16_64);
165 }
166 };
167
168 constexpr Format asVOP3(Format format) {
169 return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
170 };
171
172 constexpr Format asSDWA(Format format) {
173 assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC);
174 return (Format) ((uint32_t) Format::SDWA | (uint32_t) format);
175 }
176
177 enum class RegType {
178 none = 0,
179 sgpr,
180 vgpr,
181 linear_vgpr,
182 };
183
184 struct RegClass {
185
186 enum RC : uint8_t {
187 s1 = 1,
188 s2 = 2,
189 s3 = 3,
190 s4 = 4,
191 s6 = 6,
192 s8 = 8,
193 s16 = 16,
194 v1 = s1 | (1 << 5),
195 v2 = s2 | (1 << 5),
196 v3 = s3 | (1 << 5),
197 v4 = s4 | (1 << 5),
198 v5 = 5 | (1 << 5),
199 v6 = 6 | (1 << 5),
200 v7 = 7 | (1 << 5),
201 v8 = 8 | (1 << 5),
202 /* byte-sized register class */
203 v1b = v1 | (1 << 7),
204 v2b = v2 | (1 << 7),
205 v3b = v3 | (1 << 7),
206 v4b = v4 | (1 << 7),
207 v6b = v6 | (1 << 7),
208 v8b = v8 | (1 << 7),
209 /* these are used for WWM and spills to vgpr */
210 v1_linear = v1 | (1 << 6),
211 v2_linear = v2 | (1 << 6),
212 };
213
214 RegClass() = default;
215 constexpr RegClass(RC rc)
216 : rc(rc) {}
217 constexpr RegClass(RegType type, unsigned size)
218 : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
219
220 constexpr operator RC() const { return rc; }
221 explicit operator bool() = delete;
222
223 constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
224 constexpr bool is_subdword() const { return rc & (1 << 7); }
225 constexpr unsigned bytes() const { return ((unsigned) rc & 0x1F) * (is_subdword() ? 1 : 4); }
226 //TODO: use size() less in favor of bytes()
227 constexpr unsigned size() const { return (bytes() + 3) >> 2; }
228 constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
229 constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
230 constexpr RegClass as_subdword() const { return RegClass((RC) (rc | 1 << 7)); }
231
232 private:
233 RC rc;
234 };
235
236 /* transitional helper expressions */
237 static constexpr RegClass s1{RegClass::s1};
238 static constexpr RegClass s2{RegClass::s2};
239 static constexpr RegClass s3{RegClass::s3};
240 static constexpr RegClass s4{RegClass::s4};
241 static constexpr RegClass s8{RegClass::s8};
242 static constexpr RegClass s16{RegClass::s16};
243 static constexpr RegClass v1{RegClass::v1};
244 static constexpr RegClass v2{RegClass::v2};
245 static constexpr RegClass v3{RegClass::v3};
246 static constexpr RegClass v4{RegClass::v4};
247 static constexpr RegClass v5{RegClass::v5};
248 static constexpr RegClass v6{RegClass::v6};
249 static constexpr RegClass v7{RegClass::v7};
250 static constexpr RegClass v8{RegClass::v8};
251 static constexpr RegClass v1b{RegClass::v1b};
252 static constexpr RegClass v2b{RegClass::v2b};
253 static constexpr RegClass v3b{RegClass::v3b};
254 static constexpr RegClass v4b{RegClass::v4b};
255 static constexpr RegClass v6b{RegClass::v6b};
256 static constexpr RegClass v8b{RegClass::v8b};
257
258 /**
259 * Temp Class
260 * Each temporary virtual register has a
261 * register class (i.e. size and type)
262 * and SSA id.
263 */
264 struct Temp {
265 Temp() noexcept : id_(0), reg_class(0) {}
266 constexpr Temp(uint32_t id, RegClass cls) noexcept
267 : id_(id), reg_class(uint8_t(cls)) {}
268
269 constexpr uint32_t id() const noexcept { return id_; }
270 constexpr RegClass regClass() const noexcept { return (RegClass::RC)reg_class; }
271
272 constexpr unsigned bytes() const noexcept { return regClass().bytes(); }
273 constexpr unsigned size() const noexcept { return regClass().size(); }
274 constexpr RegType type() const noexcept { return regClass().type(); }
275 constexpr bool is_linear() const noexcept { return regClass().is_linear(); }
276
277 constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
278 constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
279 constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
280
281 private:
282 uint32_t id_: 24;
283 uint32_t reg_class : 8;
284 };
285
286 /**
287 * PhysReg
288 * Represents the physical register for each
289 * Operand and Definition.
290 */
291 struct PhysReg {
292 constexpr PhysReg() = default;
293 explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {}
294 constexpr unsigned reg() const { return reg_b >> 2; }
295 constexpr unsigned byte() const { return reg_b & 0x3; }
296 constexpr operator unsigned() const { return reg(); }
297 constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; }
298 constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; }
299 constexpr bool operator <(PhysReg other) const { return reg_b < other.reg_b; }
300
301 uint16_t reg_b = 0;
302 };
303
304 /* helper expressions for special registers */
305 static constexpr PhysReg m0{124};
306 static constexpr PhysReg vcc{106};
307 static constexpr PhysReg vcc_hi{107};
308 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
309 static constexpr PhysReg exec{126};
310 static constexpr PhysReg exec_lo{126};
311 static constexpr PhysReg exec_hi{127};
312 static constexpr PhysReg vccz{251};
313 static constexpr PhysReg execz{252};
314 static constexpr PhysReg scc{253};
315
316 /**
317 * Operand Class
318 * Initially, each Operand refers to either
319 * a temporary virtual register
320 * or to a constant value
321 * Temporary registers get mapped to physical register during RA
322 * Constant values are inlined into the instruction sequence.
323 */
324 class Operand final
325 {
326 public:
327 constexpr Operand()
328 : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
329 isKill_(false), isUndef_(true), isFirstKill_(false), is64BitConst_(false),
330 isLateKill_(false) {}
331
332 explicit Operand(Temp r) noexcept
333 {
334 data_.temp = r;
335 if (r.id()) {
336 isTemp_ = true;
337 } else {
338 isUndef_ = true;
339 setFixed(PhysReg{128});
340 }
341 };
342 explicit Operand(uint32_t v, bool is64bit = false) noexcept
343 {
344 data_.i = v;
345 isConstant_ = true;
346 is64BitConst_ = is64bit;
347 if (v <= 64)
348 setFixed(PhysReg{128 + v});
349 else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
350 setFixed(PhysReg{192 - v});
351 else if (v == 0x3f000000) /* 0.5 */
352 setFixed(PhysReg{240});
353 else if (v == 0xbf000000) /* -0.5 */
354 setFixed(PhysReg{241});
355 else if (v == 0x3f800000) /* 1.0 */
356 setFixed(PhysReg{242});
357 else if (v == 0xbf800000) /* -1.0 */
358 setFixed(PhysReg{243});
359 else if (v == 0x40000000) /* 2.0 */
360 setFixed(PhysReg{244});
361 else if (v == 0xc0000000) /* -2.0 */
362 setFixed(PhysReg{245});
363 else if (v == 0x40800000) /* 4.0 */
364 setFixed(PhysReg{246});
365 else if (v == 0xc0800000) /* -4.0 */
366 setFixed(PhysReg{247});
367 else { /* Literal Constant */
368 assert(!is64bit && "attempt to create a 64-bit literal constant");
369 setFixed(PhysReg{255});
370 }
371 };
372 explicit Operand(uint64_t v) noexcept
373 {
374 isConstant_ = true;
375 is64BitConst_ = true;
376 if (v <= 64) {
377 data_.i = (uint32_t) v;
378 setFixed(PhysReg{128 + (uint32_t) v});
379 } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
380 data_.i = (uint32_t) v;
381 setFixed(PhysReg{192 - (uint32_t) v});
382 } else if (v == 0x3FE0000000000000) { /* 0.5 */
383 data_.i = 0x3f000000;
384 setFixed(PhysReg{240});
385 } else if (v == 0xBFE0000000000000) { /* -0.5 */
386 data_.i = 0xbf000000;
387 setFixed(PhysReg{241});
388 } else if (v == 0x3FF0000000000000) { /* 1.0 */
389 data_.i = 0x3f800000;
390 setFixed(PhysReg{242});
391 } else if (v == 0xBFF0000000000000) { /* -1.0 */
392 data_.i = 0xbf800000;
393 setFixed(PhysReg{243});
394 } else if (v == 0x4000000000000000) { /* 2.0 */
395 data_.i = 0x40000000;
396 setFixed(PhysReg{244});
397 } else if (v == 0xC000000000000000) { /* -2.0 */
398 data_.i = 0xc0000000;
399 setFixed(PhysReg{245});
400 } else if (v == 0x4010000000000000) { /* 4.0 */
401 data_.i = 0x40800000;
402 setFixed(PhysReg{246});
403 } else if (v == 0xC010000000000000) { /* -4.0 */
404 data_.i = 0xc0800000;
405 setFixed(PhysReg{247});
406 } else { /* Literal Constant: we don't know if it is a long or double.*/
407 isConstant_ = 0;
408 assert(false && "attempt to create a 64-bit literal constant");
409 }
410 };
411 explicit Operand(RegClass type) noexcept
412 {
413 isUndef_ = true;
414 data_.temp = Temp(0, type);
415 setFixed(PhysReg{128});
416 };
417 explicit Operand(PhysReg reg, RegClass type) noexcept
418 {
419 data_.temp = Temp(0, type);
420 setFixed(reg);
421 }
422
423 constexpr bool isTemp() const noexcept
424 {
425 return isTemp_;
426 }
427
428 constexpr void setTemp(Temp t) noexcept {
429 assert(!isConstant_);
430 isTemp_ = true;
431 data_.temp = t;
432 }
433
434 constexpr Temp getTemp() const noexcept
435 {
436 return data_.temp;
437 }
438
439 constexpr uint32_t tempId() const noexcept
440 {
441 return data_.temp.id();
442 }
443
444 constexpr bool hasRegClass() const noexcept
445 {
446 return isTemp() || isUndefined();
447 }
448
449 constexpr RegClass regClass() const noexcept
450 {
451 return data_.temp.regClass();
452 }
453
454 constexpr unsigned bytes() const noexcept
455 {
456 if (isConstant())
457 return is64BitConst_ ? 8 : 4; //TODO: sub-dword constants
458 else
459 return data_.temp.bytes();
460 }
461
462 constexpr unsigned size() const noexcept
463 {
464 if (isConstant())
465 return is64BitConst_ ? 2 : 1;
466 else
467 return data_.temp.size();
468 }
469
470 constexpr bool isFixed() const noexcept
471 {
472 return isFixed_;
473 }
474
475 constexpr PhysReg physReg() const noexcept
476 {
477 return reg_;
478 }
479
480 constexpr void setFixed(PhysReg reg) noexcept
481 {
482 isFixed_ = reg != unsigned(-1);
483 reg_ = reg;
484 }
485
486 constexpr bool isConstant() const noexcept
487 {
488 return isConstant_;
489 }
490
491 constexpr bool isLiteral() const noexcept
492 {
493 return isConstant() && reg_ == 255;
494 }
495
496 constexpr bool isUndefined() const noexcept
497 {
498 return isUndef_;
499 }
500
501 constexpr uint32_t constantValue() const noexcept
502 {
503 return data_.i;
504 }
505
506 constexpr bool constantEquals(uint32_t cmp) const noexcept
507 {
508 return isConstant() && constantValue() == cmp;
509 }
510
511 constexpr uint64_t constantValue64(bool signext=false) const noexcept
512 {
513 if (is64BitConst_) {
514 if (reg_ <= 192)
515 return reg_ - 128;
516 else if (reg_ <= 208)
517 return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);
518
519 switch (reg_) {
520 case 240:
521 return 0x3FE0000000000000;
522 case 241:
523 return 0xBFE0000000000000;
524 case 242:
525 return 0x3FF0000000000000;
526 case 243:
527 return 0xBFF0000000000000;
528 case 244:
529 return 0x4000000000000000;
530 case 245:
531 return 0xC000000000000000;
532 case 246:
533 return 0x4010000000000000;
534 case 247:
535 return 0xC010000000000000;
536 }
537 }
538 return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
539 }
540
541 /* Indicates that the killed operand's live range intersects with the
542 * instruction's definitions. Unlike isKill() and isFirstKill(), this is
543 * not set by liveness analysis. */
544 constexpr void setLateKill(bool flag) noexcept
545 {
546 isLateKill_ = flag;
547 }
548
549 constexpr bool isLateKill() const noexcept
550 {
551 return isLateKill_;
552 }
553
554 constexpr void setKill(bool flag) noexcept
555 {
556 isKill_ = flag;
557 if (!flag)
558 setFirstKill(false);
559 }
560
561 constexpr bool isKill() const noexcept
562 {
563 return isKill_ || isFirstKill();
564 }
565
566 constexpr void setFirstKill(bool flag) noexcept
567 {
568 isFirstKill_ = flag;
569 if (flag)
570 setKill(flag);
571 }
572
573 /* When there are multiple operands killing the same temporary,
574 * isFirstKill() is only returns true for the first one. */
575 constexpr bool isFirstKill() const noexcept
576 {
577 return isFirstKill_;
578 }
579
580 constexpr bool isKillBeforeDef() const noexcept
581 {
582 return isKill() && !isLateKill();
583 }
584
585 constexpr bool isFirstKillBeforeDef() const noexcept
586 {
587 return isFirstKill() && !isLateKill();
588 }
589
590 constexpr bool operator == (Operand other) const noexcept
591 {
592 if (other.size() != size())
593 return false;
594 if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef())
595 return false;
596 if (isFixed() && other.isFixed() && physReg() != other.physReg())
597 return false;
598 if (isLiteral())
599 return other.isLiteral() && other.constantValue() == constantValue();
600 else if (isConstant())
601 return other.isConstant() && other.physReg() == physReg();
602 else if (isUndefined())
603 return other.isUndefined() && other.regClass() == regClass();
604 else
605 return other.isTemp() && other.getTemp() == getTemp();
606 }
607 private:
608 union {
609 uint32_t i;
610 float f;
611 Temp temp = Temp(0, s1);
612 } data_;
613 PhysReg reg_;
614 union {
615 struct {
616 uint8_t isTemp_:1;
617 uint8_t isFixed_:1;
618 uint8_t isConstant_:1;
619 uint8_t isKill_:1;
620 uint8_t isUndef_:1;
621 uint8_t isFirstKill_:1;
622 uint8_t is64BitConst_:1;
623 uint8_t isLateKill_:1;
624 };
625 /* can't initialize bit-fields in c++11, so work around using a union */
626 uint8_t control_ = 0;
627 };
628 };
629
630 /**
631 * Definition Class
632 * Definitions are the results of Instructions
633 * and refer to temporary virtual registers
634 * which are later mapped to physical registers
635 */
636 class Definition final
637 {
638 public:
639 constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0) {}
640 Definition(uint32_t index, RegClass type) noexcept
641 : temp(index, type) {}
642 explicit Definition(Temp tmp) noexcept
643 : temp(tmp) {}
644 Definition(PhysReg reg, RegClass type) noexcept
645 : temp(Temp(0, type))
646 {
647 setFixed(reg);
648 }
649 Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
650 : temp(Temp(tmpId, type))
651 {
652 setFixed(reg);
653 }
654
655 constexpr bool isTemp() const noexcept
656 {
657 return tempId() > 0;
658 }
659
660 constexpr Temp getTemp() const noexcept
661 {
662 return temp;
663 }
664
665 constexpr uint32_t tempId() const noexcept
666 {
667 return temp.id();
668 }
669
670 constexpr void setTemp(Temp t) noexcept {
671 temp = t;
672 }
673
674 constexpr RegClass regClass() const noexcept
675 {
676 return temp.regClass();
677 }
678
679 constexpr unsigned bytes() const noexcept
680 {
681 return temp.bytes();
682 }
683
684 constexpr unsigned size() const noexcept
685 {
686 return temp.size();
687 }
688
689 constexpr bool isFixed() const noexcept
690 {
691 return isFixed_;
692 }
693
694 constexpr PhysReg physReg() const noexcept
695 {
696 return reg_;
697 }
698
699 constexpr void setFixed(PhysReg reg) noexcept
700 {
701 isFixed_ = 1;
702 reg_ = reg;
703 }
704
705 constexpr void setHint(PhysReg reg) noexcept
706 {
707 hasHint_ = 1;
708 reg_ = reg;
709 }
710
711 constexpr bool hasHint() const noexcept
712 {
713 return hasHint_;
714 }
715
716 constexpr void setKill(bool flag) noexcept
717 {
718 isKill_ = flag;
719 }
720
721 constexpr bool isKill() const noexcept
722 {
723 return isKill_;
724 }
725
726 private:
727 Temp temp = Temp(0, s1);
728 PhysReg reg_;
729 union {
730 struct {
731 uint8_t isFixed_:1;
732 uint8_t hasHint_:1;
733 uint8_t isKill_:1;
734 };
735 /* can't initialize bit-fields in c++11, so work around using a union */
736 uint8_t control_ = 0;
737 };
738 };
739
740 class Block;
741
742 struct Instruction {
743 aco_opcode opcode;
744 Format format;
745 uint32_t pass_flags;
746
747 aco::span<Operand> operands;
748 aco::span<Definition> definitions;
749
750 constexpr bool isVALU() const noexcept
751 {
752 return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
753 || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
754 || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
755 || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
756 || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
757 || ((uint16_t) format & (uint16_t) Format::VOP3P) == (uint16_t) Format::VOP3P;
758 }
759
760 constexpr bool isSALU() const noexcept
761 {
762 return format == Format::SOP1 ||
763 format == Format::SOP2 ||
764 format == Format::SOPC ||
765 format == Format::SOPK ||
766 format == Format::SOPP;
767 }
768
769 constexpr bool isVMEM() const noexcept
770 {
771 return format == Format::MTBUF ||
772 format == Format::MUBUF ||
773 format == Format::MIMG;
774 }
775
776 constexpr bool isDPP() const noexcept
777 {
778 return (uint16_t) format & (uint16_t) Format::DPP;
779 }
780
781 constexpr bool isVOP3() const noexcept
782 {
783 return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
784 ((uint16_t) format & (uint16_t) Format::VOP3B) ||
785 format == Format::VOP3P;
786 }
787
788 constexpr bool isSDWA() const noexcept
789 {
790 return (uint16_t) format & (uint16_t) Format::SDWA;
791 }
792
793 constexpr bool isFlatOrGlobal() const noexcept
794 {
795 return format == Format::FLAT || format == Format::GLOBAL;
796 }
797
798 constexpr bool usesModifiers() const noexcept;
799
800 constexpr bool reads_exec() const noexcept
801 {
802 for (const Operand& op : operands) {
803 if (op.isFixed() && op.physReg() == exec)
804 return true;
805 }
806 return false;
807 }
808 };
809
810 struct SOPK_instruction : public Instruction {
811 uint16_t imm;
812 };
813
814 struct SOPP_instruction : public Instruction {
815 uint32_t imm;
816 int block;
817 };
818
819 struct SOPC_instruction : public Instruction {
820 };
821
822 struct SOP1_instruction : public Instruction {
823 };
824
825 struct SOP2_instruction : public Instruction {
826 };
827
828 /**
829 * Scalar Memory Format:
830 * For s_(buffer_)load_dword*:
831 * Operand(0): SBASE - SGPR-pair which provides base address
832 * Operand(1): Offset - immediate (un)signed offset or SGPR
833 * Operand(2) / Definition(0): SDATA - SGPR for read / write result
834 * Operand(n-1): SOffset - SGPR offset (Vega only)
835 *
836 * Having no operands is also valid for instructions such as s_dcache_inv.
837 *
838 */
839 struct SMEM_instruction : public Instruction {
840 bool glc : 1; /* VI+: globally coherent */
841 bool dlc : 1; /* NAVI: device level coherent */
842 bool nv : 1; /* VEGA only: Non-volatile */
843 bool can_reorder : 1;
844 bool disable_wqm : 1;
845 barrier_interaction barrier;
846 };
847
848 struct VOP1_instruction : public Instruction {
849 };
850
851 struct VOP2_instruction : public Instruction {
852 };
853
854 struct VOPC_instruction : public Instruction {
855 };
856
857 struct VOP3A_instruction : public Instruction {
858 bool abs[3];
859 bool neg[3];
860 uint8_t opsel : 4;
861 uint8_t omod : 2;
862 bool clamp : 1;
863 };
864
865 /**
866 * Data Parallel Primitives Format:
867 * This format can be used for VOP1, VOP2 or VOPC instructions.
868 * The swizzle applies to the src0 operand.
869 *
870 */
871 struct DPP_instruction : public Instruction {
872 bool abs[2];
873 bool neg[2];
874 uint16_t dpp_ctrl;
875 uint8_t row_mask : 4;
876 uint8_t bank_mask : 4;
877 bool bound_ctrl : 1;
878 };
879
880 enum sdwa_sel : uint8_t {
881 /* masks */
882 sdwa_wordnum = 0x1,
883 sdwa_bytenum = 0x3,
884 sdwa_asuint = 0x7 | 0x10,
885 sdwa_rasize = 0x3,
886
887 /* flags */
888 sdwa_isword = 0x4,
889 sdwa_sext = 0x8,
890 sdwa_isra = 0x10,
891
892 /* specific values */
893 sdwa_ubyte0 = 0,
894 sdwa_ubyte1 = 1,
895 sdwa_ubyte2 = 2,
896 sdwa_ubyte3 = 3,
897 sdwa_uword0 = sdwa_isword | 0,
898 sdwa_uword1 = sdwa_isword | 1,
899 sdwa_udword = 6,
900
901 sdwa_sbyte0 = sdwa_ubyte0 | sdwa_sext,
902 sdwa_sbyte1 = sdwa_ubyte1 | sdwa_sext,
903 sdwa_sbyte2 = sdwa_ubyte2 | sdwa_sext,
904 sdwa_sbyte3 = sdwa_ubyte3 | sdwa_sext,
905 sdwa_sword0 = sdwa_uword0 | sdwa_sext,
906 sdwa_sword1 = sdwa_uword1 | sdwa_sext,
907 sdwa_sdword = sdwa_udword | sdwa_sext,
908
909 /* register-allocated */
910 sdwa_ubyte = 1 | sdwa_isra,
911 sdwa_uword = 2 | sdwa_isra,
912 sdwa_sbyte = sdwa_ubyte | sdwa_sext,
913 sdwa_sword = sdwa_uword | sdwa_sext,
914 };
915
916 /**
917 * Sub-Dword Addressing Format:
918 * This format can be used for VOP1, VOP2 or VOPC instructions.
919 *
920 * omod and SGPR/constant operands are only available on GFX9+. For VOPC,
921 * the definition doesn't have to be VCC on GFX9+.
922 *
923 */
924 struct SDWA_instruction : public Instruction {
925 /* these destination modifiers aren't available with VOPC except for
926 * clamp on GFX8 */
927 unsigned dst_sel:8;
928 bool dst_preserve:1;
929 bool clamp:1;
930 unsigned omod:2; /* GFX9+ */
931
932 unsigned sel[2];
933 bool neg[2];
934 bool abs[2];
935 };
936
937 struct Interp_instruction : public Instruction {
938 uint8_t attribute;
939 uint8_t component;
940 };
941
942 /**
943 * Local and Global Data Sharing instructions
944 * Operand(0): ADDR - VGPR which supplies the address.
945 * Operand(1): DATA0 - First data VGPR.
946 * Operand(2): DATA1 - Second data VGPR.
947 * Operand(n-1): M0 - LDS size.
948 * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
949 *
950 */
951 struct DS_instruction : public Instruction {
952 int16_t offset0;
953 int8_t offset1;
954 bool gds;
955 };
956
957 /**
958 * Vector Memory Untyped-buffer Instructions
959 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
960 * Operand(1): VADDR - Address source. Can carry an index and/or offset
961 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
962 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
963 *
964 */
965 struct MUBUF_instruction : public Instruction {
966 uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
967 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
968 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
969 bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
970 bool glc : 1; /* globally coherent */
971 bool dlc : 1; /* NAVI: device level coherent */
972 bool slc : 1; /* system level coherent */
973 bool tfe : 1; /* texture fail enable */
974 bool lds : 1; /* Return read-data to LDS instead of VGPRs */
975 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
976 bool can_reorder : 1;
977 barrier_interaction barrier;
978 };
979
980 /**
981 * Vector Memory Typed-buffer Instructions
982 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
983 * Operand(1): VADDR - Address source. Can carry an index and/or offset
984 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
985 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
986 *
987 */
988 struct MTBUF_instruction : public Instruction {
989 uint16_t offset; /* Unsigned byte offset - 12 bit */
990 uint8_t dfmt : 4; /* Data Format of data in memory buffer */
991 uint8_t nfmt : 3; /* Numeric format of data in memory */
992 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
993 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
994 bool glc : 1; /* globally coherent */
995 bool dlc : 1; /* NAVI: device level coherent */
996 bool slc : 1; /* system level coherent */
997 bool tfe : 1; /* texture fail enable */
998 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
999 bool can_reorder : 1;
1000 barrier_interaction barrier;
1001 };
1002
1003 /**
1004 * Vector Memory Image Instructions
1005 * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
1006 * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
1007 * or VDATA - Vector GPR for write data.
1008 * Operand(2): VADDR - Address source. Can carry an offset or an index.
1009 * Definition(0): VDATA - Vector GPR for read result.
1010 *
1011 */
1012 struct MIMG_instruction : public Instruction {
1013 uint8_t dmask; /* Data VGPR enable mask */
1014 uint8_t dim : 3; /* NAVI: dimensionality */
1015 bool unrm : 1; /* Force address to be un-normalized */
1016 bool dlc : 1; /* NAVI: device level coherent */
1017 bool glc : 1; /* globally coherent */
1018 bool slc : 1; /* system level coherent */
1019 bool tfe : 1; /* texture fail enable */
1020 bool da : 1; /* declare an array */
1021 bool lwe : 1; /* Force data to be un-normalized */
1022 bool r128 : 1; /* NAVI: Texture resource size */
1023 bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
1024 bool d16 : 1; /* Convert 32-bit data to 16-bit data */
1025 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1026 bool can_reorder : 1;
1027 barrier_interaction barrier;
1028 };
1029
1030 /**
1031 * Flat/Scratch/Global Instructions
1032 * Operand(0): ADDR
1033 * Operand(1): SADDR
1034 * Operand(2) / Definition(0): DATA/VDST
1035 *
1036 */
1037 struct FLAT_instruction : public Instruction {
1038 uint16_t offset; /* Vega/Navi only */
1039 bool slc : 1; /* system level coherent */
1040 bool glc : 1; /* globally coherent */
1041 bool dlc : 1; /* NAVI: device level coherent */
1042 bool lds : 1;
1043 bool nv : 1;
1044 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1045 bool can_reorder : 1;
1046 barrier_interaction barrier;
1047 };
1048
1049 struct Export_instruction : public Instruction {
1050 uint8_t enabled_mask;
1051 uint8_t dest;
1052 bool compressed : 1;
1053 bool done : 1;
1054 bool valid_mask : 1;
1055 };
1056
1057 struct Pseudo_instruction : public Instruction {
1058 bool tmp_in_scc;
1059 PhysReg scratch_sgpr; /* might not be valid if it's not needed */
1060 };
1061
1062 struct Pseudo_branch_instruction : public Instruction {
1063 /* target[0] is the block index of the branch target.
1064 * For conditional branches, target[1] contains the fall-through alternative.
1065 * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
1066 */
1067 uint32_t target[2];
1068 };
1069
1070 struct Pseudo_barrier_instruction : public Instruction {
1071 };
1072
1073 enum ReduceOp {
1074 iadd32, iadd64,
1075 imul32, imul64,
1076 fadd32, fadd64,
1077 fmul32, fmul64,
1078 imin32, imin64,
1079 imax32, imax64,
1080 umin32, umin64,
1081 umax32, umax64,
1082 fmin32, fmin64,
1083 fmax32, fmax64,
1084 iand32, iand64,
1085 ior32, ior64,
1086 ixor32, ixor64,
1087 gfx10_wave64_bpermute
1088 };
1089
1090 /**
1091 * Subgroup Reduction Instructions, everything except for the data to be
1092 * reduced and the result as inserted by setup_reduce_temp().
1093 * Operand(0): data to be reduced
1094 * Operand(1): reduce temporary
1095 * Operand(2): vector temporary
1096 * Definition(0): result
1097 * Definition(1): scalar temporary
1098 * Definition(2): scalar identity temporary (not used to store identity on GFX10)
1099 * Definition(3): scc clobber
1100 * Definition(4): vcc clobber
1101 *
1102 */
1103 struct Pseudo_reduction_instruction : public Instruction {
1104 ReduceOp reduce_op;
1105 unsigned cluster_size; // must be 0 for scans
1106 };
1107
1108 struct instr_deleter_functor {
1109 void operator()(void* p) {
1110 free(p);
1111 }
1112 };
1113
1114 template<typename T>
1115 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
1116
1117 template<typename T>
1118 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
1119 {
1120 std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1121 char *data = (char*) calloc(1, size);
1122 T* inst = (T*) data;
1123
1124 inst->opcode = opcode;
1125 inst->format = format;
1126
1127 uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
1128 inst->operands = aco::span<Operand>(operands_offset, num_operands);
1129 uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1130 inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1131
1132 return inst;
1133 }
1134
1135 constexpr bool Instruction::usesModifiers() const noexcept
1136 {
1137 if (isDPP() || isSDWA())
1138 return true;
1139 if (!isVOP3())
1140 return false;
1141 const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
1142 for (unsigned i = 0; i < operands.size(); i++) {
1143 if (vop3->abs[i] || vop3->neg[i])
1144 return true;
1145 }
1146 return vop3->opsel || vop3->clamp || vop3->omod;
1147 }
1148
1149 constexpr bool is_phi(Instruction* instr)
1150 {
1151 return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
1152 }
1153
1154 static inline bool is_phi(aco_ptr<Instruction>& instr)
1155 {
1156 return is_phi(instr.get());
1157 }
1158
1159 barrier_interaction get_barrier_interaction(Instruction* instr);
1160
1161 bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr);
1162
1163 enum block_kind {
1164 /* uniform indicates that leaving this block,
1165 * all actives lanes stay active */
1166 block_kind_uniform = 1 << 0,
1167 block_kind_top_level = 1 << 1,
1168 block_kind_loop_preheader = 1 << 2,
1169 block_kind_loop_header = 1 << 3,
1170 block_kind_loop_exit = 1 << 4,
1171 block_kind_continue = 1 << 5,
1172 block_kind_break = 1 << 6,
1173 block_kind_continue_or_break = 1 << 7,
1174 block_kind_discard = 1 << 8,
1175 block_kind_branch = 1 << 9,
1176 block_kind_merge = 1 << 10,
1177 block_kind_invert = 1 << 11,
1178 block_kind_uses_discard_if = 1 << 12,
1179 block_kind_needs_lowering = 1 << 13,
1180 block_kind_uses_demote = 1 << 14,
1181 block_kind_export_end = 1 << 15,
1182 };
1183
1184
1185 struct RegisterDemand {
1186 constexpr RegisterDemand() = default;
1187 constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1188 : vgpr{v}, sgpr{s} {}
1189 int16_t vgpr = 0;
1190 int16_t sgpr = 0;
1191
1192 constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1193 return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1194 }
1195
1196 constexpr bool exceeds(const RegisterDemand other) const noexcept {
1197 return vgpr > other.vgpr || sgpr > other.sgpr;
1198 }
1199
1200 constexpr RegisterDemand operator+(const Temp t) const noexcept {
1201 if (t.type() == RegType::sgpr)
1202 return RegisterDemand( vgpr, sgpr + t.size() );
1203 else
1204 return RegisterDemand( vgpr + t.size(), sgpr );
1205 }
1206
1207 constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1208 return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1209 }
1210
1211 constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1212 return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1213 }
1214
1215 constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1216 vgpr += other.vgpr;
1217 sgpr += other.sgpr;
1218 return *this;
1219 }
1220
1221 constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1222 vgpr -= other.vgpr;
1223 sgpr -= other.sgpr;
1224 return *this;
1225 }
1226
1227 constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1228 if (t.type() == RegType::sgpr)
1229 sgpr += t.size();
1230 else
1231 vgpr += t.size();
1232 return *this;
1233 }
1234
1235 constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1236 if (t.type() == RegType::sgpr)
1237 sgpr -= t.size();
1238 else
1239 vgpr -= t.size();
1240 return *this;
1241 }
1242
1243 constexpr void update(const RegisterDemand other) noexcept {
1244 vgpr = std::max(vgpr, other.vgpr);
1245 sgpr = std::max(sgpr, other.sgpr);
1246 }
1247
1248 };
1249
1250 /* CFG */
1251 struct Block {
1252 float_mode fp_mode;
1253 unsigned index;
1254 unsigned offset = 0;
1255 std::vector<aco_ptr<Instruction>> instructions;
1256 std::vector<unsigned> logical_preds;
1257 std::vector<unsigned> linear_preds;
1258 std::vector<unsigned> logical_succs;
1259 std::vector<unsigned> linear_succs;
1260 RegisterDemand register_demand = RegisterDemand();
1261 uint16_t loop_nest_depth = 0;
1262 uint16_t kind = 0;
1263 int logical_idom = -1;
1264 int linear_idom = -1;
1265 Temp live_out_exec = Temp();
1266
1267 /* this information is needed for predecessors to blocks with phis when
1268 * moving out of ssa */
1269 bool scc_live_out = false;
1270 PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1271
1272 Block(unsigned idx) : index(idx) {}
1273 Block() : index(0) {}
1274 };
1275
1276 using Stage = uint16_t;
1277
1278 /* software stages */
1279 static constexpr Stage sw_vs = 1 << 0;
1280 static constexpr Stage sw_gs = 1 << 1;
1281 static constexpr Stage sw_tcs = 1 << 2;
1282 static constexpr Stage sw_tes = 1 << 3;
1283 static constexpr Stage sw_fs = 1 << 4;
1284 static constexpr Stage sw_cs = 1 << 5;
1285 static constexpr Stage sw_gs_copy = 1 << 6;
1286 static constexpr Stage sw_mask = 0x7f;
1287
1288 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1289 static constexpr Stage hw_vs = 1 << 7;
1290 static constexpr Stage hw_es = 1 << 8; /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */
1291 static constexpr Stage hw_gs = 1 << 9; /* Geometry shader on GFX10/legacy and GFX6-9. */
1292 static constexpr Stage hw_ngg_gs = 1 << 10; /* Geometry shader on GFX10/NGG. */
1293 static constexpr Stage hw_ls = 1 << 11; /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */
1294 static constexpr Stage hw_hs = 1 << 12; /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */
1295 static constexpr Stage hw_fs = 1 << 13;
1296 static constexpr Stage hw_cs = 1 << 14;
1297 static constexpr Stage hw_mask = 0xff << 7;
1298
1299 /* possible settings of Program::stage */
1300 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1301 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1302 static constexpr Stage compute_cs = sw_cs | hw_cs;
1303 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1304 static constexpr Stage gs_copy_vs = sw_gs_copy | hw_vs;
1305 /* GFX10/NGG */
1306 static constexpr Stage ngg_vertex_gs = sw_vs | hw_ngg_gs;
1307 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_ngg_gs;
1308 static constexpr Stage ngg_tess_eval_gs = sw_tes | hw_ngg_gs;
1309 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_ngg_gs;
1310 /* GFX9 (and GFX10 if NGG isn't used) */
1311 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1312 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1313 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1314 /* pre-GFX9 */
1315 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1316 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1317 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1318 static constexpr Stage tess_eval_es = sw_tes | hw_es; /* tesselation evaluation before geometry */
1319 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1320
1321 enum statistic {
1322 statistic_hash,
1323 statistic_instructions,
1324 statistic_copies,
1325 statistic_branches,
1326 statistic_cycles,
1327 statistic_vmem_clauses,
1328 statistic_smem_clauses,
1329 statistic_vmem_score,
1330 statistic_smem_score,
1331 statistic_sgpr_presched,
1332 statistic_vgpr_presched,
1333 num_statistics
1334 };
1335
1336 class Program final {
1337 public:
1338 float_mode next_fp_mode;
1339 std::vector<Block> blocks;
1340 RegisterDemand max_reg_demand = RegisterDemand();
1341 uint16_t num_waves = 0;
1342 uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1343 ac_shader_config* config;
1344 struct radv_shader_info *info;
1345 enum chip_class chip_class;
1346 enum radeon_family family;
1347 unsigned wave_size;
1348 RegClass lane_mask;
1349 Stage stage; /* Stage */
1350 bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1351 bool needs_wqm = false; /* there exists a p_wqm instruction */
1352 bool wb_smem_l1_on_end = false;
1353
1354 std::vector<uint8_t> constant_data;
1355 Temp private_segment_buffer;
1356 Temp scratch_offset;
1357
1358 uint16_t min_waves = 0;
1359 uint16_t lds_alloc_granule;
1360 uint32_t lds_limit; /* in bytes */
1361 bool has_16bank_lds;
1362 uint16_t vgpr_limit;
1363 uint16_t sgpr_limit;
1364 uint16_t physical_sgprs;
1365 uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1366 uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
1367 unsigned workgroup_size; /* if known; otherwise UINT_MAX */
1368
1369 bool xnack_enabled = false;
1370
1371 bool needs_vcc = false;
1372 bool needs_flat_scr = false;
1373
1374 bool collect_statistics = false;
1375 uint32_t statistics[num_statistics];
1376
1377 uint32_t allocateId()
1378 {
1379 assert(allocationID <= 16777215);
1380 return allocationID++;
1381 }
1382
1383 uint32_t peekAllocationId()
1384 {
1385 return allocationID;
1386 }
1387
1388 void setAllocationId(uint32_t id)
1389 {
1390 allocationID = id;
1391 }
1392
1393 Block* create_and_insert_block() {
1394 blocks.emplace_back(blocks.size());
1395 blocks.back().fp_mode = next_fp_mode;
1396 return &blocks.back();
1397 }
1398
1399 Block* insert_block(Block&& block) {
1400 block.index = blocks.size();
1401 block.fp_mode = next_fp_mode;
1402 blocks.emplace_back(std::move(block));
1403 return &blocks.back();
1404 }
1405
1406 private:
1407 uint32_t allocationID = 1;
1408 };
1409
1410 struct live {
1411 /* live temps out per block */
1412 std::vector<std::set<Temp>> live_out;
1413 /* register demand (sgpr/vgpr) per instruction per block */
1414 std::vector<std::vector<RegisterDemand>> register_demand;
1415 };
1416
1417 void select_program(Program *program,
1418 unsigned shader_count,
1419 struct nir_shader *const *shaders,
1420 ac_shader_config* config,
1421 struct radv_shader_args *args);
1422 void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
1423 ac_shader_config* config,
1424 struct radv_shader_args *args);
1425
1426 void lower_wqm(Program* program, live& live_vars,
1427 const struct radv_nir_compiler_options *options);
1428 void lower_bool_phis(Program* program);
1429 void calc_min_waves(Program* program);
1430 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1431 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1432 std::vector<uint16_t> dead_code_analysis(Program *program);
1433 void dominator_tree(Program* program);
1434 void insert_exec_mask(Program *program);
1435 void value_numbering(Program* program);
1436 void optimize(Program* program);
1437 void setup_reduce_temp(Program* program);
1438 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1439 void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_per_block);
1440 void ssa_elimination(Program* program);
1441 void lower_to_hw_instr(Program* program);
1442 void schedule_program(Program* program, live& live_vars);
1443 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1444 void insert_wait_states(Program* program);
1445 void insert_NOPs(Program* program);
1446 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1447 void print_asm(Program *program, std::vector<uint32_t>& binary,
1448 unsigned exec_size, std::ostream& out);
1449 void validate(Program* program, FILE *output);
1450 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output);
1451 #ifndef NDEBUG
1452 void perfwarn(bool cond, const char *msg, Instruction *instr=NULL);
1453 #else
1454 #define perfwarn(program, cond, msg, ...) do {} while(0)
1455 #endif
1456
1457 void collect_presched_stats(Program *program);
1458 void collect_preasm_stats(Program *program);
1459 void collect_postasm_stats(Program *program, const std::vector<uint32_t>& code);
1460
1461 void aco_print_instr(Instruction *instr, FILE *output);
1462 void aco_print_program(Program *program, FILE *output);
1463
1464 /* utilities for dealing with register demand */
1465 RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
1466 RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
1467 RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, aco_ptr<Instruction>& instr_before);
1468
1469 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1470 uint16_t get_extra_sgprs(Program *program);
1471
1472 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
1473 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1474 uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs);
1475
1476 /* return number of addressable sgprs/vgprs for max_waves */
1477 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1478 uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves);
1479
1480 typedef struct {
1481 const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
1482 const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1483 const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1484 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1485 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1486 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
1487 const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1488 const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1489 } Info;
1490
1491 extern const Info instr_info;
1492
1493 }
1494
1495 #endif /* ACO_IR_H */
1496