aco: add SDWA_instruction
[mesa.git] / src / amd / compiler / aco_ir.h
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #ifndef ACO_IR_H
26 #define ACO_IR_H
27
28 #include <vector>
29 #include <set>
30 #include <bitset>
31 #include <memory>
32
33 #include "nir.h"
34 #include "ac_binary.h"
35 #include "amd_family.h"
36 #include "aco_opcodes.h"
37 #include "aco_util.h"
38
39 struct radv_nir_compiler_options;
40 struct radv_shader_args;
41 struct radv_shader_info;
42
43 namespace aco {
44
45 extern uint64_t debug_flags;
46
47 enum {
48 DEBUG_VALIDATE = 0x1,
49 DEBUG_VALIDATE_RA = 0x2,
50 DEBUG_PERFWARN = 0x4,
51 };
52
53 /**
54 * Representation of the instruction's microcode encoding format
55 * Note: Some Vector ALU Formats can be combined, such that:
56 * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
57 * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
58 * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
59 *
60 * (*) The same is applicable for VOP1 and VOPC instructions.
61 */
62 enum class Format : std::uint16_t {
63 /* Pseudo Instruction Format */
64 PSEUDO = 0,
65 /* Scalar ALU & Control Formats */
66 SOP1 = 1,
67 SOP2 = 2,
68 SOPK = 3,
69 SOPP = 4,
70 SOPC = 5,
71 /* Scalar Memory Format */
72 SMEM = 6,
73 /* LDS/GDS Format */
74 DS = 8,
75 /* Vector Memory Buffer Formats */
76 MTBUF = 9,
77 MUBUF = 10,
78 /* Vector Memory Image Format */
79 MIMG = 11,
80 /* Export Format */
81 EXP = 12,
82 /* Flat Formats */
83 FLAT = 13,
84 GLOBAL = 14,
85 SCRATCH = 15,
86
87 PSEUDO_BRANCH = 16,
88 PSEUDO_BARRIER = 17,
89 PSEUDO_REDUCTION = 18,
90
91 /* Vector ALU Formats */
92 VOP1 = 1 << 8,
93 VOP2 = 1 << 9,
94 VOPC = 1 << 10,
95 VOP3 = 1 << 11,
96 VOP3A = 1 << 11,
97 VOP3B = 1 << 11,
98 VOP3P = 1 << 12,
99 /* Vector Parameter Interpolation Format */
100 VINTRP = 1 << 13,
101 DPP = 1 << 14,
102 SDWA = 1 << 15,
103 };
104
105 enum barrier_interaction : uint8_t {
106 barrier_none = 0,
107 barrier_buffer = 0x1,
108 barrier_image = 0x2,
109 barrier_atomic = 0x4,
110 barrier_shared = 0x8,
111 /* used for geometry shaders to ensure vertex data writes are before the
112 * GS_DONE s_sendmsg. */
113 barrier_gs_data = 0x10,
114 /* used for geometry shaders to ensure s_sendmsg instructions are in-order. */
115 barrier_gs_sendmsg = 0x20,
116 /* used by barriers. created by s_barrier */
117 barrier_barrier = 0x40,
118 barrier_count = 6,
119 };
120
121 enum fp_round {
122 fp_round_ne = 0,
123 fp_round_pi = 1,
124 fp_round_ni = 2,
125 fp_round_tz = 3,
126 };
127
128 enum fp_denorm {
129 /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
130 * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
131 fp_denorm_flush = 0x0,
132 fp_denorm_keep = 0x3,
133 };
134
135 struct float_mode {
136 /* matches encoding of the MODE register */
137 union {
138 struct {
139 fp_round round32:2;
140 fp_round round16_64:2;
141 unsigned denorm32:2;
142 unsigned denorm16_64:2;
143 };
144 uint8_t val = 0;
145 };
146 /* if false, optimizations which may remove infs/nan/-0.0 can be done */
147 bool preserve_signed_zero_inf_nan32:1;
148 bool preserve_signed_zero_inf_nan16_64:1;
149 /* if false, optimizations which may remove denormal flushing can be done */
150 bool must_flush_denorms32:1;
151 bool must_flush_denorms16_64:1;
152 bool care_about_round32:1;
153 bool care_about_round16_64:1;
154
155 /* Returns true if instructions using the mode "other" can safely use the
156 * current one instead. */
157 bool canReplace(float_mode other) const noexcept {
158 return val == other.val &&
159 (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
160 (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
161 (must_flush_denorms32 || !other.must_flush_denorms32) &&
162 (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
163 (care_about_round32 || !other.care_about_round32) &&
164 (care_about_round16_64 || !other.care_about_round16_64);
165 }
166 };
167
168 constexpr Format asVOP3(Format format) {
169 return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
170 };
171
172 constexpr Format asSDWA(Format format) {
173 assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC);
174 return (Format) ((uint32_t) Format::SDWA | (uint32_t) format);
175 }
176
177 enum class RegType {
178 none = 0,
179 sgpr,
180 vgpr,
181 linear_vgpr,
182 };
183
184 struct RegClass {
185
186 enum RC : uint8_t {
187 s1 = 1,
188 s2 = 2,
189 s3 = 3,
190 s4 = 4,
191 s6 = 6,
192 s8 = 8,
193 s16 = 16,
194 v1 = s1 | (1 << 5),
195 v2 = s2 | (1 << 5),
196 v3 = s3 | (1 << 5),
197 v4 = s4 | (1 << 5),
198 v5 = 5 | (1 << 5),
199 v6 = 6 | (1 << 5),
200 v7 = 7 | (1 << 5),
201 v8 = 8 | (1 << 5),
202 /* these are used for WWM and spills to vgpr */
203 v1_linear = v1 | (1 << 6),
204 v2_linear = v2 | (1 << 6),
205 };
206
207 RegClass() = default;
208 constexpr RegClass(RC rc)
209 : rc(rc) {}
210 constexpr RegClass(RegType type, unsigned size)
211 : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
212
213 constexpr operator RC() const { return rc; }
214 explicit operator bool() = delete;
215
216 constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
217 constexpr unsigned size() const { return (unsigned) rc & 0x1F; }
218 constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
219 constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
220
221 private:
222 RC rc;
223 };
224
225 /* transitional helper expressions */
226 static constexpr RegClass s1{RegClass::s1};
227 static constexpr RegClass s2{RegClass::s2};
228 static constexpr RegClass s3{RegClass::s3};
229 static constexpr RegClass s4{RegClass::s4};
230 static constexpr RegClass s8{RegClass::s8};
231 static constexpr RegClass s16{RegClass::s16};
232 static constexpr RegClass v1{RegClass::v1};
233 static constexpr RegClass v2{RegClass::v2};
234 static constexpr RegClass v3{RegClass::v3};
235 static constexpr RegClass v4{RegClass::v4};
236 static constexpr RegClass v5{RegClass::v5};
237 static constexpr RegClass v6{RegClass::v6};
238 static constexpr RegClass v7{RegClass::v7};
239 static constexpr RegClass v8{RegClass::v8};
240
241 /**
242 * Temp Class
243 * Each temporary virtual register has a
244 * register class (i.e. size and type)
245 * and SSA id.
246 */
247 struct Temp {
248 Temp() = default;
249 constexpr Temp(uint32_t id, RegClass cls) noexcept
250 : id_(id), reg_class(cls) {}
251
252 constexpr uint32_t id() const noexcept { return id_; }
253 constexpr RegClass regClass() const noexcept { return reg_class; }
254
255 constexpr unsigned size() const noexcept { return reg_class.size(); }
256 constexpr RegType type() const noexcept { return reg_class.type(); }
257 constexpr bool is_linear() const noexcept { return reg_class.is_linear(); }
258
259 constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
260 constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
261 constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
262
263 private:
264 uint32_t id_:24;
265 RegClass reg_class;
266 };
267
268 /**
269 * PhysReg
270 * Represents the physical register for each
271 * Operand and Definition.
272 */
273 struct PhysReg {
274 constexpr PhysReg() = default;
275 explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {}
276 constexpr unsigned reg() const { return reg_b >> 2; }
277 constexpr unsigned byte() const { return reg_b & 0x3; }
278 constexpr operator unsigned() const { return reg(); }
279 constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; }
280 constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; }
281 constexpr bool operator <(PhysReg other) const { return reg_b < other.reg_b; }
282
283 uint16_t reg_b = 0;
284 };
285
286 /* helper expressions for special registers */
287 static constexpr PhysReg m0{124};
288 static constexpr PhysReg vcc{106};
289 static constexpr PhysReg vcc_hi{107};
290 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
291 static constexpr PhysReg exec{126};
292 static constexpr PhysReg exec_lo{126};
293 static constexpr PhysReg exec_hi{127};
294 static constexpr PhysReg vccz{251};
295 static constexpr PhysReg execz{252};
296 static constexpr PhysReg scc{253};
297
298 /**
299 * Operand Class
300 * Initially, each Operand refers to either
301 * a temporary virtual register
302 * or to a constant value
303 * Temporary registers get mapped to physical register during RA
304 * Constant values are inlined into the instruction sequence.
305 */
306 class Operand final
307 {
308 public:
309 constexpr Operand()
310 : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
311 isKill_(false), isUndef_(true), isFirstKill_(false), is64BitConst_(false),
312 isLateKill_(false) {}
313
314 explicit Operand(Temp r) noexcept
315 {
316 data_.temp = r;
317 if (r.id()) {
318 isTemp_ = true;
319 } else {
320 isUndef_ = true;
321 setFixed(PhysReg{128});
322 }
323 };
324 explicit Operand(uint32_t v, bool is64bit = false) noexcept
325 {
326 data_.i = v;
327 isConstant_ = true;
328 is64BitConst_ = is64bit;
329 if (v <= 64)
330 setFixed(PhysReg{128 + v});
331 else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
332 setFixed(PhysReg{192 - v});
333 else if (v == 0x3f000000) /* 0.5 */
334 setFixed(PhysReg{240});
335 else if (v == 0xbf000000) /* -0.5 */
336 setFixed(PhysReg{241});
337 else if (v == 0x3f800000) /* 1.0 */
338 setFixed(PhysReg{242});
339 else if (v == 0xbf800000) /* -1.0 */
340 setFixed(PhysReg{243});
341 else if (v == 0x40000000) /* 2.0 */
342 setFixed(PhysReg{244});
343 else if (v == 0xc0000000) /* -2.0 */
344 setFixed(PhysReg{245});
345 else if (v == 0x40800000) /* 4.0 */
346 setFixed(PhysReg{246});
347 else if (v == 0xc0800000) /* -4.0 */
348 setFixed(PhysReg{247});
349 else { /* Literal Constant */
350 assert(!is64bit && "attempt to create a 64-bit literal constant");
351 setFixed(PhysReg{255});
352 }
353 };
354 explicit Operand(uint64_t v) noexcept
355 {
356 isConstant_ = true;
357 is64BitConst_ = true;
358 if (v <= 64) {
359 data_.i = (uint32_t) v;
360 setFixed(PhysReg{128 + (uint32_t) v});
361 } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
362 data_.i = (uint32_t) v;
363 setFixed(PhysReg{192 - (uint32_t) v});
364 } else if (v == 0x3FE0000000000000) { /* 0.5 */
365 data_.i = 0x3f000000;
366 setFixed(PhysReg{240});
367 } else if (v == 0xBFE0000000000000) { /* -0.5 */
368 data_.i = 0xbf000000;
369 setFixed(PhysReg{241});
370 } else if (v == 0x3FF0000000000000) { /* 1.0 */
371 data_.i = 0x3f800000;
372 setFixed(PhysReg{242});
373 } else if (v == 0xBFF0000000000000) { /* -1.0 */
374 data_.i = 0xbf800000;
375 setFixed(PhysReg{243});
376 } else if (v == 0x4000000000000000) { /* 2.0 */
377 data_.i = 0x40000000;
378 setFixed(PhysReg{244});
379 } else if (v == 0xC000000000000000) { /* -2.0 */
380 data_.i = 0xc0000000;
381 setFixed(PhysReg{245});
382 } else if (v == 0x4010000000000000) { /* 4.0 */
383 data_.i = 0x40800000;
384 setFixed(PhysReg{246});
385 } else if (v == 0xC010000000000000) { /* -4.0 */
386 data_.i = 0xc0800000;
387 setFixed(PhysReg{247});
388 } else { /* Literal Constant: we don't know if it is a long or double.*/
389 isConstant_ = 0;
390 assert(false && "attempt to create a 64-bit literal constant");
391 }
392 };
393 explicit Operand(RegClass type) noexcept
394 {
395 isUndef_ = true;
396 data_.temp = Temp(0, type);
397 setFixed(PhysReg{128});
398 };
399 explicit Operand(PhysReg reg, RegClass type) noexcept
400 {
401 data_.temp = Temp(0, type);
402 setFixed(reg);
403 }
404
405 constexpr bool isTemp() const noexcept
406 {
407 return isTemp_;
408 }
409
410 constexpr void setTemp(Temp t) noexcept {
411 assert(!isConstant_);
412 isTemp_ = true;
413 data_.temp = t;
414 }
415
416 constexpr Temp getTemp() const noexcept
417 {
418 return data_.temp;
419 }
420
421 constexpr uint32_t tempId() const noexcept
422 {
423 return data_.temp.id();
424 }
425
426 constexpr bool hasRegClass() const noexcept
427 {
428 return isTemp() || isUndefined();
429 }
430
431 constexpr RegClass regClass() const noexcept
432 {
433 return data_.temp.regClass();
434 }
435
436 constexpr unsigned size() const noexcept
437 {
438 if (isConstant())
439 return is64BitConst_ ? 2 : 1;
440 else
441 return data_.temp.size();
442 }
443
444 constexpr bool isFixed() const noexcept
445 {
446 return isFixed_;
447 }
448
449 constexpr PhysReg physReg() const noexcept
450 {
451 return reg_;
452 }
453
454 constexpr void setFixed(PhysReg reg) noexcept
455 {
456 isFixed_ = reg != unsigned(-1);
457 reg_ = reg;
458 }
459
460 constexpr bool isConstant() const noexcept
461 {
462 return isConstant_;
463 }
464
465 constexpr bool isLiteral() const noexcept
466 {
467 return isConstant() && reg_ == 255;
468 }
469
470 constexpr bool isUndefined() const noexcept
471 {
472 return isUndef_;
473 }
474
475 constexpr uint32_t constantValue() const noexcept
476 {
477 return data_.i;
478 }
479
480 constexpr bool constantEquals(uint32_t cmp) const noexcept
481 {
482 return isConstant() && constantValue() == cmp;
483 }
484
485 constexpr uint64_t constantValue64(bool signext=false) const noexcept
486 {
487 if (is64BitConst_) {
488 if (reg_ <= 192)
489 return reg_ - 128;
490 else if (reg_ <= 208)
491 return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);
492
493 switch (reg_) {
494 case 240:
495 return 0x3FE0000000000000;
496 case 241:
497 return 0xBFE0000000000000;
498 case 242:
499 return 0x3FF0000000000000;
500 case 243:
501 return 0xBFF0000000000000;
502 case 244:
503 return 0x4000000000000000;
504 case 245:
505 return 0xC000000000000000;
506 case 246:
507 return 0x4010000000000000;
508 case 247:
509 return 0xC010000000000000;
510 }
511 }
512 return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
513 }
514
515 /* Indicates that the killed operand's live range intersects with the
516 * instruction's definitions. Unlike isKill() and isFirstKill(), this is
517 * not set by liveness analysis. */
518 constexpr void setLateKill(bool flag) noexcept
519 {
520 isLateKill_ = flag;
521 }
522
523 constexpr bool isLateKill() const noexcept
524 {
525 return isLateKill_;
526 }
527
528 constexpr void setKill(bool flag) noexcept
529 {
530 isKill_ = flag;
531 if (!flag)
532 setFirstKill(false);
533 }
534
535 constexpr bool isKill() const noexcept
536 {
537 return isKill_ || isFirstKill();
538 }
539
540 constexpr void setFirstKill(bool flag) noexcept
541 {
542 isFirstKill_ = flag;
543 if (flag)
544 setKill(flag);
545 }
546
547 /* When there are multiple operands killing the same temporary,
548 * isFirstKill() is only returns true for the first one. */
549 constexpr bool isFirstKill() const noexcept
550 {
551 return isFirstKill_;
552 }
553
554 constexpr bool isKillBeforeDef() const noexcept
555 {
556 return isKill() && !isLateKill();
557 }
558
559 constexpr bool isFirstKillBeforeDef() const noexcept
560 {
561 return isFirstKill() && !isLateKill();
562 }
563
564 constexpr bool operator == (Operand other) const noexcept
565 {
566 if (other.size() != size())
567 return false;
568 if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef())
569 return false;
570 if (isFixed() && other.isFixed() && physReg() != other.physReg())
571 return false;
572 if (isLiteral())
573 return other.isLiteral() && other.constantValue() == constantValue();
574 else if (isConstant())
575 return other.isConstant() && other.physReg() == physReg();
576 else if (isUndefined())
577 return other.isUndefined() && other.regClass() == regClass();
578 else
579 return other.isTemp() && other.getTemp() == getTemp();
580 }
581 private:
582 union {
583 uint32_t i;
584 float f;
585 Temp temp = Temp(0, s1);
586 } data_;
587 PhysReg reg_;
588 union {
589 struct {
590 uint8_t isTemp_:1;
591 uint8_t isFixed_:1;
592 uint8_t isConstant_:1;
593 uint8_t isKill_:1;
594 uint8_t isUndef_:1;
595 uint8_t isFirstKill_:1;
596 uint8_t is64BitConst_:1;
597 uint8_t isLateKill_:1;
598 };
599 /* can't initialize bit-fields in c++11, so work around using a union */
600 uint8_t control_ = 0;
601 };
602 };
603
604 /**
605 * Definition Class
606 * Definitions are the results of Instructions
607 * and refer to temporary virtual registers
608 * which are later mapped to physical registers
609 */
610 class Definition final
611 {
612 public:
613 constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0) {}
614 Definition(uint32_t index, RegClass type) noexcept
615 : temp(index, type) {}
616 explicit Definition(Temp tmp) noexcept
617 : temp(tmp) {}
618 Definition(PhysReg reg, RegClass type) noexcept
619 : temp(Temp(0, type))
620 {
621 setFixed(reg);
622 }
623 Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
624 : temp(Temp(tmpId, type))
625 {
626 setFixed(reg);
627 }
628
629 constexpr bool isTemp() const noexcept
630 {
631 return tempId() > 0;
632 }
633
634 constexpr Temp getTemp() const noexcept
635 {
636 return temp;
637 }
638
639 constexpr uint32_t tempId() const noexcept
640 {
641 return temp.id();
642 }
643
644 constexpr void setTemp(Temp t) noexcept {
645 temp = t;
646 }
647
648 constexpr RegClass regClass() const noexcept
649 {
650 return temp.regClass();
651 }
652
653 constexpr unsigned size() const noexcept
654 {
655 return temp.size();
656 }
657
658 constexpr bool isFixed() const noexcept
659 {
660 return isFixed_;
661 }
662
663 constexpr PhysReg physReg() const noexcept
664 {
665 return reg_;
666 }
667
668 constexpr void setFixed(PhysReg reg) noexcept
669 {
670 isFixed_ = 1;
671 reg_ = reg;
672 }
673
674 constexpr void setHint(PhysReg reg) noexcept
675 {
676 hasHint_ = 1;
677 reg_ = reg;
678 }
679
680 constexpr bool hasHint() const noexcept
681 {
682 return hasHint_;
683 }
684
685 constexpr void setKill(bool flag) noexcept
686 {
687 isKill_ = flag;
688 }
689
690 constexpr bool isKill() const noexcept
691 {
692 return isKill_;
693 }
694
695 private:
696 Temp temp = Temp(0, s1);
697 PhysReg reg_;
698 union {
699 struct {
700 uint8_t isFixed_:1;
701 uint8_t hasHint_:1;
702 uint8_t isKill_:1;
703 };
704 /* can't initialize bit-fields in c++11, so work around using a union */
705 uint8_t control_ = 0;
706 };
707 };
708
709 class Block;
710
711 struct Instruction {
712 aco_opcode opcode;
713 Format format;
714 uint32_t pass_flags;
715
716 aco::span<Operand> operands;
717 aco::span<Definition> definitions;
718
719 constexpr bool isVALU() const noexcept
720 {
721 return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
722 || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
723 || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
724 || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
725 || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
726 || ((uint16_t) format & (uint16_t) Format::VOP3P) == (uint16_t) Format::VOP3P;
727 }
728
729 constexpr bool isSALU() const noexcept
730 {
731 return format == Format::SOP1 ||
732 format == Format::SOP2 ||
733 format == Format::SOPC ||
734 format == Format::SOPK ||
735 format == Format::SOPP;
736 }
737
738 constexpr bool isVMEM() const noexcept
739 {
740 return format == Format::MTBUF ||
741 format == Format::MUBUF ||
742 format == Format::MIMG;
743 }
744
745 constexpr bool isDPP() const noexcept
746 {
747 return (uint16_t) format & (uint16_t) Format::DPP;
748 }
749
750 constexpr bool isVOP3() const noexcept
751 {
752 return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
753 ((uint16_t) format & (uint16_t) Format::VOP3B) ||
754 format == Format::VOP3P;
755 }
756
757 constexpr bool isSDWA() const noexcept
758 {
759 return (uint16_t) format & (uint16_t) Format::SDWA;
760 }
761
762 constexpr bool isFlatOrGlobal() const noexcept
763 {
764 return format == Format::FLAT || format == Format::GLOBAL;
765 }
766
767 constexpr bool usesModifiers() const noexcept;
768
769 constexpr bool reads_exec() const noexcept
770 {
771 for (const Operand& op : operands) {
772 if (op.isFixed() && op.physReg() == exec)
773 return true;
774 }
775 return false;
776 }
777 };
778
779 struct SOPK_instruction : public Instruction {
780 uint16_t imm;
781 };
782
783 struct SOPP_instruction : public Instruction {
784 uint32_t imm;
785 int block;
786 };
787
788 struct SOPC_instruction : public Instruction {
789 };
790
791 struct SOP1_instruction : public Instruction {
792 };
793
794 struct SOP2_instruction : public Instruction {
795 };
796
797 /**
798 * Scalar Memory Format:
799 * For s_(buffer_)load_dword*:
800 * Operand(0): SBASE - SGPR-pair which provides base address
801 * Operand(1): Offset - immediate (un)signed offset or SGPR
802 * Operand(2) / Definition(0): SDATA - SGPR for read / write result
803 * Operand(n-1): SOffset - SGPR offset (Vega only)
804 *
805 * Having no operands is also valid for instructions such as s_dcache_inv.
806 *
807 */
808 struct SMEM_instruction : public Instruction {
809 bool glc : 1; /* VI+: globally coherent */
810 bool dlc : 1; /* NAVI: device level coherent */
811 bool nv : 1; /* VEGA only: Non-volatile */
812 bool can_reorder : 1;
813 bool disable_wqm : 1;
814 barrier_interaction barrier;
815 };
816
817 struct VOP1_instruction : public Instruction {
818 };
819
820 struct VOP2_instruction : public Instruction {
821 };
822
823 struct VOPC_instruction : public Instruction {
824 };
825
826 struct VOP3A_instruction : public Instruction {
827 bool abs[3];
828 bool neg[3];
829 uint8_t opsel : 4;
830 uint8_t omod : 2;
831 bool clamp : 1;
832 };
833
834 /**
835 * Data Parallel Primitives Format:
836 * This format can be used for VOP1, VOP2 or VOPC instructions.
837 * The swizzle applies to the src0 operand.
838 *
839 */
840 struct DPP_instruction : public Instruction {
841 bool abs[2];
842 bool neg[2];
843 uint16_t dpp_ctrl;
844 uint8_t row_mask : 4;
845 uint8_t bank_mask : 4;
846 bool bound_ctrl : 1;
847 };
848
849 enum sdwa_sel : uint8_t {
850 /* masks */
851 sdwa_wordnum = 0x1,
852 sdwa_bytenum = 0x3,
853 sdwa_asuint = 0x7,
854
855 /* flags */
856 sdwa_isword = 0x4,
857 sdwa_sext = 0x8,
858
859 /* specific values */
860 sdwa_ubyte0 = 0,
861 sdwa_ubyte1 = 1,
862 sdwa_ubyte2 = 2,
863 sdwa_ubyte3 = 3,
864 sdwa_uword0 = sdwa_isword | 0,
865 sdwa_uword1 = sdwa_isword | 1,
866 sdwa_udword = 6,
867
868 sdwa_sbyte0 = sdwa_ubyte0 | sdwa_sext,
869 sdwa_sbyte1 = sdwa_ubyte1 | sdwa_sext,
870 sdwa_sbyte2 = sdwa_ubyte2 | sdwa_sext,
871 sdwa_sbyte3 = sdwa_ubyte3 | sdwa_sext,
872 sdwa_sword0 = sdwa_uword0 | sdwa_sext,
873 sdwa_sword1 = sdwa_uword1 | sdwa_sext,
874 sdwa_sdword = sdwa_udword | sdwa_sext,
875 };
876
877 /**
878 * Sub-Dword Addressing Format:
879 * This format can be used for VOP1, VOP2 or VOPC instructions.
880 *
881 * omod and SGPR/constant operands are only available on GFX9+. For VOPC,
882 * the definition doesn't have to be VCC on GFX9+.
883 *
884 */
885 struct SDWA_instruction : public Instruction {
886 /* these destination modifiers aren't available with VOPC except for
887 * clamp on GFX8 */
888 unsigned dst_sel:4;
889 bool dst_preserve:1;
890 bool clamp:1;
891 unsigned omod:2; /* GFX9+ */
892
893 unsigned sel[2];
894 bool neg[2];
895 bool abs[2];
896 };
897
898 struct Interp_instruction : public Instruction {
899 uint8_t attribute;
900 uint8_t component;
901 };
902
903 /**
904 * Local and Global Data Sharing instructions
905 * Operand(0): ADDR - VGPR which supplies the address.
906 * Operand(1): DATA0 - First data VGPR.
907 * Operand(2): DATA1 - Second data VGPR.
908 * Operand(n-1): M0 - LDS size.
909 * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
910 *
911 */
912 struct DS_instruction : public Instruction {
913 int16_t offset0;
914 int8_t offset1;
915 bool gds;
916 };
917
918 /**
919 * Vector Memory Untyped-buffer Instructions
920 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
921 * Operand(1): VADDR - Address source. Can carry an index and/or offset
922 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
923 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
924 *
925 */
926 struct MUBUF_instruction : public Instruction {
927 uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
928 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
929 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
930 bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
931 bool glc : 1; /* globally coherent */
932 bool dlc : 1; /* NAVI: device level coherent */
933 bool slc : 1; /* system level coherent */
934 bool tfe : 1; /* texture fail enable */
935 bool lds : 1; /* Return read-data to LDS instead of VGPRs */
936 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
937 bool can_reorder : 1;
938 barrier_interaction barrier;
939 };
940
941 /**
942 * Vector Memory Typed-buffer Instructions
943 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
944 * Operand(1): VADDR - Address source. Can carry an index and/or offset
945 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
946 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
947 *
948 */
949 struct MTBUF_instruction : public Instruction {
950 uint16_t offset; /* Unsigned byte offset - 12 bit */
951 uint8_t dfmt : 4; /* Data Format of data in memory buffer */
952 uint8_t nfmt : 3; /* Numeric format of data in memory */
953 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
954 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
955 bool glc : 1; /* globally coherent */
956 bool dlc : 1; /* NAVI: device level coherent */
957 bool slc : 1; /* system level coherent */
958 bool tfe : 1; /* texture fail enable */
959 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
960 bool can_reorder : 1;
961 barrier_interaction barrier;
962 };
963
964 /**
965 * Vector Memory Image Instructions
966 * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
967 * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
968 * or VDATA - Vector GPR for write data.
969 * Operand(2): VADDR - Address source. Can carry an offset or an index.
970 * Definition(0): VDATA - Vector GPR for read result.
971 *
972 */
973 struct MIMG_instruction : public Instruction {
974 uint8_t dmask; /* Data VGPR enable mask */
975 uint8_t dim : 3; /* NAVI: dimensionality */
976 bool unrm : 1; /* Force address to be un-normalized */
977 bool dlc : 1; /* NAVI: device level coherent */
978 bool glc : 1; /* globally coherent */
979 bool slc : 1; /* system level coherent */
980 bool tfe : 1; /* texture fail enable */
981 bool da : 1; /* declare an array */
982 bool lwe : 1; /* Force data to be un-normalized */
983 bool r128 : 1; /* NAVI: Texture resource size */
984 bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
985 bool d16 : 1; /* Convert 32-bit data to 16-bit data */
986 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
987 bool can_reorder : 1;
988 barrier_interaction barrier;
989 };
990
991 /**
992 * Flat/Scratch/Global Instructions
993 * Operand(0): ADDR
994 * Operand(1): SADDR
995 * Operand(2) / Definition(0): DATA/VDST
996 *
997 */
998 struct FLAT_instruction : public Instruction {
999 uint16_t offset; /* Vega/Navi only */
1000 bool slc : 1; /* system level coherent */
1001 bool glc : 1; /* globally coherent */
1002 bool dlc : 1; /* NAVI: device level coherent */
1003 bool lds : 1;
1004 bool nv : 1;
1005 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1006 bool can_reorder : 1;
1007 barrier_interaction barrier;
1008 };
1009
1010 struct Export_instruction : public Instruction {
1011 uint8_t enabled_mask;
1012 uint8_t dest;
1013 bool compressed : 1;
1014 bool done : 1;
1015 bool valid_mask : 1;
1016 };
1017
1018 struct Pseudo_instruction : public Instruction {
1019 bool tmp_in_scc;
1020 PhysReg scratch_sgpr; /* might not be valid if it's not needed */
1021 };
1022
1023 struct Pseudo_branch_instruction : public Instruction {
1024 /* target[0] is the block index of the branch target.
1025 * For conditional branches, target[1] contains the fall-through alternative.
1026 * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
1027 */
1028 uint32_t target[2];
1029 };
1030
1031 struct Pseudo_barrier_instruction : public Instruction {
1032 };
1033
1034 enum ReduceOp {
1035 iadd32, iadd64,
1036 imul32, imul64,
1037 fadd32, fadd64,
1038 fmul32, fmul64,
1039 imin32, imin64,
1040 imax32, imax64,
1041 umin32, umin64,
1042 umax32, umax64,
1043 fmin32, fmin64,
1044 fmax32, fmax64,
1045 iand32, iand64,
1046 ior32, ior64,
1047 ixor32, ixor64,
1048 gfx10_wave64_bpermute
1049 };
1050
1051 /**
1052 * Subgroup Reduction Instructions, everything except for the data to be
1053 * reduced and the result as inserted by setup_reduce_temp().
1054 * Operand(0): data to be reduced
1055 * Operand(1): reduce temporary
1056 * Operand(2): vector temporary
1057 * Definition(0): result
1058 * Definition(1): scalar temporary
1059 * Definition(2): scalar identity temporary (not used to store identity on GFX10)
1060 * Definition(3): scc clobber
1061 * Definition(4): vcc clobber
1062 *
1063 */
1064 struct Pseudo_reduction_instruction : public Instruction {
1065 ReduceOp reduce_op;
1066 unsigned cluster_size; // must be 0 for scans
1067 };
1068
1069 struct instr_deleter_functor {
1070 void operator()(void* p) {
1071 free(p);
1072 }
1073 };
1074
1075 template<typename T>
1076 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
1077
1078 template<typename T>
1079 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
1080 {
1081 std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1082 char *data = (char*) calloc(1, size);
1083 T* inst = (T*) data;
1084
1085 inst->opcode = opcode;
1086 inst->format = format;
1087
1088 uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
1089 inst->operands = aco::span<Operand>(operands_offset, num_operands);
1090 uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1091 inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1092
1093 return inst;
1094 }
1095
1096 constexpr bool Instruction::usesModifiers() const noexcept
1097 {
1098 if (isDPP() || isSDWA())
1099 return true;
1100 if (!isVOP3())
1101 return false;
1102 const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
1103 for (unsigned i = 0; i < operands.size(); i++) {
1104 if (vop3->abs[i] || vop3->neg[i])
1105 return true;
1106 }
1107 return vop3->opsel || vop3->clamp || vop3->omod;
1108 }
1109
1110 constexpr bool is_phi(Instruction* instr)
1111 {
1112 return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
1113 }
1114
1115 static inline bool is_phi(aco_ptr<Instruction>& instr)
1116 {
1117 return is_phi(instr.get());
1118 }
1119
1120 barrier_interaction get_barrier_interaction(Instruction* instr);
1121
1122 bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr);
1123
1124 enum block_kind {
1125 /* uniform indicates that leaving this block,
1126 * all actives lanes stay active */
1127 block_kind_uniform = 1 << 0,
1128 block_kind_top_level = 1 << 1,
1129 block_kind_loop_preheader = 1 << 2,
1130 block_kind_loop_header = 1 << 3,
1131 block_kind_loop_exit = 1 << 4,
1132 block_kind_continue = 1 << 5,
1133 block_kind_break = 1 << 6,
1134 block_kind_continue_or_break = 1 << 7,
1135 block_kind_discard = 1 << 8,
1136 block_kind_branch = 1 << 9,
1137 block_kind_merge = 1 << 10,
1138 block_kind_invert = 1 << 11,
1139 block_kind_uses_discard_if = 1 << 12,
1140 block_kind_needs_lowering = 1 << 13,
1141 block_kind_uses_demote = 1 << 14,
1142 block_kind_export_end = 1 << 15,
1143 };
1144
1145
1146 struct RegisterDemand {
1147 constexpr RegisterDemand() = default;
1148 constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1149 : vgpr{v}, sgpr{s} {}
1150 int16_t vgpr = 0;
1151 int16_t sgpr = 0;
1152
1153 constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1154 return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1155 }
1156
1157 constexpr bool exceeds(const RegisterDemand other) const noexcept {
1158 return vgpr > other.vgpr || sgpr > other.sgpr;
1159 }
1160
1161 constexpr RegisterDemand operator+(const Temp t) const noexcept {
1162 if (t.type() == RegType::sgpr)
1163 return RegisterDemand( vgpr, sgpr + t.size() );
1164 else
1165 return RegisterDemand( vgpr + t.size(), sgpr );
1166 }
1167
1168 constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1169 return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1170 }
1171
1172 constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1173 return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1174 }
1175
1176 constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1177 vgpr += other.vgpr;
1178 sgpr += other.sgpr;
1179 return *this;
1180 }
1181
1182 constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1183 vgpr -= other.vgpr;
1184 sgpr -= other.sgpr;
1185 return *this;
1186 }
1187
1188 constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1189 if (t.type() == RegType::sgpr)
1190 sgpr += t.size();
1191 else
1192 vgpr += t.size();
1193 return *this;
1194 }
1195
1196 constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1197 if (t.type() == RegType::sgpr)
1198 sgpr -= t.size();
1199 else
1200 vgpr -= t.size();
1201 return *this;
1202 }
1203
1204 constexpr void update(const RegisterDemand other) noexcept {
1205 vgpr = std::max(vgpr, other.vgpr);
1206 sgpr = std::max(sgpr, other.sgpr);
1207 }
1208
1209 };
1210
1211 /* CFG */
1212 struct Block {
1213 float_mode fp_mode;
1214 unsigned index;
1215 unsigned offset = 0;
1216 std::vector<aco_ptr<Instruction>> instructions;
1217 std::vector<unsigned> logical_preds;
1218 std::vector<unsigned> linear_preds;
1219 std::vector<unsigned> logical_succs;
1220 std::vector<unsigned> linear_succs;
1221 RegisterDemand register_demand = RegisterDemand();
1222 uint16_t loop_nest_depth = 0;
1223 uint16_t kind = 0;
1224 int logical_idom = -1;
1225 int linear_idom = -1;
1226 Temp live_out_exec = Temp();
1227
1228 /* this information is needed for predecessors to blocks with phis when
1229 * moving out of ssa */
1230 bool scc_live_out = false;
1231 PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1232
1233 Block(unsigned idx) : index(idx) {}
1234 Block() : index(0) {}
1235 };
1236
1237 using Stage = uint16_t;
1238
1239 /* software stages */
1240 static constexpr Stage sw_vs = 1 << 0;
1241 static constexpr Stage sw_gs = 1 << 1;
1242 static constexpr Stage sw_tcs = 1 << 2;
1243 static constexpr Stage sw_tes = 1 << 3;
1244 static constexpr Stage sw_fs = 1 << 4;
1245 static constexpr Stage sw_cs = 1 << 5;
1246 static constexpr Stage sw_gs_copy = 1 << 6;
1247 static constexpr Stage sw_mask = 0x7f;
1248
1249 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1250 static constexpr Stage hw_vs = 1 << 7;
1251 static constexpr Stage hw_es = 1 << 8; /* not on GFX9. combined into GS on GFX9 (and GFX10/legacy). */
1252 static constexpr Stage hw_gs = 1 << 9;
1253 static constexpr Stage hw_ls = 1 << 10; /* not on GFX9. combined into HS on GFX9 (and GFX10/legacy). */
1254 static constexpr Stage hw_hs = 1 << 11;
1255 static constexpr Stage hw_fs = 1 << 12;
1256 static constexpr Stage hw_cs = 1 << 13;
1257 static constexpr Stage hw_mask = 0x7f << 7;
1258
1259 /* possible settings of Program::stage */
1260 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1261 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1262 static constexpr Stage compute_cs = sw_cs | hw_cs;
1263 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1264 static constexpr Stage gs_copy_vs = sw_gs_copy | hw_vs;
1265 /* GFX10/NGG */
1266 static constexpr Stage ngg_vertex_gs = sw_vs | hw_gs;
1267 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1268 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1269 static constexpr Stage ngg_vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1270 /* GFX9 (and GFX10 if NGG isn't used) */
1271 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1272 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1273 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1274 /* pre-GFX9 */
1275 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1276 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1277 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1278 static constexpr Stage tess_eval_es = sw_tes | hw_es; /* tesselation evaluation before geometry */
1279 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1280
1281 enum statistic {
1282 statistic_hash,
1283 statistic_instructions,
1284 statistic_copies,
1285 statistic_branches,
1286 statistic_cycles,
1287 statistic_vmem_clauses,
1288 statistic_smem_clauses,
1289 statistic_vmem_score,
1290 statistic_smem_score,
1291 statistic_sgpr_presched,
1292 statistic_vgpr_presched,
1293 num_statistics
1294 };
1295
1296 class Program final {
1297 public:
1298 float_mode next_fp_mode;
1299 std::vector<Block> blocks;
1300 RegisterDemand max_reg_demand = RegisterDemand();
1301 uint16_t num_waves = 0;
1302 uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1303 ac_shader_config* config;
1304 struct radv_shader_info *info;
1305 enum chip_class chip_class;
1306 enum radeon_family family;
1307 unsigned wave_size;
1308 RegClass lane_mask;
1309 Stage stage; /* Stage */
1310 bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1311 bool needs_wqm = false; /* there exists a p_wqm instruction */
1312 bool wb_smem_l1_on_end = false;
1313
1314 std::vector<uint8_t> constant_data;
1315 Temp private_segment_buffer;
1316 Temp scratch_offset;
1317
1318 uint16_t min_waves = 0;
1319 uint16_t lds_alloc_granule;
1320 uint32_t lds_limit; /* in bytes */
1321 bool has_16bank_lds;
1322 uint16_t vgpr_limit;
1323 uint16_t sgpr_limit;
1324 uint16_t physical_sgprs;
1325 uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1326 uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
1327 unsigned workgroup_size; /* if known; otherwise UINT_MAX */
1328
1329 bool xnack_enabled = false;
1330
1331 bool needs_vcc = false;
1332 bool needs_flat_scr = false;
1333
1334 bool collect_statistics = false;
1335 uint32_t statistics[num_statistics];
1336
1337 uint32_t allocateId()
1338 {
1339 assert(allocationID <= 16777215);
1340 return allocationID++;
1341 }
1342
1343 uint32_t peekAllocationId()
1344 {
1345 return allocationID;
1346 }
1347
1348 void setAllocationId(uint32_t id)
1349 {
1350 allocationID = id;
1351 }
1352
1353 Block* create_and_insert_block() {
1354 blocks.emplace_back(blocks.size());
1355 blocks.back().fp_mode = next_fp_mode;
1356 return &blocks.back();
1357 }
1358
1359 Block* insert_block(Block&& block) {
1360 block.index = blocks.size();
1361 block.fp_mode = next_fp_mode;
1362 blocks.emplace_back(std::move(block));
1363 return &blocks.back();
1364 }
1365
1366 private:
1367 uint32_t allocationID = 1;
1368 };
1369
1370 struct live {
1371 /* live temps out per block */
1372 std::vector<std::set<Temp>> live_out;
1373 /* register demand (sgpr/vgpr) per instruction per block */
1374 std::vector<std::vector<RegisterDemand>> register_demand;
1375 };
1376
1377 void select_program(Program *program,
1378 unsigned shader_count,
1379 struct nir_shader *const *shaders,
1380 ac_shader_config* config,
1381 struct radv_shader_args *args);
1382 void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
1383 ac_shader_config* config,
1384 struct radv_shader_args *args);
1385
1386 void lower_wqm(Program* program, live& live_vars,
1387 const struct radv_nir_compiler_options *options);
1388 void lower_bool_phis(Program* program);
1389 void calc_min_waves(Program* program);
1390 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1391 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1392 std::vector<uint16_t> dead_code_analysis(Program *program);
1393 void dominator_tree(Program* program);
1394 void insert_exec_mask(Program *program);
1395 void value_numbering(Program* program);
1396 void optimize(Program* program);
1397 void setup_reduce_temp(Program* program);
1398 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1399 void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_per_block);
1400 void ssa_elimination(Program* program);
1401 void lower_to_hw_instr(Program* program);
1402 void schedule_program(Program* program, live& live_vars);
1403 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1404 void insert_wait_states(Program* program);
1405 void insert_NOPs(Program* program);
1406 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1407 void print_asm(Program *program, std::vector<uint32_t>& binary,
1408 unsigned exec_size, std::ostream& out);
1409 void validate(Program* program, FILE *output);
1410 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output);
1411 #ifndef NDEBUG
1412 void perfwarn(bool cond, const char *msg, Instruction *instr=NULL);
1413 #else
1414 #define perfwarn(program, cond, msg, ...) do {} while(0)
1415 #endif
1416
1417 void collect_presched_stats(Program *program);
1418 void collect_preasm_stats(Program *program);
1419 void collect_postasm_stats(Program *program, const std::vector<uint32_t>& code);
1420
1421 void aco_print_instr(Instruction *instr, FILE *output);
1422 void aco_print_program(Program *program, FILE *output);
1423
1424 /* utilities for dealing with register demand */
1425 RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
1426 RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
1427 RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, aco_ptr<Instruction>& instr_before);
1428
1429 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1430 uint16_t get_extra_sgprs(Program *program);
1431
1432 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
1433 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1434 uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs);
1435
1436 /* return number of addressable sgprs/vgprs for max_waves */
1437 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1438 uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves);
1439
1440 typedef struct {
1441 const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
1442 const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1443 const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1444 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1445 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1446 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
1447 const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1448 const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1449 } Info;
1450
1451 extern const Info instr_info;
1452
1453 }
1454
1455 #endif /* ACO_IR_H */
1456