aco: improve 8/16-bit constants
[mesa.git] / src / amd / compiler / aco_ir.h
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #ifndef ACO_IR_H
26 #define ACO_IR_H
27
28 #include <vector>
29 #include <set>
30 #include <unordered_set>
31 #include <bitset>
32 #include <memory>
33
34 #include "nir.h"
35 #include "ac_binary.h"
36 #include "amd_family.h"
37 #include "aco_opcodes.h"
38 #include "aco_util.h"
39
40 struct radv_nir_compiler_options;
41 struct radv_shader_args;
42 struct radv_shader_info;
43
44 namespace aco {
45
46 extern uint64_t debug_flags;
47
48 enum {
49 DEBUG_VALIDATE = 0x1,
50 DEBUG_VALIDATE_RA = 0x2,
51 DEBUG_PERFWARN = 0x4,
52 };
53
54 /**
55 * Representation of the instruction's microcode encoding format
56 * Note: Some Vector ALU Formats can be combined, such that:
57 * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
58 * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
59 * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
60 *
61 * (*) The same is applicable for VOP1 and VOPC instructions.
62 */
63 enum class Format : std::uint16_t {
64 /* Pseudo Instruction Format */
65 PSEUDO = 0,
66 /* Scalar ALU & Control Formats */
67 SOP1 = 1,
68 SOP2 = 2,
69 SOPK = 3,
70 SOPP = 4,
71 SOPC = 5,
72 /* Scalar Memory Format */
73 SMEM = 6,
74 /* LDS/GDS Format */
75 DS = 8,
76 /* Vector Memory Buffer Formats */
77 MTBUF = 9,
78 MUBUF = 10,
79 /* Vector Memory Image Format */
80 MIMG = 11,
81 /* Export Format */
82 EXP = 12,
83 /* Flat Formats */
84 FLAT = 13,
85 GLOBAL = 14,
86 SCRATCH = 15,
87
88 PSEUDO_BRANCH = 16,
89 PSEUDO_BARRIER = 17,
90 PSEUDO_REDUCTION = 18,
91
92 /* Vector ALU Formats */
93 VOP3P = 19,
94 VOP1 = 1 << 8,
95 VOP2 = 1 << 9,
96 VOPC = 1 << 10,
97 VOP3 = 1 << 11,
98 VOP3A = 1 << 11,
99 VOP3B = 1 << 11,
100 /* Vector Parameter Interpolation Format */
101 VINTRP = 1 << 12,
102 DPP = 1 << 13,
103 SDWA = 1 << 14,
104 };
105
106 enum barrier_interaction : uint8_t {
107 barrier_none = 0,
108 barrier_buffer = 0x1,
109 barrier_image = 0x2,
110 barrier_atomic = 0x4,
111 barrier_shared = 0x8,
112 /* used for geometry shaders to ensure vertex data writes are before the
113 * GS_DONE s_sendmsg. */
114 barrier_gs_data = 0x10,
115 /* used for geometry shaders to ensure s_sendmsg instructions are in-order. */
116 barrier_gs_sendmsg = 0x20,
117 /* used by barriers. created by s_barrier */
118 barrier_barrier = 0x40,
119 barrier_count = 7,
120 };
121
122 enum fp_round {
123 fp_round_ne = 0,
124 fp_round_pi = 1,
125 fp_round_ni = 2,
126 fp_round_tz = 3,
127 };
128
129 enum fp_denorm {
130 /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
131 * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
132 fp_denorm_flush = 0x0,
133 fp_denorm_keep = 0x3,
134 };
135
136 struct float_mode {
137 /* matches encoding of the MODE register */
138 union {
139 struct {
140 fp_round round32:2;
141 fp_round round16_64:2;
142 unsigned denorm32:2;
143 unsigned denorm16_64:2;
144 };
145 uint8_t val = 0;
146 };
147 /* if false, optimizations which may remove infs/nan/-0.0 can be done */
148 bool preserve_signed_zero_inf_nan32:1;
149 bool preserve_signed_zero_inf_nan16_64:1;
150 /* if false, optimizations which may remove denormal flushing can be done */
151 bool must_flush_denorms32:1;
152 bool must_flush_denorms16_64:1;
153 bool care_about_round32:1;
154 bool care_about_round16_64:1;
155
156 /* Returns true if instructions using the mode "other" can safely use the
157 * current one instead. */
158 bool canReplace(float_mode other) const noexcept {
159 return val == other.val &&
160 (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
161 (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
162 (must_flush_denorms32 || !other.must_flush_denorms32) &&
163 (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
164 (care_about_round32 || !other.care_about_round32) &&
165 (care_about_round16_64 || !other.care_about_round16_64);
166 }
167 };
168
169 constexpr Format asVOP3(Format format) {
170 return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
171 };
172
173 constexpr Format asSDWA(Format format) {
174 assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC);
175 return (Format) ((uint32_t) Format::SDWA | (uint32_t) format);
176 }
177
178 enum class RegType {
179 none = 0,
180 sgpr,
181 vgpr,
182 linear_vgpr,
183 };
184
185 struct RegClass {
186
187 enum RC : uint8_t {
188 s1 = 1,
189 s2 = 2,
190 s3 = 3,
191 s4 = 4,
192 s6 = 6,
193 s8 = 8,
194 s16 = 16,
195 v1 = s1 | (1 << 5),
196 v2 = s2 | (1 << 5),
197 v3 = s3 | (1 << 5),
198 v4 = s4 | (1 << 5),
199 v5 = 5 | (1 << 5),
200 v6 = 6 | (1 << 5),
201 v7 = 7 | (1 << 5),
202 v8 = 8 | (1 << 5),
203 /* byte-sized register class */
204 v1b = v1 | (1 << 7),
205 v2b = v2 | (1 << 7),
206 v3b = v3 | (1 << 7),
207 v4b = v4 | (1 << 7),
208 v6b = v6 | (1 << 7),
209 v8b = v8 | (1 << 7),
210 /* these are used for WWM and spills to vgpr */
211 v1_linear = v1 | (1 << 6),
212 v2_linear = v2 | (1 << 6),
213 };
214
215 RegClass() = default;
216 constexpr RegClass(RC rc)
217 : rc(rc) {}
218 constexpr RegClass(RegType type, unsigned size)
219 : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
220
221 constexpr operator RC() const { return rc; }
222 explicit operator bool() = delete;
223
224 constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
225 constexpr bool is_subdword() const { return rc & (1 << 7); }
226 constexpr unsigned bytes() const { return ((unsigned) rc & 0x1F) * (is_subdword() ? 1 : 4); }
227 //TODO: use size() less in favor of bytes()
228 constexpr unsigned size() const { return (bytes() + 3) >> 2; }
229 constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
230 constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
231 constexpr RegClass as_subdword() const { return RegClass((RC) (rc | 1 << 7)); }
232
233 static constexpr RegClass get(RegType type, unsigned bytes) {
234 if (type == RegType::sgpr) {
235 return RegClass(type, DIV_ROUND_UP(bytes, 4u));
236 } else {
237 return bytes % 4u ? RegClass(type, bytes).as_subdword() :
238 RegClass(type, bytes / 4u);
239 }
240 }
241
242 private:
243 RC rc;
244 };
245
246 /* transitional helper expressions */
247 static constexpr RegClass s1{RegClass::s1};
248 static constexpr RegClass s2{RegClass::s2};
249 static constexpr RegClass s3{RegClass::s3};
250 static constexpr RegClass s4{RegClass::s4};
251 static constexpr RegClass s8{RegClass::s8};
252 static constexpr RegClass s16{RegClass::s16};
253 static constexpr RegClass v1{RegClass::v1};
254 static constexpr RegClass v2{RegClass::v2};
255 static constexpr RegClass v3{RegClass::v3};
256 static constexpr RegClass v4{RegClass::v4};
257 static constexpr RegClass v5{RegClass::v5};
258 static constexpr RegClass v6{RegClass::v6};
259 static constexpr RegClass v7{RegClass::v7};
260 static constexpr RegClass v8{RegClass::v8};
261 static constexpr RegClass v1b{RegClass::v1b};
262 static constexpr RegClass v2b{RegClass::v2b};
263 static constexpr RegClass v3b{RegClass::v3b};
264 static constexpr RegClass v4b{RegClass::v4b};
265 static constexpr RegClass v6b{RegClass::v6b};
266 static constexpr RegClass v8b{RegClass::v8b};
267
268 /**
269 * Temp Class
270 * Each temporary virtual register has a
271 * register class (i.e. size and type)
272 * and SSA id.
273 */
274 struct Temp {
275 Temp() noexcept : id_(0), reg_class(0) {}
276 constexpr Temp(uint32_t id, RegClass cls) noexcept
277 : id_(id), reg_class(uint8_t(cls)) {}
278
279 constexpr uint32_t id() const noexcept { return id_; }
280 constexpr RegClass regClass() const noexcept { return (RegClass::RC)reg_class; }
281
282 constexpr unsigned bytes() const noexcept { return regClass().bytes(); }
283 constexpr unsigned size() const noexcept { return regClass().size(); }
284 constexpr RegType type() const noexcept { return regClass().type(); }
285 constexpr bool is_linear() const noexcept { return regClass().is_linear(); }
286
287 constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
288 constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
289 constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
290
291 private:
292 uint32_t id_: 24;
293 uint32_t reg_class : 8;
294 };
295
296 /**
297 * PhysReg
298 * Represents the physical register for each
299 * Operand and Definition.
300 */
301 struct PhysReg {
302 constexpr PhysReg() = default;
303 explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {}
304 constexpr unsigned reg() const { return reg_b >> 2; }
305 constexpr unsigned byte() const { return reg_b & 0x3; }
306 constexpr operator unsigned() const { return reg(); }
307 constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; }
308 constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; }
309 constexpr bool operator <(PhysReg other) const { return reg_b < other.reg_b; }
310 constexpr PhysReg advance(unsigned bytes) const { PhysReg res = *this; res.reg_b += bytes; return res; }
311
312 uint16_t reg_b = 0;
313 };
314
315 /* helper expressions for special registers */
316 static constexpr PhysReg m0{124};
317 static constexpr PhysReg vcc{106};
318 static constexpr PhysReg vcc_hi{107};
319 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
320 static constexpr PhysReg exec{126};
321 static constexpr PhysReg exec_lo{126};
322 static constexpr PhysReg exec_hi{127};
323 static constexpr PhysReg vccz{251};
324 static constexpr PhysReg execz{252};
325 static constexpr PhysReg scc{253};
326
327 /**
328 * Operand Class
329 * Initially, each Operand refers to either
330 * a temporary virtual register
331 * or to a constant value
332 * Temporary registers get mapped to physical register during RA
333 * Constant values are inlined into the instruction sequence.
334 */
335 class Operand final
336 {
337 public:
338 constexpr Operand()
339 : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
340 isKill_(false), isUndef_(true), isFirstKill_(false), constSize(0),
341 isLateKill_(false) {}
342
343 explicit Operand(Temp r) noexcept
344 {
345 data_.temp = r;
346 if (r.id()) {
347 isTemp_ = true;
348 } else {
349 isUndef_ = true;
350 setFixed(PhysReg{128});
351 }
352 };
353 explicit Operand(uint8_t v) noexcept
354 {
355 /* 8-bit constants are only used for copies and copies from any 8-bit
356 * constant can be implemented with a SDWA v_mul_u32_u24. So consider all
357 * to be inline constants. */
358 data_.i = v;
359 isConstant_ = true;
360 constSize = 0;
361 setFixed(PhysReg{0u});
362 };
363 explicit Operand(uint16_t v) noexcept
364 {
365 data_.i = v;
366 isConstant_ = true;
367 constSize = 1;
368 if (v <= 64)
369 setFixed(PhysReg{128u + v});
370 else if (v >= 0xFFF0) /* [-16 .. -1] */
371 setFixed(PhysReg{192u + (0xFFFF - v)});
372 else if (v == 0x3800) /* 0.5 */
373 setFixed(PhysReg{240});
374 else if (v == 0xB800) /* -0.5 */
375 setFixed(PhysReg{241});
376 else if (v == 0x3C00) /* 1.0 */
377 setFixed(PhysReg{242});
378 else if (v == 0xBC00) /* -1.0 */
379 setFixed(PhysReg{243});
380 else if (v == 0x4000) /* 2.0 */
381 setFixed(PhysReg{244});
382 else if (v == 0xC000) /* -2.0 */
383 setFixed(PhysReg{245});
384 else if (v == 0x4400) /* 4.0 */
385 setFixed(PhysReg{246});
386 else if (v == 0xC400) /* -4.0 */
387 setFixed(PhysReg{247});
388 else if (v == 0x3118) /* 1/2 PI */
389 setFixed(PhysReg{248});
390 else /* Literal Constant */
391 setFixed(PhysReg{255});
392 };
393 explicit Operand(uint32_t v, bool is64bit = false) noexcept
394 {
395 data_.i = v;
396 isConstant_ = true;
397 constSize = is64bit ? 3 : 2;
398 if (v <= 64)
399 setFixed(PhysReg{128 + v});
400 else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
401 setFixed(PhysReg{192 - v});
402 else if (v == 0x3f000000) /* 0.5 */
403 setFixed(PhysReg{240});
404 else if (v == 0xbf000000) /* -0.5 */
405 setFixed(PhysReg{241});
406 else if (v == 0x3f800000) /* 1.0 */
407 setFixed(PhysReg{242});
408 else if (v == 0xbf800000) /* -1.0 */
409 setFixed(PhysReg{243});
410 else if (v == 0x40000000) /* 2.0 */
411 setFixed(PhysReg{244});
412 else if (v == 0xc0000000) /* -2.0 */
413 setFixed(PhysReg{245});
414 else if (v == 0x40800000) /* 4.0 */
415 setFixed(PhysReg{246});
416 else if (v == 0xc0800000) /* -4.0 */
417 setFixed(PhysReg{247});
418 else { /* Literal Constant */
419 assert(!is64bit && "attempt to create a 64-bit literal constant");
420 setFixed(PhysReg{255});
421 }
422 };
423 explicit Operand(uint64_t v) noexcept
424 {
425 isConstant_ = true;
426 constSize = 3;
427 if (v <= 64) {
428 data_.i = (uint32_t) v;
429 setFixed(PhysReg{128 + (uint32_t) v});
430 } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
431 data_.i = (uint32_t) v;
432 setFixed(PhysReg{192 - (uint32_t) v});
433 } else if (v == 0x3FE0000000000000) { /* 0.5 */
434 data_.i = 0x3f000000;
435 setFixed(PhysReg{240});
436 } else if (v == 0xBFE0000000000000) { /* -0.5 */
437 data_.i = 0xbf000000;
438 setFixed(PhysReg{241});
439 } else if (v == 0x3FF0000000000000) { /* 1.0 */
440 data_.i = 0x3f800000;
441 setFixed(PhysReg{242});
442 } else if (v == 0xBFF0000000000000) { /* -1.0 */
443 data_.i = 0xbf800000;
444 setFixed(PhysReg{243});
445 } else if (v == 0x4000000000000000) { /* 2.0 */
446 data_.i = 0x40000000;
447 setFixed(PhysReg{244});
448 } else if (v == 0xC000000000000000) { /* -2.0 */
449 data_.i = 0xc0000000;
450 setFixed(PhysReg{245});
451 } else if (v == 0x4010000000000000) { /* 4.0 */
452 data_.i = 0x40800000;
453 setFixed(PhysReg{246});
454 } else if (v == 0xC010000000000000) { /* -4.0 */
455 data_.i = 0xc0800000;
456 setFixed(PhysReg{247});
457 } else { /* Literal Constant: we don't know if it is a long or double.*/
458 isConstant_ = 0;
459 assert(false && "attempt to create a 64-bit literal constant");
460 }
461 };
462 explicit Operand(RegClass type) noexcept
463 {
464 isUndef_ = true;
465 data_.temp = Temp(0, type);
466 setFixed(PhysReg{128});
467 };
468 explicit Operand(PhysReg reg, RegClass type) noexcept
469 {
470 data_.temp = Temp(0, type);
471 setFixed(reg);
472 }
473
474 constexpr bool isTemp() const noexcept
475 {
476 return isTemp_;
477 }
478
479 constexpr void setTemp(Temp t) noexcept {
480 assert(!isConstant_);
481 isTemp_ = true;
482 data_.temp = t;
483 }
484
485 constexpr Temp getTemp() const noexcept
486 {
487 return data_.temp;
488 }
489
490 constexpr uint32_t tempId() const noexcept
491 {
492 return data_.temp.id();
493 }
494
495 constexpr bool hasRegClass() const noexcept
496 {
497 return isTemp() || isUndefined();
498 }
499
500 constexpr RegClass regClass() const noexcept
501 {
502 return data_.temp.regClass();
503 }
504
505 constexpr unsigned bytes() const noexcept
506 {
507 if (isConstant())
508 return 1 << constSize;
509 else
510 return data_.temp.bytes();
511 }
512
513 constexpr unsigned size() const noexcept
514 {
515 if (isConstant())
516 return constSize > 2 ? 2 : 1;
517 else
518 return data_.temp.size();
519 }
520
521 constexpr bool isFixed() const noexcept
522 {
523 return isFixed_;
524 }
525
526 constexpr PhysReg physReg() const noexcept
527 {
528 return reg_;
529 }
530
531 constexpr void setFixed(PhysReg reg) noexcept
532 {
533 isFixed_ = reg != unsigned(-1);
534 reg_ = reg;
535 }
536
537 constexpr bool isConstant() const noexcept
538 {
539 return isConstant_;
540 }
541
542 constexpr bool isLiteral() const noexcept
543 {
544 return isConstant() && reg_ == 255;
545 }
546
547 constexpr bool isUndefined() const noexcept
548 {
549 return isUndef_;
550 }
551
552 constexpr uint32_t constantValue() const noexcept
553 {
554 return data_.i;
555 }
556
557 constexpr bool constantEquals(uint32_t cmp) const noexcept
558 {
559 return isConstant() && constantValue() == cmp;
560 }
561
562 constexpr uint64_t constantValue64(bool signext=false) const noexcept
563 {
564 if (constSize == 3) {
565 if (reg_ <= 192)
566 return reg_ - 128;
567 else if (reg_ <= 208)
568 return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);
569
570 switch (reg_) {
571 case 240:
572 return 0x3FE0000000000000;
573 case 241:
574 return 0xBFE0000000000000;
575 case 242:
576 return 0x3FF0000000000000;
577 case 243:
578 return 0xBFF0000000000000;
579 case 244:
580 return 0x4000000000000000;
581 case 245:
582 return 0xC000000000000000;
583 case 246:
584 return 0x4010000000000000;
585 case 247:
586 return 0xC010000000000000;
587 }
588 } else if (constSize == 1) {
589 return (signext && (data_.i & 0x8000u) ? 0xffffffffffff0000ull : 0ull) | data_.i;
590 } else if (constSize == 0) {
591 return (signext && (data_.i & 0x80u) ? 0xffffffffffffff00ull : 0ull) | data_.i;
592 }
593 return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
594 }
595
596 constexpr bool isOfType(RegType type) const noexcept
597 {
598 return hasRegClass() && regClass().type() == type;
599 }
600
601 /* Indicates that the killed operand's live range intersects with the
602 * instruction's definitions. Unlike isKill() and isFirstKill(), this is
603 * not set by liveness analysis. */
604 constexpr void setLateKill(bool flag) noexcept
605 {
606 isLateKill_ = flag;
607 }
608
609 constexpr bool isLateKill() const noexcept
610 {
611 return isLateKill_;
612 }
613
614 constexpr void setKill(bool flag) noexcept
615 {
616 isKill_ = flag;
617 if (!flag)
618 setFirstKill(false);
619 }
620
621 constexpr bool isKill() const noexcept
622 {
623 return isKill_ || isFirstKill();
624 }
625
626 constexpr void setFirstKill(bool flag) noexcept
627 {
628 isFirstKill_ = flag;
629 if (flag)
630 setKill(flag);
631 }
632
633 /* When there are multiple operands killing the same temporary,
634 * isFirstKill() is only returns true for the first one. */
635 constexpr bool isFirstKill() const noexcept
636 {
637 return isFirstKill_;
638 }
639
640 constexpr bool isKillBeforeDef() const noexcept
641 {
642 return isKill() && !isLateKill();
643 }
644
645 constexpr bool isFirstKillBeforeDef() const noexcept
646 {
647 return isFirstKill() && !isLateKill();
648 }
649
650 constexpr bool operator == (Operand other) const noexcept
651 {
652 if (other.size() != size())
653 return false;
654 if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef())
655 return false;
656 if (isFixed() && other.isFixed() && physReg() != other.physReg())
657 return false;
658 if (isLiteral())
659 return other.isLiteral() && other.constantValue() == constantValue();
660 else if (isConstant())
661 return other.isConstant() && other.physReg() == physReg();
662 else if (isUndefined())
663 return other.isUndefined() && other.regClass() == regClass();
664 else
665 return other.isTemp() && other.getTemp() == getTemp();
666 }
667 private:
668 union {
669 uint32_t i;
670 float f;
671 Temp temp = Temp(0, s1);
672 } data_;
673 PhysReg reg_;
674 union {
675 struct {
676 uint8_t isTemp_:1;
677 uint8_t isFixed_:1;
678 uint8_t isConstant_:1;
679 uint8_t isKill_:1;
680 uint8_t isUndef_:1;
681 uint8_t isFirstKill_:1;
682 uint8_t constSize:2;
683 uint8_t isLateKill_:1;
684 };
685 /* can't initialize bit-fields in c++11, so work around using a union */
686 uint16_t control_ = 0;
687 };
688 };
689
690 /**
691 * Definition Class
692 * Definitions are the results of Instructions
693 * and refer to temporary virtual registers
694 * which are later mapped to physical registers
695 */
696 class Definition final
697 {
698 public:
699 constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0), isPrecise_(0) {}
700 Definition(uint32_t index, RegClass type) noexcept
701 : temp(index, type) {}
702 explicit Definition(Temp tmp) noexcept
703 : temp(tmp) {}
704 Definition(PhysReg reg, RegClass type) noexcept
705 : temp(Temp(0, type))
706 {
707 setFixed(reg);
708 }
709 Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
710 : temp(Temp(tmpId, type))
711 {
712 setFixed(reg);
713 }
714
715 constexpr bool isTemp() const noexcept
716 {
717 return tempId() > 0;
718 }
719
720 constexpr Temp getTemp() const noexcept
721 {
722 return temp;
723 }
724
725 constexpr uint32_t tempId() const noexcept
726 {
727 return temp.id();
728 }
729
730 constexpr void setTemp(Temp t) noexcept {
731 temp = t;
732 }
733
734 constexpr RegClass regClass() const noexcept
735 {
736 return temp.regClass();
737 }
738
739 constexpr unsigned bytes() const noexcept
740 {
741 return temp.bytes();
742 }
743
744 constexpr unsigned size() const noexcept
745 {
746 return temp.size();
747 }
748
749 constexpr bool isFixed() const noexcept
750 {
751 return isFixed_;
752 }
753
754 constexpr PhysReg physReg() const noexcept
755 {
756 return reg_;
757 }
758
759 constexpr void setFixed(PhysReg reg) noexcept
760 {
761 isFixed_ = 1;
762 reg_ = reg;
763 }
764
765 constexpr void setHint(PhysReg reg) noexcept
766 {
767 hasHint_ = 1;
768 reg_ = reg;
769 }
770
771 constexpr bool hasHint() const noexcept
772 {
773 return hasHint_;
774 }
775
776 constexpr void setKill(bool flag) noexcept
777 {
778 isKill_ = flag;
779 }
780
781 constexpr bool isKill() const noexcept
782 {
783 return isKill_;
784 }
785
786 constexpr void setPrecise(bool precise) noexcept
787 {
788 isPrecise_ = precise;
789 }
790
791 constexpr bool isPrecise() const noexcept
792 {
793 return isPrecise_;
794 }
795
796 private:
797 Temp temp = Temp(0, s1);
798 PhysReg reg_;
799 union {
800 struct {
801 uint8_t isFixed_:1;
802 uint8_t hasHint_:1;
803 uint8_t isKill_:1;
804 uint8_t isPrecise_:1;
805 };
806 /* can't initialize bit-fields in c++11, so work around using a union */
807 uint8_t control_ = 0;
808 };
809 };
810
811 struct Block;
812
813 struct Instruction {
814 aco_opcode opcode;
815 Format format;
816 uint32_t pass_flags;
817
818 aco::span<Operand> operands;
819 aco::span<Definition> definitions;
820
821 constexpr bool isVALU() const noexcept
822 {
823 return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
824 || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
825 || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
826 || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
827 || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
828 || format == Format::VOP3P;
829 }
830
831 constexpr bool isSALU() const noexcept
832 {
833 return format == Format::SOP1 ||
834 format == Format::SOP2 ||
835 format == Format::SOPC ||
836 format == Format::SOPK ||
837 format == Format::SOPP;
838 }
839
840 constexpr bool isVMEM() const noexcept
841 {
842 return format == Format::MTBUF ||
843 format == Format::MUBUF ||
844 format == Format::MIMG;
845 }
846
847 constexpr bool isDPP() const noexcept
848 {
849 return (uint16_t) format & (uint16_t) Format::DPP;
850 }
851
852 constexpr bool isVOP3() const noexcept
853 {
854 return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
855 ((uint16_t) format & (uint16_t) Format::VOP3B);
856 }
857
858 constexpr bool isSDWA() const noexcept
859 {
860 return (uint16_t) format & (uint16_t) Format::SDWA;
861 }
862
863 constexpr bool isFlatOrGlobal() const noexcept
864 {
865 return format == Format::FLAT || format == Format::GLOBAL;
866 }
867
868 constexpr bool usesModifiers() const noexcept;
869
870 constexpr bool reads_exec() const noexcept
871 {
872 for (const Operand& op : operands) {
873 if (op.isFixed() && op.physReg() == exec)
874 return true;
875 }
876 return false;
877 }
878 };
879 static_assert(sizeof(Instruction) == 16, "Unexpected padding");
880
881 struct SOPK_instruction : public Instruction {
882 uint16_t imm;
883 uint16_t padding;
884 };
885 static_assert(sizeof(SOPK_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
886
887 struct SOPP_instruction : public Instruction {
888 uint32_t imm;
889 int block;
890 };
891 static_assert(sizeof(SOPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
892
893 struct SOPC_instruction : public Instruction {
894 };
895 static_assert(sizeof(SOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
896
897 struct SOP1_instruction : public Instruction {
898 };
899 static_assert(sizeof(SOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
900
901 struct SOP2_instruction : public Instruction {
902 };
903 static_assert(sizeof(SOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
904
905 /**
906 * Scalar Memory Format:
907 * For s_(buffer_)load_dword*:
908 * Operand(0): SBASE - SGPR-pair which provides base address
909 * Operand(1): Offset - immediate (un)signed offset or SGPR
910 * Operand(2) / Definition(0): SDATA - SGPR for read / write result
911 * Operand(n-1): SOffset - SGPR offset (Vega only)
912 *
913 * Having no operands is also valid for instructions such as s_dcache_inv.
914 *
915 */
916 struct SMEM_instruction : public Instruction {
917 barrier_interaction barrier;
918 bool glc : 1; /* VI+: globally coherent */
919 bool dlc : 1; /* NAVI: device level coherent */
920 bool nv : 1; /* VEGA only: Non-volatile */
921 bool can_reorder : 1;
922 bool disable_wqm : 1;
923 uint32_t padding: 19;
924 };
925 static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
926
927 struct VOP1_instruction : public Instruction {
928 };
929 static_assert(sizeof(VOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
930
931 struct VOP2_instruction : public Instruction {
932 };
933 static_assert(sizeof(VOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
934
935 struct VOPC_instruction : public Instruction {
936 };
937 static_assert(sizeof(VOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
938
939 struct VOP3A_instruction : public Instruction {
940 bool abs[3];
941 bool neg[3];
942 uint8_t opsel : 4;
943 uint8_t omod : 2;
944 bool clamp : 1;
945 uint32_t padding : 9;
946 };
947 static_assert(sizeof(VOP3A_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
948
949 struct VOP3P_instruction : public Instruction {
950 bool neg_lo[3];
951 bool neg_hi[3];
952 uint8_t opsel_lo : 3;
953 uint8_t opsel_hi : 3;
954 bool clamp : 1;
955 uint32_t padding : 9;
956 };
957 static_assert(sizeof(VOP3P_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
958
959 /**
960 * Data Parallel Primitives Format:
961 * This format can be used for VOP1, VOP2 or VOPC instructions.
962 * The swizzle applies to the src0 operand.
963 *
964 */
965 struct DPP_instruction : public Instruction {
966 bool abs[2];
967 bool neg[2];
968 uint16_t dpp_ctrl;
969 uint8_t row_mask : 4;
970 uint8_t bank_mask : 4;
971 bool bound_ctrl : 1;
972 uint32_t padding : 7;
973 };
974 static_assert(sizeof(DPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
975
976 enum sdwa_sel : uint8_t {
977 /* masks */
978 sdwa_wordnum = 0x1,
979 sdwa_bytenum = 0x3,
980 sdwa_asuint = 0x7 | 0x10,
981 sdwa_rasize = 0x3,
982
983 /* flags */
984 sdwa_isword = 0x4,
985 sdwa_sext = 0x8,
986 sdwa_isra = 0x10,
987
988 /* specific values */
989 sdwa_ubyte0 = 0,
990 sdwa_ubyte1 = 1,
991 sdwa_ubyte2 = 2,
992 sdwa_ubyte3 = 3,
993 sdwa_uword0 = sdwa_isword | 0,
994 sdwa_uword1 = sdwa_isword | 1,
995 sdwa_udword = 6,
996
997 sdwa_sbyte0 = sdwa_ubyte0 | sdwa_sext,
998 sdwa_sbyte1 = sdwa_ubyte1 | sdwa_sext,
999 sdwa_sbyte2 = sdwa_ubyte2 | sdwa_sext,
1000 sdwa_sbyte3 = sdwa_ubyte3 | sdwa_sext,
1001 sdwa_sword0 = sdwa_uword0 | sdwa_sext,
1002 sdwa_sword1 = sdwa_uword1 | sdwa_sext,
1003 sdwa_sdword = sdwa_udword | sdwa_sext,
1004
1005 /* register-allocated */
1006 sdwa_ubyte = 1 | sdwa_isra,
1007 sdwa_uword = 2 | sdwa_isra,
1008 sdwa_sbyte = sdwa_ubyte | sdwa_sext,
1009 sdwa_sword = sdwa_uword | sdwa_sext,
1010 };
1011
1012 /**
1013 * Sub-Dword Addressing Format:
1014 * This format can be used for VOP1, VOP2 or VOPC instructions.
1015 *
1016 * omod and SGPR/constant operands are only available on GFX9+. For VOPC,
1017 * the definition doesn't have to be VCC on GFX9+.
1018 *
1019 */
1020 struct SDWA_instruction : public Instruction {
1021 /* these destination modifiers aren't available with VOPC except for
1022 * clamp on GFX8 */
1023 uint8_t sel[2];
1024 uint8_t dst_sel;
1025 bool neg[2];
1026 bool abs[2];
1027 bool dst_preserve : 1;
1028 bool clamp : 1;
1029 uint8_t omod : 2; /* GFX9+ */
1030 uint32_t padding : 4;
1031 };
1032 static_assert(sizeof(SDWA_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1033
1034 struct Interp_instruction : public Instruction {
1035 uint8_t attribute;
1036 uint8_t component;
1037 uint16_t padding;
1038 };
1039 static_assert(sizeof(Interp_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1040
1041 /**
1042 * Local and Global Data Sharing instructions
1043 * Operand(0): ADDR - VGPR which supplies the address.
1044 * Operand(1): DATA0 - First data VGPR.
1045 * Operand(2): DATA1 - Second data VGPR.
1046 * Operand(n-1): M0 - LDS size.
1047 * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
1048 *
1049 */
1050 struct DS_instruction : public Instruction {
1051 int16_t offset0;
1052 int8_t offset1;
1053 bool gds;
1054 };
1055 static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1056
1057 /**
1058 * Vector Memory Untyped-buffer Instructions
1059 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1060 * Operand(1): VADDR - Address source. Can carry an index and/or offset
1061 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1062 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1063 *
1064 */
1065 struct MUBUF_instruction : public Instruction {
1066 uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
1067 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1068 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1069 bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
1070 bool glc : 1; /* globally coherent */
1071 bool dlc : 1; /* NAVI: device level coherent */
1072 bool slc : 1; /* system level coherent */
1073 bool tfe : 1; /* texture fail enable */
1074 bool lds : 1; /* Return read-data to LDS instead of VGPRs */
1075 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1076 bool can_reorder : 1;
1077 uint8_t padding : 2;
1078 barrier_interaction barrier;
1079 };
1080 static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1081
1082 /**
1083 * Vector Memory Typed-buffer Instructions
1084 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1085 * Operand(1): VADDR - Address source. Can carry an index and/or offset
1086 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1087 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1088 *
1089 */
1090 struct MTBUF_instruction : public Instruction {
1091 uint16_t offset; /* Unsigned byte offset - 12 bit */
1092 barrier_interaction barrier;
1093 uint8_t dfmt : 4; /* Data Format of data in memory buffer */
1094 uint8_t nfmt : 3; /* Numeric format of data in memory */
1095 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1096 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1097 bool glc : 1; /* globally coherent */
1098 bool dlc : 1; /* NAVI: device level coherent */
1099 bool slc : 1; /* system level coherent */
1100 bool tfe : 1; /* texture fail enable */
1101 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1102 bool can_reorder : 1;
1103 uint32_t padding : 25;
1104 };
1105 static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1106
1107 /**
1108 * Vector Memory Image Instructions
1109 * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
1110 * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
1111 * or VDATA - Vector GPR for write data.
1112 * Operand(2): VADDR - Address source. Can carry an offset or an index.
1113 * Definition(0): VDATA - Vector GPR for read result.
1114 *
1115 */
1116 struct MIMG_instruction : public Instruction {
1117 uint8_t dmask; /* Data VGPR enable mask */
1118 uint8_t dim : 3; /* NAVI: dimensionality */
1119 bool unrm : 1; /* Force address to be un-normalized */
1120 bool dlc : 1; /* NAVI: device level coherent */
1121 bool glc : 1; /* globally coherent */
1122 bool slc : 1; /* system level coherent */
1123 bool tfe : 1; /* texture fail enable */
1124 bool da : 1; /* declare an array */
1125 bool lwe : 1; /* Force data to be un-normalized */
1126 bool r128 : 1; /* NAVI: Texture resource size */
1127 bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
1128 bool d16 : 1; /* Convert 32-bit data to 16-bit data */
1129 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1130 bool can_reorder : 1;
1131 uint8_t padding : 1;
1132 barrier_interaction barrier;
1133 };
1134 static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1135
1136 /**
1137 * Flat/Scratch/Global Instructions
1138 * Operand(0): ADDR
1139 * Operand(1): SADDR
1140 * Operand(2) / Definition(0): DATA/VDST
1141 *
1142 */
1143 struct FLAT_instruction : public Instruction {
1144 uint16_t offset; /* Vega/Navi only */
1145 bool slc : 1; /* system level coherent */
1146 bool glc : 1; /* globally coherent */
1147 bool dlc : 1; /* NAVI: device level coherent */
1148 bool lds : 1;
1149 bool nv : 1;
1150 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1151 bool can_reorder : 1;
1152 uint8_t padding : 1;
1153 barrier_interaction barrier;
1154 };
1155 static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1156
1157 struct Export_instruction : public Instruction {
1158 uint8_t enabled_mask;
1159 uint8_t dest;
1160 bool compressed : 1;
1161 bool done : 1;
1162 bool valid_mask : 1;
1163 uint32_t padding : 13;
1164 };
1165 static_assert(sizeof(Export_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1166
1167 struct Pseudo_instruction : public Instruction {
1168 PhysReg scratch_sgpr; /* might not be valid if it's not needed */
1169 bool tmp_in_scc;
1170 uint8_t padding;
1171 };
1172 static_assert(sizeof(Pseudo_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1173
1174 struct Pseudo_branch_instruction : public Instruction {
1175 /* target[0] is the block index of the branch target.
1176 * For conditional branches, target[1] contains the fall-through alternative.
1177 * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
1178 */
1179 uint32_t target[2];
1180 };
1181 static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1182
1183 struct Pseudo_barrier_instruction : public Instruction {
1184 };
1185 static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1186
1187 enum ReduceOp : uint16_t {
1188 iadd8, iadd16, iadd32, iadd64,
1189 imul8, imul16, imul32, imul64,
1190 fadd16, fadd32, fadd64,
1191 fmul16, fmul32, fmul64,
1192 imin8, imin16, imin32, imin64,
1193 imax8, imax16, imax32, imax64,
1194 umin8, umin16, umin32, umin64,
1195 umax8, umax16, umax32, umax64,
1196 fmin16, fmin32, fmin64,
1197 fmax16, fmax32, fmax64,
1198 iand8, iand16, iand32, iand64,
1199 ior8, ior16, ior32, ior64,
1200 ixor8, ixor16, ixor32, ixor64,
1201 };
1202
1203 /**
1204 * Subgroup Reduction Instructions, everything except for the data to be
1205 * reduced and the result as inserted by setup_reduce_temp().
1206 * Operand(0): data to be reduced
1207 * Operand(1): reduce temporary
1208 * Operand(2): vector temporary
1209 * Definition(0): result
1210 * Definition(1): scalar temporary
1211 * Definition(2): scalar identity temporary (not used to store identity on GFX10)
1212 * Definition(3): scc clobber
1213 * Definition(4): vcc clobber
1214 *
1215 */
1216 struct Pseudo_reduction_instruction : public Instruction {
1217 ReduceOp reduce_op;
1218 uint16_t cluster_size; // must be 0 for scans
1219 };
1220 static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1221
1222 struct instr_deleter_functor {
1223 void operator()(void* p) {
1224 free(p);
1225 }
1226 };
1227
1228 template<typename T>
1229 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
1230
1231 template<typename T>
1232 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
1233 {
1234 std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1235 char *data = (char*) calloc(1, size);
1236 T* inst = (T*) data;
1237
1238 inst->opcode = opcode;
1239 inst->format = format;
1240
1241 uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
1242 inst->operands = aco::span<Operand>(operands_offset, num_operands);
1243 uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1244 inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1245
1246 return inst;
1247 }
1248
1249 constexpr bool Instruction::usesModifiers() const noexcept
1250 {
1251 if (isDPP() || isSDWA())
1252 return true;
1253
1254 if (format == Format::VOP3P) {
1255 const VOP3P_instruction *vop3p = static_cast<const VOP3P_instruction*>(this);
1256 for (unsigned i = 0; i < operands.size(); i++) {
1257 if (vop3p->neg_lo[i] || vop3p->neg_hi[i])
1258 return true;
1259 }
1260 return vop3p->opsel_lo || vop3p->opsel_hi || vop3p->clamp;
1261 } else if (isVOP3()) {
1262 const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
1263 for (unsigned i = 0; i < operands.size(); i++) {
1264 if (vop3->abs[i] || vop3->neg[i])
1265 return true;
1266 }
1267 return vop3->opsel || vop3->clamp || vop3->omod;
1268 }
1269 return false;
1270 }
1271
1272 constexpr bool is_phi(Instruction* instr)
1273 {
1274 return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
1275 }
1276
1277 static inline bool is_phi(aco_ptr<Instruction>& instr)
1278 {
1279 return is_phi(instr.get());
1280 }
1281
1282 barrier_interaction get_barrier_interaction(const Instruction* instr);
1283 bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr);
1284
1285 bool can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high);
1286 bool can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr);
1287 /* updates "instr" and returns the old instruction (or NULL if no update was needed) */
1288 aco_ptr<Instruction> convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& instr);
1289
1290 enum block_kind {
1291 /* uniform indicates that leaving this block,
1292 * all actives lanes stay active */
1293 block_kind_uniform = 1 << 0,
1294 block_kind_top_level = 1 << 1,
1295 block_kind_loop_preheader = 1 << 2,
1296 block_kind_loop_header = 1 << 3,
1297 block_kind_loop_exit = 1 << 4,
1298 block_kind_continue = 1 << 5,
1299 block_kind_break = 1 << 6,
1300 block_kind_continue_or_break = 1 << 7,
1301 block_kind_discard = 1 << 8,
1302 block_kind_branch = 1 << 9,
1303 block_kind_merge = 1 << 10,
1304 block_kind_invert = 1 << 11,
1305 block_kind_uses_discard_if = 1 << 12,
1306 block_kind_needs_lowering = 1 << 13,
1307 block_kind_uses_demote = 1 << 14,
1308 block_kind_export_end = 1 << 15,
1309 };
1310
1311
1312 struct RegisterDemand {
1313 constexpr RegisterDemand() = default;
1314 constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1315 : vgpr{v}, sgpr{s} {}
1316 int16_t vgpr = 0;
1317 int16_t sgpr = 0;
1318
1319 constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1320 return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1321 }
1322
1323 constexpr bool exceeds(const RegisterDemand other) const noexcept {
1324 return vgpr > other.vgpr || sgpr > other.sgpr;
1325 }
1326
1327 constexpr RegisterDemand operator+(const Temp t) const noexcept {
1328 if (t.type() == RegType::sgpr)
1329 return RegisterDemand( vgpr, sgpr + t.size() );
1330 else
1331 return RegisterDemand( vgpr + t.size(), sgpr );
1332 }
1333
1334 constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1335 return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1336 }
1337
1338 constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1339 return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1340 }
1341
1342 constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1343 vgpr += other.vgpr;
1344 sgpr += other.sgpr;
1345 return *this;
1346 }
1347
1348 constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1349 vgpr -= other.vgpr;
1350 sgpr -= other.sgpr;
1351 return *this;
1352 }
1353
1354 constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1355 if (t.type() == RegType::sgpr)
1356 sgpr += t.size();
1357 else
1358 vgpr += t.size();
1359 return *this;
1360 }
1361
1362 constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1363 if (t.type() == RegType::sgpr)
1364 sgpr -= t.size();
1365 else
1366 vgpr -= t.size();
1367 return *this;
1368 }
1369
1370 constexpr void update(const RegisterDemand other) noexcept {
1371 vgpr = std::max(vgpr, other.vgpr);
1372 sgpr = std::max(sgpr, other.sgpr);
1373 }
1374
1375 };
1376
1377 /* CFG */
1378 struct Block {
1379 float_mode fp_mode;
1380 unsigned index;
1381 unsigned offset = 0;
1382 std::vector<aco_ptr<Instruction>> instructions;
1383 std::vector<unsigned> logical_preds;
1384 std::vector<unsigned> linear_preds;
1385 std::vector<unsigned> logical_succs;
1386 std::vector<unsigned> linear_succs;
1387 RegisterDemand register_demand = RegisterDemand();
1388 uint16_t loop_nest_depth = 0;
1389 uint16_t kind = 0;
1390 int logical_idom = -1;
1391 int linear_idom = -1;
1392 Temp live_out_exec = Temp();
1393
1394 /* this information is needed for predecessors to blocks with phis when
1395 * moving out of ssa */
1396 bool scc_live_out = false;
1397 PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1398
1399 Block(unsigned idx) : index(idx) {}
1400 Block() : index(0) {}
1401 };
1402
1403 using Stage = uint16_t;
1404
1405 /* software stages */
1406 static constexpr Stage sw_vs = 1 << 0;
1407 static constexpr Stage sw_gs = 1 << 1;
1408 static constexpr Stage sw_tcs = 1 << 2;
1409 static constexpr Stage sw_tes = 1 << 3;
1410 static constexpr Stage sw_fs = 1 << 4;
1411 static constexpr Stage sw_cs = 1 << 5;
1412 static constexpr Stage sw_gs_copy = 1 << 6;
1413 static constexpr Stage sw_mask = 0x7f;
1414
1415 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1416 static constexpr Stage hw_vs = 1 << 7;
1417 static constexpr Stage hw_es = 1 << 8; /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */
1418 static constexpr Stage hw_gs = 1 << 9; /* Geometry shader on GFX10/legacy and GFX6-9. */
1419 static constexpr Stage hw_ngg_gs = 1 << 10; /* Geometry shader on GFX10/NGG. */
1420 static constexpr Stage hw_ls = 1 << 11; /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */
1421 static constexpr Stage hw_hs = 1 << 12; /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */
1422 static constexpr Stage hw_fs = 1 << 13;
1423 static constexpr Stage hw_cs = 1 << 14;
1424 static constexpr Stage hw_mask = 0xff << 7;
1425
1426 /* possible settings of Program::stage */
1427 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1428 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1429 static constexpr Stage compute_cs = sw_cs | hw_cs;
1430 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1431 static constexpr Stage gs_copy_vs = sw_gs_copy | hw_vs;
1432 /* GFX10/NGG */
1433 static constexpr Stage ngg_vertex_gs = sw_vs | hw_ngg_gs;
1434 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_ngg_gs;
1435 static constexpr Stage ngg_tess_eval_gs = sw_tes | hw_ngg_gs;
1436 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_ngg_gs;
1437 /* GFX9 (and GFX10 if NGG isn't used) */
1438 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1439 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1440 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1441 /* pre-GFX9 */
1442 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1443 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1444 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1445 static constexpr Stage tess_eval_es = sw_tes | hw_es; /* tesselation evaluation before geometry */
1446 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1447
1448 enum statistic {
1449 statistic_hash,
1450 statistic_instructions,
1451 statistic_copies,
1452 statistic_branches,
1453 statistic_cycles,
1454 statistic_vmem_clauses,
1455 statistic_smem_clauses,
1456 statistic_vmem_score,
1457 statistic_smem_score,
1458 statistic_sgpr_presched,
1459 statistic_vgpr_presched,
1460 num_statistics
1461 };
1462
1463 class Program final {
1464 public:
1465 float_mode next_fp_mode;
1466 std::vector<Block> blocks;
1467 RegisterDemand max_reg_demand = RegisterDemand();
1468 uint16_t num_waves = 0;
1469 uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1470 ac_shader_config* config;
1471 struct radv_shader_info *info;
1472 enum chip_class chip_class;
1473 enum radeon_family family;
1474 unsigned wave_size;
1475 RegClass lane_mask;
1476 Stage stage; /* Stage */
1477 bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1478 bool needs_wqm = false; /* there exists a p_wqm instruction */
1479 bool wb_smem_l1_on_end = false;
1480
1481 std::vector<uint8_t> constant_data;
1482 Temp private_segment_buffer;
1483 Temp scratch_offset;
1484
1485 uint16_t min_waves = 0;
1486 uint16_t lds_alloc_granule;
1487 uint32_t lds_limit; /* in bytes */
1488 bool has_16bank_lds;
1489 uint16_t vgpr_limit;
1490 uint16_t sgpr_limit;
1491 uint16_t physical_sgprs;
1492 uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1493 uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
1494 unsigned workgroup_size; /* if known; otherwise UINT_MAX */
1495
1496 bool xnack_enabled = false;
1497 bool sram_ecc_enabled = false;
1498 bool has_fast_fma32 = false;
1499
1500 bool needs_vcc = false;
1501 bool needs_flat_scr = false;
1502
1503 bool collect_statistics = false;
1504 uint32_t statistics[num_statistics];
1505
1506 uint32_t allocateId()
1507 {
1508 assert(allocationID <= 16777215);
1509 return allocationID++;
1510 }
1511
1512 uint32_t peekAllocationId()
1513 {
1514 return allocationID;
1515 }
1516
1517 void setAllocationId(uint32_t id)
1518 {
1519 allocationID = id;
1520 }
1521
1522 Block* create_and_insert_block() {
1523 blocks.emplace_back(blocks.size());
1524 blocks.back().fp_mode = next_fp_mode;
1525 return &blocks.back();
1526 }
1527
1528 Block* insert_block(Block&& block) {
1529 block.index = blocks.size();
1530 block.fp_mode = next_fp_mode;
1531 blocks.emplace_back(std::move(block));
1532 return &blocks.back();
1533 }
1534
1535 private:
1536 uint32_t allocationID = 1;
1537 };
1538
1539 struct TempHash {
1540 std::size_t operator()(Temp t) const {
1541 return t.id();
1542 }
1543 };
1544 using TempSet = std::unordered_set<Temp, TempHash>;
1545
1546 struct live {
1547 /* live temps out per block */
1548 std::vector<TempSet> live_out;
1549 /* register demand (sgpr/vgpr) per instruction per block */
1550 std::vector<std::vector<RegisterDemand>> register_demand;
1551 };
1552
1553 void select_program(Program *program,
1554 unsigned shader_count,
1555 struct nir_shader *const *shaders,
1556 ac_shader_config* config,
1557 struct radv_shader_args *args);
1558 void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
1559 ac_shader_config* config,
1560 struct radv_shader_args *args);
1561
1562 void lower_wqm(Program* program, live& live_vars,
1563 const struct radv_nir_compiler_options *options);
1564 void lower_phis(Program* program);
1565 void calc_min_waves(Program* program);
1566 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1567 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1568 std::vector<uint16_t> dead_code_analysis(Program *program);
1569 void dominator_tree(Program* program);
1570 void insert_exec_mask(Program *program);
1571 void value_numbering(Program* program);
1572 void optimize(Program* program);
1573 void setup_reduce_temp(Program* program);
1574 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1575 void register_allocation(Program *program, std::vector<TempSet>& live_out_per_block);
1576 void ssa_elimination(Program* program);
1577 void lower_to_hw_instr(Program* program);
1578 void schedule_program(Program* program, live& live_vars);
1579 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1580 void insert_wait_states(Program* program);
1581 void insert_NOPs(Program* program);
1582 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1583 void print_asm(Program *program, std::vector<uint32_t>& binary,
1584 unsigned exec_size, std::ostream& out);
1585 void validate(Program* program, FILE *output);
1586 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output);
1587 #ifndef NDEBUG
1588 void perfwarn(bool cond, const char *msg, Instruction *instr=NULL);
1589 #else
1590 #define perfwarn(program, cond, msg, ...) do {} while(0)
1591 #endif
1592
1593 void collect_presched_stats(Program *program);
1594 void collect_preasm_stats(Program *program);
1595 void collect_postasm_stats(Program *program, const std::vector<uint32_t>& code);
1596
1597 void aco_print_instr(const Instruction *instr, FILE *output);
1598 void aco_print_program(const Program *program, FILE *output);
1599
1600 /* utilities for dealing with register demand */
1601 RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
1602 RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
1603 RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, aco_ptr<Instruction>& instr_before);
1604
1605 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1606 uint16_t get_extra_sgprs(Program *program);
1607
1608 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
1609 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1610 uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs);
1611
1612 /* return number of addressable sgprs/vgprs for max_waves */
1613 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1614 uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves);
1615
1616 typedef struct {
1617 const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
1618 const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1619 const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1620 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1621 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1622 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
1623 const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1624 const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1625 /* sizes used for input/output modifiers and constants */
1626 const unsigned operand_size[static_cast<int>(aco_opcode::num_opcodes)];
1627 const unsigned definition_size[static_cast<int>(aco_opcode::num_opcodes)];
1628 } Info;
1629
1630 extern const Info instr_info;
1631
1632 }
1633
1634 #endif /* ACO_IR_H */
1635