aco: add TBA/TMA/TTMP0-11 physical registers definitions
[mesa.git] / src / amd / compiler / aco_ir.h
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #ifndef ACO_IR_H
26 #define ACO_IR_H
27
28 #include <vector>
29 #include <set>
30 #include <unordered_set>
31 #include <bitset>
32 #include <memory>
33
34 #include "nir.h"
35 #include "ac_binary.h"
36 #include "amd_family.h"
37 #include "aco_opcodes.h"
38 #include "aco_util.h"
39
40 #include "vulkan/radv_shader.h"
41
42 struct radv_nir_compiler_options;
43 struct radv_shader_args;
44 struct radv_shader_info;
45
46 namespace aco {
47
48 extern uint64_t debug_flags;
49
50 enum {
51 DEBUG_VALIDATE_IR = 0x1,
52 DEBUG_VALIDATE_RA = 0x2,
53 DEBUG_PERFWARN = 0x4,
54 DEBUG_FORCE_WAITCNT = 0x8,
55 };
56
57 /**
58 * Representation of the instruction's microcode encoding format
59 * Note: Some Vector ALU Formats can be combined, such that:
60 * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
61 * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
62 * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
63 *
64 * (*) The same is applicable for VOP1 and VOPC instructions.
65 */
66 enum class Format : std::uint16_t {
67 /* Pseudo Instruction Format */
68 PSEUDO = 0,
69 /* Scalar ALU & Control Formats */
70 SOP1 = 1,
71 SOP2 = 2,
72 SOPK = 3,
73 SOPP = 4,
74 SOPC = 5,
75 /* Scalar Memory Format */
76 SMEM = 6,
77 /* LDS/GDS Format */
78 DS = 8,
79 /* Vector Memory Buffer Formats */
80 MTBUF = 9,
81 MUBUF = 10,
82 /* Vector Memory Image Format */
83 MIMG = 11,
84 /* Export Format */
85 EXP = 12,
86 /* Flat Formats */
87 FLAT = 13,
88 GLOBAL = 14,
89 SCRATCH = 15,
90
91 PSEUDO_BRANCH = 16,
92 PSEUDO_BARRIER = 17,
93 PSEUDO_REDUCTION = 18,
94
95 /* Vector ALU Formats */
96 VOP3P = 19,
97 VOP1 = 1 << 8,
98 VOP2 = 1 << 9,
99 VOPC = 1 << 10,
100 VOP3 = 1 << 11,
101 VOP3A = 1 << 11,
102 VOP3B = 1 << 11,
103 /* Vector Parameter Interpolation Format */
104 VINTRP = 1 << 12,
105 DPP = 1 << 13,
106 SDWA = 1 << 14,
107 };
108
109 enum storage_class : uint8_t {
110 storage_none = 0x0, /* no synchronization and can be reordered around aliasing stores */
111 storage_buffer = 0x1, /* SSBOs and global memory */
112 storage_atomic_counter = 0x2, /* not used for Vulkan */
113 storage_image = 0x4,
114 storage_shared = 0x8, /* or TCS output */
115 storage_vmem_output = 0x10, /* GS or TCS output stores using VMEM */
116 storage_scratch = 0x20,
117 storage_vgpr_spill = 0x40,
118 storage_count = 8,
119 };
120
121 enum memory_semantics : uint8_t {
122 semantic_none = 0x0,
123 /* for loads: don't move any access after this load to before this load (even other loads)
124 * for barriers: don't move any access after the barrier to before any
125 * atomics/control_barriers/sendmsg_gs_done before the barrier */
126 semantic_acquire = 0x1,
127 /* for stores: don't move any access before this store to after this store
128 * for barriers: don't move any access before the barrier to after any
129 * atomics/control_barriers/sendmsg_gs_done after the barrier */
130 semantic_release = 0x2,
131
132 /* the rest are for load/stores/atomics only */
133 /* cannot be DCE'd or CSE'd */
134 semantic_volatile = 0x4,
135 /* does not interact with barriers and assumes this lane is the only lane
136 * accessing this memory */
137 semantic_private = 0x8,
138 /* this operation can be reordered around operations of the same storage. says nothing about barriers */
139 semantic_can_reorder = 0x10,
140 /* this is a atomic instruction (may only read or write memory) */
141 semantic_atomic = 0x20,
142 /* this is instruction both reads and writes memory */
143 semantic_rmw = 0x40,
144
145 semantic_acqrel = semantic_acquire | semantic_release,
146 semantic_atomicrmw = semantic_volatile | semantic_atomic | semantic_rmw,
147 };
148
149 enum sync_scope : uint8_t {
150 scope_invocation = 0,
151 scope_subgroup = 1,
152 scope_workgroup = 2,
153 scope_queuefamily = 3,
154 scope_device = 4,
155 };
156
157 struct memory_sync_info {
158 memory_sync_info() : storage(storage_none), semantics(semantic_none), scope(scope_invocation) {}
159 memory_sync_info(int storage, int semantics=0, sync_scope scope=scope_invocation)
160 : storage((storage_class)storage), semantics((memory_semantics)semantics), scope(scope) {}
161
162 storage_class storage:8;
163 memory_semantics semantics:8;
164 sync_scope scope:8;
165
166 bool operator == (const memory_sync_info& rhs) const {
167 return storage == rhs.storage &&
168 semantics == rhs.semantics &&
169 scope == rhs.scope;
170 }
171
172 bool can_reorder() const {
173 if (semantics & semantic_acqrel)
174 return false;
175 /* Also check storage so that zero-initialized memory_sync_info can be
176 * reordered. */
177 return (!storage || (semantics & semantic_can_reorder)) && !(semantics & semantic_volatile);
178 }
179 };
180 static_assert(sizeof(memory_sync_info) == 3, "Unexpected padding");
181
182 enum fp_round {
183 fp_round_ne = 0,
184 fp_round_pi = 1,
185 fp_round_ni = 2,
186 fp_round_tz = 3,
187 };
188
189 enum fp_denorm {
190 /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
191 * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
192 fp_denorm_flush = 0x0,
193 fp_denorm_keep = 0x3,
194 };
195
196 struct float_mode {
197 /* matches encoding of the MODE register */
198 union {
199 struct {
200 fp_round round32:2;
201 fp_round round16_64:2;
202 unsigned denorm32:2;
203 unsigned denorm16_64:2;
204 };
205 struct {
206 uint8_t round:4;
207 uint8_t denorm:4;
208 };
209 uint8_t val = 0;
210 };
211 /* if false, optimizations which may remove infs/nan/-0.0 can be done */
212 bool preserve_signed_zero_inf_nan32:1;
213 bool preserve_signed_zero_inf_nan16_64:1;
214 /* if false, optimizations which may remove denormal flushing can be done */
215 bool must_flush_denorms32:1;
216 bool must_flush_denorms16_64:1;
217 bool care_about_round32:1;
218 bool care_about_round16_64:1;
219
220 /* Returns true if instructions using the mode "other" can safely use the
221 * current one instead. */
222 bool canReplace(float_mode other) const noexcept {
223 return val == other.val &&
224 (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
225 (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
226 (must_flush_denorms32 || !other.must_flush_denorms32) &&
227 (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
228 (care_about_round32 || !other.care_about_round32) &&
229 (care_about_round16_64 || !other.care_about_round16_64);
230 }
231 };
232
233 constexpr Format asVOP3(Format format) {
234 return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
235 };
236
237 constexpr Format asSDWA(Format format) {
238 assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC);
239 return (Format) ((uint32_t) Format::SDWA | (uint32_t) format);
240 }
241
242 enum class RegType {
243 none = 0,
244 sgpr,
245 vgpr,
246 linear_vgpr,
247 };
248
249 struct RegClass {
250
251 enum RC : uint8_t {
252 s1 = 1,
253 s2 = 2,
254 s3 = 3,
255 s4 = 4,
256 s6 = 6,
257 s8 = 8,
258 s16 = 16,
259 v1 = s1 | (1 << 5),
260 v2 = s2 | (1 << 5),
261 v3 = s3 | (1 << 5),
262 v4 = s4 | (1 << 5),
263 v5 = 5 | (1 << 5),
264 v6 = 6 | (1 << 5),
265 v7 = 7 | (1 << 5),
266 v8 = 8 | (1 << 5),
267 /* byte-sized register class */
268 v1b = v1 | (1 << 7),
269 v2b = v2 | (1 << 7),
270 v3b = v3 | (1 << 7),
271 v4b = v4 | (1 << 7),
272 v6b = v6 | (1 << 7),
273 v8b = v8 | (1 << 7),
274 /* these are used for WWM and spills to vgpr */
275 v1_linear = v1 | (1 << 6),
276 v2_linear = v2 | (1 << 6),
277 };
278
279 RegClass() = default;
280 constexpr RegClass(RC rc)
281 : rc(rc) {}
282 constexpr RegClass(RegType type, unsigned size)
283 : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
284
285 constexpr operator RC() const { return rc; }
286 explicit operator bool() = delete;
287
288 constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
289 constexpr bool is_subdword() const { return rc & (1 << 7); }
290 constexpr unsigned bytes() const { return ((unsigned) rc & 0x1F) * (is_subdword() ? 1 : 4); }
291 //TODO: use size() less in favor of bytes()
292 constexpr unsigned size() const { return (bytes() + 3) >> 2; }
293 constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
294 constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
295 constexpr RegClass as_subdword() const { return RegClass((RC) (rc | 1 << 7)); }
296
297 static constexpr RegClass get(RegType type, unsigned bytes) {
298 if (type == RegType::sgpr) {
299 return RegClass(type, DIV_ROUND_UP(bytes, 4u));
300 } else {
301 return bytes % 4u ? RegClass(type, bytes).as_subdword() :
302 RegClass(type, bytes / 4u);
303 }
304 }
305
306 private:
307 RC rc;
308 };
309
310 /* transitional helper expressions */
311 static constexpr RegClass s1{RegClass::s1};
312 static constexpr RegClass s2{RegClass::s2};
313 static constexpr RegClass s3{RegClass::s3};
314 static constexpr RegClass s4{RegClass::s4};
315 static constexpr RegClass s8{RegClass::s8};
316 static constexpr RegClass s16{RegClass::s16};
317 static constexpr RegClass v1{RegClass::v1};
318 static constexpr RegClass v2{RegClass::v2};
319 static constexpr RegClass v3{RegClass::v3};
320 static constexpr RegClass v4{RegClass::v4};
321 static constexpr RegClass v5{RegClass::v5};
322 static constexpr RegClass v6{RegClass::v6};
323 static constexpr RegClass v7{RegClass::v7};
324 static constexpr RegClass v8{RegClass::v8};
325 static constexpr RegClass v1b{RegClass::v1b};
326 static constexpr RegClass v2b{RegClass::v2b};
327 static constexpr RegClass v3b{RegClass::v3b};
328 static constexpr RegClass v4b{RegClass::v4b};
329 static constexpr RegClass v6b{RegClass::v6b};
330 static constexpr RegClass v8b{RegClass::v8b};
331
332 /**
333 * Temp Class
334 * Each temporary virtual register has a
335 * register class (i.e. size and type)
336 * and SSA id.
337 */
338 struct Temp {
339 Temp() noexcept : id_(0), reg_class(0) {}
340 constexpr Temp(uint32_t id, RegClass cls) noexcept
341 : id_(id), reg_class(uint8_t(cls)) {}
342
343 constexpr uint32_t id() const noexcept { return id_; }
344 constexpr RegClass regClass() const noexcept { return (RegClass::RC)reg_class; }
345
346 constexpr unsigned bytes() const noexcept { return regClass().bytes(); }
347 constexpr unsigned size() const noexcept { return regClass().size(); }
348 constexpr RegType type() const noexcept { return regClass().type(); }
349 constexpr bool is_linear() const noexcept { return regClass().is_linear(); }
350
351 constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
352 constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
353 constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
354
355 private:
356 uint32_t id_: 24;
357 uint32_t reg_class : 8;
358 };
359
360 /**
361 * PhysReg
362 * Represents the physical register for each
363 * Operand and Definition.
364 */
365 struct PhysReg {
366 constexpr PhysReg() = default;
367 explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {}
368 constexpr unsigned reg() const { return reg_b >> 2; }
369 constexpr unsigned byte() const { return reg_b & 0x3; }
370 constexpr operator unsigned() const { return reg(); }
371 constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; }
372 constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; }
373 constexpr bool operator <(PhysReg other) const { return reg_b < other.reg_b; }
374 constexpr PhysReg advance(int bytes) const { PhysReg res = *this; res.reg_b += bytes; return res; }
375
376 uint16_t reg_b = 0;
377 };
378
379 /* helper expressions for special registers */
380 static constexpr PhysReg m0{124};
381 static constexpr PhysReg vcc{106};
382 static constexpr PhysReg vcc_hi{107};
383 static constexpr PhysReg tba{108}; /* GFX6-GFX8 */
384 static constexpr PhysReg tma{110}; /* GFX6-GFX8 */
385 static constexpr PhysReg ttmp0{112};
386 static constexpr PhysReg ttmp1{113};
387 static constexpr PhysReg ttmp2{114};
388 static constexpr PhysReg ttmp3{115};
389 static constexpr PhysReg ttmp4{116};
390 static constexpr PhysReg ttmp5{117};
391 static constexpr PhysReg ttmp6{118};
392 static constexpr PhysReg ttmp7{119};
393 static constexpr PhysReg ttmp8{120};
394 static constexpr PhysReg ttmp9{121};
395 static constexpr PhysReg ttmp10{122};
396 static constexpr PhysReg ttmp11{123};
397 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
398 static constexpr PhysReg exec{126};
399 static constexpr PhysReg exec_lo{126};
400 static constexpr PhysReg exec_hi{127};
401 static constexpr PhysReg vccz{251};
402 static constexpr PhysReg execz{252};
403 static constexpr PhysReg scc{253};
404
405 /**
406 * Operand Class
407 * Initially, each Operand refers to either
408 * a temporary virtual register
409 * or to a constant value
410 * Temporary registers get mapped to physical register during RA
411 * Constant values are inlined into the instruction sequence.
412 */
413 class Operand final
414 {
415 public:
416 constexpr Operand()
417 : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
418 isKill_(false), isUndef_(true), isFirstKill_(false), constSize(0),
419 isLateKill_(false) {}
420
421 explicit Operand(Temp r) noexcept
422 {
423 data_.temp = r;
424 if (r.id()) {
425 isTemp_ = true;
426 } else {
427 isUndef_ = true;
428 setFixed(PhysReg{128});
429 }
430 };
431 explicit Operand(uint8_t v) noexcept
432 {
433 /* 8-bit constants are only used for copies and copies from any 8-bit
434 * constant can be implemented with a SDWA v_mul_u32_u24. So consider all
435 * to be inline constants. */
436 data_.i = v;
437 isConstant_ = true;
438 constSize = 0;
439 setFixed(PhysReg{0u});
440 };
441 explicit Operand(uint16_t v) noexcept
442 {
443 data_.i = v;
444 isConstant_ = true;
445 constSize = 1;
446 if (v <= 64)
447 setFixed(PhysReg{128u + v});
448 else if (v >= 0xFFF0) /* [-16 .. -1] */
449 setFixed(PhysReg{192u + (0xFFFF - v)});
450 else if (v == 0x3800) /* 0.5 */
451 setFixed(PhysReg{240});
452 else if (v == 0xB800) /* -0.5 */
453 setFixed(PhysReg{241});
454 else if (v == 0x3C00) /* 1.0 */
455 setFixed(PhysReg{242});
456 else if (v == 0xBC00) /* -1.0 */
457 setFixed(PhysReg{243});
458 else if (v == 0x4000) /* 2.0 */
459 setFixed(PhysReg{244});
460 else if (v == 0xC000) /* -2.0 */
461 setFixed(PhysReg{245});
462 else if (v == 0x4400) /* 4.0 */
463 setFixed(PhysReg{246});
464 else if (v == 0xC400) /* -4.0 */
465 setFixed(PhysReg{247});
466 else if (v == 0x3118) /* 1/2 PI */
467 setFixed(PhysReg{248});
468 else /* Literal Constant */
469 setFixed(PhysReg{255});
470 };
471 explicit Operand(uint32_t v, bool is64bit = false) noexcept
472 {
473 data_.i = v;
474 isConstant_ = true;
475 constSize = is64bit ? 3 : 2;
476 if (v <= 64)
477 setFixed(PhysReg{128 + v});
478 else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
479 setFixed(PhysReg{192 - v});
480 else if (v == 0x3f000000) /* 0.5 */
481 setFixed(PhysReg{240});
482 else if (v == 0xbf000000) /* -0.5 */
483 setFixed(PhysReg{241});
484 else if (v == 0x3f800000) /* 1.0 */
485 setFixed(PhysReg{242});
486 else if (v == 0xbf800000) /* -1.0 */
487 setFixed(PhysReg{243});
488 else if (v == 0x40000000) /* 2.0 */
489 setFixed(PhysReg{244});
490 else if (v == 0xc0000000) /* -2.0 */
491 setFixed(PhysReg{245});
492 else if (v == 0x40800000) /* 4.0 */
493 setFixed(PhysReg{246});
494 else if (v == 0xc0800000) /* -4.0 */
495 setFixed(PhysReg{247});
496 else { /* Literal Constant */
497 assert(!is64bit && "attempt to create a 64-bit literal constant");
498 setFixed(PhysReg{255});
499 }
500 };
501 explicit Operand(uint64_t v) noexcept
502 {
503 isConstant_ = true;
504 constSize = 3;
505 if (v <= 64) {
506 data_.i = (uint32_t) v;
507 setFixed(PhysReg{128 + (uint32_t) v});
508 } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
509 data_.i = (uint32_t) v;
510 setFixed(PhysReg{192 - (uint32_t) v});
511 } else if (v == 0x3FE0000000000000) { /* 0.5 */
512 data_.i = 0x3f000000;
513 setFixed(PhysReg{240});
514 } else if (v == 0xBFE0000000000000) { /* -0.5 */
515 data_.i = 0xbf000000;
516 setFixed(PhysReg{241});
517 } else if (v == 0x3FF0000000000000) { /* 1.0 */
518 data_.i = 0x3f800000;
519 setFixed(PhysReg{242});
520 } else if (v == 0xBFF0000000000000) { /* -1.0 */
521 data_.i = 0xbf800000;
522 setFixed(PhysReg{243});
523 } else if (v == 0x4000000000000000) { /* 2.0 */
524 data_.i = 0x40000000;
525 setFixed(PhysReg{244});
526 } else if (v == 0xC000000000000000) { /* -2.0 */
527 data_.i = 0xc0000000;
528 setFixed(PhysReg{245});
529 } else if (v == 0x4010000000000000) { /* 4.0 */
530 data_.i = 0x40800000;
531 setFixed(PhysReg{246});
532 } else if (v == 0xC010000000000000) { /* -4.0 */
533 data_.i = 0xc0800000;
534 setFixed(PhysReg{247});
535 } else { /* Literal Constant: we don't know if it is a long or double.*/
536 isConstant_ = 0;
537 assert(false && "attempt to create a 64-bit literal constant");
538 }
539 };
540 explicit Operand(RegClass type) noexcept
541 {
542 isUndef_ = true;
543 data_.temp = Temp(0, type);
544 setFixed(PhysReg{128});
545 };
546 explicit Operand(PhysReg reg, RegClass type) noexcept
547 {
548 data_.temp = Temp(0, type);
549 setFixed(reg);
550 }
551
552 constexpr bool isTemp() const noexcept
553 {
554 return isTemp_;
555 }
556
557 constexpr void setTemp(Temp t) noexcept {
558 assert(!isConstant_);
559 isTemp_ = true;
560 data_.temp = t;
561 }
562
563 constexpr Temp getTemp() const noexcept
564 {
565 return data_.temp;
566 }
567
568 constexpr uint32_t tempId() const noexcept
569 {
570 return data_.temp.id();
571 }
572
573 constexpr bool hasRegClass() const noexcept
574 {
575 return isTemp() || isUndefined();
576 }
577
578 constexpr RegClass regClass() const noexcept
579 {
580 return data_.temp.regClass();
581 }
582
583 constexpr unsigned bytes() const noexcept
584 {
585 if (isConstant())
586 return 1 << constSize;
587 else
588 return data_.temp.bytes();
589 }
590
591 constexpr unsigned size() const noexcept
592 {
593 if (isConstant())
594 return constSize > 2 ? 2 : 1;
595 else
596 return data_.temp.size();
597 }
598
599 constexpr bool isFixed() const noexcept
600 {
601 return isFixed_;
602 }
603
604 constexpr PhysReg physReg() const noexcept
605 {
606 return reg_;
607 }
608
609 constexpr void setFixed(PhysReg reg) noexcept
610 {
611 isFixed_ = reg != unsigned(-1);
612 reg_ = reg;
613 }
614
615 constexpr bool isConstant() const noexcept
616 {
617 return isConstant_;
618 }
619
620 constexpr bool isLiteral() const noexcept
621 {
622 return isConstant() && reg_ == 255;
623 }
624
625 constexpr bool isUndefined() const noexcept
626 {
627 return isUndef_;
628 }
629
630 constexpr uint32_t constantValue() const noexcept
631 {
632 return data_.i;
633 }
634
635 constexpr bool constantEquals(uint32_t cmp) const noexcept
636 {
637 return isConstant() && constantValue() == cmp;
638 }
639
640 constexpr uint64_t constantValue64(bool signext=false) const noexcept
641 {
642 if (constSize == 3) {
643 if (reg_ <= 192)
644 return reg_ - 128;
645 else if (reg_ <= 208)
646 return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);
647
648 switch (reg_) {
649 case 240:
650 return 0x3FE0000000000000;
651 case 241:
652 return 0xBFE0000000000000;
653 case 242:
654 return 0x3FF0000000000000;
655 case 243:
656 return 0xBFF0000000000000;
657 case 244:
658 return 0x4000000000000000;
659 case 245:
660 return 0xC000000000000000;
661 case 246:
662 return 0x4010000000000000;
663 case 247:
664 return 0xC010000000000000;
665 }
666 } else if (constSize == 1) {
667 return (signext && (data_.i & 0x8000u) ? 0xffffffffffff0000ull : 0ull) | data_.i;
668 } else if (constSize == 0) {
669 return (signext && (data_.i & 0x80u) ? 0xffffffffffffff00ull : 0ull) | data_.i;
670 }
671 return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
672 }
673
674 constexpr bool isOfType(RegType type) const noexcept
675 {
676 return hasRegClass() && regClass().type() == type;
677 }
678
679 /* Indicates that the killed operand's live range intersects with the
680 * instruction's definitions. Unlike isKill() and isFirstKill(), this is
681 * not set by liveness analysis. */
682 constexpr void setLateKill(bool flag) noexcept
683 {
684 isLateKill_ = flag;
685 }
686
687 constexpr bool isLateKill() const noexcept
688 {
689 return isLateKill_;
690 }
691
692 constexpr void setKill(bool flag) noexcept
693 {
694 isKill_ = flag;
695 if (!flag)
696 setFirstKill(false);
697 }
698
699 constexpr bool isKill() const noexcept
700 {
701 return isKill_ || isFirstKill();
702 }
703
704 constexpr void setFirstKill(bool flag) noexcept
705 {
706 isFirstKill_ = flag;
707 if (flag)
708 setKill(flag);
709 }
710
711 /* When there are multiple operands killing the same temporary,
712 * isFirstKill() is only returns true for the first one. */
713 constexpr bool isFirstKill() const noexcept
714 {
715 return isFirstKill_;
716 }
717
718 constexpr bool isKillBeforeDef() const noexcept
719 {
720 return isKill() && !isLateKill();
721 }
722
723 constexpr bool isFirstKillBeforeDef() const noexcept
724 {
725 return isFirstKill() && !isLateKill();
726 }
727
728 constexpr bool operator == (Operand other) const noexcept
729 {
730 if (other.size() != size())
731 return false;
732 if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef())
733 return false;
734 if (isFixed() && other.isFixed() && physReg() != other.physReg())
735 return false;
736 if (isLiteral())
737 return other.isLiteral() && other.constantValue() == constantValue();
738 else if (isConstant())
739 return other.isConstant() && other.physReg() == physReg();
740 else if (isUndefined())
741 return other.isUndefined() && other.regClass() == regClass();
742 else
743 return other.isTemp() && other.getTemp() == getTemp();
744 }
745 private:
746 union {
747 uint32_t i;
748 float f;
749 Temp temp = Temp(0, s1);
750 } data_;
751 PhysReg reg_;
752 union {
753 struct {
754 uint8_t isTemp_:1;
755 uint8_t isFixed_:1;
756 uint8_t isConstant_:1;
757 uint8_t isKill_:1;
758 uint8_t isUndef_:1;
759 uint8_t isFirstKill_:1;
760 uint8_t constSize:2;
761 uint8_t isLateKill_:1;
762 };
763 /* can't initialize bit-fields in c++11, so work around using a union */
764 uint16_t control_ = 0;
765 };
766 };
767
768 /**
769 * Definition Class
770 * Definitions are the results of Instructions
771 * and refer to temporary virtual registers
772 * which are later mapped to physical registers
773 */
774 class Definition final
775 {
776 public:
777 constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0),
778 isKill_(0), isPrecise_(0), isNUW_(0) {}
779 Definition(uint32_t index, RegClass type) noexcept
780 : temp(index, type) {}
781 explicit Definition(Temp tmp) noexcept
782 : temp(tmp) {}
783 Definition(PhysReg reg, RegClass type) noexcept
784 : temp(Temp(0, type))
785 {
786 setFixed(reg);
787 }
788 Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
789 : temp(Temp(tmpId, type))
790 {
791 setFixed(reg);
792 }
793
794 constexpr bool isTemp() const noexcept
795 {
796 return tempId() > 0;
797 }
798
799 constexpr Temp getTemp() const noexcept
800 {
801 return temp;
802 }
803
804 constexpr uint32_t tempId() const noexcept
805 {
806 return temp.id();
807 }
808
809 constexpr void setTemp(Temp t) noexcept {
810 temp = t;
811 }
812
813 constexpr RegClass regClass() const noexcept
814 {
815 return temp.regClass();
816 }
817
818 constexpr unsigned bytes() const noexcept
819 {
820 return temp.bytes();
821 }
822
823 constexpr unsigned size() const noexcept
824 {
825 return temp.size();
826 }
827
828 constexpr bool isFixed() const noexcept
829 {
830 return isFixed_;
831 }
832
833 constexpr PhysReg physReg() const noexcept
834 {
835 return reg_;
836 }
837
838 constexpr void setFixed(PhysReg reg) noexcept
839 {
840 isFixed_ = 1;
841 reg_ = reg;
842 }
843
844 constexpr void setHint(PhysReg reg) noexcept
845 {
846 hasHint_ = 1;
847 reg_ = reg;
848 }
849
850 constexpr bool hasHint() const noexcept
851 {
852 return hasHint_;
853 }
854
855 constexpr void setKill(bool flag) noexcept
856 {
857 isKill_ = flag;
858 }
859
860 constexpr bool isKill() const noexcept
861 {
862 return isKill_;
863 }
864
865 constexpr void setPrecise(bool precise) noexcept
866 {
867 isPrecise_ = precise;
868 }
869
870 constexpr bool isPrecise() const noexcept
871 {
872 return isPrecise_;
873 }
874
875 /* No Unsigned Wrap */
876 constexpr void setNUW(bool nuw) noexcept
877 {
878 isNUW_ = nuw;
879 }
880
881 constexpr bool isNUW() const noexcept
882 {
883 return isNUW_;
884 }
885
886 private:
887 Temp temp = Temp(0, s1);
888 PhysReg reg_;
889 union {
890 struct {
891 uint8_t isFixed_:1;
892 uint8_t hasHint_:1;
893 uint8_t isKill_:1;
894 uint8_t isPrecise_:1;
895 uint8_t isNUW_:1;
896 };
897 /* can't initialize bit-fields in c++11, so work around using a union */
898 uint8_t control_ = 0;
899 };
900 };
901
902 struct Block;
903
904 struct Instruction {
905 aco_opcode opcode;
906 Format format;
907 uint32_t pass_flags;
908
909 aco::span<Operand> operands;
910 aco::span<Definition> definitions;
911
912 constexpr bool isVALU() const noexcept
913 {
914 return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
915 || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
916 || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
917 || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
918 || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
919 || format == Format::VOP3P;
920 }
921
922 constexpr bool isSALU() const noexcept
923 {
924 return format == Format::SOP1 ||
925 format == Format::SOP2 ||
926 format == Format::SOPC ||
927 format == Format::SOPK ||
928 format == Format::SOPP;
929 }
930
931 constexpr bool isVMEM() const noexcept
932 {
933 return format == Format::MTBUF ||
934 format == Format::MUBUF ||
935 format == Format::MIMG;
936 }
937
938 constexpr bool isDPP() const noexcept
939 {
940 return (uint16_t) format & (uint16_t) Format::DPP;
941 }
942
943 constexpr bool isVOP3() const noexcept
944 {
945 return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
946 ((uint16_t) format & (uint16_t) Format::VOP3B);
947 }
948
949 constexpr bool isSDWA() const noexcept
950 {
951 return (uint16_t) format & (uint16_t) Format::SDWA;
952 }
953
954 constexpr bool isFlatOrGlobal() const noexcept
955 {
956 return format == Format::FLAT || format == Format::GLOBAL;
957 }
958
959 constexpr bool usesModifiers() const noexcept;
960
961 constexpr bool reads_exec() const noexcept
962 {
963 for (const Operand& op : operands) {
964 if (op.isFixed() && op.physReg() == exec)
965 return true;
966 }
967 return false;
968 }
969 };
970 static_assert(sizeof(Instruction) == 16, "Unexpected padding");
971
972 struct SOPK_instruction : public Instruction {
973 uint16_t imm;
974 uint16_t padding;
975 };
976 static_assert(sizeof(SOPK_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
977
978 struct SOPP_instruction : public Instruction {
979 uint32_t imm;
980 int block;
981 };
982 static_assert(sizeof(SOPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
983
984 struct SOPC_instruction : public Instruction {
985 };
986 static_assert(sizeof(SOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
987
988 struct SOP1_instruction : public Instruction {
989 };
990 static_assert(sizeof(SOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
991
992 struct SOP2_instruction : public Instruction {
993 };
994 static_assert(sizeof(SOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
995
996 /**
997 * Scalar Memory Format:
998 * For s_(buffer_)load_dword*:
999 * Operand(0): SBASE - SGPR-pair which provides base address
1000 * Operand(1): Offset - immediate (un)signed offset or SGPR
1001 * Operand(2) / Definition(0): SDATA - SGPR for read / write result
1002 * Operand(n-1): SOffset - SGPR offset (Vega only)
1003 *
1004 * Having no operands is also valid for instructions such as s_dcache_inv.
1005 *
1006 */
1007 struct SMEM_instruction : public Instruction {
1008 memory_sync_info sync;
1009 bool glc : 1; /* VI+: globally coherent */
1010 bool dlc : 1; /* NAVI: device level coherent */
1011 bool nv : 1; /* VEGA only: Non-volatile */
1012 bool disable_wqm : 1;
1013 bool prevent_overflow : 1; /* avoid overflow when combining additions */
1014 uint32_t padding: 3;
1015 };
1016 static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1017
1018 struct VOP1_instruction : public Instruction {
1019 };
1020 static_assert(sizeof(VOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1021
1022 struct VOP2_instruction : public Instruction {
1023 };
1024 static_assert(sizeof(VOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1025
1026 struct VOPC_instruction : public Instruction {
1027 };
1028 static_assert(sizeof(VOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1029
1030 struct VOP3A_instruction : public Instruction {
1031 bool abs[3];
1032 bool neg[3];
1033 uint8_t opsel : 4;
1034 uint8_t omod : 2;
1035 bool clamp : 1;
1036 uint32_t padding : 9;
1037 };
1038 static_assert(sizeof(VOP3A_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1039
1040 struct VOP3P_instruction : public Instruction {
1041 bool neg_lo[3];
1042 bool neg_hi[3];
1043 uint8_t opsel_lo : 3;
1044 uint8_t opsel_hi : 3;
1045 bool clamp : 1;
1046 uint32_t padding : 9;
1047 };
1048 static_assert(sizeof(VOP3P_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1049
1050 /**
1051 * Data Parallel Primitives Format:
1052 * This format can be used for VOP1, VOP2 or VOPC instructions.
1053 * The swizzle applies to the src0 operand.
1054 *
1055 */
1056 struct DPP_instruction : public Instruction {
1057 bool abs[2];
1058 bool neg[2];
1059 uint16_t dpp_ctrl;
1060 uint8_t row_mask : 4;
1061 uint8_t bank_mask : 4;
1062 bool bound_ctrl : 1;
1063 uint32_t padding : 7;
1064 };
1065 static_assert(sizeof(DPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1066
1067 enum sdwa_sel : uint8_t {
1068 /* masks */
1069 sdwa_wordnum = 0x1,
1070 sdwa_bytenum = 0x3,
1071 sdwa_asuint = 0x7 | 0x10,
1072 sdwa_rasize = 0x3,
1073
1074 /* flags */
1075 sdwa_isword = 0x4,
1076 sdwa_sext = 0x8,
1077 sdwa_isra = 0x10,
1078
1079 /* specific values */
1080 sdwa_ubyte0 = 0,
1081 sdwa_ubyte1 = 1,
1082 sdwa_ubyte2 = 2,
1083 sdwa_ubyte3 = 3,
1084 sdwa_uword0 = sdwa_isword | 0,
1085 sdwa_uword1 = sdwa_isword | 1,
1086 sdwa_udword = 6,
1087
1088 sdwa_sbyte0 = sdwa_ubyte0 | sdwa_sext,
1089 sdwa_sbyte1 = sdwa_ubyte1 | sdwa_sext,
1090 sdwa_sbyte2 = sdwa_ubyte2 | sdwa_sext,
1091 sdwa_sbyte3 = sdwa_ubyte3 | sdwa_sext,
1092 sdwa_sword0 = sdwa_uword0 | sdwa_sext,
1093 sdwa_sword1 = sdwa_uword1 | sdwa_sext,
1094 sdwa_sdword = sdwa_udword | sdwa_sext,
1095
1096 /* register-allocated */
1097 sdwa_ubyte = 1 | sdwa_isra,
1098 sdwa_uword = 2 | sdwa_isra,
1099 sdwa_sbyte = sdwa_ubyte | sdwa_sext,
1100 sdwa_sword = sdwa_uword | sdwa_sext,
1101 };
1102
1103 /**
1104 * Sub-Dword Addressing Format:
1105 * This format can be used for VOP1, VOP2 or VOPC instructions.
1106 *
1107 * omod and SGPR/constant operands are only available on GFX9+. For VOPC,
1108 * the definition doesn't have to be VCC on GFX9+.
1109 *
1110 */
1111 struct SDWA_instruction : public Instruction {
1112 /* these destination modifiers aren't available with VOPC except for
1113 * clamp on GFX8 */
1114 uint8_t sel[2];
1115 uint8_t dst_sel;
1116 bool neg[2];
1117 bool abs[2];
1118 bool dst_preserve : 1;
1119 bool clamp : 1;
1120 uint8_t omod : 2; /* GFX9+ */
1121 uint32_t padding : 4;
1122 };
1123 static_assert(sizeof(SDWA_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1124
1125 struct Interp_instruction : public Instruction {
1126 uint8_t attribute;
1127 uint8_t component;
1128 uint16_t padding;
1129 };
1130 static_assert(sizeof(Interp_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1131
1132 /**
1133 * Local and Global Data Sharing instructions
1134 * Operand(0): ADDR - VGPR which supplies the address.
1135 * Operand(1): DATA0 - First data VGPR.
1136 * Operand(2): DATA1 - Second data VGPR.
1137 * Operand(n-1): M0 - LDS size.
1138 * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
1139 *
1140 */
1141 struct DS_instruction : public Instruction {
1142 memory_sync_info sync;
1143 bool gds;
1144 int16_t offset0;
1145 int8_t offset1;
1146 uint8_t padding;
1147 };
1148 static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1149
1150 /**
1151 * Vector Memory Untyped-buffer Instructions
1152 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1153 * Operand(1): VADDR - Address source. Can carry an index and/or offset
1154 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1155 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1156 *
1157 */
1158 struct MUBUF_instruction : public Instruction {
1159 memory_sync_info sync;
1160 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1161 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1162 bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
1163 bool glc : 1; /* globally coherent */
1164 bool dlc : 1; /* NAVI: device level coherent */
1165 bool slc : 1; /* system level coherent */
1166 bool tfe : 1; /* texture fail enable */
1167 bool lds : 1; /* Return read-data to LDS instead of VGPRs */
1168 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1169 uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
1170 bool swizzled : 1;
1171 uint32_t padding1 : 18;
1172 };
1173 static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1174
1175 /**
1176 * Vector Memory Typed-buffer Instructions
1177 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1178 * Operand(1): VADDR - Address source. Can carry an index and/or offset
1179 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1180 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1181 *
1182 */
1183 struct MTBUF_instruction : public Instruction {
1184 memory_sync_info sync;
1185 uint8_t dfmt : 4; /* Data Format of data in memory buffer */
1186 uint8_t nfmt : 3; /* Numeric format of data in memory */
1187 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1188 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1189 bool glc : 1; /* globally coherent */
1190 bool dlc : 1; /* NAVI: device level coherent */
1191 bool slc : 1; /* system level coherent */
1192 bool tfe : 1; /* texture fail enable */
1193 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1194 uint32_t padding : 10;
1195 uint16_t offset; /* Unsigned byte offset - 12 bit */
1196 };
1197 static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1198
1199 /**
1200 * Vector Memory Image Instructions
1201 * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
1202 * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
1203 * or VDATA - Vector GPR for write data.
1204 * Operand(2): VADDR - Address source. Can carry an offset or an index.
1205 * Definition(0): VDATA - Vector GPR for read result.
1206 *
1207 */
1208 struct MIMG_instruction : public Instruction {
1209 memory_sync_info sync;
1210 uint8_t dmask; /* Data VGPR enable mask */
1211 uint8_t dim : 3; /* NAVI: dimensionality */
1212 bool unrm : 1; /* Force address to be un-normalized */
1213 bool dlc : 1; /* NAVI: device level coherent */
1214 bool glc : 1; /* globally coherent */
1215 bool slc : 1; /* system level coherent */
1216 bool tfe : 1; /* texture fail enable */
1217 bool da : 1; /* declare an array */
1218 bool lwe : 1; /* Force data to be un-normalized */
1219 bool r128 : 1; /* NAVI: Texture resource size */
1220 bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
1221 bool d16 : 1; /* Convert 32-bit data to 16-bit data */
1222 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1223 uint32_t padding : 18;
1224 };
1225 static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1226
1227 /**
1228 * Flat/Scratch/Global Instructions
1229 * Operand(0): ADDR
1230 * Operand(1): SADDR
1231 * Operand(2) / Definition(0): DATA/VDST
1232 *
1233 */
1234 struct FLAT_instruction : public Instruction {
1235 memory_sync_info sync;
1236 bool slc : 1; /* system level coherent */
1237 bool glc : 1; /* globally coherent */
1238 bool dlc : 1; /* NAVI: device level coherent */
1239 bool lds : 1;
1240 bool nv : 1;
1241 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1242 uint32_t padding0 : 2;
1243 uint16_t offset; /* Vega/Navi only */
1244 uint16_t padding1;
1245 };
1246 static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1247
1248 struct Export_instruction : public Instruction {
1249 uint8_t enabled_mask;
1250 uint8_t dest;
1251 bool compressed : 1;
1252 bool done : 1;
1253 bool valid_mask : 1;
1254 uint32_t padding : 13;
1255 };
1256 static_assert(sizeof(Export_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1257
1258 struct Pseudo_instruction : public Instruction {
1259 PhysReg scratch_sgpr; /* might not be valid if it's not needed */
1260 bool tmp_in_scc;
1261 uint8_t padding;
1262 };
1263 static_assert(sizeof(Pseudo_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1264
1265 struct Pseudo_branch_instruction : public Instruction {
1266 /* target[0] is the block index of the branch target.
1267 * For conditional branches, target[1] contains the fall-through alternative.
1268 * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
1269 */
1270 uint32_t target[2];
1271 };
1272 static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1273
1274 struct Pseudo_barrier_instruction : public Instruction {
1275 memory_sync_info sync;
1276 sync_scope exec_scope;
1277 };
1278 static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1279
1280 enum ReduceOp : uint16_t {
1281 iadd8, iadd16, iadd32, iadd64,
1282 imul8, imul16, imul32, imul64,
1283 fadd16, fadd32, fadd64,
1284 fmul16, fmul32, fmul64,
1285 imin8, imin16, imin32, imin64,
1286 imax8, imax16, imax32, imax64,
1287 umin8, umin16, umin32, umin64,
1288 umax8, umax16, umax32, umax64,
1289 fmin16, fmin32, fmin64,
1290 fmax16, fmax32, fmax64,
1291 iand8, iand16, iand32, iand64,
1292 ior8, ior16, ior32, ior64,
1293 ixor8, ixor16, ixor32, ixor64,
1294 };
1295
1296 /**
1297 * Subgroup Reduction Instructions, everything except for the data to be
1298 * reduced and the result as inserted by setup_reduce_temp().
1299 * Operand(0): data to be reduced
1300 * Operand(1): reduce temporary
1301 * Operand(2): vector temporary
1302 * Definition(0): result
1303 * Definition(1): scalar temporary
1304 * Definition(2): scalar identity temporary (not used to store identity on GFX10)
1305 * Definition(3): scc clobber
1306 * Definition(4): vcc clobber
1307 *
1308 */
1309 struct Pseudo_reduction_instruction : public Instruction {
1310 ReduceOp reduce_op;
1311 uint16_t cluster_size; // must be 0 for scans
1312 };
1313 static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1314
1315 struct instr_deleter_functor {
1316 void operator()(void* p) {
1317 free(p);
1318 }
1319 };
1320
1321 template<typename T>
1322 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
1323
1324 template<typename T>
1325 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
1326 {
1327 std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1328 char *data = (char*) calloc(1, size);
1329 T* inst = (T*) data;
1330
1331 inst->opcode = opcode;
1332 inst->format = format;
1333
1334 uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
1335 inst->operands = aco::span<Operand>(operands_offset, num_operands);
1336 uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1337 inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1338
1339 return inst;
1340 }
1341
1342 constexpr bool Instruction::usesModifiers() const noexcept
1343 {
1344 if (isDPP() || isSDWA())
1345 return true;
1346
1347 if (format == Format::VOP3P) {
1348 const VOP3P_instruction *vop3p = static_cast<const VOP3P_instruction*>(this);
1349 for (unsigned i = 0; i < operands.size(); i++) {
1350 if (vop3p->neg_lo[i] || vop3p->neg_hi[i])
1351 return true;
1352 }
1353 return vop3p->opsel_lo || vop3p->opsel_hi || vop3p->clamp;
1354 } else if (isVOP3()) {
1355 const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
1356 for (unsigned i = 0; i < operands.size(); i++) {
1357 if (vop3->abs[i] || vop3->neg[i])
1358 return true;
1359 }
1360 return vop3->opsel || vop3->clamp || vop3->omod;
1361 }
1362 return false;
1363 }
1364
1365 constexpr bool is_phi(Instruction* instr)
1366 {
1367 return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
1368 }
1369
1370 static inline bool is_phi(aco_ptr<Instruction>& instr)
1371 {
1372 return is_phi(instr.get());
1373 }
1374
1375 memory_sync_info get_sync_info(const Instruction* instr);
1376
1377 bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr);
1378
1379 bool can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high);
1380 bool can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr);
1381 /* updates "instr" and returns the old instruction (or NULL if no update was needed) */
1382 aco_ptr<Instruction> convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& instr);
1383
1384 enum block_kind {
1385 /* uniform indicates that leaving this block,
1386 * all actives lanes stay active */
1387 block_kind_uniform = 1 << 0,
1388 block_kind_top_level = 1 << 1,
1389 block_kind_loop_preheader = 1 << 2,
1390 block_kind_loop_header = 1 << 3,
1391 block_kind_loop_exit = 1 << 4,
1392 block_kind_continue = 1 << 5,
1393 block_kind_break = 1 << 6,
1394 block_kind_continue_or_break = 1 << 7,
1395 block_kind_discard = 1 << 8,
1396 block_kind_branch = 1 << 9,
1397 block_kind_merge = 1 << 10,
1398 block_kind_invert = 1 << 11,
1399 block_kind_uses_discard_if = 1 << 12,
1400 block_kind_needs_lowering = 1 << 13,
1401 block_kind_uses_demote = 1 << 14,
1402 block_kind_export_end = 1 << 15,
1403 };
1404
1405
1406 struct RegisterDemand {
1407 constexpr RegisterDemand() = default;
1408 constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1409 : vgpr{v}, sgpr{s} {}
1410 int16_t vgpr = 0;
1411 int16_t sgpr = 0;
1412
1413 constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1414 return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1415 }
1416
1417 constexpr bool exceeds(const RegisterDemand other) const noexcept {
1418 return vgpr > other.vgpr || sgpr > other.sgpr;
1419 }
1420
1421 constexpr RegisterDemand operator+(const Temp t) const noexcept {
1422 if (t.type() == RegType::sgpr)
1423 return RegisterDemand( vgpr, sgpr + t.size() );
1424 else
1425 return RegisterDemand( vgpr + t.size(), sgpr );
1426 }
1427
1428 constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1429 return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1430 }
1431
1432 constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1433 return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1434 }
1435
1436 constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1437 vgpr += other.vgpr;
1438 sgpr += other.sgpr;
1439 return *this;
1440 }
1441
1442 constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1443 vgpr -= other.vgpr;
1444 sgpr -= other.sgpr;
1445 return *this;
1446 }
1447
1448 constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1449 if (t.type() == RegType::sgpr)
1450 sgpr += t.size();
1451 else
1452 vgpr += t.size();
1453 return *this;
1454 }
1455
1456 constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1457 if (t.type() == RegType::sgpr)
1458 sgpr -= t.size();
1459 else
1460 vgpr -= t.size();
1461 return *this;
1462 }
1463
1464 constexpr void update(const RegisterDemand other) noexcept {
1465 vgpr = std::max(vgpr, other.vgpr);
1466 sgpr = std::max(sgpr, other.sgpr);
1467 }
1468
1469 };
1470
1471 /* CFG */
1472 struct Block {
1473 float_mode fp_mode;
1474 unsigned index;
1475 unsigned offset = 0;
1476 std::vector<aco_ptr<Instruction>> instructions;
1477 std::vector<unsigned> logical_preds;
1478 std::vector<unsigned> linear_preds;
1479 std::vector<unsigned> logical_succs;
1480 std::vector<unsigned> linear_succs;
1481 RegisterDemand register_demand = RegisterDemand();
1482 uint16_t loop_nest_depth = 0;
1483 uint16_t kind = 0;
1484 int logical_idom = -1;
1485 int linear_idom = -1;
1486 Temp live_out_exec = Temp();
1487
1488 /* this information is needed for predecessors to blocks with phis when
1489 * moving out of ssa */
1490 bool scc_live_out = false;
1491 PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1492
1493 Block(unsigned idx) : index(idx) {}
1494 Block() : index(0) {}
1495 };
1496
1497 using Stage = uint16_t;
1498
1499 /* software stages */
1500 static constexpr Stage sw_vs = 1 << 0;
1501 static constexpr Stage sw_gs = 1 << 1;
1502 static constexpr Stage sw_tcs = 1 << 2;
1503 static constexpr Stage sw_tes = 1 << 3;
1504 static constexpr Stage sw_fs = 1 << 4;
1505 static constexpr Stage sw_cs = 1 << 5;
1506 static constexpr Stage sw_gs_copy = 1 << 6;
1507 static constexpr Stage sw_mask = 0x7f;
1508
1509 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1510 static constexpr Stage hw_vs = 1 << 7;
1511 static constexpr Stage hw_es = 1 << 8; /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */
1512 static constexpr Stage hw_gs = 1 << 9; /* Geometry shader on GFX10/legacy and GFX6-9. */
1513 static constexpr Stage hw_ngg_gs = 1 << 10; /* Geometry shader on GFX10/NGG. */
1514 static constexpr Stage hw_ls = 1 << 11; /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */
1515 static constexpr Stage hw_hs = 1 << 12; /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */
1516 static constexpr Stage hw_fs = 1 << 13;
1517 static constexpr Stage hw_cs = 1 << 14;
1518 static constexpr Stage hw_mask = 0xff << 7;
1519
1520 /* possible settings of Program::stage */
1521 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1522 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1523 static constexpr Stage compute_cs = sw_cs | hw_cs;
1524 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1525 static constexpr Stage gs_copy_vs = sw_gs_copy | hw_vs;
1526 /* GFX10/NGG */
1527 static constexpr Stage ngg_vertex_gs = sw_vs | hw_ngg_gs;
1528 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_ngg_gs;
1529 static constexpr Stage ngg_tess_eval_gs = sw_tes | hw_ngg_gs;
1530 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_ngg_gs;
1531 /* GFX9 (and GFX10 if NGG isn't used) */
1532 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1533 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1534 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1535 /* pre-GFX9 */
1536 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1537 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1538 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1539 static constexpr Stage tess_eval_es = sw_tes | hw_es; /* tesselation evaluation before geometry */
1540 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1541
1542 enum statistic {
1543 statistic_hash,
1544 statistic_instructions,
1545 statistic_copies,
1546 statistic_branches,
1547 statistic_cycles,
1548 statistic_vmem_clauses,
1549 statistic_smem_clauses,
1550 statistic_vmem_score,
1551 statistic_smem_score,
1552 statistic_sgpr_presched,
1553 statistic_vgpr_presched,
1554 num_statistics
1555 };
1556
1557 class Program final {
1558 public:
1559 float_mode next_fp_mode;
1560 std::vector<Block> blocks;
1561 RegisterDemand max_reg_demand = RegisterDemand();
1562 uint16_t num_waves = 0;
1563 uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1564 ac_shader_config* config;
1565 struct radv_shader_info *info;
1566 enum chip_class chip_class;
1567 enum radeon_family family;
1568 unsigned wave_size;
1569 RegClass lane_mask;
1570 Stage stage; /* Stage */
1571 bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1572 bool needs_wqm = false; /* there exists a p_wqm instruction */
1573 bool wb_smem_l1_on_end = false;
1574
1575 std::vector<uint8_t> constant_data;
1576 Temp private_segment_buffer;
1577 Temp scratch_offset;
1578
1579 uint16_t min_waves = 0;
1580 uint16_t lds_alloc_granule;
1581 uint32_t lds_limit; /* in bytes */
1582 bool has_16bank_lds;
1583 uint16_t vgpr_limit;
1584 uint16_t sgpr_limit;
1585 uint16_t physical_sgprs;
1586 uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1587 uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
1588 unsigned workgroup_size; /* if known; otherwise UINT_MAX */
1589
1590 bool xnack_enabled = false;
1591 bool sram_ecc_enabled = false;
1592 bool has_fast_fma32 = false;
1593
1594 bool needs_vcc = false;
1595 bool needs_flat_scr = false;
1596
1597 bool collect_statistics = false;
1598 uint32_t statistics[num_statistics];
1599
1600 struct {
1601 void (*func)(void *private_data,
1602 enum radv_compiler_debug_level level,
1603 const char *message);
1604 void *private_data;
1605 } debug;
1606
1607 uint32_t allocateId()
1608 {
1609 assert(allocationID <= 16777215);
1610 return allocationID++;
1611 }
1612
1613 uint32_t peekAllocationId()
1614 {
1615 return allocationID;
1616 }
1617
1618 void setAllocationId(uint32_t id)
1619 {
1620 allocationID = id;
1621 }
1622
1623 Block* create_and_insert_block() {
1624 blocks.emplace_back(blocks.size());
1625 blocks.back().fp_mode = next_fp_mode;
1626 return &blocks.back();
1627 }
1628
1629 Block* insert_block(Block&& block) {
1630 block.index = blocks.size();
1631 block.fp_mode = next_fp_mode;
1632 blocks.emplace_back(std::move(block));
1633 return &blocks.back();
1634 }
1635
1636 private:
1637 uint32_t allocationID = 1;
1638 };
1639
1640 struct TempHash {
1641 std::size_t operator()(Temp t) const {
1642 return t.id();
1643 }
1644 };
1645 using TempSet = std::unordered_set<Temp, TempHash>;
1646
1647 struct live {
1648 /* live temps out per block */
1649 std::vector<TempSet> live_out;
1650 /* register demand (sgpr/vgpr) per instruction per block */
1651 std::vector<std::vector<RegisterDemand>> register_demand;
1652 };
1653
1654 void init();
1655
1656 void init_program(Program *program, Stage stage, struct radv_shader_info *info,
1657 enum chip_class chip_class, enum radeon_family family,
1658 ac_shader_config *config);
1659
1660 void select_program(Program *program,
1661 unsigned shader_count,
1662 struct nir_shader *const *shaders,
1663 ac_shader_config* config,
1664 struct radv_shader_args *args);
1665 void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
1666 ac_shader_config* config,
1667 struct radv_shader_args *args);
1668
1669 void lower_wqm(Program* program, live& live_vars,
1670 const struct radv_nir_compiler_options *options);
1671 void lower_phis(Program* program);
1672 void calc_min_waves(Program* program);
1673 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1674 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1675 std::vector<uint16_t> dead_code_analysis(Program *program);
1676 void dominator_tree(Program* program);
1677 void insert_exec_mask(Program *program);
1678 void value_numbering(Program* program);
1679 void optimize(Program* program);
1680 void setup_reduce_temp(Program* program);
1681 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1682 void register_allocation(Program *program, std::vector<TempSet>& live_out_per_block);
1683 void ssa_elimination(Program* program);
1684 void lower_to_hw_instr(Program* program);
1685 void schedule_program(Program* program, live& live_vars);
1686 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1687 void insert_wait_states(Program* program);
1688 void insert_NOPs(Program* program);
1689 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1690 void print_asm(Program *program, std::vector<uint32_t>& binary,
1691 unsigned exec_size, std::ostream& out);
1692 bool validate_ir(Program* program);
1693 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options);
1694 #ifndef NDEBUG
1695 void perfwarn(Program *program, bool cond, const char *msg, Instruction *instr=NULL);
1696 #else
1697 #define perfwarn(program, cond, msg, ...) do {} while(0)
1698 #endif
1699
1700 void collect_presched_stats(Program *program);
1701 void collect_preasm_stats(Program *program);
1702 void collect_postasm_stats(Program *program, const std::vector<uint32_t>& code);
1703
1704 void aco_print_instr(const Instruction *instr, FILE *output);
1705 void aco_print_program(const Program *program, FILE *output);
1706
1707 void _aco_perfwarn(Program *program, const char *file, unsigned line,
1708 const char *fmt, ...);
1709 void _aco_err(Program *program, const char *file, unsigned line,
1710 const char *fmt, ...);
1711
1712 #define aco_perfwarn(program, ...) _aco_perfwarn(program, __FILE__, __LINE__, __VA_ARGS__)
1713 #define aco_err(program, ...) _aco_err(program, __FILE__, __LINE__, __VA_ARGS__)
1714
1715 /* utilities for dealing with register demand */
1716 RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
1717 RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
1718 RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, aco_ptr<Instruction>& instr_before);
1719
1720 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1721 uint16_t get_extra_sgprs(Program *program);
1722
1723 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
1724 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1725 uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs);
1726
1727 /* return number of addressable sgprs/vgprs for max_waves */
1728 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1729 uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves);
1730
1731 typedef struct {
1732 const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
1733 const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1734 const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1735 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1736 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1737 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
1738 const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1739 const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1740 /* sizes used for input/output modifiers and constants */
1741 const unsigned operand_size[static_cast<int>(aco_opcode::num_opcodes)];
1742 const unsigned definition_size[static_cast<int>(aco_opcode::num_opcodes)];
1743 } Info;
1744
1745 extern const Info instr_info;
1746
1747 }
1748
1749 #endif /* ACO_IR_H */
1750