aco: add ACO_DEBUG=novn,noopt,nosched for debugging purposes
[mesa.git] / src / amd / compiler / aco_ir.h
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #ifndef ACO_IR_H
26 #define ACO_IR_H
27
28 #include <vector>
29 #include <set>
30 #include <unordered_set>
31 #include <bitset>
32 #include <memory>
33
34 #include "nir.h"
35 #include "ac_binary.h"
36 #include "amd_family.h"
37 #include "aco_opcodes.h"
38 #include "aco_util.h"
39
40 #include "vulkan/radv_shader.h"
41
42 struct radv_nir_compiler_options;
43 struct radv_shader_args;
44 struct radv_shader_info;
45
46 namespace aco {
47
48 extern uint64_t debug_flags;
49
50 enum {
51 DEBUG_VALIDATE_IR = 0x1,
52 DEBUG_VALIDATE_RA = 0x2,
53 DEBUG_PERFWARN = 0x4,
54 DEBUG_FORCE_WAITCNT = 0x8,
55 DEBUG_NO_VN = 0x10,
56 DEBUG_NO_OPT = 0x20,
57 DEBUG_NO_SCHED = 0x40,
58 };
59
60 /**
61 * Representation of the instruction's microcode encoding format
62 * Note: Some Vector ALU Formats can be combined, such that:
63 * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
64 * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
65 * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
66 *
67 * (*) The same is applicable for VOP1 and VOPC instructions.
68 */
69 enum class Format : std::uint16_t {
70 /* Pseudo Instruction Format */
71 PSEUDO = 0,
72 /* Scalar ALU & Control Formats */
73 SOP1 = 1,
74 SOP2 = 2,
75 SOPK = 3,
76 SOPP = 4,
77 SOPC = 5,
78 /* Scalar Memory Format */
79 SMEM = 6,
80 /* LDS/GDS Format */
81 DS = 8,
82 /* Vector Memory Buffer Formats */
83 MTBUF = 9,
84 MUBUF = 10,
85 /* Vector Memory Image Format */
86 MIMG = 11,
87 /* Export Format */
88 EXP = 12,
89 /* Flat Formats */
90 FLAT = 13,
91 GLOBAL = 14,
92 SCRATCH = 15,
93
94 PSEUDO_BRANCH = 16,
95 PSEUDO_BARRIER = 17,
96 PSEUDO_REDUCTION = 18,
97
98 /* Vector ALU Formats */
99 VOP3P = 19,
100 VOP1 = 1 << 8,
101 VOP2 = 1 << 9,
102 VOPC = 1 << 10,
103 VOP3 = 1 << 11,
104 VOP3A = 1 << 11,
105 VOP3B = 1 << 11,
106 /* Vector Parameter Interpolation Format */
107 VINTRP = 1 << 12,
108 DPP = 1 << 13,
109 SDWA = 1 << 14,
110 };
111
112 enum storage_class : uint8_t {
113 storage_none = 0x0, /* no synchronization and can be reordered around aliasing stores */
114 storage_buffer = 0x1, /* SSBOs and global memory */
115 storage_atomic_counter = 0x2, /* not used for Vulkan */
116 storage_image = 0x4,
117 storage_shared = 0x8, /* or TCS output */
118 storage_vmem_output = 0x10, /* GS or TCS output stores using VMEM */
119 storage_scratch = 0x20,
120 storage_vgpr_spill = 0x40,
121 storage_count = 8,
122 };
123
124 enum memory_semantics : uint8_t {
125 semantic_none = 0x0,
126 /* for loads: don't move any access after this load to before this load (even other loads)
127 * for barriers: don't move any access after the barrier to before any
128 * atomics/control_barriers/sendmsg_gs_done before the barrier */
129 semantic_acquire = 0x1,
130 /* for stores: don't move any access before this store to after this store
131 * for barriers: don't move any access before the barrier to after any
132 * atomics/control_barriers/sendmsg_gs_done after the barrier */
133 semantic_release = 0x2,
134
135 /* the rest are for load/stores/atomics only */
136 /* cannot be DCE'd or CSE'd */
137 semantic_volatile = 0x4,
138 /* does not interact with barriers and assumes this lane is the only lane
139 * accessing this memory */
140 semantic_private = 0x8,
141 /* this operation can be reordered around operations of the same storage. says nothing about barriers */
142 semantic_can_reorder = 0x10,
143 /* this is a atomic instruction (may only read or write memory) */
144 semantic_atomic = 0x20,
145 /* this is instruction both reads and writes memory */
146 semantic_rmw = 0x40,
147
148 semantic_acqrel = semantic_acquire | semantic_release,
149 semantic_atomicrmw = semantic_volatile | semantic_atomic | semantic_rmw,
150 };
151
152 enum sync_scope : uint8_t {
153 scope_invocation = 0,
154 scope_subgroup = 1,
155 scope_workgroup = 2,
156 scope_queuefamily = 3,
157 scope_device = 4,
158 };
159
160 struct memory_sync_info {
161 memory_sync_info() : storage(storage_none), semantics(semantic_none), scope(scope_invocation) {}
162 memory_sync_info(int storage, int semantics=0, sync_scope scope=scope_invocation)
163 : storage((storage_class)storage), semantics((memory_semantics)semantics), scope(scope) {}
164
165 storage_class storage:8;
166 memory_semantics semantics:8;
167 sync_scope scope:8;
168
169 bool operator == (const memory_sync_info& rhs) const {
170 return storage == rhs.storage &&
171 semantics == rhs.semantics &&
172 scope == rhs.scope;
173 }
174
175 bool can_reorder() const {
176 if (semantics & semantic_acqrel)
177 return false;
178 /* Also check storage so that zero-initialized memory_sync_info can be
179 * reordered. */
180 return (!storage || (semantics & semantic_can_reorder)) && !(semantics & semantic_volatile);
181 }
182 };
183 static_assert(sizeof(memory_sync_info) == 3, "Unexpected padding");
184
185 enum fp_round {
186 fp_round_ne = 0,
187 fp_round_pi = 1,
188 fp_round_ni = 2,
189 fp_round_tz = 3,
190 };
191
192 enum fp_denorm {
193 /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
194 * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
195 fp_denorm_flush = 0x0,
196 fp_denorm_keep = 0x3,
197 };
198
199 struct float_mode {
200 /* matches encoding of the MODE register */
201 union {
202 struct {
203 fp_round round32:2;
204 fp_round round16_64:2;
205 unsigned denorm32:2;
206 unsigned denorm16_64:2;
207 };
208 struct {
209 uint8_t round:4;
210 uint8_t denorm:4;
211 };
212 uint8_t val = 0;
213 };
214 /* if false, optimizations which may remove infs/nan/-0.0 can be done */
215 bool preserve_signed_zero_inf_nan32:1;
216 bool preserve_signed_zero_inf_nan16_64:1;
217 /* if false, optimizations which may remove denormal flushing can be done */
218 bool must_flush_denorms32:1;
219 bool must_flush_denorms16_64:1;
220 bool care_about_round32:1;
221 bool care_about_round16_64:1;
222
223 /* Returns true if instructions using the mode "other" can safely use the
224 * current one instead. */
225 bool canReplace(float_mode other) const noexcept {
226 return val == other.val &&
227 (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
228 (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
229 (must_flush_denorms32 || !other.must_flush_denorms32) &&
230 (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
231 (care_about_round32 || !other.care_about_round32) &&
232 (care_about_round16_64 || !other.care_about_round16_64);
233 }
234 };
235
236 constexpr Format asVOP3(Format format) {
237 return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
238 };
239
240 constexpr Format asSDWA(Format format) {
241 assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC);
242 return (Format) ((uint32_t) Format::SDWA | (uint32_t) format);
243 }
244
245 enum class RegType {
246 none = 0,
247 sgpr,
248 vgpr,
249 linear_vgpr,
250 };
251
252 struct RegClass {
253
254 enum RC : uint8_t {
255 s1 = 1,
256 s2 = 2,
257 s3 = 3,
258 s4 = 4,
259 s6 = 6,
260 s8 = 8,
261 s16 = 16,
262 v1 = s1 | (1 << 5),
263 v2 = s2 | (1 << 5),
264 v3 = s3 | (1 << 5),
265 v4 = s4 | (1 << 5),
266 v5 = 5 | (1 << 5),
267 v6 = 6 | (1 << 5),
268 v7 = 7 | (1 << 5),
269 v8 = 8 | (1 << 5),
270 /* byte-sized register class */
271 v1b = v1 | (1 << 7),
272 v2b = v2 | (1 << 7),
273 v3b = v3 | (1 << 7),
274 v4b = v4 | (1 << 7),
275 v6b = v6 | (1 << 7),
276 v8b = v8 | (1 << 7),
277 /* these are used for WWM and spills to vgpr */
278 v1_linear = v1 | (1 << 6),
279 v2_linear = v2 | (1 << 6),
280 };
281
282 RegClass() = default;
283 constexpr RegClass(RC rc)
284 : rc(rc) {}
285 constexpr RegClass(RegType type, unsigned size)
286 : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
287
288 constexpr operator RC() const { return rc; }
289 explicit operator bool() = delete;
290
291 constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
292 constexpr bool is_subdword() const { return rc & (1 << 7); }
293 constexpr unsigned bytes() const { return ((unsigned) rc & 0x1F) * (is_subdword() ? 1 : 4); }
294 //TODO: use size() less in favor of bytes()
295 constexpr unsigned size() const { return (bytes() + 3) >> 2; }
296 constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
297 constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
298 constexpr RegClass as_subdword() const { return RegClass((RC) (rc | 1 << 7)); }
299
300 static constexpr RegClass get(RegType type, unsigned bytes) {
301 if (type == RegType::sgpr) {
302 return RegClass(type, DIV_ROUND_UP(bytes, 4u));
303 } else {
304 return bytes % 4u ? RegClass(type, bytes).as_subdword() :
305 RegClass(type, bytes / 4u);
306 }
307 }
308
309 private:
310 RC rc;
311 };
312
313 /* transitional helper expressions */
314 static constexpr RegClass s1{RegClass::s1};
315 static constexpr RegClass s2{RegClass::s2};
316 static constexpr RegClass s3{RegClass::s3};
317 static constexpr RegClass s4{RegClass::s4};
318 static constexpr RegClass s8{RegClass::s8};
319 static constexpr RegClass s16{RegClass::s16};
320 static constexpr RegClass v1{RegClass::v1};
321 static constexpr RegClass v2{RegClass::v2};
322 static constexpr RegClass v3{RegClass::v3};
323 static constexpr RegClass v4{RegClass::v4};
324 static constexpr RegClass v5{RegClass::v5};
325 static constexpr RegClass v6{RegClass::v6};
326 static constexpr RegClass v7{RegClass::v7};
327 static constexpr RegClass v8{RegClass::v8};
328 static constexpr RegClass v1b{RegClass::v1b};
329 static constexpr RegClass v2b{RegClass::v2b};
330 static constexpr RegClass v3b{RegClass::v3b};
331 static constexpr RegClass v4b{RegClass::v4b};
332 static constexpr RegClass v6b{RegClass::v6b};
333 static constexpr RegClass v8b{RegClass::v8b};
334
335 /**
336 * Temp Class
337 * Each temporary virtual register has a
338 * register class (i.e. size and type)
339 * and SSA id.
340 */
341 struct Temp {
342 Temp() noexcept : id_(0), reg_class(0) {}
343 constexpr Temp(uint32_t id, RegClass cls) noexcept
344 : id_(id), reg_class(uint8_t(cls)) {}
345
346 constexpr uint32_t id() const noexcept { return id_; }
347 constexpr RegClass regClass() const noexcept { return (RegClass::RC)reg_class; }
348
349 constexpr unsigned bytes() const noexcept { return regClass().bytes(); }
350 constexpr unsigned size() const noexcept { return regClass().size(); }
351 constexpr RegType type() const noexcept { return regClass().type(); }
352 constexpr bool is_linear() const noexcept { return regClass().is_linear(); }
353
354 constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
355 constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
356 constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
357
358 private:
359 uint32_t id_: 24;
360 uint32_t reg_class : 8;
361 };
362
363 /**
364 * PhysReg
365 * Represents the physical register for each
366 * Operand and Definition.
367 */
368 struct PhysReg {
369 constexpr PhysReg() = default;
370 explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {}
371 constexpr unsigned reg() const { return reg_b >> 2; }
372 constexpr unsigned byte() const { return reg_b & 0x3; }
373 constexpr operator unsigned() const { return reg(); }
374 constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; }
375 constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; }
376 constexpr bool operator <(PhysReg other) const { return reg_b < other.reg_b; }
377 constexpr PhysReg advance(int bytes) const { PhysReg res = *this; res.reg_b += bytes; return res; }
378
379 uint16_t reg_b = 0;
380 };
381
382 /* helper expressions for special registers */
383 static constexpr PhysReg m0{124};
384 static constexpr PhysReg vcc{106};
385 static constexpr PhysReg vcc_hi{107};
386 static constexpr PhysReg tba{108}; /* GFX6-GFX8 */
387 static constexpr PhysReg tma{110}; /* GFX6-GFX8 */
388 static constexpr PhysReg ttmp0{112};
389 static constexpr PhysReg ttmp1{113};
390 static constexpr PhysReg ttmp2{114};
391 static constexpr PhysReg ttmp3{115};
392 static constexpr PhysReg ttmp4{116};
393 static constexpr PhysReg ttmp5{117};
394 static constexpr PhysReg ttmp6{118};
395 static constexpr PhysReg ttmp7{119};
396 static constexpr PhysReg ttmp8{120};
397 static constexpr PhysReg ttmp9{121};
398 static constexpr PhysReg ttmp10{122};
399 static constexpr PhysReg ttmp11{123};
400 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
401 static constexpr PhysReg exec{126};
402 static constexpr PhysReg exec_lo{126};
403 static constexpr PhysReg exec_hi{127};
404 static constexpr PhysReg vccz{251};
405 static constexpr PhysReg execz{252};
406 static constexpr PhysReg scc{253};
407
408 /**
409 * Operand Class
410 * Initially, each Operand refers to either
411 * a temporary virtual register
412 * or to a constant value
413 * Temporary registers get mapped to physical register during RA
414 * Constant values are inlined into the instruction sequence.
415 */
416 class Operand final
417 {
418 public:
419 constexpr Operand()
420 : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
421 isKill_(false), isUndef_(true), isFirstKill_(false), constSize(0),
422 isLateKill_(false) {}
423
424 explicit Operand(Temp r) noexcept
425 {
426 data_.temp = r;
427 if (r.id()) {
428 isTemp_ = true;
429 } else {
430 isUndef_ = true;
431 setFixed(PhysReg{128});
432 }
433 };
434 explicit Operand(uint8_t v) noexcept
435 {
436 /* 8-bit constants are only used for copies and copies from any 8-bit
437 * constant can be implemented with a SDWA v_mul_u32_u24. So consider all
438 * to be inline constants. */
439 data_.i = v;
440 isConstant_ = true;
441 constSize = 0;
442 setFixed(PhysReg{0u});
443 };
444 explicit Operand(uint16_t v) noexcept
445 {
446 data_.i = v;
447 isConstant_ = true;
448 constSize = 1;
449 if (v <= 64)
450 setFixed(PhysReg{128u + v});
451 else if (v >= 0xFFF0) /* [-16 .. -1] */
452 setFixed(PhysReg{192u + (0xFFFF - v)});
453 else if (v == 0x3800) /* 0.5 */
454 setFixed(PhysReg{240});
455 else if (v == 0xB800) /* -0.5 */
456 setFixed(PhysReg{241});
457 else if (v == 0x3C00) /* 1.0 */
458 setFixed(PhysReg{242});
459 else if (v == 0xBC00) /* -1.0 */
460 setFixed(PhysReg{243});
461 else if (v == 0x4000) /* 2.0 */
462 setFixed(PhysReg{244});
463 else if (v == 0xC000) /* -2.0 */
464 setFixed(PhysReg{245});
465 else if (v == 0x4400) /* 4.0 */
466 setFixed(PhysReg{246});
467 else if (v == 0xC400) /* -4.0 */
468 setFixed(PhysReg{247});
469 else if (v == 0x3118) /* 1/2 PI */
470 setFixed(PhysReg{248});
471 else /* Literal Constant */
472 setFixed(PhysReg{255});
473 };
474 explicit Operand(uint32_t v, bool is64bit = false) noexcept
475 {
476 data_.i = v;
477 isConstant_ = true;
478 constSize = is64bit ? 3 : 2;
479 if (v <= 64)
480 setFixed(PhysReg{128 + v});
481 else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
482 setFixed(PhysReg{192 - v});
483 else if (v == 0x3f000000) /* 0.5 */
484 setFixed(PhysReg{240});
485 else if (v == 0xbf000000) /* -0.5 */
486 setFixed(PhysReg{241});
487 else if (v == 0x3f800000) /* 1.0 */
488 setFixed(PhysReg{242});
489 else if (v == 0xbf800000) /* -1.0 */
490 setFixed(PhysReg{243});
491 else if (v == 0x40000000) /* 2.0 */
492 setFixed(PhysReg{244});
493 else if (v == 0xc0000000) /* -2.0 */
494 setFixed(PhysReg{245});
495 else if (v == 0x40800000) /* 4.0 */
496 setFixed(PhysReg{246});
497 else if (v == 0xc0800000) /* -4.0 */
498 setFixed(PhysReg{247});
499 else { /* Literal Constant */
500 assert(!is64bit && "attempt to create a 64-bit literal constant");
501 setFixed(PhysReg{255});
502 }
503 };
504 explicit Operand(uint64_t v) noexcept
505 {
506 isConstant_ = true;
507 constSize = 3;
508 if (v <= 64) {
509 data_.i = (uint32_t) v;
510 setFixed(PhysReg{128 + (uint32_t) v});
511 } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
512 data_.i = (uint32_t) v;
513 setFixed(PhysReg{192 - (uint32_t) v});
514 } else if (v == 0x3FE0000000000000) { /* 0.5 */
515 data_.i = 0x3f000000;
516 setFixed(PhysReg{240});
517 } else if (v == 0xBFE0000000000000) { /* -0.5 */
518 data_.i = 0xbf000000;
519 setFixed(PhysReg{241});
520 } else if (v == 0x3FF0000000000000) { /* 1.0 */
521 data_.i = 0x3f800000;
522 setFixed(PhysReg{242});
523 } else if (v == 0xBFF0000000000000) { /* -1.0 */
524 data_.i = 0xbf800000;
525 setFixed(PhysReg{243});
526 } else if (v == 0x4000000000000000) { /* 2.0 */
527 data_.i = 0x40000000;
528 setFixed(PhysReg{244});
529 } else if (v == 0xC000000000000000) { /* -2.0 */
530 data_.i = 0xc0000000;
531 setFixed(PhysReg{245});
532 } else if (v == 0x4010000000000000) { /* 4.0 */
533 data_.i = 0x40800000;
534 setFixed(PhysReg{246});
535 } else if (v == 0xC010000000000000) { /* -4.0 */
536 data_.i = 0xc0800000;
537 setFixed(PhysReg{247});
538 } else { /* Literal Constant: we don't know if it is a long or double.*/
539 isConstant_ = 0;
540 assert(false && "attempt to create a 64-bit literal constant");
541 }
542 };
543 explicit Operand(RegClass type) noexcept
544 {
545 isUndef_ = true;
546 data_.temp = Temp(0, type);
547 setFixed(PhysReg{128});
548 };
549 explicit Operand(PhysReg reg, RegClass type) noexcept
550 {
551 data_.temp = Temp(0, type);
552 setFixed(reg);
553 }
554
555 constexpr bool isTemp() const noexcept
556 {
557 return isTemp_;
558 }
559
560 constexpr void setTemp(Temp t) noexcept {
561 assert(!isConstant_);
562 isTemp_ = true;
563 data_.temp = t;
564 }
565
566 constexpr Temp getTemp() const noexcept
567 {
568 return data_.temp;
569 }
570
571 constexpr uint32_t tempId() const noexcept
572 {
573 return data_.temp.id();
574 }
575
576 constexpr bool hasRegClass() const noexcept
577 {
578 return isTemp() || isUndefined();
579 }
580
581 constexpr RegClass regClass() const noexcept
582 {
583 return data_.temp.regClass();
584 }
585
586 constexpr unsigned bytes() const noexcept
587 {
588 if (isConstant())
589 return 1 << constSize;
590 else
591 return data_.temp.bytes();
592 }
593
594 constexpr unsigned size() const noexcept
595 {
596 if (isConstant())
597 return constSize > 2 ? 2 : 1;
598 else
599 return data_.temp.size();
600 }
601
602 constexpr bool isFixed() const noexcept
603 {
604 return isFixed_;
605 }
606
607 constexpr PhysReg physReg() const noexcept
608 {
609 return reg_;
610 }
611
612 constexpr void setFixed(PhysReg reg) noexcept
613 {
614 isFixed_ = reg != unsigned(-1);
615 reg_ = reg;
616 }
617
618 constexpr bool isConstant() const noexcept
619 {
620 return isConstant_;
621 }
622
623 constexpr bool isLiteral() const noexcept
624 {
625 return isConstant() && reg_ == 255;
626 }
627
628 constexpr bool isUndefined() const noexcept
629 {
630 return isUndef_;
631 }
632
633 constexpr uint32_t constantValue() const noexcept
634 {
635 return data_.i;
636 }
637
638 constexpr bool constantEquals(uint32_t cmp) const noexcept
639 {
640 return isConstant() && constantValue() == cmp;
641 }
642
643 constexpr uint64_t constantValue64(bool signext=false) const noexcept
644 {
645 if (constSize == 3) {
646 if (reg_ <= 192)
647 return reg_ - 128;
648 else if (reg_ <= 208)
649 return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);
650
651 switch (reg_) {
652 case 240:
653 return 0x3FE0000000000000;
654 case 241:
655 return 0xBFE0000000000000;
656 case 242:
657 return 0x3FF0000000000000;
658 case 243:
659 return 0xBFF0000000000000;
660 case 244:
661 return 0x4000000000000000;
662 case 245:
663 return 0xC000000000000000;
664 case 246:
665 return 0x4010000000000000;
666 case 247:
667 return 0xC010000000000000;
668 }
669 } else if (constSize == 1) {
670 return (signext && (data_.i & 0x8000u) ? 0xffffffffffff0000ull : 0ull) | data_.i;
671 } else if (constSize == 0) {
672 return (signext && (data_.i & 0x80u) ? 0xffffffffffffff00ull : 0ull) | data_.i;
673 }
674 return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
675 }
676
677 constexpr bool isOfType(RegType type) const noexcept
678 {
679 return hasRegClass() && regClass().type() == type;
680 }
681
682 /* Indicates that the killed operand's live range intersects with the
683 * instruction's definitions. Unlike isKill() and isFirstKill(), this is
684 * not set by liveness analysis. */
685 constexpr void setLateKill(bool flag) noexcept
686 {
687 isLateKill_ = flag;
688 }
689
690 constexpr bool isLateKill() const noexcept
691 {
692 return isLateKill_;
693 }
694
695 constexpr void setKill(bool flag) noexcept
696 {
697 isKill_ = flag;
698 if (!flag)
699 setFirstKill(false);
700 }
701
702 constexpr bool isKill() const noexcept
703 {
704 return isKill_ || isFirstKill();
705 }
706
707 constexpr void setFirstKill(bool flag) noexcept
708 {
709 isFirstKill_ = flag;
710 if (flag)
711 setKill(flag);
712 }
713
714 /* When there are multiple operands killing the same temporary,
715 * isFirstKill() is only returns true for the first one. */
716 constexpr bool isFirstKill() const noexcept
717 {
718 return isFirstKill_;
719 }
720
721 constexpr bool isKillBeforeDef() const noexcept
722 {
723 return isKill() && !isLateKill();
724 }
725
726 constexpr bool isFirstKillBeforeDef() const noexcept
727 {
728 return isFirstKill() && !isLateKill();
729 }
730
731 constexpr bool operator == (Operand other) const noexcept
732 {
733 if (other.size() != size())
734 return false;
735 if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef())
736 return false;
737 if (isFixed() && other.isFixed() && physReg() != other.physReg())
738 return false;
739 if (isLiteral())
740 return other.isLiteral() && other.constantValue() == constantValue();
741 else if (isConstant())
742 return other.isConstant() && other.physReg() == physReg();
743 else if (isUndefined())
744 return other.isUndefined() && other.regClass() == regClass();
745 else
746 return other.isTemp() && other.getTemp() == getTemp();
747 }
748 private:
749 union {
750 uint32_t i;
751 float f;
752 Temp temp = Temp(0, s1);
753 } data_;
754 PhysReg reg_;
755 union {
756 struct {
757 uint8_t isTemp_:1;
758 uint8_t isFixed_:1;
759 uint8_t isConstant_:1;
760 uint8_t isKill_:1;
761 uint8_t isUndef_:1;
762 uint8_t isFirstKill_:1;
763 uint8_t constSize:2;
764 uint8_t isLateKill_:1;
765 };
766 /* can't initialize bit-fields in c++11, so work around using a union */
767 uint16_t control_ = 0;
768 };
769 };
770
771 /**
772 * Definition Class
773 * Definitions are the results of Instructions
774 * and refer to temporary virtual registers
775 * which are later mapped to physical registers
776 */
777 class Definition final
778 {
779 public:
780 constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0),
781 isKill_(0), isPrecise_(0), isNUW_(0) {}
782 Definition(uint32_t index, RegClass type) noexcept
783 : temp(index, type) {}
784 explicit Definition(Temp tmp) noexcept
785 : temp(tmp) {}
786 Definition(PhysReg reg, RegClass type) noexcept
787 : temp(Temp(0, type))
788 {
789 setFixed(reg);
790 }
791 Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
792 : temp(Temp(tmpId, type))
793 {
794 setFixed(reg);
795 }
796
797 constexpr bool isTemp() const noexcept
798 {
799 return tempId() > 0;
800 }
801
802 constexpr Temp getTemp() const noexcept
803 {
804 return temp;
805 }
806
807 constexpr uint32_t tempId() const noexcept
808 {
809 return temp.id();
810 }
811
812 constexpr void setTemp(Temp t) noexcept {
813 temp = t;
814 }
815
816 constexpr RegClass regClass() const noexcept
817 {
818 return temp.regClass();
819 }
820
821 constexpr unsigned bytes() const noexcept
822 {
823 return temp.bytes();
824 }
825
826 constexpr unsigned size() const noexcept
827 {
828 return temp.size();
829 }
830
831 constexpr bool isFixed() const noexcept
832 {
833 return isFixed_;
834 }
835
836 constexpr PhysReg physReg() const noexcept
837 {
838 return reg_;
839 }
840
841 constexpr void setFixed(PhysReg reg) noexcept
842 {
843 isFixed_ = 1;
844 reg_ = reg;
845 }
846
847 constexpr void setHint(PhysReg reg) noexcept
848 {
849 hasHint_ = 1;
850 reg_ = reg;
851 }
852
853 constexpr bool hasHint() const noexcept
854 {
855 return hasHint_;
856 }
857
858 constexpr void setKill(bool flag) noexcept
859 {
860 isKill_ = flag;
861 }
862
863 constexpr bool isKill() const noexcept
864 {
865 return isKill_;
866 }
867
868 constexpr void setPrecise(bool precise) noexcept
869 {
870 isPrecise_ = precise;
871 }
872
873 constexpr bool isPrecise() const noexcept
874 {
875 return isPrecise_;
876 }
877
878 /* No Unsigned Wrap */
879 constexpr void setNUW(bool nuw) noexcept
880 {
881 isNUW_ = nuw;
882 }
883
884 constexpr bool isNUW() const noexcept
885 {
886 return isNUW_;
887 }
888
889 private:
890 Temp temp = Temp(0, s1);
891 PhysReg reg_;
892 union {
893 struct {
894 uint8_t isFixed_:1;
895 uint8_t hasHint_:1;
896 uint8_t isKill_:1;
897 uint8_t isPrecise_:1;
898 uint8_t isNUW_:1;
899 };
900 /* can't initialize bit-fields in c++11, so work around using a union */
901 uint8_t control_ = 0;
902 };
903 };
904
905 struct Block;
906
907 struct Instruction {
908 aco_opcode opcode;
909 Format format;
910 uint32_t pass_flags;
911
912 aco::span<Operand> operands;
913 aco::span<Definition> definitions;
914
915 constexpr bool isVALU() const noexcept
916 {
917 return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
918 || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
919 || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
920 || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
921 || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
922 || format == Format::VOP3P;
923 }
924
925 constexpr bool isSALU() const noexcept
926 {
927 return format == Format::SOP1 ||
928 format == Format::SOP2 ||
929 format == Format::SOPC ||
930 format == Format::SOPK ||
931 format == Format::SOPP;
932 }
933
934 constexpr bool isVMEM() const noexcept
935 {
936 return format == Format::MTBUF ||
937 format == Format::MUBUF ||
938 format == Format::MIMG;
939 }
940
941 constexpr bool isDPP() const noexcept
942 {
943 return (uint16_t) format & (uint16_t) Format::DPP;
944 }
945
946 constexpr bool isVOP3() const noexcept
947 {
948 return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
949 ((uint16_t) format & (uint16_t) Format::VOP3B);
950 }
951
952 constexpr bool isSDWA() const noexcept
953 {
954 return (uint16_t) format & (uint16_t) Format::SDWA;
955 }
956
957 constexpr bool isFlatOrGlobal() const noexcept
958 {
959 return format == Format::FLAT || format == Format::GLOBAL;
960 }
961
962 constexpr bool usesModifiers() const noexcept;
963
964 constexpr bool reads_exec() const noexcept
965 {
966 for (const Operand& op : operands) {
967 if (op.isFixed() && op.physReg() == exec)
968 return true;
969 }
970 return false;
971 }
972 };
973 static_assert(sizeof(Instruction) == 16, "Unexpected padding");
974
975 struct SOPK_instruction : public Instruction {
976 uint16_t imm;
977 uint16_t padding;
978 };
979 static_assert(sizeof(SOPK_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
980
981 struct SOPP_instruction : public Instruction {
982 uint32_t imm;
983 int block;
984 };
985 static_assert(sizeof(SOPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
986
987 struct SOPC_instruction : public Instruction {
988 };
989 static_assert(sizeof(SOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
990
991 struct SOP1_instruction : public Instruction {
992 };
993 static_assert(sizeof(SOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
994
995 struct SOP2_instruction : public Instruction {
996 };
997 static_assert(sizeof(SOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
998
999 /**
1000 * Scalar Memory Format:
1001 * For s_(buffer_)load_dword*:
1002 * Operand(0): SBASE - SGPR-pair which provides base address
1003 * Operand(1): Offset - immediate (un)signed offset or SGPR
1004 * Operand(2) / Definition(0): SDATA - SGPR for read / write result
1005 * Operand(n-1): SOffset - SGPR offset (Vega only)
1006 *
1007 * Having no operands is also valid for instructions such as s_dcache_inv.
1008 *
1009 */
1010 struct SMEM_instruction : public Instruction {
1011 memory_sync_info sync;
1012 bool glc : 1; /* VI+: globally coherent */
1013 bool dlc : 1; /* NAVI: device level coherent */
1014 bool nv : 1; /* VEGA only: Non-volatile */
1015 bool disable_wqm : 1;
1016 bool prevent_overflow : 1; /* avoid overflow when combining additions */
1017 uint32_t padding: 3;
1018 };
1019 static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1020
1021 struct VOP1_instruction : public Instruction {
1022 };
1023 static_assert(sizeof(VOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1024
1025 struct VOP2_instruction : public Instruction {
1026 };
1027 static_assert(sizeof(VOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1028
1029 struct VOPC_instruction : public Instruction {
1030 };
1031 static_assert(sizeof(VOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1032
1033 struct VOP3A_instruction : public Instruction {
1034 bool abs[3];
1035 bool neg[3];
1036 uint8_t opsel : 4;
1037 uint8_t omod : 2;
1038 bool clamp : 1;
1039 uint32_t padding : 9;
1040 };
1041 static_assert(sizeof(VOP3A_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1042
1043 struct VOP3P_instruction : public Instruction {
1044 bool neg_lo[3];
1045 bool neg_hi[3];
1046 uint8_t opsel_lo : 3;
1047 uint8_t opsel_hi : 3;
1048 bool clamp : 1;
1049 uint32_t padding : 9;
1050 };
1051 static_assert(sizeof(VOP3P_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1052
1053 /**
1054 * Data Parallel Primitives Format:
1055 * This format can be used for VOP1, VOP2 or VOPC instructions.
1056 * The swizzle applies to the src0 operand.
1057 *
1058 */
1059 struct DPP_instruction : public Instruction {
1060 bool abs[2];
1061 bool neg[2];
1062 uint16_t dpp_ctrl;
1063 uint8_t row_mask : 4;
1064 uint8_t bank_mask : 4;
1065 bool bound_ctrl : 1;
1066 uint32_t padding : 7;
1067 };
1068 static_assert(sizeof(DPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1069
1070 enum sdwa_sel : uint8_t {
1071 /* masks */
1072 sdwa_wordnum = 0x1,
1073 sdwa_bytenum = 0x3,
1074 sdwa_asuint = 0x7 | 0x10,
1075 sdwa_rasize = 0x3,
1076
1077 /* flags */
1078 sdwa_isword = 0x4,
1079 sdwa_sext = 0x8,
1080 sdwa_isra = 0x10,
1081
1082 /* specific values */
1083 sdwa_ubyte0 = 0,
1084 sdwa_ubyte1 = 1,
1085 sdwa_ubyte2 = 2,
1086 sdwa_ubyte3 = 3,
1087 sdwa_uword0 = sdwa_isword | 0,
1088 sdwa_uword1 = sdwa_isword | 1,
1089 sdwa_udword = 6,
1090
1091 sdwa_sbyte0 = sdwa_ubyte0 | sdwa_sext,
1092 sdwa_sbyte1 = sdwa_ubyte1 | sdwa_sext,
1093 sdwa_sbyte2 = sdwa_ubyte2 | sdwa_sext,
1094 sdwa_sbyte3 = sdwa_ubyte3 | sdwa_sext,
1095 sdwa_sword0 = sdwa_uword0 | sdwa_sext,
1096 sdwa_sword1 = sdwa_uword1 | sdwa_sext,
1097 sdwa_sdword = sdwa_udword | sdwa_sext,
1098
1099 /* register-allocated */
1100 sdwa_ubyte = 1 | sdwa_isra,
1101 sdwa_uword = 2 | sdwa_isra,
1102 sdwa_sbyte = sdwa_ubyte | sdwa_sext,
1103 sdwa_sword = sdwa_uword | sdwa_sext,
1104 };
1105
1106 /**
1107 * Sub-Dword Addressing Format:
1108 * This format can be used for VOP1, VOP2 or VOPC instructions.
1109 *
1110 * omod and SGPR/constant operands are only available on GFX9+. For VOPC,
1111 * the definition doesn't have to be VCC on GFX9+.
1112 *
1113 */
1114 struct SDWA_instruction : public Instruction {
1115 /* these destination modifiers aren't available with VOPC except for
1116 * clamp on GFX8 */
1117 uint8_t sel[2];
1118 uint8_t dst_sel;
1119 bool neg[2];
1120 bool abs[2];
1121 bool dst_preserve : 1;
1122 bool clamp : 1;
1123 uint8_t omod : 2; /* GFX9+ */
1124 uint32_t padding : 4;
1125 };
1126 static_assert(sizeof(SDWA_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1127
1128 struct Interp_instruction : public Instruction {
1129 uint8_t attribute;
1130 uint8_t component;
1131 uint16_t padding;
1132 };
1133 static_assert(sizeof(Interp_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1134
1135 /**
1136 * Local and Global Data Sharing instructions
1137 * Operand(0): ADDR - VGPR which supplies the address.
1138 * Operand(1): DATA0 - First data VGPR.
1139 * Operand(2): DATA1 - Second data VGPR.
1140 * Operand(n-1): M0 - LDS size.
1141 * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
1142 *
1143 */
1144 struct DS_instruction : public Instruction {
1145 memory_sync_info sync;
1146 bool gds;
1147 int16_t offset0;
1148 int8_t offset1;
1149 uint8_t padding;
1150 };
1151 static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1152
1153 /**
1154 * Vector Memory Untyped-buffer Instructions
1155 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1156 * Operand(1): VADDR - Address source. Can carry an index and/or offset
1157 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1158 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1159 *
1160 */
1161 struct MUBUF_instruction : public Instruction {
1162 memory_sync_info sync;
1163 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1164 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1165 bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
1166 bool glc : 1; /* globally coherent */
1167 bool dlc : 1; /* NAVI: device level coherent */
1168 bool slc : 1; /* system level coherent */
1169 bool tfe : 1; /* texture fail enable */
1170 bool lds : 1; /* Return read-data to LDS instead of VGPRs */
1171 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1172 uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
1173 bool swizzled : 1;
1174 uint32_t padding1 : 18;
1175 };
1176 static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1177
1178 /**
1179 * Vector Memory Typed-buffer Instructions
1180 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1181 * Operand(1): VADDR - Address source. Can carry an index and/or offset
1182 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1183 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1184 *
1185 */
1186 struct MTBUF_instruction : public Instruction {
1187 memory_sync_info sync;
1188 uint8_t dfmt : 4; /* Data Format of data in memory buffer */
1189 uint8_t nfmt : 3; /* Numeric format of data in memory */
1190 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1191 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1192 bool glc : 1; /* globally coherent */
1193 bool dlc : 1; /* NAVI: device level coherent */
1194 bool slc : 1; /* system level coherent */
1195 bool tfe : 1; /* texture fail enable */
1196 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1197 uint32_t padding : 10;
1198 uint16_t offset; /* Unsigned byte offset - 12 bit */
1199 };
1200 static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1201
1202 /**
1203 * Vector Memory Image Instructions
1204 * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
1205 * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
1206 * or VDATA - Vector GPR for write data.
1207 * Operand(2): VADDR - Address source. Can carry an offset or an index.
1208 * Definition(0): VDATA - Vector GPR for read result.
1209 *
1210 */
1211 struct MIMG_instruction : public Instruction {
1212 memory_sync_info sync;
1213 uint8_t dmask; /* Data VGPR enable mask */
1214 uint8_t dim : 3; /* NAVI: dimensionality */
1215 bool unrm : 1; /* Force address to be un-normalized */
1216 bool dlc : 1; /* NAVI: device level coherent */
1217 bool glc : 1; /* globally coherent */
1218 bool slc : 1; /* system level coherent */
1219 bool tfe : 1; /* texture fail enable */
1220 bool da : 1; /* declare an array */
1221 bool lwe : 1; /* Force data to be un-normalized */
1222 bool r128 : 1; /* NAVI: Texture resource size */
1223 bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
1224 bool d16 : 1; /* Convert 32-bit data to 16-bit data */
1225 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1226 uint32_t padding : 18;
1227 };
1228 static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1229
1230 /**
1231 * Flat/Scratch/Global Instructions
1232 * Operand(0): ADDR
1233 * Operand(1): SADDR
1234 * Operand(2) / Definition(0): DATA/VDST
1235 *
1236 */
1237 struct FLAT_instruction : public Instruction {
1238 memory_sync_info sync;
1239 bool slc : 1; /* system level coherent */
1240 bool glc : 1; /* globally coherent */
1241 bool dlc : 1; /* NAVI: device level coherent */
1242 bool lds : 1;
1243 bool nv : 1;
1244 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1245 uint32_t padding0 : 2;
1246 uint16_t offset; /* Vega/Navi only */
1247 uint16_t padding1;
1248 };
1249 static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1250
1251 struct Export_instruction : public Instruction {
1252 uint8_t enabled_mask;
1253 uint8_t dest;
1254 bool compressed : 1;
1255 bool done : 1;
1256 bool valid_mask : 1;
1257 uint32_t padding : 13;
1258 };
1259 static_assert(sizeof(Export_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1260
1261 struct Pseudo_instruction : public Instruction {
1262 PhysReg scratch_sgpr; /* might not be valid if it's not needed */
1263 bool tmp_in_scc;
1264 uint8_t padding;
1265 };
1266 static_assert(sizeof(Pseudo_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1267
1268 struct Pseudo_branch_instruction : public Instruction {
1269 /* target[0] is the block index of the branch target.
1270 * For conditional branches, target[1] contains the fall-through alternative.
1271 * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
1272 */
1273 uint32_t target[2];
1274 };
1275 static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1276
1277 struct Pseudo_barrier_instruction : public Instruction {
1278 memory_sync_info sync;
1279 sync_scope exec_scope;
1280 };
1281 static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1282
1283 enum ReduceOp : uint16_t {
1284 iadd8, iadd16, iadd32, iadd64,
1285 imul8, imul16, imul32, imul64,
1286 fadd16, fadd32, fadd64,
1287 fmul16, fmul32, fmul64,
1288 imin8, imin16, imin32, imin64,
1289 imax8, imax16, imax32, imax64,
1290 umin8, umin16, umin32, umin64,
1291 umax8, umax16, umax32, umax64,
1292 fmin16, fmin32, fmin64,
1293 fmax16, fmax32, fmax64,
1294 iand8, iand16, iand32, iand64,
1295 ior8, ior16, ior32, ior64,
1296 ixor8, ixor16, ixor32, ixor64,
1297 };
1298
1299 /**
1300 * Subgroup Reduction Instructions, everything except for the data to be
1301 * reduced and the result as inserted by setup_reduce_temp().
1302 * Operand(0): data to be reduced
1303 * Operand(1): reduce temporary
1304 * Operand(2): vector temporary
1305 * Definition(0): result
1306 * Definition(1): scalar temporary
1307 * Definition(2): scalar identity temporary (not used to store identity on GFX10)
1308 * Definition(3): scc clobber
1309 * Definition(4): vcc clobber
1310 *
1311 */
1312 struct Pseudo_reduction_instruction : public Instruction {
1313 ReduceOp reduce_op;
1314 uint16_t cluster_size; // must be 0 for scans
1315 };
1316 static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1317
1318 struct instr_deleter_functor {
1319 void operator()(void* p) {
1320 free(p);
1321 }
1322 };
1323
1324 template<typename T>
1325 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
1326
1327 template<typename T>
1328 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
1329 {
1330 std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1331 char *data = (char*) calloc(1, size);
1332 T* inst = (T*) data;
1333
1334 inst->opcode = opcode;
1335 inst->format = format;
1336
1337 uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
1338 inst->operands = aco::span<Operand>(operands_offset, num_operands);
1339 uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1340 inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1341
1342 return inst;
1343 }
1344
1345 constexpr bool Instruction::usesModifiers() const noexcept
1346 {
1347 if (isDPP() || isSDWA())
1348 return true;
1349
1350 if (format == Format::VOP3P) {
1351 const VOP3P_instruction *vop3p = static_cast<const VOP3P_instruction*>(this);
1352 for (unsigned i = 0; i < operands.size(); i++) {
1353 if (vop3p->neg_lo[i] || vop3p->neg_hi[i])
1354 return true;
1355 }
1356 return vop3p->opsel_lo || vop3p->opsel_hi || vop3p->clamp;
1357 } else if (isVOP3()) {
1358 const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
1359 for (unsigned i = 0; i < operands.size(); i++) {
1360 if (vop3->abs[i] || vop3->neg[i])
1361 return true;
1362 }
1363 return vop3->opsel || vop3->clamp || vop3->omod;
1364 }
1365 return false;
1366 }
1367
1368 constexpr bool is_phi(Instruction* instr)
1369 {
1370 return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
1371 }
1372
1373 static inline bool is_phi(aco_ptr<Instruction>& instr)
1374 {
1375 return is_phi(instr.get());
1376 }
1377
1378 memory_sync_info get_sync_info(const Instruction* instr);
1379
1380 bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr);
1381
1382 bool can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high);
1383 bool can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr);
1384 /* updates "instr" and returns the old instruction (or NULL if no update was needed) */
1385 aco_ptr<Instruction> convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& instr);
1386
1387 enum block_kind {
1388 /* uniform indicates that leaving this block,
1389 * all actives lanes stay active */
1390 block_kind_uniform = 1 << 0,
1391 block_kind_top_level = 1 << 1,
1392 block_kind_loop_preheader = 1 << 2,
1393 block_kind_loop_header = 1 << 3,
1394 block_kind_loop_exit = 1 << 4,
1395 block_kind_continue = 1 << 5,
1396 block_kind_break = 1 << 6,
1397 block_kind_continue_or_break = 1 << 7,
1398 block_kind_discard = 1 << 8,
1399 block_kind_branch = 1 << 9,
1400 block_kind_merge = 1 << 10,
1401 block_kind_invert = 1 << 11,
1402 block_kind_uses_discard_if = 1 << 12,
1403 block_kind_needs_lowering = 1 << 13,
1404 block_kind_uses_demote = 1 << 14,
1405 block_kind_export_end = 1 << 15,
1406 };
1407
1408
1409 struct RegisterDemand {
1410 constexpr RegisterDemand() = default;
1411 constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1412 : vgpr{v}, sgpr{s} {}
1413 int16_t vgpr = 0;
1414 int16_t sgpr = 0;
1415
1416 constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1417 return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1418 }
1419
1420 constexpr bool exceeds(const RegisterDemand other) const noexcept {
1421 return vgpr > other.vgpr || sgpr > other.sgpr;
1422 }
1423
1424 constexpr RegisterDemand operator+(const Temp t) const noexcept {
1425 if (t.type() == RegType::sgpr)
1426 return RegisterDemand( vgpr, sgpr + t.size() );
1427 else
1428 return RegisterDemand( vgpr + t.size(), sgpr );
1429 }
1430
1431 constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1432 return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1433 }
1434
1435 constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1436 return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1437 }
1438
1439 constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1440 vgpr += other.vgpr;
1441 sgpr += other.sgpr;
1442 return *this;
1443 }
1444
1445 constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1446 vgpr -= other.vgpr;
1447 sgpr -= other.sgpr;
1448 return *this;
1449 }
1450
1451 constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1452 if (t.type() == RegType::sgpr)
1453 sgpr += t.size();
1454 else
1455 vgpr += t.size();
1456 return *this;
1457 }
1458
1459 constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1460 if (t.type() == RegType::sgpr)
1461 sgpr -= t.size();
1462 else
1463 vgpr -= t.size();
1464 return *this;
1465 }
1466
1467 constexpr void update(const RegisterDemand other) noexcept {
1468 vgpr = std::max(vgpr, other.vgpr);
1469 sgpr = std::max(sgpr, other.sgpr);
1470 }
1471
1472 };
1473
1474 /* CFG */
1475 struct Block {
1476 float_mode fp_mode;
1477 unsigned index;
1478 unsigned offset = 0;
1479 std::vector<aco_ptr<Instruction>> instructions;
1480 std::vector<unsigned> logical_preds;
1481 std::vector<unsigned> linear_preds;
1482 std::vector<unsigned> logical_succs;
1483 std::vector<unsigned> linear_succs;
1484 RegisterDemand register_demand = RegisterDemand();
1485 uint16_t loop_nest_depth = 0;
1486 uint16_t kind = 0;
1487 int logical_idom = -1;
1488 int linear_idom = -1;
1489 Temp live_out_exec = Temp();
1490
1491 /* this information is needed for predecessors to blocks with phis when
1492 * moving out of ssa */
1493 bool scc_live_out = false;
1494 PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1495
1496 Block(unsigned idx) : index(idx) {}
1497 Block() : index(0) {}
1498 };
1499
1500 using Stage = uint16_t;
1501
1502 /* software stages */
1503 static constexpr Stage sw_vs = 1 << 0;
1504 static constexpr Stage sw_gs = 1 << 1;
1505 static constexpr Stage sw_tcs = 1 << 2;
1506 static constexpr Stage sw_tes = 1 << 3;
1507 static constexpr Stage sw_fs = 1 << 4;
1508 static constexpr Stage sw_cs = 1 << 5;
1509 static constexpr Stage sw_gs_copy = 1 << 6;
1510 static constexpr Stage sw_mask = 0x7f;
1511
1512 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1513 static constexpr Stage hw_vs = 1 << 7;
1514 static constexpr Stage hw_es = 1 << 8; /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */
1515 static constexpr Stage hw_gs = 1 << 9; /* Geometry shader on GFX10/legacy and GFX6-9. */
1516 static constexpr Stage hw_ngg_gs = 1 << 10; /* Geometry shader on GFX10/NGG. */
1517 static constexpr Stage hw_ls = 1 << 11; /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */
1518 static constexpr Stage hw_hs = 1 << 12; /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */
1519 static constexpr Stage hw_fs = 1 << 13;
1520 static constexpr Stage hw_cs = 1 << 14;
1521 static constexpr Stage hw_mask = 0xff << 7;
1522
1523 /* possible settings of Program::stage */
1524 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1525 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1526 static constexpr Stage compute_cs = sw_cs | hw_cs;
1527 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1528 static constexpr Stage gs_copy_vs = sw_gs_copy | hw_vs;
1529 /* GFX10/NGG */
1530 static constexpr Stage ngg_vertex_gs = sw_vs | hw_ngg_gs;
1531 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_ngg_gs;
1532 static constexpr Stage ngg_tess_eval_gs = sw_tes | hw_ngg_gs;
1533 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_ngg_gs;
1534 /* GFX9 (and GFX10 if NGG isn't used) */
1535 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1536 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1537 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1538 /* pre-GFX9 */
1539 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1540 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1541 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1542 static constexpr Stage tess_eval_es = sw_tes | hw_es; /* tesselation evaluation before geometry */
1543 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1544
1545 enum statistic {
1546 statistic_hash,
1547 statistic_instructions,
1548 statistic_copies,
1549 statistic_branches,
1550 statistic_cycles,
1551 statistic_vmem_clauses,
1552 statistic_smem_clauses,
1553 statistic_vmem_score,
1554 statistic_smem_score,
1555 statistic_sgpr_presched,
1556 statistic_vgpr_presched,
1557 num_statistics
1558 };
1559
1560 class Program final {
1561 public:
1562 float_mode next_fp_mode;
1563 std::vector<Block> blocks;
1564 RegisterDemand max_reg_demand = RegisterDemand();
1565 uint16_t num_waves = 0;
1566 uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1567 ac_shader_config* config;
1568 struct radv_shader_info *info;
1569 enum chip_class chip_class;
1570 enum radeon_family family;
1571 unsigned wave_size;
1572 RegClass lane_mask;
1573 Stage stage; /* Stage */
1574 bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1575 bool needs_wqm = false; /* there exists a p_wqm instruction */
1576 bool wb_smem_l1_on_end = false;
1577
1578 std::vector<uint8_t> constant_data;
1579 Temp private_segment_buffer;
1580 Temp scratch_offset;
1581
1582 uint16_t min_waves = 0;
1583 uint16_t lds_alloc_granule;
1584 uint32_t lds_limit; /* in bytes */
1585 bool has_16bank_lds;
1586 uint16_t vgpr_limit;
1587 uint16_t sgpr_limit;
1588 uint16_t physical_sgprs;
1589 uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1590 uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
1591 unsigned workgroup_size; /* if known; otherwise UINT_MAX */
1592
1593 bool xnack_enabled = false;
1594 bool sram_ecc_enabled = false;
1595 bool has_fast_fma32 = false;
1596
1597 bool needs_vcc = false;
1598 bool needs_flat_scr = false;
1599
1600 bool collect_statistics = false;
1601 uint32_t statistics[num_statistics];
1602
1603 struct {
1604 void (*func)(void *private_data,
1605 enum radv_compiler_debug_level level,
1606 const char *message);
1607 void *private_data;
1608 } debug;
1609
1610 uint32_t allocateId()
1611 {
1612 assert(allocationID <= 16777215);
1613 return allocationID++;
1614 }
1615
1616 uint32_t peekAllocationId()
1617 {
1618 return allocationID;
1619 }
1620
1621 void setAllocationId(uint32_t id)
1622 {
1623 allocationID = id;
1624 }
1625
1626 Block* create_and_insert_block() {
1627 blocks.emplace_back(blocks.size());
1628 blocks.back().fp_mode = next_fp_mode;
1629 return &blocks.back();
1630 }
1631
1632 Block* insert_block(Block&& block) {
1633 block.index = blocks.size();
1634 block.fp_mode = next_fp_mode;
1635 blocks.emplace_back(std::move(block));
1636 return &blocks.back();
1637 }
1638
1639 private:
1640 uint32_t allocationID = 1;
1641 };
1642
1643 struct TempHash {
1644 std::size_t operator()(Temp t) const {
1645 return t.id();
1646 }
1647 };
1648 using TempSet = std::unordered_set<Temp, TempHash>;
1649
1650 struct live {
1651 /* live temps out per block */
1652 std::vector<TempSet> live_out;
1653 /* register demand (sgpr/vgpr) per instruction per block */
1654 std::vector<std::vector<RegisterDemand>> register_demand;
1655 };
1656
1657 void init();
1658
1659 void init_program(Program *program, Stage stage, struct radv_shader_info *info,
1660 enum chip_class chip_class, enum radeon_family family,
1661 ac_shader_config *config);
1662
1663 void select_program(Program *program,
1664 unsigned shader_count,
1665 struct nir_shader *const *shaders,
1666 ac_shader_config* config,
1667 struct radv_shader_args *args);
1668 void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
1669 ac_shader_config* config,
1670 struct radv_shader_args *args);
1671 void select_trap_handler_shader(Program *program, struct nir_shader *shader,
1672 ac_shader_config* config,
1673 struct radv_shader_args *args);
1674
1675 void lower_wqm(Program* program, live& live_vars,
1676 const struct radv_nir_compiler_options *options);
1677 void lower_phis(Program* program);
1678 void calc_min_waves(Program* program);
1679 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1680 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1681 std::vector<uint16_t> dead_code_analysis(Program *program);
1682 void dominator_tree(Program* program);
1683 void insert_exec_mask(Program *program);
1684 void value_numbering(Program* program);
1685 void optimize(Program* program);
1686 void setup_reduce_temp(Program* program);
1687 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1688 void register_allocation(Program *program, std::vector<TempSet>& live_out_per_block);
1689 void ssa_elimination(Program* program);
1690 void lower_to_hw_instr(Program* program);
1691 void schedule_program(Program* program, live& live_vars);
1692 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1693 void insert_wait_states(Program* program);
1694 void insert_NOPs(Program* program);
1695 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1696 void print_asm(Program *program, std::vector<uint32_t>& binary,
1697 unsigned exec_size, std::ostream& out);
1698 bool validate_ir(Program* program);
1699 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options);
1700 #ifndef NDEBUG
1701 void perfwarn(Program *program, bool cond, const char *msg, Instruction *instr=NULL);
1702 #else
1703 #define perfwarn(program, cond, msg, ...) do {} while(0)
1704 #endif
1705
1706 void collect_presched_stats(Program *program);
1707 void collect_preasm_stats(Program *program);
1708 void collect_postasm_stats(Program *program, const std::vector<uint32_t>& code);
1709
1710 void aco_print_instr(const Instruction *instr, FILE *output);
1711 void aco_print_program(const Program *program, FILE *output);
1712
1713 void _aco_perfwarn(Program *program, const char *file, unsigned line,
1714 const char *fmt, ...);
1715 void _aco_err(Program *program, const char *file, unsigned line,
1716 const char *fmt, ...);
1717
1718 #define aco_perfwarn(program, ...) _aco_perfwarn(program, __FILE__, __LINE__, __VA_ARGS__)
1719 #define aco_err(program, ...) _aco_err(program, __FILE__, __LINE__, __VA_ARGS__)
1720
1721 /* utilities for dealing with register demand */
1722 RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
1723 RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
1724 RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, aco_ptr<Instruction>& instr_before);
1725
1726 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1727 uint16_t get_extra_sgprs(Program *program);
1728
1729 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
1730 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1731 uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs);
1732
1733 /* return number of addressable sgprs/vgprs for max_waves */
1734 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1735 uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves);
1736
1737 typedef struct {
1738 const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
1739 const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1740 const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1741 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1742 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1743 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
1744 const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1745 const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1746 /* sizes used for input/output modifiers and constants */
1747 const unsigned operand_size[static_cast<int>(aco_opcode::num_opcodes)];
1748 const unsigned definition_size[static_cast<int>(aco_opcode::num_opcodes)];
1749 } Info;
1750
1751 extern const Info instr_info;
1752
1753 }
1754
1755 #endif /* ACO_IR_H */
1756