aco: don't move memory accesses to before control barriers
[mesa.git] / src / amd / compiler / aco_ir.h
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #ifndef ACO_IR_H
26 #define ACO_IR_H
27
28 #include <vector>
29 #include <set>
30 #include <unordered_set>
31 #include <bitset>
32 #include <memory>
33
34 #include "nir.h"
35 #include "ac_binary.h"
36 #include "amd_family.h"
37 #include "aco_opcodes.h"
38 #include "aco_util.h"
39
40 struct radv_nir_compiler_options;
41 struct radv_shader_args;
42 struct radv_shader_info;
43
44 namespace aco {
45
46 extern uint64_t debug_flags;
47
48 enum {
49 DEBUG_VALIDATE = 0x1,
50 DEBUG_VALIDATE_RA = 0x2,
51 DEBUG_PERFWARN = 0x4,
52 };
53
54 /**
55 * Representation of the instruction's microcode encoding format
56 * Note: Some Vector ALU Formats can be combined, such that:
57 * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
58 * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
59 * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
60 *
61 * (*) The same is applicable for VOP1 and VOPC instructions.
62 */
63 enum class Format : std::uint16_t {
64 /* Pseudo Instruction Format */
65 PSEUDO = 0,
66 /* Scalar ALU & Control Formats */
67 SOP1 = 1,
68 SOP2 = 2,
69 SOPK = 3,
70 SOPP = 4,
71 SOPC = 5,
72 /* Scalar Memory Format */
73 SMEM = 6,
74 /* LDS/GDS Format */
75 DS = 8,
76 /* Vector Memory Buffer Formats */
77 MTBUF = 9,
78 MUBUF = 10,
79 /* Vector Memory Image Format */
80 MIMG = 11,
81 /* Export Format */
82 EXP = 12,
83 /* Flat Formats */
84 FLAT = 13,
85 GLOBAL = 14,
86 SCRATCH = 15,
87
88 PSEUDO_BRANCH = 16,
89 PSEUDO_BARRIER = 17,
90 PSEUDO_REDUCTION = 18,
91
92 /* Vector ALU Formats */
93 VOP3P = 19,
94 VOP1 = 1 << 8,
95 VOP2 = 1 << 9,
96 VOPC = 1 << 10,
97 VOP3 = 1 << 11,
98 VOP3A = 1 << 11,
99 VOP3B = 1 << 11,
100 /* Vector Parameter Interpolation Format */
101 VINTRP = 1 << 12,
102 DPP = 1 << 13,
103 SDWA = 1 << 14,
104 };
105
106 enum storage_class : uint8_t {
107 storage_none = 0x0, /* no synchronization and can be reordered around aliasing stores */
108 storage_buffer = 0x1, /* SSBOs and global memory */
109 storage_atomic_counter = 0x2, /* not used for Vulkan */
110 storage_image = 0x4,
111 storage_shared = 0x8, /* or TCS output */
112 storage_vmem_output = 0x10, /* GS or TCS output stores using VMEM */
113 storage_scratch = 0x20,
114 storage_vgpr_spill = 0x40,
115 storage_count = 8,
116 };
117
118 enum memory_semantics : uint8_t {
119 semantic_none = 0x0,
120 /* for loads: don't move any access after this load to before this load (even other loads)
121 * for barriers: don't move any access after the barrier to before any
122 * atomics/control_barriers/sendmsg_gs_done before the barrier */
123 semantic_acquire = 0x1,
124 /* for stores: don't move any access before this store to after this store
125 * for barriers: don't move any access before the barrier to after any
126 * atomics/control_barriers/sendmsg_gs_done after the barrier */
127 semantic_release = 0x2,
128
129 /* the rest are for load/stores/atomics only */
130 /* cannot be DCE'd or CSE'd */
131 semantic_volatile = 0x4,
132 /* does not interact with barriers and assumes this lane is the only lane
133 * accessing this memory */
134 semantic_private = 0x8,
135 /* this operation can be reordered around operations of the same storage. says nothing about barriers */
136 semantic_can_reorder = 0x10,
137 /* this is a atomic instruction (may only read or write memory) */
138 semantic_atomic = 0x20,
139 /* this is instruction both reads and writes memory */
140 semantic_rmw = 0x40,
141
142 semantic_acqrel = semantic_acquire | semantic_release,
143 semantic_atomicrmw = semantic_volatile | semantic_atomic | semantic_rmw,
144 };
145
146 enum sync_scope : uint8_t {
147 scope_invocation = 0,
148 scope_subgroup = 1,
149 scope_workgroup = 2,
150 scope_queuefamily = 3,
151 scope_device = 4,
152 };
153
154 struct memory_sync_info {
155 memory_sync_info() : storage(storage_none), semantics(semantic_none), scope(scope_invocation) {}
156 memory_sync_info(int storage, int semantics=0, sync_scope scope=scope_invocation)
157 : storage((storage_class)storage), semantics((memory_semantics)semantics), scope(scope) {}
158
159 storage_class storage:8;
160 memory_semantics semantics:8;
161 sync_scope scope:8;
162
163 bool operator == (const memory_sync_info& rhs) const {
164 return storage == rhs.storage &&
165 semantics == rhs.semantics &&
166 scope == rhs.scope;
167 }
168
169 bool can_reorder() const {
170 if (semantics & semantic_acqrel)
171 return false;
172 /* Also check storage so that zero-initialized memory_sync_info can be
173 * reordered. */
174 return (!storage || (semantics & semantic_can_reorder)) && !(semantics & semantic_volatile);
175 }
176 };
177 static_assert(sizeof(memory_sync_info) == 3, "Unexpected padding");
178
179 enum fp_round {
180 fp_round_ne = 0,
181 fp_round_pi = 1,
182 fp_round_ni = 2,
183 fp_round_tz = 3,
184 };
185
186 enum fp_denorm {
187 /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
188 * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
189 fp_denorm_flush = 0x0,
190 fp_denorm_keep = 0x3,
191 };
192
193 struct float_mode {
194 /* matches encoding of the MODE register */
195 union {
196 struct {
197 fp_round round32:2;
198 fp_round round16_64:2;
199 unsigned denorm32:2;
200 unsigned denorm16_64:2;
201 };
202 struct {
203 uint8_t round:4;
204 uint8_t denorm:4;
205 };
206 uint8_t val = 0;
207 };
208 /* if false, optimizations which may remove infs/nan/-0.0 can be done */
209 bool preserve_signed_zero_inf_nan32:1;
210 bool preserve_signed_zero_inf_nan16_64:1;
211 /* if false, optimizations which may remove denormal flushing can be done */
212 bool must_flush_denorms32:1;
213 bool must_flush_denorms16_64:1;
214 bool care_about_round32:1;
215 bool care_about_round16_64:1;
216
217 /* Returns true if instructions using the mode "other" can safely use the
218 * current one instead. */
219 bool canReplace(float_mode other) const noexcept {
220 return val == other.val &&
221 (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
222 (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
223 (must_flush_denorms32 || !other.must_flush_denorms32) &&
224 (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
225 (care_about_round32 || !other.care_about_round32) &&
226 (care_about_round16_64 || !other.care_about_round16_64);
227 }
228 };
229
230 constexpr Format asVOP3(Format format) {
231 return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
232 };
233
234 constexpr Format asSDWA(Format format) {
235 assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC);
236 return (Format) ((uint32_t) Format::SDWA | (uint32_t) format);
237 }
238
239 enum class RegType {
240 none = 0,
241 sgpr,
242 vgpr,
243 linear_vgpr,
244 };
245
246 struct RegClass {
247
248 enum RC : uint8_t {
249 s1 = 1,
250 s2 = 2,
251 s3 = 3,
252 s4 = 4,
253 s6 = 6,
254 s8 = 8,
255 s16 = 16,
256 v1 = s1 | (1 << 5),
257 v2 = s2 | (1 << 5),
258 v3 = s3 | (1 << 5),
259 v4 = s4 | (1 << 5),
260 v5 = 5 | (1 << 5),
261 v6 = 6 | (1 << 5),
262 v7 = 7 | (1 << 5),
263 v8 = 8 | (1 << 5),
264 /* byte-sized register class */
265 v1b = v1 | (1 << 7),
266 v2b = v2 | (1 << 7),
267 v3b = v3 | (1 << 7),
268 v4b = v4 | (1 << 7),
269 v6b = v6 | (1 << 7),
270 v8b = v8 | (1 << 7),
271 /* these are used for WWM and spills to vgpr */
272 v1_linear = v1 | (1 << 6),
273 v2_linear = v2 | (1 << 6),
274 };
275
276 RegClass() = default;
277 constexpr RegClass(RC rc)
278 : rc(rc) {}
279 constexpr RegClass(RegType type, unsigned size)
280 : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
281
282 constexpr operator RC() const { return rc; }
283 explicit operator bool() = delete;
284
285 constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
286 constexpr bool is_subdword() const { return rc & (1 << 7); }
287 constexpr unsigned bytes() const { return ((unsigned) rc & 0x1F) * (is_subdword() ? 1 : 4); }
288 //TODO: use size() less in favor of bytes()
289 constexpr unsigned size() const { return (bytes() + 3) >> 2; }
290 constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
291 constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
292 constexpr RegClass as_subdword() const { return RegClass((RC) (rc | 1 << 7)); }
293
294 static constexpr RegClass get(RegType type, unsigned bytes) {
295 if (type == RegType::sgpr) {
296 return RegClass(type, DIV_ROUND_UP(bytes, 4u));
297 } else {
298 return bytes % 4u ? RegClass(type, bytes).as_subdword() :
299 RegClass(type, bytes / 4u);
300 }
301 }
302
303 private:
304 RC rc;
305 };
306
307 /* transitional helper expressions */
308 static constexpr RegClass s1{RegClass::s1};
309 static constexpr RegClass s2{RegClass::s2};
310 static constexpr RegClass s3{RegClass::s3};
311 static constexpr RegClass s4{RegClass::s4};
312 static constexpr RegClass s8{RegClass::s8};
313 static constexpr RegClass s16{RegClass::s16};
314 static constexpr RegClass v1{RegClass::v1};
315 static constexpr RegClass v2{RegClass::v2};
316 static constexpr RegClass v3{RegClass::v3};
317 static constexpr RegClass v4{RegClass::v4};
318 static constexpr RegClass v5{RegClass::v5};
319 static constexpr RegClass v6{RegClass::v6};
320 static constexpr RegClass v7{RegClass::v7};
321 static constexpr RegClass v8{RegClass::v8};
322 static constexpr RegClass v1b{RegClass::v1b};
323 static constexpr RegClass v2b{RegClass::v2b};
324 static constexpr RegClass v3b{RegClass::v3b};
325 static constexpr RegClass v4b{RegClass::v4b};
326 static constexpr RegClass v6b{RegClass::v6b};
327 static constexpr RegClass v8b{RegClass::v8b};
328
329 /**
330 * Temp Class
331 * Each temporary virtual register has a
332 * register class (i.e. size and type)
333 * and SSA id.
334 */
335 struct Temp {
336 Temp() noexcept : id_(0), reg_class(0) {}
337 constexpr Temp(uint32_t id, RegClass cls) noexcept
338 : id_(id), reg_class(uint8_t(cls)) {}
339
340 constexpr uint32_t id() const noexcept { return id_; }
341 constexpr RegClass regClass() const noexcept { return (RegClass::RC)reg_class; }
342
343 constexpr unsigned bytes() const noexcept { return regClass().bytes(); }
344 constexpr unsigned size() const noexcept { return regClass().size(); }
345 constexpr RegType type() const noexcept { return regClass().type(); }
346 constexpr bool is_linear() const noexcept { return regClass().is_linear(); }
347
348 constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
349 constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
350 constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
351
352 private:
353 uint32_t id_: 24;
354 uint32_t reg_class : 8;
355 };
356
357 /**
358 * PhysReg
359 * Represents the physical register for each
360 * Operand and Definition.
361 */
362 struct PhysReg {
363 constexpr PhysReg() = default;
364 explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {}
365 constexpr unsigned reg() const { return reg_b >> 2; }
366 constexpr unsigned byte() const { return reg_b & 0x3; }
367 constexpr operator unsigned() const { return reg(); }
368 constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; }
369 constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; }
370 constexpr bool operator <(PhysReg other) const { return reg_b < other.reg_b; }
371 constexpr PhysReg advance(int bytes) const { PhysReg res = *this; res.reg_b += bytes; return res; }
372
373 uint16_t reg_b = 0;
374 };
375
376 /* helper expressions for special registers */
377 static constexpr PhysReg m0{124};
378 static constexpr PhysReg vcc{106};
379 static constexpr PhysReg vcc_hi{107};
380 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
381 static constexpr PhysReg exec{126};
382 static constexpr PhysReg exec_lo{126};
383 static constexpr PhysReg exec_hi{127};
384 static constexpr PhysReg vccz{251};
385 static constexpr PhysReg execz{252};
386 static constexpr PhysReg scc{253};
387
388 /**
389 * Operand Class
390 * Initially, each Operand refers to either
391 * a temporary virtual register
392 * or to a constant value
393 * Temporary registers get mapped to physical register during RA
394 * Constant values are inlined into the instruction sequence.
395 */
396 class Operand final
397 {
398 public:
399 constexpr Operand()
400 : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
401 isKill_(false), isUndef_(true), isFirstKill_(false), constSize(0),
402 isLateKill_(false) {}
403
404 explicit Operand(Temp r) noexcept
405 {
406 data_.temp = r;
407 if (r.id()) {
408 isTemp_ = true;
409 } else {
410 isUndef_ = true;
411 setFixed(PhysReg{128});
412 }
413 };
414 explicit Operand(uint8_t v) noexcept
415 {
416 /* 8-bit constants are only used for copies and copies from any 8-bit
417 * constant can be implemented with a SDWA v_mul_u32_u24. So consider all
418 * to be inline constants. */
419 data_.i = v;
420 isConstant_ = true;
421 constSize = 0;
422 setFixed(PhysReg{0u});
423 };
424 explicit Operand(uint16_t v) noexcept
425 {
426 data_.i = v;
427 isConstant_ = true;
428 constSize = 1;
429 if (v <= 64)
430 setFixed(PhysReg{128u + v});
431 else if (v >= 0xFFF0) /* [-16 .. -1] */
432 setFixed(PhysReg{192u + (0xFFFF - v)});
433 else if (v == 0x3800) /* 0.5 */
434 setFixed(PhysReg{240});
435 else if (v == 0xB800) /* -0.5 */
436 setFixed(PhysReg{241});
437 else if (v == 0x3C00) /* 1.0 */
438 setFixed(PhysReg{242});
439 else if (v == 0xBC00) /* -1.0 */
440 setFixed(PhysReg{243});
441 else if (v == 0x4000) /* 2.0 */
442 setFixed(PhysReg{244});
443 else if (v == 0xC000) /* -2.0 */
444 setFixed(PhysReg{245});
445 else if (v == 0x4400) /* 4.0 */
446 setFixed(PhysReg{246});
447 else if (v == 0xC400) /* -4.0 */
448 setFixed(PhysReg{247});
449 else if (v == 0x3118) /* 1/2 PI */
450 setFixed(PhysReg{248});
451 else /* Literal Constant */
452 setFixed(PhysReg{255});
453 };
454 explicit Operand(uint32_t v, bool is64bit = false) noexcept
455 {
456 data_.i = v;
457 isConstant_ = true;
458 constSize = is64bit ? 3 : 2;
459 if (v <= 64)
460 setFixed(PhysReg{128 + v});
461 else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
462 setFixed(PhysReg{192 - v});
463 else if (v == 0x3f000000) /* 0.5 */
464 setFixed(PhysReg{240});
465 else if (v == 0xbf000000) /* -0.5 */
466 setFixed(PhysReg{241});
467 else if (v == 0x3f800000) /* 1.0 */
468 setFixed(PhysReg{242});
469 else if (v == 0xbf800000) /* -1.0 */
470 setFixed(PhysReg{243});
471 else if (v == 0x40000000) /* 2.0 */
472 setFixed(PhysReg{244});
473 else if (v == 0xc0000000) /* -2.0 */
474 setFixed(PhysReg{245});
475 else if (v == 0x40800000) /* 4.0 */
476 setFixed(PhysReg{246});
477 else if (v == 0xc0800000) /* -4.0 */
478 setFixed(PhysReg{247});
479 else { /* Literal Constant */
480 assert(!is64bit && "attempt to create a 64-bit literal constant");
481 setFixed(PhysReg{255});
482 }
483 };
484 explicit Operand(uint64_t v) noexcept
485 {
486 isConstant_ = true;
487 constSize = 3;
488 if (v <= 64) {
489 data_.i = (uint32_t) v;
490 setFixed(PhysReg{128 + (uint32_t) v});
491 } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
492 data_.i = (uint32_t) v;
493 setFixed(PhysReg{192 - (uint32_t) v});
494 } else if (v == 0x3FE0000000000000) { /* 0.5 */
495 data_.i = 0x3f000000;
496 setFixed(PhysReg{240});
497 } else if (v == 0xBFE0000000000000) { /* -0.5 */
498 data_.i = 0xbf000000;
499 setFixed(PhysReg{241});
500 } else if (v == 0x3FF0000000000000) { /* 1.0 */
501 data_.i = 0x3f800000;
502 setFixed(PhysReg{242});
503 } else if (v == 0xBFF0000000000000) { /* -1.0 */
504 data_.i = 0xbf800000;
505 setFixed(PhysReg{243});
506 } else if (v == 0x4000000000000000) { /* 2.0 */
507 data_.i = 0x40000000;
508 setFixed(PhysReg{244});
509 } else if (v == 0xC000000000000000) { /* -2.0 */
510 data_.i = 0xc0000000;
511 setFixed(PhysReg{245});
512 } else if (v == 0x4010000000000000) { /* 4.0 */
513 data_.i = 0x40800000;
514 setFixed(PhysReg{246});
515 } else if (v == 0xC010000000000000) { /* -4.0 */
516 data_.i = 0xc0800000;
517 setFixed(PhysReg{247});
518 } else { /* Literal Constant: we don't know if it is a long or double.*/
519 isConstant_ = 0;
520 assert(false && "attempt to create a 64-bit literal constant");
521 }
522 };
523 explicit Operand(RegClass type) noexcept
524 {
525 isUndef_ = true;
526 data_.temp = Temp(0, type);
527 setFixed(PhysReg{128});
528 };
529 explicit Operand(PhysReg reg, RegClass type) noexcept
530 {
531 data_.temp = Temp(0, type);
532 setFixed(reg);
533 }
534
535 constexpr bool isTemp() const noexcept
536 {
537 return isTemp_;
538 }
539
540 constexpr void setTemp(Temp t) noexcept {
541 assert(!isConstant_);
542 isTemp_ = true;
543 data_.temp = t;
544 }
545
546 constexpr Temp getTemp() const noexcept
547 {
548 return data_.temp;
549 }
550
551 constexpr uint32_t tempId() const noexcept
552 {
553 return data_.temp.id();
554 }
555
556 constexpr bool hasRegClass() const noexcept
557 {
558 return isTemp() || isUndefined();
559 }
560
561 constexpr RegClass regClass() const noexcept
562 {
563 return data_.temp.regClass();
564 }
565
566 constexpr unsigned bytes() const noexcept
567 {
568 if (isConstant())
569 return 1 << constSize;
570 else
571 return data_.temp.bytes();
572 }
573
574 constexpr unsigned size() const noexcept
575 {
576 if (isConstant())
577 return constSize > 2 ? 2 : 1;
578 else
579 return data_.temp.size();
580 }
581
582 constexpr bool isFixed() const noexcept
583 {
584 return isFixed_;
585 }
586
587 constexpr PhysReg physReg() const noexcept
588 {
589 return reg_;
590 }
591
592 constexpr void setFixed(PhysReg reg) noexcept
593 {
594 isFixed_ = reg != unsigned(-1);
595 reg_ = reg;
596 }
597
598 constexpr bool isConstant() const noexcept
599 {
600 return isConstant_;
601 }
602
603 constexpr bool isLiteral() const noexcept
604 {
605 return isConstant() && reg_ == 255;
606 }
607
608 constexpr bool isUndefined() const noexcept
609 {
610 return isUndef_;
611 }
612
613 constexpr uint32_t constantValue() const noexcept
614 {
615 return data_.i;
616 }
617
618 constexpr bool constantEquals(uint32_t cmp) const noexcept
619 {
620 return isConstant() && constantValue() == cmp;
621 }
622
623 constexpr uint64_t constantValue64(bool signext=false) const noexcept
624 {
625 if (constSize == 3) {
626 if (reg_ <= 192)
627 return reg_ - 128;
628 else if (reg_ <= 208)
629 return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);
630
631 switch (reg_) {
632 case 240:
633 return 0x3FE0000000000000;
634 case 241:
635 return 0xBFE0000000000000;
636 case 242:
637 return 0x3FF0000000000000;
638 case 243:
639 return 0xBFF0000000000000;
640 case 244:
641 return 0x4000000000000000;
642 case 245:
643 return 0xC000000000000000;
644 case 246:
645 return 0x4010000000000000;
646 case 247:
647 return 0xC010000000000000;
648 }
649 } else if (constSize == 1) {
650 return (signext && (data_.i & 0x8000u) ? 0xffffffffffff0000ull : 0ull) | data_.i;
651 } else if (constSize == 0) {
652 return (signext && (data_.i & 0x80u) ? 0xffffffffffffff00ull : 0ull) | data_.i;
653 }
654 return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
655 }
656
657 constexpr bool isOfType(RegType type) const noexcept
658 {
659 return hasRegClass() && regClass().type() == type;
660 }
661
662 /* Indicates that the killed operand's live range intersects with the
663 * instruction's definitions. Unlike isKill() and isFirstKill(), this is
664 * not set by liveness analysis. */
665 constexpr void setLateKill(bool flag) noexcept
666 {
667 isLateKill_ = flag;
668 }
669
670 constexpr bool isLateKill() const noexcept
671 {
672 return isLateKill_;
673 }
674
675 constexpr void setKill(bool flag) noexcept
676 {
677 isKill_ = flag;
678 if (!flag)
679 setFirstKill(false);
680 }
681
682 constexpr bool isKill() const noexcept
683 {
684 return isKill_ || isFirstKill();
685 }
686
687 constexpr void setFirstKill(bool flag) noexcept
688 {
689 isFirstKill_ = flag;
690 if (flag)
691 setKill(flag);
692 }
693
694 /* When there are multiple operands killing the same temporary,
695 * isFirstKill() is only returns true for the first one. */
696 constexpr bool isFirstKill() const noexcept
697 {
698 return isFirstKill_;
699 }
700
701 constexpr bool isKillBeforeDef() const noexcept
702 {
703 return isKill() && !isLateKill();
704 }
705
706 constexpr bool isFirstKillBeforeDef() const noexcept
707 {
708 return isFirstKill() && !isLateKill();
709 }
710
711 constexpr bool operator == (Operand other) const noexcept
712 {
713 if (other.size() != size())
714 return false;
715 if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef())
716 return false;
717 if (isFixed() && other.isFixed() && physReg() != other.physReg())
718 return false;
719 if (isLiteral())
720 return other.isLiteral() && other.constantValue() == constantValue();
721 else if (isConstant())
722 return other.isConstant() && other.physReg() == physReg();
723 else if (isUndefined())
724 return other.isUndefined() && other.regClass() == regClass();
725 else
726 return other.isTemp() && other.getTemp() == getTemp();
727 }
728 private:
729 union {
730 uint32_t i;
731 float f;
732 Temp temp = Temp(0, s1);
733 } data_;
734 PhysReg reg_;
735 union {
736 struct {
737 uint8_t isTemp_:1;
738 uint8_t isFixed_:1;
739 uint8_t isConstant_:1;
740 uint8_t isKill_:1;
741 uint8_t isUndef_:1;
742 uint8_t isFirstKill_:1;
743 uint8_t constSize:2;
744 uint8_t isLateKill_:1;
745 };
746 /* can't initialize bit-fields in c++11, so work around using a union */
747 uint16_t control_ = 0;
748 };
749 };
750
751 /**
752 * Definition Class
753 * Definitions are the results of Instructions
754 * and refer to temporary virtual registers
755 * which are later mapped to physical registers
756 */
757 class Definition final
758 {
759 public:
760 constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0),
761 isKill_(0), isPrecise_(0), isNUW_(0) {}
762 Definition(uint32_t index, RegClass type) noexcept
763 : temp(index, type) {}
764 explicit Definition(Temp tmp) noexcept
765 : temp(tmp) {}
766 Definition(PhysReg reg, RegClass type) noexcept
767 : temp(Temp(0, type))
768 {
769 setFixed(reg);
770 }
771 Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
772 : temp(Temp(tmpId, type))
773 {
774 setFixed(reg);
775 }
776
777 constexpr bool isTemp() const noexcept
778 {
779 return tempId() > 0;
780 }
781
782 constexpr Temp getTemp() const noexcept
783 {
784 return temp;
785 }
786
787 constexpr uint32_t tempId() const noexcept
788 {
789 return temp.id();
790 }
791
792 constexpr void setTemp(Temp t) noexcept {
793 temp = t;
794 }
795
796 constexpr RegClass regClass() const noexcept
797 {
798 return temp.regClass();
799 }
800
801 constexpr unsigned bytes() const noexcept
802 {
803 return temp.bytes();
804 }
805
806 constexpr unsigned size() const noexcept
807 {
808 return temp.size();
809 }
810
811 constexpr bool isFixed() const noexcept
812 {
813 return isFixed_;
814 }
815
816 constexpr PhysReg physReg() const noexcept
817 {
818 return reg_;
819 }
820
821 constexpr void setFixed(PhysReg reg) noexcept
822 {
823 isFixed_ = 1;
824 reg_ = reg;
825 }
826
827 constexpr void setHint(PhysReg reg) noexcept
828 {
829 hasHint_ = 1;
830 reg_ = reg;
831 }
832
833 constexpr bool hasHint() const noexcept
834 {
835 return hasHint_;
836 }
837
838 constexpr void setKill(bool flag) noexcept
839 {
840 isKill_ = flag;
841 }
842
843 constexpr bool isKill() const noexcept
844 {
845 return isKill_;
846 }
847
848 constexpr void setPrecise(bool precise) noexcept
849 {
850 isPrecise_ = precise;
851 }
852
853 constexpr bool isPrecise() const noexcept
854 {
855 return isPrecise_;
856 }
857
858 /* No Unsigned Wrap */
859 constexpr void setNUW(bool nuw) noexcept
860 {
861 isNUW_ = nuw;
862 }
863
864 constexpr bool isNUW() const noexcept
865 {
866 return isNUW_;
867 }
868
869 private:
870 Temp temp = Temp(0, s1);
871 PhysReg reg_;
872 union {
873 struct {
874 uint8_t isFixed_:1;
875 uint8_t hasHint_:1;
876 uint8_t isKill_:1;
877 uint8_t isPrecise_:1;
878 uint8_t isNUW_:1;
879 };
880 /* can't initialize bit-fields in c++11, so work around using a union */
881 uint8_t control_ = 0;
882 };
883 };
884
885 struct Block;
886
887 struct Instruction {
888 aco_opcode opcode;
889 Format format;
890 uint32_t pass_flags;
891
892 aco::span<Operand> operands;
893 aco::span<Definition> definitions;
894
895 constexpr bool isVALU() const noexcept
896 {
897 return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
898 || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
899 || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
900 || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
901 || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
902 || format == Format::VOP3P;
903 }
904
905 constexpr bool isSALU() const noexcept
906 {
907 return format == Format::SOP1 ||
908 format == Format::SOP2 ||
909 format == Format::SOPC ||
910 format == Format::SOPK ||
911 format == Format::SOPP;
912 }
913
914 constexpr bool isVMEM() const noexcept
915 {
916 return format == Format::MTBUF ||
917 format == Format::MUBUF ||
918 format == Format::MIMG;
919 }
920
921 constexpr bool isDPP() const noexcept
922 {
923 return (uint16_t) format & (uint16_t) Format::DPP;
924 }
925
926 constexpr bool isVOP3() const noexcept
927 {
928 return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
929 ((uint16_t) format & (uint16_t) Format::VOP3B);
930 }
931
932 constexpr bool isSDWA() const noexcept
933 {
934 return (uint16_t) format & (uint16_t) Format::SDWA;
935 }
936
937 constexpr bool isFlatOrGlobal() const noexcept
938 {
939 return format == Format::FLAT || format == Format::GLOBAL;
940 }
941
942 constexpr bool usesModifiers() const noexcept;
943
944 constexpr bool reads_exec() const noexcept
945 {
946 for (const Operand& op : operands) {
947 if (op.isFixed() && op.physReg() == exec)
948 return true;
949 }
950 return false;
951 }
952 };
953 static_assert(sizeof(Instruction) == 16, "Unexpected padding");
954
955 struct SOPK_instruction : public Instruction {
956 uint16_t imm;
957 uint16_t padding;
958 };
959 static_assert(sizeof(SOPK_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
960
961 struct SOPP_instruction : public Instruction {
962 uint32_t imm;
963 int block;
964 };
965 static_assert(sizeof(SOPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
966
967 struct SOPC_instruction : public Instruction {
968 };
969 static_assert(sizeof(SOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
970
971 struct SOP1_instruction : public Instruction {
972 };
973 static_assert(sizeof(SOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
974
975 struct SOP2_instruction : public Instruction {
976 };
977 static_assert(sizeof(SOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
978
979 /**
980 * Scalar Memory Format:
981 * For s_(buffer_)load_dword*:
982 * Operand(0): SBASE - SGPR-pair which provides base address
983 * Operand(1): Offset - immediate (un)signed offset or SGPR
984 * Operand(2) / Definition(0): SDATA - SGPR for read / write result
985 * Operand(n-1): SOffset - SGPR offset (Vega only)
986 *
987 * Having no operands is also valid for instructions such as s_dcache_inv.
988 *
989 */
990 struct SMEM_instruction : public Instruction {
991 memory_sync_info sync;
992 bool glc : 1; /* VI+: globally coherent */
993 bool dlc : 1; /* NAVI: device level coherent */
994 bool nv : 1; /* VEGA only: Non-volatile */
995 bool disable_wqm : 1;
996 bool prevent_overflow : 1; /* avoid overflow when combining additions */
997 uint32_t padding: 3;
998 };
999 static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1000
1001 struct VOP1_instruction : public Instruction {
1002 };
1003 static_assert(sizeof(VOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1004
1005 struct VOP2_instruction : public Instruction {
1006 };
1007 static_assert(sizeof(VOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1008
1009 struct VOPC_instruction : public Instruction {
1010 };
1011 static_assert(sizeof(VOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1012
1013 struct VOP3A_instruction : public Instruction {
1014 bool abs[3];
1015 bool neg[3];
1016 uint8_t opsel : 4;
1017 uint8_t omod : 2;
1018 bool clamp : 1;
1019 uint32_t padding : 9;
1020 };
1021 static_assert(sizeof(VOP3A_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1022
1023 struct VOP3P_instruction : public Instruction {
1024 bool neg_lo[3];
1025 bool neg_hi[3];
1026 uint8_t opsel_lo : 3;
1027 uint8_t opsel_hi : 3;
1028 bool clamp : 1;
1029 uint32_t padding : 9;
1030 };
1031 static_assert(sizeof(VOP3P_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1032
1033 /**
1034 * Data Parallel Primitives Format:
1035 * This format can be used for VOP1, VOP2 or VOPC instructions.
1036 * The swizzle applies to the src0 operand.
1037 *
1038 */
1039 struct DPP_instruction : public Instruction {
1040 bool abs[2];
1041 bool neg[2];
1042 uint16_t dpp_ctrl;
1043 uint8_t row_mask : 4;
1044 uint8_t bank_mask : 4;
1045 bool bound_ctrl : 1;
1046 uint32_t padding : 7;
1047 };
1048 static_assert(sizeof(DPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1049
1050 enum sdwa_sel : uint8_t {
1051 /* masks */
1052 sdwa_wordnum = 0x1,
1053 sdwa_bytenum = 0x3,
1054 sdwa_asuint = 0x7 | 0x10,
1055 sdwa_rasize = 0x3,
1056
1057 /* flags */
1058 sdwa_isword = 0x4,
1059 sdwa_sext = 0x8,
1060 sdwa_isra = 0x10,
1061
1062 /* specific values */
1063 sdwa_ubyte0 = 0,
1064 sdwa_ubyte1 = 1,
1065 sdwa_ubyte2 = 2,
1066 sdwa_ubyte3 = 3,
1067 sdwa_uword0 = sdwa_isword | 0,
1068 sdwa_uword1 = sdwa_isword | 1,
1069 sdwa_udword = 6,
1070
1071 sdwa_sbyte0 = sdwa_ubyte0 | sdwa_sext,
1072 sdwa_sbyte1 = sdwa_ubyte1 | sdwa_sext,
1073 sdwa_sbyte2 = sdwa_ubyte2 | sdwa_sext,
1074 sdwa_sbyte3 = sdwa_ubyte3 | sdwa_sext,
1075 sdwa_sword0 = sdwa_uword0 | sdwa_sext,
1076 sdwa_sword1 = sdwa_uword1 | sdwa_sext,
1077 sdwa_sdword = sdwa_udword | sdwa_sext,
1078
1079 /* register-allocated */
1080 sdwa_ubyte = 1 | sdwa_isra,
1081 sdwa_uword = 2 | sdwa_isra,
1082 sdwa_sbyte = sdwa_ubyte | sdwa_sext,
1083 sdwa_sword = sdwa_uword | sdwa_sext,
1084 };
1085
1086 /**
1087 * Sub-Dword Addressing Format:
1088 * This format can be used for VOP1, VOP2 or VOPC instructions.
1089 *
1090 * omod and SGPR/constant operands are only available on GFX9+. For VOPC,
1091 * the definition doesn't have to be VCC on GFX9+.
1092 *
1093 */
1094 struct SDWA_instruction : public Instruction {
1095 /* these destination modifiers aren't available with VOPC except for
1096 * clamp on GFX8 */
1097 uint8_t sel[2];
1098 uint8_t dst_sel;
1099 bool neg[2];
1100 bool abs[2];
1101 bool dst_preserve : 1;
1102 bool clamp : 1;
1103 uint8_t omod : 2; /* GFX9+ */
1104 uint32_t padding : 4;
1105 };
1106 static_assert(sizeof(SDWA_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1107
1108 struct Interp_instruction : public Instruction {
1109 uint8_t attribute;
1110 uint8_t component;
1111 uint16_t padding;
1112 };
1113 static_assert(sizeof(Interp_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1114
1115 /**
1116 * Local and Global Data Sharing instructions
1117 * Operand(0): ADDR - VGPR which supplies the address.
1118 * Operand(1): DATA0 - First data VGPR.
1119 * Operand(2): DATA1 - Second data VGPR.
1120 * Operand(n-1): M0 - LDS size.
1121 * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
1122 *
1123 */
1124 struct DS_instruction : public Instruction {
1125 memory_sync_info sync;
1126 bool gds;
1127 int16_t offset0;
1128 int8_t offset1;
1129 uint8_t padding;
1130 };
1131 static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1132
1133 /**
1134 * Vector Memory Untyped-buffer Instructions
1135 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1136 * Operand(1): VADDR - Address source. Can carry an index and/or offset
1137 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1138 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1139 *
1140 */
1141 struct MUBUF_instruction : public Instruction {
1142 memory_sync_info sync;
1143 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1144 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1145 bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
1146 bool glc : 1; /* globally coherent */
1147 bool dlc : 1; /* NAVI: device level coherent */
1148 bool slc : 1; /* system level coherent */
1149 bool tfe : 1; /* texture fail enable */
1150 bool lds : 1; /* Return read-data to LDS instead of VGPRs */
1151 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1152 uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
1153 bool swizzled : 1;
1154 uint32_t padding1 : 18;
1155 };
1156 static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1157
1158 /**
1159 * Vector Memory Typed-buffer Instructions
1160 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1161 * Operand(1): VADDR - Address source. Can carry an index and/or offset
1162 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1163 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1164 *
1165 */
1166 struct MTBUF_instruction : public Instruction {
1167 memory_sync_info sync;
1168 uint8_t dfmt : 4; /* Data Format of data in memory buffer */
1169 uint8_t nfmt : 3; /* Numeric format of data in memory */
1170 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1171 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1172 bool glc : 1; /* globally coherent */
1173 bool dlc : 1; /* NAVI: device level coherent */
1174 bool slc : 1; /* system level coherent */
1175 bool tfe : 1; /* texture fail enable */
1176 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1177 uint32_t padding : 10;
1178 uint16_t offset; /* Unsigned byte offset - 12 bit */
1179 };
1180 static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1181
1182 /**
1183 * Vector Memory Image Instructions
1184 * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
1185 * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
1186 * or VDATA - Vector GPR for write data.
1187 * Operand(2): VADDR - Address source. Can carry an offset or an index.
1188 * Definition(0): VDATA - Vector GPR for read result.
1189 *
1190 */
1191 struct MIMG_instruction : public Instruction {
1192 memory_sync_info sync;
1193 uint8_t dmask; /* Data VGPR enable mask */
1194 uint8_t dim : 3; /* NAVI: dimensionality */
1195 bool unrm : 1; /* Force address to be un-normalized */
1196 bool dlc : 1; /* NAVI: device level coherent */
1197 bool glc : 1; /* globally coherent */
1198 bool slc : 1; /* system level coherent */
1199 bool tfe : 1; /* texture fail enable */
1200 bool da : 1; /* declare an array */
1201 bool lwe : 1; /* Force data to be un-normalized */
1202 bool r128 : 1; /* NAVI: Texture resource size */
1203 bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
1204 bool d16 : 1; /* Convert 32-bit data to 16-bit data */
1205 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1206 uint32_t padding : 18;
1207 };
1208 static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1209
1210 /**
1211 * Flat/Scratch/Global Instructions
1212 * Operand(0): ADDR
1213 * Operand(1): SADDR
1214 * Operand(2) / Definition(0): DATA/VDST
1215 *
1216 */
1217 struct FLAT_instruction : public Instruction {
1218 memory_sync_info sync;
1219 bool slc : 1; /* system level coherent */
1220 bool glc : 1; /* globally coherent */
1221 bool dlc : 1; /* NAVI: device level coherent */
1222 bool lds : 1;
1223 bool nv : 1;
1224 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1225 uint32_t padding0 : 2;
1226 uint16_t offset; /* Vega/Navi only */
1227 uint16_t padding1;
1228 };
1229 static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1230
1231 struct Export_instruction : public Instruction {
1232 uint8_t enabled_mask;
1233 uint8_t dest;
1234 bool compressed : 1;
1235 bool done : 1;
1236 bool valid_mask : 1;
1237 uint32_t padding : 13;
1238 };
1239 static_assert(sizeof(Export_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1240
1241 struct Pseudo_instruction : public Instruction {
1242 PhysReg scratch_sgpr; /* might not be valid if it's not needed */
1243 bool tmp_in_scc;
1244 uint8_t padding;
1245 };
1246 static_assert(sizeof(Pseudo_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1247
1248 struct Pseudo_branch_instruction : public Instruction {
1249 /* target[0] is the block index of the branch target.
1250 * For conditional branches, target[1] contains the fall-through alternative.
1251 * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
1252 */
1253 uint32_t target[2];
1254 };
1255 static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1256
1257 struct Pseudo_barrier_instruction : public Instruction {
1258 memory_sync_info sync;
1259 sync_scope exec_scope;
1260 };
1261 static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1262
1263 enum ReduceOp : uint16_t {
1264 iadd8, iadd16, iadd32, iadd64,
1265 imul8, imul16, imul32, imul64,
1266 fadd16, fadd32, fadd64,
1267 fmul16, fmul32, fmul64,
1268 imin8, imin16, imin32, imin64,
1269 imax8, imax16, imax32, imax64,
1270 umin8, umin16, umin32, umin64,
1271 umax8, umax16, umax32, umax64,
1272 fmin16, fmin32, fmin64,
1273 fmax16, fmax32, fmax64,
1274 iand8, iand16, iand32, iand64,
1275 ior8, ior16, ior32, ior64,
1276 ixor8, ixor16, ixor32, ixor64,
1277 };
1278
1279 /**
1280 * Subgroup Reduction Instructions, everything except for the data to be
1281 * reduced and the result as inserted by setup_reduce_temp().
1282 * Operand(0): data to be reduced
1283 * Operand(1): reduce temporary
1284 * Operand(2): vector temporary
1285 * Definition(0): result
1286 * Definition(1): scalar temporary
1287 * Definition(2): scalar identity temporary (not used to store identity on GFX10)
1288 * Definition(3): scc clobber
1289 * Definition(4): vcc clobber
1290 *
1291 */
1292 struct Pseudo_reduction_instruction : public Instruction {
1293 ReduceOp reduce_op;
1294 uint16_t cluster_size; // must be 0 for scans
1295 };
1296 static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1297
1298 struct instr_deleter_functor {
1299 void operator()(void* p) {
1300 free(p);
1301 }
1302 };
1303
1304 template<typename T>
1305 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
1306
1307 template<typename T>
1308 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
1309 {
1310 std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1311 char *data = (char*) calloc(1, size);
1312 T* inst = (T*) data;
1313
1314 inst->opcode = opcode;
1315 inst->format = format;
1316
1317 uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
1318 inst->operands = aco::span<Operand>(operands_offset, num_operands);
1319 uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1320 inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1321
1322 return inst;
1323 }
1324
1325 constexpr bool Instruction::usesModifiers() const noexcept
1326 {
1327 if (isDPP() || isSDWA())
1328 return true;
1329
1330 if (format == Format::VOP3P) {
1331 const VOP3P_instruction *vop3p = static_cast<const VOP3P_instruction*>(this);
1332 for (unsigned i = 0; i < operands.size(); i++) {
1333 if (vop3p->neg_lo[i] || vop3p->neg_hi[i])
1334 return true;
1335 }
1336 return vop3p->opsel_lo || vop3p->opsel_hi || vop3p->clamp;
1337 } else if (isVOP3()) {
1338 const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
1339 for (unsigned i = 0; i < operands.size(); i++) {
1340 if (vop3->abs[i] || vop3->neg[i])
1341 return true;
1342 }
1343 return vop3->opsel || vop3->clamp || vop3->omod;
1344 }
1345 return false;
1346 }
1347
1348 constexpr bool is_phi(Instruction* instr)
1349 {
1350 return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
1351 }
1352
1353 static inline bool is_phi(aco_ptr<Instruction>& instr)
1354 {
1355 return is_phi(instr.get());
1356 }
1357
1358 memory_sync_info get_sync_info(const Instruction* instr);
1359
1360 bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr);
1361
1362 bool can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high);
1363 bool can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr);
1364 /* updates "instr" and returns the old instruction (or NULL if no update was needed) */
1365 aco_ptr<Instruction> convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& instr);
1366
1367 enum block_kind {
1368 /* uniform indicates that leaving this block,
1369 * all actives lanes stay active */
1370 block_kind_uniform = 1 << 0,
1371 block_kind_top_level = 1 << 1,
1372 block_kind_loop_preheader = 1 << 2,
1373 block_kind_loop_header = 1 << 3,
1374 block_kind_loop_exit = 1 << 4,
1375 block_kind_continue = 1 << 5,
1376 block_kind_break = 1 << 6,
1377 block_kind_continue_or_break = 1 << 7,
1378 block_kind_discard = 1 << 8,
1379 block_kind_branch = 1 << 9,
1380 block_kind_merge = 1 << 10,
1381 block_kind_invert = 1 << 11,
1382 block_kind_uses_discard_if = 1 << 12,
1383 block_kind_needs_lowering = 1 << 13,
1384 block_kind_uses_demote = 1 << 14,
1385 block_kind_export_end = 1 << 15,
1386 };
1387
1388
1389 struct RegisterDemand {
1390 constexpr RegisterDemand() = default;
1391 constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1392 : vgpr{v}, sgpr{s} {}
1393 int16_t vgpr = 0;
1394 int16_t sgpr = 0;
1395
1396 constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1397 return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1398 }
1399
1400 constexpr bool exceeds(const RegisterDemand other) const noexcept {
1401 return vgpr > other.vgpr || sgpr > other.sgpr;
1402 }
1403
1404 constexpr RegisterDemand operator+(const Temp t) const noexcept {
1405 if (t.type() == RegType::sgpr)
1406 return RegisterDemand( vgpr, sgpr + t.size() );
1407 else
1408 return RegisterDemand( vgpr + t.size(), sgpr );
1409 }
1410
1411 constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1412 return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1413 }
1414
1415 constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1416 return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1417 }
1418
1419 constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1420 vgpr += other.vgpr;
1421 sgpr += other.sgpr;
1422 return *this;
1423 }
1424
1425 constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1426 vgpr -= other.vgpr;
1427 sgpr -= other.sgpr;
1428 return *this;
1429 }
1430
1431 constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1432 if (t.type() == RegType::sgpr)
1433 sgpr += t.size();
1434 else
1435 vgpr += t.size();
1436 return *this;
1437 }
1438
1439 constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1440 if (t.type() == RegType::sgpr)
1441 sgpr -= t.size();
1442 else
1443 vgpr -= t.size();
1444 return *this;
1445 }
1446
1447 constexpr void update(const RegisterDemand other) noexcept {
1448 vgpr = std::max(vgpr, other.vgpr);
1449 sgpr = std::max(sgpr, other.sgpr);
1450 }
1451
1452 };
1453
1454 /* CFG */
1455 struct Block {
1456 float_mode fp_mode;
1457 unsigned index;
1458 unsigned offset = 0;
1459 std::vector<aco_ptr<Instruction>> instructions;
1460 std::vector<unsigned> logical_preds;
1461 std::vector<unsigned> linear_preds;
1462 std::vector<unsigned> logical_succs;
1463 std::vector<unsigned> linear_succs;
1464 RegisterDemand register_demand = RegisterDemand();
1465 uint16_t loop_nest_depth = 0;
1466 uint16_t kind = 0;
1467 int logical_idom = -1;
1468 int linear_idom = -1;
1469 Temp live_out_exec = Temp();
1470
1471 /* this information is needed for predecessors to blocks with phis when
1472 * moving out of ssa */
1473 bool scc_live_out = false;
1474 PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1475
1476 Block(unsigned idx) : index(idx) {}
1477 Block() : index(0) {}
1478 };
1479
1480 using Stage = uint16_t;
1481
1482 /* software stages */
1483 static constexpr Stage sw_vs = 1 << 0;
1484 static constexpr Stage sw_gs = 1 << 1;
1485 static constexpr Stage sw_tcs = 1 << 2;
1486 static constexpr Stage sw_tes = 1 << 3;
1487 static constexpr Stage sw_fs = 1 << 4;
1488 static constexpr Stage sw_cs = 1 << 5;
1489 static constexpr Stage sw_gs_copy = 1 << 6;
1490 static constexpr Stage sw_mask = 0x7f;
1491
1492 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1493 static constexpr Stage hw_vs = 1 << 7;
1494 static constexpr Stage hw_es = 1 << 8; /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */
1495 static constexpr Stage hw_gs = 1 << 9; /* Geometry shader on GFX10/legacy and GFX6-9. */
1496 static constexpr Stage hw_ngg_gs = 1 << 10; /* Geometry shader on GFX10/NGG. */
1497 static constexpr Stage hw_ls = 1 << 11; /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */
1498 static constexpr Stage hw_hs = 1 << 12; /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */
1499 static constexpr Stage hw_fs = 1 << 13;
1500 static constexpr Stage hw_cs = 1 << 14;
1501 static constexpr Stage hw_mask = 0xff << 7;
1502
1503 /* possible settings of Program::stage */
1504 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1505 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1506 static constexpr Stage compute_cs = sw_cs | hw_cs;
1507 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1508 static constexpr Stage gs_copy_vs = sw_gs_copy | hw_vs;
1509 /* GFX10/NGG */
1510 static constexpr Stage ngg_vertex_gs = sw_vs | hw_ngg_gs;
1511 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_ngg_gs;
1512 static constexpr Stage ngg_tess_eval_gs = sw_tes | hw_ngg_gs;
1513 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_ngg_gs;
1514 /* GFX9 (and GFX10 if NGG isn't used) */
1515 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1516 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1517 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1518 /* pre-GFX9 */
1519 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1520 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1521 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1522 static constexpr Stage tess_eval_es = sw_tes | hw_es; /* tesselation evaluation before geometry */
1523 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1524
1525 enum statistic {
1526 statistic_hash,
1527 statistic_instructions,
1528 statistic_copies,
1529 statistic_branches,
1530 statistic_cycles,
1531 statistic_vmem_clauses,
1532 statistic_smem_clauses,
1533 statistic_vmem_score,
1534 statistic_smem_score,
1535 statistic_sgpr_presched,
1536 statistic_vgpr_presched,
1537 num_statistics
1538 };
1539
1540 class Program final {
1541 public:
1542 float_mode next_fp_mode;
1543 std::vector<Block> blocks;
1544 RegisterDemand max_reg_demand = RegisterDemand();
1545 uint16_t num_waves = 0;
1546 uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1547 ac_shader_config* config;
1548 struct radv_shader_info *info;
1549 enum chip_class chip_class;
1550 enum radeon_family family;
1551 unsigned wave_size;
1552 RegClass lane_mask;
1553 Stage stage; /* Stage */
1554 bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1555 bool needs_wqm = false; /* there exists a p_wqm instruction */
1556 bool wb_smem_l1_on_end = false;
1557
1558 std::vector<uint8_t> constant_data;
1559 Temp private_segment_buffer;
1560 Temp scratch_offset;
1561
1562 uint16_t min_waves = 0;
1563 uint16_t lds_alloc_granule;
1564 uint32_t lds_limit; /* in bytes */
1565 bool has_16bank_lds;
1566 uint16_t vgpr_limit;
1567 uint16_t sgpr_limit;
1568 uint16_t physical_sgprs;
1569 uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1570 uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
1571 unsigned workgroup_size; /* if known; otherwise UINT_MAX */
1572
1573 bool xnack_enabled = false;
1574 bool sram_ecc_enabled = false;
1575 bool has_fast_fma32 = false;
1576
1577 bool needs_vcc = false;
1578 bool needs_flat_scr = false;
1579
1580 bool collect_statistics = false;
1581 uint32_t statistics[num_statistics];
1582
1583 uint32_t allocateId()
1584 {
1585 assert(allocationID <= 16777215);
1586 return allocationID++;
1587 }
1588
1589 uint32_t peekAllocationId()
1590 {
1591 return allocationID;
1592 }
1593
1594 void setAllocationId(uint32_t id)
1595 {
1596 allocationID = id;
1597 }
1598
1599 Block* create_and_insert_block() {
1600 blocks.emplace_back(blocks.size());
1601 blocks.back().fp_mode = next_fp_mode;
1602 return &blocks.back();
1603 }
1604
1605 Block* insert_block(Block&& block) {
1606 block.index = blocks.size();
1607 block.fp_mode = next_fp_mode;
1608 blocks.emplace_back(std::move(block));
1609 return &blocks.back();
1610 }
1611
1612 private:
1613 uint32_t allocationID = 1;
1614 };
1615
1616 struct TempHash {
1617 std::size_t operator()(Temp t) const {
1618 return t.id();
1619 }
1620 };
1621 using TempSet = std::unordered_set<Temp, TempHash>;
1622
1623 struct live {
1624 /* live temps out per block */
1625 std::vector<TempSet> live_out;
1626 /* register demand (sgpr/vgpr) per instruction per block */
1627 std::vector<std::vector<RegisterDemand>> register_demand;
1628 };
1629
1630 void init();
1631
1632 void init_program(Program *program, Stage stage, struct radv_shader_info *info,
1633 enum chip_class chip_class, enum radeon_family family,
1634 ac_shader_config *config);
1635
1636 void select_program(Program *program,
1637 unsigned shader_count,
1638 struct nir_shader *const *shaders,
1639 ac_shader_config* config,
1640 struct radv_shader_args *args);
1641 void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
1642 ac_shader_config* config,
1643 struct radv_shader_args *args);
1644
1645 void lower_wqm(Program* program, live& live_vars,
1646 const struct radv_nir_compiler_options *options);
1647 void lower_phis(Program* program);
1648 void calc_min_waves(Program* program);
1649 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1650 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1651 std::vector<uint16_t> dead_code_analysis(Program *program);
1652 void dominator_tree(Program* program);
1653 void insert_exec_mask(Program *program);
1654 void value_numbering(Program* program);
1655 void optimize(Program* program);
1656 void setup_reduce_temp(Program* program);
1657 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1658 void register_allocation(Program *program, std::vector<TempSet>& live_out_per_block);
1659 void ssa_elimination(Program* program);
1660 void lower_to_hw_instr(Program* program);
1661 void schedule_program(Program* program, live& live_vars);
1662 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1663 void insert_wait_states(Program* program);
1664 void insert_NOPs(Program* program);
1665 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1666 void print_asm(Program *program, std::vector<uint32_t>& binary,
1667 unsigned exec_size, std::ostream& out);
1668 bool validate(Program* program, FILE *output);
1669 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output);
1670 #ifndef NDEBUG
1671 void perfwarn(bool cond, const char *msg, Instruction *instr=NULL);
1672 #else
1673 #define perfwarn(program, cond, msg, ...) do {} while(0)
1674 #endif
1675
1676 void collect_presched_stats(Program *program);
1677 void collect_preasm_stats(Program *program);
1678 void collect_postasm_stats(Program *program, const std::vector<uint32_t>& code);
1679
1680 void aco_print_instr(const Instruction *instr, FILE *output);
1681 void aco_print_program(const Program *program, FILE *output);
1682
1683 /* utilities for dealing with register demand */
1684 RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
1685 RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
1686 RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, aco_ptr<Instruction>& instr_before);
1687
1688 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1689 uint16_t get_extra_sgprs(Program *program);
1690
1691 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
1692 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1693 uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs);
1694
1695 /* return number of addressable sgprs/vgprs for max_waves */
1696 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1697 uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves);
1698
1699 typedef struct {
1700 const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
1701 const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1702 const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1703 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1704 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1705 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
1706 const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1707 const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1708 /* sizes used for input/output modifiers and constants */
1709 const unsigned operand_size[static_cast<int>(aco_opcode::num_opcodes)];
1710 const unsigned definition_size[static_cast<int>(aco_opcode::num_opcodes)];
1711 } Info;
1712
1713 extern const Info instr_info;
1714
1715 }
1716
1717 #endif /* ACO_IR_H */
1718