2 * Copyright © 2018 Valve Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
30 #include <unordered_set>
35 #include "ac_binary.h"
36 #include "amd_family.h"
37 #include "aco_opcodes.h"
40 #include "vulkan/radv_shader.h"
42 struct radv_nir_compiler_options
;
43 struct radv_shader_args
;
44 struct radv_shader_info
;
48 extern uint64_t debug_flags
;
51 DEBUG_VALIDATE_IR
= 0x1,
52 DEBUG_VALIDATE_RA
= 0x2,
54 DEBUG_FORCE_WAITCNT
= 0x8,
57 DEBUG_NO_SCHED
= 0x40,
61 * Representation of the instruction's microcode encoding format
62 * Note: Some Vector ALU Formats can be combined, such that:
63 * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
64 * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
65 * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
67 * (*) The same is applicable for VOP1 and VOPC instructions.
69 enum class Format
: std::uint16_t {
70 /* Pseudo Instruction Format */
72 /* Scalar ALU & Control Formats */
78 /* Scalar Memory Format */
82 /* Vector Memory Buffer Formats */
85 /* Vector Memory Image Format */
96 PSEUDO_REDUCTION
= 18,
98 /* Vector ALU Formats */
106 /* Vector Parameter Interpolation Format */
112 enum storage_class
: uint8_t {
113 storage_none
= 0x0, /* no synchronization and can be reordered around aliasing stores */
114 storage_buffer
= 0x1, /* SSBOs and global memory */
115 storage_atomic_counter
= 0x2, /* not used for Vulkan */
117 storage_shared
= 0x8, /* or TCS output */
118 storage_vmem_output
= 0x10, /* GS or TCS output stores using VMEM */
119 storage_scratch
= 0x20,
120 storage_vgpr_spill
= 0x40,
124 enum memory_semantics
: uint8_t {
126 /* for loads: don't move any access after this load to before this load (even other loads)
127 * for barriers: don't move any access after the barrier to before any
128 * atomics/control_barriers/sendmsg_gs_done before the barrier */
129 semantic_acquire
= 0x1,
130 /* for stores: don't move any access before this store to after this store
131 * for barriers: don't move any access before the barrier to after any
132 * atomics/control_barriers/sendmsg_gs_done after the barrier */
133 semantic_release
= 0x2,
135 /* the rest are for load/stores/atomics only */
136 /* cannot be DCE'd or CSE'd */
137 semantic_volatile
= 0x4,
138 /* does not interact with barriers and assumes this lane is the only lane
139 * accessing this memory */
140 semantic_private
= 0x8,
141 /* this operation can be reordered around operations of the same storage. says nothing about barriers */
142 semantic_can_reorder
= 0x10,
143 /* this is a atomic instruction (may only read or write memory) */
144 semantic_atomic
= 0x20,
145 /* this is instruction both reads and writes memory */
148 semantic_acqrel
= semantic_acquire
| semantic_release
,
149 semantic_atomicrmw
= semantic_volatile
| semantic_atomic
| semantic_rmw
,
152 enum sync_scope
: uint8_t {
153 scope_invocation
= 0,
156 scope_queuefamily
= 3,
160 struct memory_sync_info
{
161 memory_sync_info() : storage(storage_none
), semantics(semantic_none
), scope(scope_invocation
) {}
162 memory_sync_info(int storage
, int semantics
=0, sync_scope scope
=scope_invocation
)
163 : storage((storage_class
)storage
), semantics((memory_semantics
)semantics
), scope(scope
) {}
165 storage_class storage
:8;
166 memory_semantics semantics
:8;
169 bool operator == (const memory_sync_info
& rhs
) const {
170 return storage
== rhs
.storage
&&
171 semantics
== rhs
.semantics
&&
175 bool can_reorder() const {
176 if (semantics
& semantic_acqrel
)
178 /* Also check storage so that zero-initialized memory_sync_info can be
180 return (!storage
|| (semantics
& semantic_can_reorder
)) && !(semantics
& semantic_volatile
);
183 static_assert(sizeof(memory_sync_info
) == 3, "Unexpected padding");
193 /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
194 * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
195 fp_denorm_flush
= 0x0,
196 fp_denorm_keep
= 0x3,
200 /* matches encoding of the MODE register */
204 fp_round round16_64
:2;
206 unsigned denorm16_64
:2;
214 /* if false, optimizations which may remove infs/nan/-0.0 can be done */
215 bool preserve_signed_zero_inf_nan32
:1;
216 bool preserve_signed_zero_inf_nan16_64
:1;
217 /* if false, optimizations which may remove denormal flushing can be done */
218 bool must_flush_denorms32
:1;
219 bool must_flush_denorms16_64
:1;
220 bool care_about_round32
:1;
221 bool care_about_round16_64
:1;
223 /* Returns true if instructions using the mode "other" can safely use the
224 * current one instead. */
225 bool canReplace(float_mode other
) const noexcept
{
226 return val
== other
.val
&&
227 (preserve_signed_zero_inf_nan32
|| !other
.preserve_signed_zero_inf_nan32
) &&
228 (preserve_signed_zero_inf_nan16_64
|| !other
.preserve_signed_zero_inf_nan16_64
) &&
229 (must_flush_denorms32
|| !other
.must_flush_denorms32
) &&
230 (must_flush_denorms16_64
|| !other
.must_flush_denorms16_64
) &&
231 (care_about_round32
|| !other
.care_about_round32
) &&
232 (care_about_round16_64
|| !other
.care_about_round16_64
);
236 constexpr Format
asVOP3(Format format
) {
237 return (Format
) ((uint32_t) Format::VOP3
| (uint32_t) format
);
240 constexpr Format
asSDWA(Format format
) {
241 assert(format
== Format::VOP1
|| format
== Format::VOP2
|| format
== Format::VOPC
);
242 return (Format
) ((uint32_t) Format::SDWA
| (uint32_t) format
);
270 /* byte-sized register class */
277 /* these are used for WWM and spills to vgpr */
278 v1_linear
= v1
| (1 << 6),
279 v2_linear
= v2
| (1 << 6),
282 RegClass() = default;
283 constexpr RegClass(RC rc
)
285 constexpr RegClass(RegType type
, unsigned size
)
286 : rc((RC
) ((type
== RegType::vgpr
? 1 << 5 : 0) | size
)) {}
288 constexpr operator RC() const { return rc
; }
289 explicit operator bool() = delete;
291 constexpr RegType
type() const { return rc
<= RC::s16
? RegType::sgpr
: RegType::vgpr
; }
292 constexpr bool is_subdword() const { return rc
& (1 << 7); }
293 constexpr unsigned bytes() const { return ((unsigned) rc
& 0x1F) * (is_subdword() ? 1 : 4); }
294 //TODO: use size() less in favor of bytes()
295 constexpr unsigned size() const { return (bytes() + 3) >> 2; }
296 constexpr bool is_linear() const { return rc
<= RC::s16
|| rc
& (1 << 6); }
297 constexpr RegClass
as_linear() const { return RegClass((RC
) (rc
| (1 << 6))); }
298 constexpr RegClass
as_subdword() const { return RegClass((RC
) (rc
| 1 << 7)); }
300 static constexpr RegClass
get(RegType type
, unsigned bytes
) {
301 if (type
== RegType::sgpr
) {
302 return RegClass(type
, DIV_ROUND_UP(bytes
, 4u));
304 return bytes
% 4u ? RegClass(type
, bytes
).as_subdword() :
305 RegClass(type
, bytes
/ 4u);
313 /* transitional helper expressions */
314 static constexpr RegClass s1
{RegClass::s1
};
315 static constexpr RegClass s2
{RegClass::s2
};
316 static constexpr RegClass s3
{RegClass::s3
};
317 static constexpr RegClass s4
{RegClass::s4
};
318 static constexpr RegClass s8
{RegClass::s8
};
319 static constexpr RegClass s16
{RegClass::s16
};
320 static constexpr RegClass v1
{RegClass::v1
};
321 static constexpr RegClass v2
{RegClass::v2
};
322 static constexpr RegClass v3
{RegClass::v3
};
323 static constexpr RegClass v4
{RegClass::v4
};
324 static constexpr RegClass v5
{RegClass::v5
};
325 static constexpr RegClass v6
{RegClass::v6
};
326 static constexpr RegClass v7
{RegClass::v7
};
327 static constexpr RegClass v8
{RegClass::v8
};
328 static constexpr RegClass v1b
{RegClass::v1b
};
329 static constexpr RegClass v2b
{RegClass::v2b
};
330 static constexpr RegClass v3b
{RegClass::v3b
};
331 static constexpr RegClass v4b
{RegClass::v4b
};
332 static constexpr RegClass v6b
{RegClass::v6b
};
333 static constexpr RegClass v8b
{RegClass::v8b
};
337 * Each temporary virtual register has a
338 * register class (i.e. size and type)
342 Temp() noexcept
: id_(0), reg_class(0) {}
343 constexpr Temp(uint32_t id
, RegClass cls
) noexcept
344 : id_(id
), reg_class(uint8_t(cls
)) {}
346 constexpr uint32_t id() const noexcept
{ return id_
; }
347 constexpr RegClass
regClass() const noexcept
{ return (RegClass::RC
)reg_class
; }
349 constexpr unsigned bytes() const noexcept
{ return regClass().bytes(); }
350 constexpr unsigned size() const noexcept
{ return regClass().size(); }
351 constexpr RegType
type() const noexcept
{ return regClass().type(); }
352 constexpr bool is_linear() const noexcept
{ return regClass().is_linear(); }
354 constexpr bool operator <(Temp other
) const noexcept
{ return id() < other
.id(); }
355 constexpr bool operator==(Temp other
) const noexcept
{ return id() == other
.id(); }
356 constexpr bool operator!=(Temp other
) const noexcept
{ return id() != other
.id(); }
360 uint32_t reg_class
: 8;
365 * Represents the physical register for each
366 * Operand and Definition.
369 constexpr PhysReg() = default;
370 explicit constexpr PhysReg(unsigned r
) : reg_b(r
<< 2) {}
371 constexpr unsigned reg() const { return reg_b
>> 2; }
372 constexpr unsigned byte() const { return reg_b
& 0x3; }
373 constexpr operator unsigned() const { return reg(); }
374 constexpr bool operator==(PhysReg other
) const { return reg_b
== other
.reg_b
; }
375 constexpr bool operator!=(PhysReg other
) const { return reg_b
!= other
.reg_b
; }
376 constexpr bool operator <(PhysReg other
) const { return reg_b
< other
.reg_b
; }
377 constexpr PhysReg
advance(int bytes
) const { PhysReg res
= *this; res
.reg_b
+= bytes
; return res
; }
382 /* helper expressions for special registers */
383 static constexpr PhysReg m0
{124};
384 static constexpr PhysReg vcc
{106};
385 static constexpr PhysReg vcc_hi
{107};
386 static constexpr PhysReg tba
{108}; /* GFX6-GFX8 */
387 static constexpr PhysReg tma
{110}; /* GFX6-GFX8 */
388 static constexpr PhysReg ttmp0
{112};
389 static constexpr PhysReg ttmp1
{113};
390 static constexpr PhysReg ttmp2
{114};
391 static constexpr PhysReg ttmp3
{115};
392 static constexpr PhysReg ttmp4
{116};
393 static constexpr PhysReg ttmp5
{117};
394 static constexpr PhysReg ttmp6
{118};
395 static constexpr PhysReg ttmp7
{119};
396 static constexpr PhysReg ttmp8
{120};
397 static constexpr PhysReg ttmp9
{121};
398 static constexpr PhysReg ttmp10
{122};
399 static constexpr PhysReg ttmp11
{123};
400 static constexpr PhysReg sgpr_null
{125}; /* GFX10+ */
401 static constexpr PhysReg exec
{126};
402 static constexpr PhysReg exec_lo
{126};
403 static constexpr PhysReg exec_hi
{127};
404 static constexpr PhysReg vccz
{251};
405 static constexpr PhysReg execz
{252};
406 static constexpr PhysReg scc
{253};
410 * Initially, each Operand refers to either
411 * a temporary virtual register
412 * or to a constant value
413 * Temporary registers get mapped to physical register during RA
414 * Constant values are inlined into the instruction sequence.
420 : reg_(PhysReg
{128}), isTemp_(false), isFixed_(true), isConstant_(false),
421 isKill_(false), isUndef_(true), isFirstKill_(false), constSize(0),
422 isLateKill_(false) {}
424 explicit Operand(Temp r
) noexcept
431 setFixed(PhysReg
{128});
434 explicit Operand(uint8_t v
) noexcept
436 /* 8-bit constants are only used for copies and copies from any 8-bit
437 * constant can be implemented with a SDWA v_mul_u32_u24. So consider all
438 * to be inline constants. */
442 setFixed(PhysReg
{0u});
444 explicit Operand(uint16_t v
) noexcept
450 setFixed(PhysReg
{128u + v
});
451 else if (v
>= 0xFFF0) /* [-16 .. -1] */
452 setFixed(PhysReg
{192u + (0xFFFF - v
)});
453 else if (v
== 0x3800) /* 0.5 */
454 setFixed(PhysReg
{240});
455 else if (v
== 0xB800) /* -0.5 */
456 setFixed(PhysReg
{241});
457 else if (v
== 0x3C00) /* 1.0 */
458 setFixed(PhysReg
{242});
459 else if (v
== 0xBC00) /* -1.0 */
460 setFixed(PhysReg
{243});
461 else if (v
== 0x4000) /* 2.0 */
462 setFixed(PhysReg
{244});
463 else if (v
== 0xC000) /* -2.0 */
464 setFixed(PhysReg
{245});
465 else if (v
== 0x4400) /* 4.0 */
466 setFixed(PhysReg
{246});
467 else if (v
== 0xC400) /* -4.0 */
468 setFixed(PhysReg
{247});
469 else if (v
== 0x3118) /* 1/2 PI */
470 setFixed(PhysReg
{248});
471 else /* Literal Constant */
472 setFixed(PhysReg
{255});
474 explicit Operand(uint32_t v
, bool is64bit
= false) noexcept
478 constSize
= is64bit
? 3 : 2;
480 setFixed(PhysReg
{128 + v
});
481 else if (v
>= 0xFFFFFFF0) /* [-16 .. -1] */
482 setFixed(PhysReg
{192 - v
});
483 else if (v
== 0x3f000000) /* 0.5 */
484 setFixed(PhysReg
{240});
485 else if (v
== 0xbf000000) /* -0.5 */
486 setFixed(PhysReg
{241});
487 else if (v
== 0x3f800000) /* 1.0 */
488 setFixed(PhysReg
{242});
489 else if (v
== 0xbf800000) /* -1.0 */
490 setFixed(PhysReg
{243});
491 else if (v
== 0x40000000) /* 2.0 */
492 setFixed(PhysReg
{244});
493 else if (v
== 0xc0000000) /* -2.0 */
494 setFixed(PhysReg
{245});
495 else if (v
== 0x40800000) /* 4.0 */
496 setFixed(PhysReg
{246});
497 else if (v
== 0xc0800000) /* -4.0 */
498 setFixed(PhysReg
{247});
499 else { /* Literal Constant */
500 assert(!is64bit
&& "attempt to create a 64-bit literal constant");
501 setFixed(PhysReg
{255});
504 explicit Operand(uint64_t v
) noexcept
509 data_
.i
= (uint32_t) v
;
510 setFixed(PhysReg
{128 + (uint32_t) v
});
511 } else if (v
>= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
512 data_
.i
= (uint32_t) v
;
513 setFixed(PhysReg
{192 - (uint32_t) v
});
514 } else if (v
== 0x3FE0000000000000) { /* 0.5 */
515 data_
.i
= 0x3f000000;
516 setFixed(PhysReg
{240});
517 } else if (v
== 0xBFE0000000000000) { /* -0.5 */
518 data_
.i
= 0xbf000000;
519 setFixed(PhysReg
{241});
520 } else if (v
== 0x3FF0000000000000) { /* 1.0 */
521 data_
.i
= 0x3f800000;
522 setFixed(PhysReg
{242});
523 } else if (v
== 0xBFF0000000000000) { /* -1.0 */
524 data_
.i
= 0xbf800000;
525 setFixed(PhysReg
{243});
526 } else if (v
== 0x4000000000000000) { /* 2.0 */
527 data_
.i
= 0x40000000;
528 setFixed(PhysReg
{244});
529 } else if (v
== 0xC000000000000000) { /* -2.0 */
530 data_
.i
= 0xc0000000;
531 setFixed(PhysReg
{245});
532 } else if (v
== 0x4010000000000000) { /* 4.0 */
533 data_
.i
= 0x40800000;
534 setFixed(PhysReg
{246});
535 } else if (v
== 0xC010000000000000) { /* -4.0 */
536 data_
.i
= 0xc0800000;
537 setFixed(PhysReg
{247});
538 } else { /* Literal Constant: we don't know if it is a long or double.*/
540 assert(false && "attempt to create a 64-bit literal constant");
543 explicit Operand(RegClass type
) noexcept
546 data_
.temp
= Temp(0, type
);
547 setFixed(PhysReg
{128});
549 explicit Operand(PhysReg reg
, RegClass type
) noexcept
551 data_
.temp
= Temp(0, type
);
555 constexpr bool isTemp() const noexcept
560 constexpr void setTemp(Temp t
) noexcept
{
561 assert(!isConstant_
);
566 constexpr Temp
getTemp() const noexcept
571 constexpr uint32_t tempId() const noexcept
573 return data_
.temp
.id();
576 constexpr bool hasRegClass() const noexcept
578 return isTemp() || isUndefined();
581 constexpr RegClass
regClass() const noexcept
583 return data_
.temp
.regClass();
586 constexpr unsigned bytes() const noexcept
589 return 1 << constSize
;
591 return data_
.temp
.bytes();
594 constexpr unsigned size() const noexcept
597 return constSize
> 2 ? 2 : 1;
599 return data_
.temp
.size();
602 constexpr bool isFixed() const noexcept
607 constexpr PhysReg
physReg() const noexcept
612 constexpr void setFixed(PhysReg reg
) noexcept
614 isFixed_
= reg
!= unsigned(-1);
618 constexpr bool isConstant() const noexcept
623 constexpr bool isLiteral() const noexcept
625 return isConstant() && reg_
== 255;
628 constexpr bool isUndefined() const noexcept
633 constexpr uint32_t constantValue() const noexcept
638 constexpr bool constantEquals(uint32_t cmp
) const noexcept
640 return isConstant() && constantValue() == cmp
;
643 constexpr uint64_t constantValue64(bool signext
=false) const noexcept
645 if (constSize
== 3) {
648 else if (reg_
<= 208)
649 return 0xFFFFFFFFFFFFFFFF - (reg_
- 193);
653 return 0x3FE0000000000000;
655 return 0xBFE0000000000000;
657 return 0x3FF0000000000000;
659 return 0xBFF0000000000000;
661 return 0x4000000000000000;
663 return 0xC000000000000000;
665 return 0x4010000000000000;
667 return 0xC010000000000000;
669 } else if (constSize
== 1) {
670 return (signext
&& (data_
.i
& 0x8000u
) ? 0xffffffffffff0000ull
: 0ull) | data_
.i
;
671 } else if (constSize
== 0) {
672 return (signext
&& (data_
.i
& 0x80u
) ? 0xffffffffffffff00ull
: 0ull) | data_
.i
;
674 return (signext
&& (data_
.i
& 0x80000000u
) ? 0xffffffff00000000ull
: 0ull) | data_
.i
;
677 constexpr bool isOfType(RegType type
) const noexcept
679 return hasRegClass() && regClass().type() == type
;
682 /* Indicates that the killed operand's live range intersects with the
683 * instruction's definitions. Unlike isKill() and isFirstKill(), this is
684 * not set by liveness analysis. */
685 constexpr void setLateKill(bool flag
) noexcept
690 constexpr bool isLateKill() const noexcept
695 constexpr void setKill(bool flag
) noexcept
702 constexpr bool isKill() const noexcept
704 return isKill_
|| isFirstKill();
707 constexpr void setFirstKill(bool flag
) noexcept
714 /* When there are multiple operands killing the same temporary,
715 * isFirstKill() is only returns true for the first one. */
716 constexpr bool isFirstKill() const noexcept
721 constexpr bool isKillBeforeDef() const noexcept
723 return isKill() && !isLateKill();
726 constexpr bool isFirstKillBeforeDef() const noexcept
728 return isFirstKill() && !isLateKill();
731 constexpr bool operator == (Operand other
) const noexcept
733 if (other
.size() != size())
735 if (isFixed() != other
.isFixed() || isKillBeforeDef() != other
.isKillBeforeDef())
737 if (isFixed() && other
.isFixed() && physReg() != other
.physReg())
740 return other
.isLiteral() && other
.constantValue() == constantValue();
741 else if (isConstant())
742 return other
.isConstant() && other
.physReg() == physReg();
743 else if (isUndefined())
744 return other
.isUndefined() && other
.regClass() == regClass();
746 return other
.isTemp() && other
.getTemp() == getTemp();
752 Temp temp
= Temp(0, s1
);
759 uint8_t isConstant_
:1;
762 uint8_t isFirstKill_
:1;
764 uint8_t isLateKill_
:1;
766 /* can't initialize bit-fields in c++11, so work around using a union */
767 uint16_t control_
= 0;
773 * Definitions are the results of Instructions
774 * and refer to temporary virtual registers
775 * which are later mapped to physical registers
777 class Definition final
780 constexpr Definition() : temp(Temp(0, s1
)), reg_(0), isFixed_(0), hasHint_(0),
781 isKill_(0), isPrecise_(0), isNUW_(0) {}
782 Definition(uint32_t index
, RegClass type
) noexcept
783 : temp(index
, type
) {}
784 explicit Definition(Temp tmp
) noexcept
786 Definition(PhysReg reg
, RegClass type
) noexcept
787 : temp(Temp(0, type
))
791 Definition(uint32_t tmpId
, PhysReg reg
, RegClass type
) noexcept
792 : temp(Temp(tmpId
, type
))
797 constexpr bool isTemp() const noexcept
802 constexpr Temp
getTemp() const noexcept
807 constexpr uint32_t tempId() const noexcept
812 constexpr void setTemp(Temp t
) noexcept
{
816 constexpr RegClass
regClass() const noexcept
818 return temp
.regClass();
821 constexpr unsigned bytes() const noexcept
826 constexpr unsigned size() const noexcept
831 constexpr bool isFixed() const noexcept
836 constexpr PhysReg
physReg() const noexcept
841 constexpr void setFixed(PhysReg reg
) noexcept
847 constexpr void setHint(PhysReg reg
) noexcept
853 constexpr bool hasHint() const noexcept
858 constexpr void setKill(bool flag
) noexcept
863 constexpr bool isKill() const noexcept
868 constexpr void setPrecise(bool precise
) noexcept
870 isPrecise_
= precise
;
873 constexpr bool isPrecise() const noexcept
878 /* No Unsigned Wrap */
879 constexpr void setNUW(bool nuw
) noexcept
884 constexpr bool isNUW() const noexcept
890 Temp temp
= Temp(0, s1
);
897 uint8_t isPrecise_
:1;
900 /* can't initialize bit-fields in c++11, so work around using a union */
901 uint8_t control_
= 0;
912 aco::span
<Operand
> operands
;
913 aco::span
<Definition
> definitions
;
915 constexpr bool isVALU() const noexcept
917 return ((uint16_t) format
& (uint16_t) Format::VOP1
) == (uint16_t) Format::VOP1
918 || ((uint16_t) format
& (uint16_t) Format::VOP2
) == (uint16_t) Format::VOP2
919 || ((uint16_t) format
& (uint16_t) Format::VOPC
) == (uint16_t) Format::VOPC
920 || ((uint16_t) format
& (uint16_t) Format::VOP3A
) == (uint16_t) Format::VOP3A
921 || ((uint16_t) format
& (uint16_t) Format::VOP3B
) == (uint16_t) Format::VOP3B
922 || format
== Format::VOP3P
;
925 constexpr bool isSALU() const noexcept
927 return format
== Format::SOP1
||
928 format
== Format::SOP2
||
929 format
== Format::SOPC
||
930 format
== Format::SOPK
||
931 format
== Format::SOPP
;
934 constexpr bool isVMEM() const noexcept
936 return format
== Format::MTBUF
||
937 format
== Format::MUBUF
||
938 format
== Format::MIMG
;
941 constexpr bool isDPP() const noexcept
943 return (uint16_t) format
& (uint16_t) Format::DPP
;
946 constexpr bool isVOP3() const noexcept
948 return ((uint16_t) format
& (uint16_t) Format::VOP3A
) ||
949 ((uint16_t) format
& (uint16_t) Format::VOP3B
);
952 constexpr bool isSDWA() const noexcept
954 return (uint16_t) format
& (uint16_t) Format::SDWA
;
957 constexpr bool isFlatOrGlobal() const noexcept
959 return format
== Format::FLAT
|| format
== Format::GLOBAL
;
962 constexpr bool usesModifiers() const noexcept
;
964 constexpr bool reads_exec() const noexcept
966 for (const Operand
& op
: operands
) {
967 if (op
.isFixed() && op
.physReg() == exec
)
973 static_assert(sizeof(Instruction
) == 16, "Unexpected padding");
975 struct SOPK_instruction
: public Instruction
{
979 static_assert(sizeof(SOPK_instruction
) == sizeof(Instruction
) + 4, "Unexpected padding");
981 struct SOPP_instruction
: public Instruction
{
985 static_assert(sizeof(SOPP_instruction
) == sizeof(Instruction
) + 8, "Unexpected padding");
987 struct SOPC_instruction
: public Instruction
{
989 static_assert(sizeof(SOPC_instruction
) == sizeof(Instruction
) + 0, "Unexpected padding");
991 struct SOP1_instruction
: public Instruction
{
993 static_assert(sizeof(SOP1_instruction
) == sizeof(Instruction
) + 0, "Unexpected padding");
995 struct SOP2_instruction
: public Instruction
{
997 static_assert(sizeof(SOP2_instruction
) == sizeof(Instruction
) + 0, "Unexpected padding");
1000 * Scalar Memory Format:
1001 * For s_(buffer_)load_dword*:
1002 * Operand(0): SBASE - SGPR-pair which provides base address
1003 * Operand(1): Offset - immediate (un)signed offset or SGPR
1004 * Operand(2) / Definition(0): SDATA - SGPR for read / write result
1005 * Operand(n-1): SOffset - SGPR offset (Vega only)
1007 * Having no operands is also valid for instructions such as s_dcache_inv.
1010 struct SMEM_instruction
: public Instruction
{
1011 memory_sync_info sync
;
1012 bool glc
: 1; /* VI+: globally coherent */
1013 bool dlc
: 1; /* NAVI: device level coherent */
1014 bool nv
: 1; /* VEGA only: Non-volatile */
1015 bool disable_wqm
: 1;
1016 bool prevent_overflow
: 1; /* avoid overflow when combining additions */
1017 uint32_t padding
: 3;
1019 static_assert(sizeof(SMEM_instruction
) == sizeof(Instruction
) + 4, "Unexpected padding");
1021 struct VOP1_instruction
: public Instruction
{
1023 static_assert(sizeof(VOP1_instruction
) == sizeof(Instruction
) + 0, "Unexpected padding");
1025 struct VOP2_instruction
: public Instruction
{
1027 static_assert(sizeof(VOP2_instruction
) == sizeof(Instruction
) + 0, "Unexpected padding");
1029 struct VOPC_instruction
: public Instruction
{
1031 static_assert(sizeof(VOPC_instruction
) == sizeof(Instruction
) + 0, "Unexpected padding");
1033 struct VOP3A_instruction
: public Instruction
{
1039 uint32_t padding
: 9;
1041 static_assert(sizeof(VOP3A_instruction
) == sizeof(Instruction
) + 8, "Unexpected padding");
1043 struct VOP3P_instruction
: public Instruction
{
1046 uint8_t opsel_lo
: 3;
1047 uint8_t opsel_hi
: 3;
1049 uint32_t padding
: 9;
1051 static_assert(sizeof(VOP3P_instruction
) == sizeof(Instruction
) + 8, "Unexpected padding");
1054 * Data Parallel Primitives Format:
1055 * This format can be used for VOP1, VOP2 or VOPC instructions.
1056 * The swizzle applies to the src0 operand.
1059 struct DPP_instruction
: public Instruction
{
1063 uint8_t row_mask
: 4;
1064 uint8_t bank_mask
: 4;
1065 bool bound_ctrl
: 1;
1066 uint32_t padding
: 7;
1068 static_assert(sizeof(DPP_instruction
) == sizeof(Instruction
) + 8, "Unexpected padding");
1070 enum sdwa_sel
: uint8_t {
1074 sdwa_asuint
= 0x7 | 0x10,
1082 /* specific values */
1087 sdwa_uword0
= sdwa_isword
| 0,
1088 sdwa_uword1
= sdwa_isword
| 1,
1091 sdwa_sbyte0
= sdwa_ubyte0
| sdwa_sext
,
1092 sdwa_sbyte1
= sdwa_ubyte1
| sdwa_sext
,
1093 sdwa_sbyte2
= sdwa_ubyte2
| sdwa_sext
,
1094 sdwa_sbyte3
= sdwa_ubyte3
| sdwa_sext
,
1095 sdwa_sword0
= sdwa_uword0
| sdwa_sext
,
1096 sdwa_sword1
= sdwa_uword1
| sdwa_sext
,
1097 sdwa_sdword
= sdwa_udword
| sdwa_sext
,
1099 /* register-allocated */
1100 sdwa_ubyte
= 1 | sdwa_isra
,
1101 sdwa_uword
= 2 | sdwa_isra
,
1102 sdwa_sbyte
= sdwa_ubyte
| sdwa_sext
,
1103 sdwa_sword
= sdwa_uword
| sdwa_sext
,
1107 * Sub-Dword Addressing Format:
1108 * This format can be used for VOP1, VOP2 or VOPC instructions.
1110 * omod and SGPR/constant operands are only available on GFX9+. For VOPC,
1111 * the definition doesn't have to be VCC on GFX9+.
1114 struct SDWA_instruction
: public Instruction
{
1115 /* these destination modifiers aren't available with VOPC except for
1121 bool dst_preserve
: 1;
1123 uint8_t omod
: 2; /* GFX9+ */
1124 uint32_t padding
: 4;
1126 static_assert(sizeof(SDWA_instruction
) == sizeof(Instruction
) + 8, "Unexpected padding");
1128 struct Interp_instruction
: public Instruction
{
1133 static_assert(sizeof(Interp_instruction
) == sizeof(Instruction
) + 4, "Unexpected padding");
1136 * Local and Global Data Sharing instructions
1137 * Operand(0): ADDR - VGPR which supplies the address.
1138 * Operand(1): DATA0 - First data VGPR.
1139 * Operand(2): DATA1 - Second data VGPR.
1140 * Operand(n-1): M0 - LDS size.
1141 * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
1144 struct DS_instruction
: public Instruction
{
1145 memory_sync_info sync
;
1151 static_assert(sizeof(DS_instruction
) == sizeof(Instruction
) + 8, "Unexpected padding");
1154 * Vector Memory Untyped-buffer Instructions
1155 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1156 * Operand(1): VADDR - Address source. Can carry an index and/or offset
1157 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1158 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1161 struct MUBUF_instruction
: public Instruction
{
1162 memory_sync_info sync
;
1163 bool offen
: 1; /* Supply an offset from VGPR (VADDR) */
1164 bool idxen
: 1; /* Supply an index from VGPR (VADDR) */
1165 bool addr64
: 1; /* SI, CIK: Address size is 64-bit */
1166 bool glc
: 1; /* globally coherent */
1167 bool dlc
: 1; /* NAVI: device level coherent */
1168 bool slc
: 1; /* system level coherent */
1169 bool tfe
: 1; /* texture fail enable */
1170 bool lds
: 1; /* Return read-data to LDS instead of VGPRs */
1171 bool disable_wqm
: 1; /* Require an exec mask without helper invocations */
1172 uint16_t offset
: 12; /* Unsigned byte offset - 12 bit */
1174 uint32_t padding1
: 18;
1176 static_assert(sizeof(MUBUF_instruction
) == sizeof(Instruction
) + 8, "Unexpected padding");
1179 * Vector Memory Typed-buffer Instructions
1180 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1181 * Operand(1): VADDR - Address source. Can carry an index and/or offset
1182 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1183 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1186 struct MTBUF_instruction
: public Instruction
{
1187 memory_sync_info sync
;
1188 uint8_t dfmt
: 4; /* Data Format of data in memory buffer */
1189 uint8_t nfmt
: 3; /* Numeric format of data in memory */
1190 bool offen
: 1; /* Supply an offset from VGPR (VADDR) */
1191 bool idxen
: 1; /* Supply an index from VGPR (VADDR) */
1192 bool glc
: 1; /* globally coherent */
1193 bool dlc
: 1; /* NAVI: device level coherent */
1194 bool slc
: 1; /* system level coherent */
1195 bool tfe
: 1; /* texture fail enable */
1196 bool disable_wqm
: 1; /* Require an exec mask without helper invocations */
1197 uint32_t padding
: 10;
1198 uint16_t offset
; /* Unsigned byte offset - 12 bit */
1200 static_assert(sizeof(MTBUF_instruction
) == sizeof(Instruction
) + 8, "Unexpected padding");
1203 * Vector Memory Image Instructions
1204 * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
1205 * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
1206 * or VDATA - Vector GPR for write data.
1207 * Operand(2): VADDR - Address source. Can carry an offset or an index.
1208 * Definition(0): VDATA - Vector GPR for read result.
1211 struct MIMG_instruction
: public Instruction
{
1212 memory_sync_info sync
;
1213 uint8_t dmask
; /* Data VGPR enable mask */
1214 uint8_t dim
: 3; /* NAVI: dimensionality */
1215 bool unrm
: 1; /* Force address to be un-normalized */
1216 bool dlc
: 1; /* NAVI: device level coherent */
1217 bool glc
: 1; /* globally coherent */
1218 bool slc
: 1; /* system level coherent */
1219 bool tfe
: 1; /* texture fail enable */
1220 bool da
: 1; /* declare an array */
1221 bool lwe
: 1; /* Force data to be un-normalized */
1222 bool r128
: 1; /* NAVI: Texture resource size */
1223 bool a16
: 1; /* VEGA, NAVI: Address components are 16-bits */
1224 bool d16
: 1; /* Convert 32-bit data to 16-bit data */
1225 bool disable_wqm
: 1; /* Require an exec mask without helper invocations */
1226 uint32_t padding
: 18;
1228 static_assert(sizeof(MIMG_instruction
) == sizeof(Instruction
) + 8, "Unexpected padding");
1231 * Flat/Scratch/Global Instructions
1234 * Operand(2) / Definition(0): DATA/VDST
1237 struct FLAT_instruction
: public Instruction
{
1238 memory_sync_info sync
;
1239 bool slc
: 1; /* system level coherent */
1240 bool glc
: 1; /* globally coherent */
1241 bool dlc
: 1; /* NAVI: device level coherent */
1244 bool disable_wqm
: 1; /* Require an exec mask without helper invocations */
1245 uint32_t padding0
: 2;
1246 uint16_t offset
; /* Vega/Navi only */
1249 static_assert(sizeof(FLAT_instruction
) == sizeof(Instruction
) + 8, "Unexpected padding");
1251 struct Export_instruction
: public Instruction
{
1252 uint8_t enabled_mask
;
1254 bool compressed
: 1;
1256 bool valid_mask
: 1;
1257 uint32_t padding
: 13;
1259 static_assert(sizeof(Export_instruction
) == sizeof(Instruction
) + 4, "Unexpected padding");
1261 struct Pseudo_instruction
: public Instruction
{
1262 PhysReg scratch_sgpr
; /* might not be valid if it's not needed */
1266 static_assert(sizeof(Pseudo_instruction
) == sizeof(Instruction
) + 4, "Unexpected padding");
1268 struct Pseudo_branch_instruction
: public Instruction
{
1269 /* target[0] is the block index of the branch target.
1270 * For conditional branches, target[1] contains the fall-through alternative.
1271 * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
1275 static_assert(sizeof(Pseudo_branch_instruction
) == sizeof(Instruction
) + 8, "Unexpected padding");
1277 struct Pseudo_barrier_instruction
: public Instruction
{
1278 memory_sync_info sync
;
1279 sync_scope exec_scope
;
1281 static_assert(sizeof(Pseudo_barrier_instruction
) == sizeof(Instruction
) + 4, "Unexpected padding");
1283 enum ReduceOp
: uint16_t {
1284 iadd8
, iadd16
, iadd32
, iadd64
,
1285 imul8
, imul16
, imul32
, imul64
,
1286 fadd16
, fadd32
, fadd64
,
1287 fmul16
, fmul32
, fmul64
,
1288 imin8
, imin16
, imin32
, imin64
,
1289 imax8
, imax16
, imax32
, imax64
,
1290 umin8
, umin16
, umin32
, umin64
,
1291 umax8
, umax16
, umax32
, umax64
,
1292 fmin16
, fmin32
, fmin64
,
1293 fmax16
, fmax32
, fmax64
,
1294 iand8
, iand16
, iand32
, iand64
,
1295 ior8
, ior16
, ior32
, ior64
,
1296 ixor8
, ixor16
, ixor32
, ixor64
,
1300 * Subgroup Reduction Instructions, everything except for the data to be
1301 * reduced and the result as inserted by setup_reduce_temp().
1302 * Operand(0): data to be reduced
1303 * Operand(1): reduce temporary
1304 * Operand(2): vector temporary
1305 * Definition(0): result
1306 * Definition(1): scalar temporary
1307 * Definition(2): scalar identity temporary (not used to store identity on GFX10)
1308 * Definition(3): scc clobber
1309 * Definition(4): vcc clobber
1312 struct Pseudo_reduction_instruction
: public Instruction
{
1314 uint16_t cluster_size
; // must be 0 for scans
1316 static_assert(sizeof(Pseudo_reduction_instruction
) == sizeof(Instruction
) + 4, "Unexpected padding");
1318 struct instr_deleter_functor
{
1319 void operator()(void* p
) {
1324 template<typename T
>
1325 using aco_ptr
= std::unique_ptr
<T
, instr_deleter_functor
>;
1327 template<typename T
>
1328 T
* create_instruction(aco_opcode opcode
, Format format
, uint32_t num_operands
, uint32_t num_definitions
)
1330 std::size_t size
= sizeof(T
) + num_operands
* sizeof(Operand
) + num_definitions
* sizeof(Definition
);
1331 char *data
= (char*) calloc(1, size
);
1332 T
* inst
= (T
*) data
;
1334 inst
->opcode
= opcode
;
1335 inst
->format
= format
;
1337 uint16_t operands_offset
= data
+ sizeof(T
) - (char*)&inst
->operands
;
1338 inst
->operands
= aco::span
<Operand
>(operands_offset
, num_operands
);
1339 uint16_t definitions_offset
= (char*)inst
->operands
.end() - (char*)&inst
->definitions
;
1340 inst
->definitions
= aco::span
<Definition
>(definitions_offset
, num_definitions
);
1345 constexpr bool Instruction::usesModifiers() const noexcept
1347 if (isDPP() || isSDWA())
1350 if (format
== Format::VOP3P
) {
1351 const VOP3P_instruction
*vop3p
= static_cast<const VOP3P_instruction
*>(this);
1352 for (unsigned i
= 0; i
< operands
.size(); i
++) {
1353 if (vop3p
->neg_lo
[i
] || vop3p
->neg_hi
[i
])
1356 return vop3p
->opsel_lo
|| vop3p
->opsel_hi
|| vop3p
->clamp
;
1357 } else if (isVOP3()) {
1358 const VOP3A_instruction
*vop3
= static_cast<const VOP3A_instruction
*>(this);
1359 for (unsigned i
= 0; i
< operands
.size(); i
++) {
1360 if (vop3
->abs
[i
] || vop3
->neg
[i
])
1363 return vop3
->opsel
|| vop3
->clamp
|| vop3
->omod
;
1368 constexpr bool is_phi(Instruction
* instr
)
1370 return instr
->opcode
== aco_opcode::p_phi
|| instr
->opcode
== aco_opcode::p_linear_phi
;
1373 static inline bool is_phi(aco_ptr
<Instruction
>& instr
)
1375 return is_phi(instr
.get());
1378 memory_sync_info
get_sync_info(const Instruction
* instr
);
1380 bool is_dead(const std::vector
<uint16_t>& uses
, Instruction
*instr
);
1382 bool can_use_opsel(chip_class chip
, aco_opcode op
, int idx
, bool high
);
1383 bool can_use_SDWA(chip_class chip
, const aco_ptr
<Instruction
>& instr
);
1384 /* updates "instr" and returns the old instruction (or NULL if no update was needed) */
1385 aco_ptr
<Instruction
> convert_to_SDWA(chip_class chip
, aco_ptr
<Instruction
>& instr
);
1388 /* uniform indicates that leaving this block,
1389 * all actives lanes stay active */
1390 block_kind_uniform
= 1 << 0,
1391 block_kind_top_level
= 1 << 1,
1392 block_kind_loop_preheader
= 1 << 2,
1393 block_kind_loop_header
= 1 << 3,
1394 block_kind_loop_exit
= 1 << 4,
1395 block_kind_continue
= 1 << 5,
1396 block_kind_break
= 1 << 6,
1397 block_kind_continue_or_break
= 1 << 7,
1398 block_kind_discard
= 1 << 8,
1399 block_kind_branch
= 1 << 9,
1400 block_kind_merge
= 1 << 10,
1401 block_kind_invert
= 1 << 11,
1402 block_kind_uses_discard_if
= 1 << 12,
1403 block_kind_needs_lowering
= 1 << 13,
1404 block_kind_uses_demote
= 1 << 14,
1405 block_kind_export_end
= 1 << 15,
1409 struct RegisterDemand
{
1410 constexpr RegisterDemand() = default;
1411 constexpr RegisterDemand(const int16_t v
, const int16_t s
) noexcept
1412 : vgpr
{v
}, sgpr
{s
} {}
1416 constexpr friend bool operator==(const RegisterDemand a
, const RegisterDemand b
) noexcept
{
1417 return a
.vgpr
== b
.vgpr
&& a
.sgpr
== b
.sgpr
;
1420 constexpr bool exceeds(const RegisterDemand other
) const noexcept
{
1421 return vgpr
> other
.vgpr
|| sgpr
> other
.sgpr
;
1424 constexpr RegisterDemand
operator+(const Temp t
) const noexcept
{
1425 if (t
.type() == RegType::sgpr
)
1426 return RegisterDemand( vgpr
, sgpr
+ t
.size() );
1428 return RegisterDemand( vgpr
+ t
.size(), sgpr
);
1431 constexpr RegisterDemand
operator+(const RegisterDemand other
) const noexcept
{
1432 return RegisterDemand(vgpr
+ other
.vgpr
, sgpr
+ other
.sgpr
);
1435 constexpr RegisterDemand
operator-(const RegisterDemand other
) const noexcept
{
1436 return RegisterDemand(vgpr
- other
.vgpr
, sgpr
- other
.sgpr
);
1439 constexpr RegisterDemand
& operator+=(const RegisterDemand other
) noexcept
{
1445 constexpr RegisterDemand
& operator-=(const RegisterDemand other
) noexcept
{
1451 constexpr RegisterDemand
& operator+=(const Temp t
) noexcept
{
1452 if (t
.type() == RegType::sgpr
)
1459 constexpr RegisterDemand
& operator-=(const Temp t
) noexcept
{
1460 if (t
.type() == RegType::sgpr
)
1467 constexpr void update(const RegisterDemand other
) noexcept
{
1468 vgpr
= std::max(vgpr
, other
.vgpr
);
1469 sgpr
= std::max(sgpr
, other
.sgpr
);
1478 unsigned offset
= 0;
1479 std::vector
<aco_ptr
<Instruction
>> instructions
;
1480 std::vector
<unsigned> logical_preds
;
1481 std::vector
<unsigned> linear_preds
;
1482 std::vector
<unsigned> logical_succs
;
1483 std::vector
<unsigned> linear_succs
;
1484 RegisterDemand register_demand
= RegisterDemand();
1485 uint16_t loop_nest_depth
= 0;
1487 int logical_idom
= -1;
1488 int linear_idom
= -1;
1489 Temp live_out_exec
= Temp();
1491 /* this information is needed for predecessors to blocks with phis when
1492 * moving out of ssa */
1493 bool scc_live_out
= false;
1494 PhysReg scratch_sgpr
= PhysReg(); /* only needs to be valid if scc_live_out != false */
1496 Block(unsigned idx
) : index(idx
) {}
1497 Block() : index(0) {}
1500 using Stage
= uint16_t;
1502 /* software stages */
1503 static constexpr Stage sw_vs
= 1 << 0;
1504 static constexpr Stage sw_gs
= 1 << 1;
1505 static constexpr Stage sw_tcs
= 1 << 2;
1506 static constexpr Stage sw_tes
= 1 << 3;
1507 static constexpr Stage sw_fs
= 1 << 4;
1508 static constexpr Stage sw_cs
= 1 << 5;
1509 static constexpr Stage sw_gs_copy
= 1 << 6;
1510 static constexpr Stage sw_mask
= 0x7f;
1512 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1513 static constexpr Stage hw_vs
= 1 << 7;
1514 static constexpr Stage hw_es
= 1 << 8; /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */
1515 static constexpr Stage hw_gs
= 1 << 9; /* Geometry shader on GFX10/legacy and GFX6-9. */
1516 static constexpr Stage hw_ngg_gs
= 1 << 10; /* Geometry shader on GFX10/NGG. */
1517 static constexpr Stage hw_ls
= 1 << 11; /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */
1518 static constexpr Stage hw_hs
= 1 << 12; /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */
1519 static constexpr Stage hw_fs
= 1 << 13;
1520 static constexpr Stage hw_cs
= 1 << 14;
1521 static constexpr Stage hw_mask
= 0xff << 7;
1523 /* possible settings of Program::stage */
1524 static constexpr Stage vertex_vs
= sw_vs
| hw_vs
;
1525 static constexpr Stage fragment_fs
= sw_fs
| hw_fs
;
1526 static constexpr Stage compute_cs
= sw_cs
| hw_cs
;
1527 static constexpr Stage tess_eval_vs
= sw_tes
| hw_vs
;
1528 static constexpr Stage gs_copy_vs
= sw_gs_copy
| hw_vs
;
1530 static constexpr Stage ngg_vertex_gs
= sw_vs
| hw_ngg_gs
;
1531 static constexpr Stage ngg_vertex_geometry_gs
= sw_vs
| sw_gs
| hw_ngg_gs
;
1532 static constexpr Stage ngg_tess_eval_gs
= sw_tes
| hw_ngg_gs
;
1533 static constexpr Stage ngg_tess_eval_geometry_gs
= sw_tes
| sw_gs
| hw_ngg_gs
;
1534 /* GFX9 (and GFX10 if NGG isn't used) */
1535 static constexpr Stage vertex_geometry_gs
= sw_vs
| sw_gs
| hw_gs
;
1536 static constexpr Stage vertex_tess_control_hs
= sw_vs
| sw_tcs
| hw_hs
;
1537 static constexpr Stage tess_eval_geometry_gs
= sw_tes
| sw_gs
| hw_gs
;
1539 static constexpr Stage vertex_ls
= sw_vs
| hw_ls
; /* vertex before tesselation control */
1540 static constexpr Stage vertex_es
= sw_vs
| hw_es
; /* vertex before geometry */
1541 static constexpr Stage tess_control_hs
= sw_tcs
| hw_hs
;
1542 static constexpr Stage tess_eval_es
= sw_tes
| hw_es
; /* tesselation evaluation before geometry */
1543 static constexpr Stage geometry_gs
= sw_gs
| hw_gs
;
1547 statistic_instructions
,
1551 statistic_vmem_clauses
,
1552 statistic_smem_clauses
,
1553 statistic_vmem_score
,
1554 statistic_smem_score
,
1555 statistic_sgpr_presched
,
1556 statistic_vgpr_presched
,
1560 class Program final
{
1562 float_mode next_fp_mode
;
1563 std::vector
<Block
> blocks
;
1564 RegisterDemand max_reg_demand
= RegisterDemand();
1565 uint16_t num_waves
= 0;
1566 uint16_t max_waves
= 0; /* maximum number of waves, regardless of register usage */
1567 ac_shader_config
* config
;
1568 struct radv_shader_info
*info
;
1569 enum chip_class chip_class
;
1570 enum radeon_family family
;
1573 Stage stage
; /* Stage */
1574 bool needs_exact
= false; /* there exists an instruction with disable_wqm = true */
1575 bool needs_wqm
= false; /* there exists a p_wqm instruction */
1576 bool wb_smem_l1_on_end
= false;
1578 std::vector
<uint8_t> constant_data
;
1579 Temp private_segment_buffer
;
1580 Temp scratch_offset
;
1582 uint16_t min_waves
= 0;
1583 uint16_t lds_alloc_granule
;
1584 uint32_t lds_limit
; /* in bytes */
1585 bool has_16bank_lds
;
1586 uint16_t vgpr_limit
;
1587 uint16_t sgpr_limit
;
1588 uint16_t physical_sgprs
;
1589 uint16_t sgpr_alloc_granule
; /* minus one. must be power of two */
1590 uint16_t vgpr_alloc_granule
; /* minus one. must be power of two */
1591 unsigned workgroup_size
; /* if known; otherwise UINT_MAX */
1593 bool xnack_enabled
= false;
1594 bool sram_ecc_enabled
= false;
1595 bool has_fast_fma32
= false;
1597 bool needs_vcc
= false;
1598 bool needs_flat_scr
= false;
1600 bool collect_statistics
= false;
1601 uint32_t statistics
[num_statistics
];
1604 void (*func
)(void *private_data
,
1605 enum radv_compiler_debug_level level
,
1606 const char *message
);
1610 uint32_t allocateId()
1612 assert(allocationID
<= 16777215);
1613 return allocationID
++;
1616 uint32_t peekAllocationId()
1618 return allocationID
;
1621 void setAllocationId(uint32_t id
)
1626 Block
* create_and_insert_block() {
1627 blocks
.emplace_back(blocks
.size());
1628 blocks
.back().fp_mode
= next_fp_mode
;
1629 return &blocks
.back();
1632 Block
* insert_block(Block
&& block
) {
1633 block
.index
= blocks
.size();
1634 block
.fp_mode
= next_fp_mode
;
1635 blocks
.emplace_back(std::move(block
));
1636 return &blocks
.back();
1640 uint32_t allocationID
= 1;
1644 std::size_t operator()(Temp t
) const {
1648 using TempSet
= std::unordered_set
<Temp
, TempHash
>;
1651 /* live temps out per block */
1652 std::vector
<TempSet
> live_out
;
1653 /* register demand (sgpr/vgpr) per instruction per block */
1654 std::vector
<std::vector
<RegisterDemand
>> register_demand
;
1659 void init_program(Program
*program
, Stage stage
, struct radv_shader_info
*info
,
1660 enum chip_class chip_class
, enum radeon_family family
,
1661 ac_shader_config
*config
);
1663 void select_program(Program
*program
,
1664 unsigned shader_count
,
1665 struct nir_shader
*const *shaders
,
1666 ac_shader_config
* config
,
1667 struct radv_shader_args
*args
);
1668 void select_gs_copy_shader(Program
*program
, struct nir_shader
*gs_shader
,
1669 ac_shader_config
* config
,
1670 struct radv_shader_args
*args
);
1671 void select_trap_handler_shader(Program
*program
, struct nir_shader
*shader
,
1672 ac_shader_config
* config
,
1673 struct radv_shader_args
*args
);
1675 void lower_wqm(Program
* program
, live
& live_vars
,
1676 const struct radv_nir_compiler_options
*options
);
1677 void lower_phis(Program
* program
);
1678 void calc_min_waves(Program
* program
);
1679 void update_vgpr_sgpr_demand(Program
* program
, const RegisterDemand new_demand
);
1680 live
live_var_analysis(Program
* program
, const struct radv_nir_compiler_options
*options
);
1681 std::vector
<uint16_t> dead_code_analysis(Program
*program
);
1682 void dominator_tree(Program
* program
);
1683 void insert_exec_mask(Program
*program
);
1684 void value_numbering(Program
* program
);
1685 void optimize(Program
* program
);
1686 void setup_reduce_temp(Program
* program
);
1687 void lower_to_cssa(Program
* program
, live
& live_vars
, const struct radv_nir_compiler_options
*options
);
1688 void register_allocation(Program
*program
, std::vector
<TempSet
>& live_out_per_block
);
1689 void ssa_elimination(Program
* program
);
1690 void lower_to_hw_instr(Program
* program
);
1691 void schedule_program(Program
* program
, live
& live_vars
);
1692 void spill(Program
* program
, live
& live_vars
, const struct radv_nir_compiler_options
*options
);
1693 void insert_wait_states(Program
* program
);
1694 void insert_NOPs(Program
* program
);
1695 unsigned emit_program(Program
* program
, std::vector
<uint32_t>& code
);
1696 void print_asm(Program
*program
, std::vector
<uint32_t>& binary
,
1697 unsigned exec_size
, std::ostream
& out
);
1698 bool validate_ir(Program
* program
);
1699 bool validate_ra(Program
* program
, const struct radv_nir_compiler_options
*options
);
1701 void perfwarn(Program
*program
, bool cond
, const char *msg
, Instruction
*instr
=NULL
);
1703 #define perfwarn(program, cond, msg, ...) do {} while(0)
1706 void collect_presched_stats(Program
*program
);
1707 void collect_preasm_stats(Program
*program
);
1708 void collect_postasm_stats(Program
*program
, const std::vector
<uint32_t>& code
);
1710 void aco_print_instr(const Instruction
*instr
, FILE *output
);
1711 void aco_print_program(const Program
*program
, FILE *output
);
1713 void _aco_perfwarn(Program
*program
, const char *file
, unsigned line
,
1714 const char *fmt
, ...);
1715 void _aco_err(Program
*program
, const char *file
, unsigned line
,
1716 const char *fmt
, ...);
1718 #define aco_perfwarn(program, ...) _aco_perfwarn(program, __FILE__, __LINE__, __VA_ARGS__)
1719 #define aco_err(program, ...) _aco_err(program, __FILE__, __LINE__, __VA_ARGS__)
1721 /* utilities for dealing with register demand */
1722 RegisterDemand
get_live_changes(aco_ptr
<Instruction
>& instr
);
1723 RegisterDemand
get_temp_registers(aco_ptr
<Instruction
>& instr
);
1724 RegisterDemand
get_demand_before(RegisterDemand demand
, aco_ptr
<Instruction
>& instr
, aco_ptr
<Instruction
>& instr_before
);
1726 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1727 uint16_t get_extra_sgprs(Program
*program
);
1729 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
1730 uint16_t get_sgpr_alloc(Program
*program
, uint16_t addressable_sgprs
);
1731 uint16_t get_vgpr_alloc(Program
*program
, uint16_t addressable_vgprs
);
1733 /* return number of addressable sgprs/vgprs for max_waves */
1734 uint16_t get_addr_sgpr_from_waves(Program
*program
, uint16_t max_waves
);
1735 uint16_t get_addr_vgpr_from_waves(Program
*program
, uint16_t max_waves
);
1738 const int16_t opcode_gfx7
[static_cast<int>(aco_opcode::num_opcodes
)];
1739 const int16_t opcode_gfx9
[static_cast<int>(aco_opcode::num_opcodes
)];
1740 const int16_t opcode_gfx10
[static_cast<int>(aco_opcode::num_opcodes
)];
1741 const std::bitset
<static_cast<int>(aco_opcode::num_opcodes
)> can_use_input_modifiers
;
1742 const std::bitset
<static_cast<int>(aco_opcode::num_opcodes
)> can_use_output_modifiers
;
1743 const std::bitset
<static_cast<int>(aco_opcode::num_opcodes
)> is_atomic
;
1744 const char *name
[static_cast<int>(aco_opcode::num_opcodes
)];
1745 const aco::Format format
[static_cast<int>(aco_opcode::num_opcodes
)];
1746 /* sizes used for input/output modifiers and constants */
1747 const unsigned operand_size
[static_cast<int>(aco_opcode::num_opcodes
)];
1748 const unsigned definition_size
[static_cast<int>(aco_opcode::num_opcodes
)];
1751 extern const Info instr_info
;
1755 #endif /* ACO_IR_H */