2 * Copyright © 2018 Valve Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
34 #include "ac_binary.h"
35 #include "amd_family.h"
36 #include "aco_opcodes.h"
39 struct radv_nir_compiler_options
;
40 struct radv_shader_info
;
44 extern uint64_t debug_flags
;
48 DEBUG_VALIDATE_RA
= 0x2,
53 * Representation of the instruction's microcode encoding format
54 * Note: Some Vector ALU Formats can be combined, such that:
55 * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
56 * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
57 * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
59 * (*) The same is applicable for VOP1 and VOPC instructions.
61 enum class Format
: std::uint16_t {
62 /* Pseudo Instruction Format */
64 /* Scalar ALU & Control Formats */
70 /* Scalar Memory Format */
74 /* Vector Memory Buffer Formats */
77 /* Vector Memory Image Format */
88 PSEUDO_REDUCTION
= 18,
90 /* Vector ALU Formats */
98 /* Vector Parameter Interpolation Format */
104 enum barrier_interaction
{
106 barrier_buffer
= 0x1,
108 barrier_atomic
= 0x4,
109 barrier_shared
= 0x8,
113 constexpr Format
asVOP3(Format format
) {
114 return (Format
) ((uint32_t) Format::VOP3
| (uint32_t) format
);
142 /* these are used for WWM and spills to vgpr */
143 v1_linear
= v1
| (1 << 6),
144 v2_linear
= v2
| (1 << 6),
147 RegClass() = default;
148 constexpr RegClass(RC rc
)
150 constexpr RegClass(RegType type
, unsigned size
)
151 : rc((RC
) ((type
== RegType::vgpr
? 1 << 5 : 0) | size
)) {}
153 constexpr operator RC() const { return rc
; }
154 explicit operator bool() = delete;
156 constexpr RegType
type() const { return rc
<= RC::s16
? RegType::sgpr
: RegType::vgpr
; }
157 constexpr unsigned size() const { return (unsigned) rc
& 0x1F; }
158 constexpr bool is_linear() const { return rc
<= RC::s16
|| rc
& (1 << 6); }
159 constexpr RegClass
as_linear() const { return RegClass((RC
) (rc
| (1 << 6))); }
165 /* transitional helper expressions */
166 static constexpr RegClass s1
{RegClass::s1
};
167 static constexpr RegClass s2
{RegClass::s2
};
168 static constexpr RegClass s3
{RegClass::s3
};
169 static constexpr RegClass s4
{RegClass::s4
};
170 static constexpr RegClass s8
{RegClass::s8
};
171 static constexpr RegClass s16
{RegClass::s16
};
172 static constexpr RegClass v1
{RegClass::v1
};
173 static constexpr RegClass v2
{RegClass::v2
};
174 static constexpr RegClass v3
{RegClass::v3
};
175 static constexpr RegClass v4
{RegClass::v4
};
176 static constexpr RegClass v5
{RegClass::v5
};
177 static constexpr RegClass v6
{RegClass::v6
};
178 static constexpr RegClass v7
{RegClass::v7
};
179 static constexpr RegClass v8
{RegClass::v8
};
183 * Each temporary virtual register has a
184 * register class (i.e. size and type)
189 constexpr Temp(uint32_t id
, RegClass cls
) noexcept
190 : id_(id
), reg_class(cls
) {}
192 constexpr uint32_t id() const noexcept
{ return id_
; }
193 constexpr RegClass
regClass() const noexcept
{ return reg_class
; }
195 constexpr unsigned size() const noexcept
{ return reg_class
.size(); }
196 constexpr RegType
type() const noexcept
{ return reg_class
.type(); }
197 constexpr bool is_linear() const noexcept
{ return reg_class
.is_linear(); }
199 constexpr bool operator <(Temp other
) const noexcept
{ return id() < other
.id(); }
200 constexpr bool operator==(Temp other
) const noexcept
{ return id() == other
.id(); }
201 constexpr bool operator!=(Temp other
) const noexcept
{ return id() != other
.id(); }
210 * Represents the physical register for each
211 * Operand and Definition.
214 constexpr PhysReg() = default;
215 explicit constexpr PhysReg(unsigned r
) : reg(r
) {}
216 constexpr operator unsigned() const { return reg
; }
221 /* helper expressions for special registers */
222 static constexpr PhysReg m0
{124};
223 static constexpr PhysReg vcc
{106};
224 static constexpr PhysReg sgpr_null
{125}; /* GFX10+ */
225 static constexpr PhysReg exec
{126};
226 static constexpr PhysReg exec_lo
{126};
227 static constexpr PhysReg exec_hi
{127};
228 static constexpr PhysReg scc
{253};
232 * Initially, each Operand refers to either
233 * a temporary virtual register
234 * or to a constant value
235 * Temporary registers get mapped to physical register during RA
236 * Constant values are inlined into the instruction sequence.
242 : reg_(PhysReg
{128}), isTemp_(false), isFixed_(true), isConstant_(false),
243 isKill_(false), isUndef_(true), isFirstKill_(false), is64BitConst_(false) {}
245 explicit Operand(Temp r
) noexcept
252 setFixed(PhysReg
{128});
255 explicit Operand(uint32_t v
) noexcept
260 setFixed(PhysReg
{128 + v
});
261 else if (v
>= 0xFFFFFFF0) /* [-16 .. -1] */
262 setFixed(PhysReg
{192 - v
});
263 else if (v
== 0x3f000000) /* 0.5 */
264 setFixed(PhysReg
{240});
265 else if (v
== 0xbf000000) /* -0.5 */
266 setFixed(PhysReg
{241});
267 else if (v
== 0x3f800000) /* 1.0 */
268 setFixed(PhysReg
{242});
269 else if (v
== 0xbf800000) /* -1.0 */
270 setFixed(PhysReg
{243});
271 else if (v
== 0x40000000) /* 2.0 */
272 setFixed(PhysReg
{244});
273 else if (v
== 0xc0000000) /* -2.0 */
274 setFixed(PhysReg
{245});
275 else if (v
== 0x40800000) /* 4.0 */
276 setFixed(PhysReg
{246});
277 else if (v
== 0xc0800000) /* -4.0 */
278 setFixed(PhysReg
{247});
279 else if (v
== 0x3e22f983) /* 1/(2*PI) */
280 setFixed(PhysReg
{248});
281 else /* Literal Constant */
282 setFixed(PhysReg
{255});
284 explicit Operand(uint64_t v
) noexcept
287 is64BitConst_
= true;
289 setFixed(PhysReg
{128 + (uint32_t) v
});
290 else if (v
>= 0xFFFFFFFFFFFFFFF0) /* [-16 .. -1] */
291 setFixed(PhysReg
{192 - (uint32_t) v
});
292 else if (v
== 0x3FE0000000000000) /* 0.5 */
293 setFixed(PhysReg
{240});
294 else if (v
== 0xBFE0000000000000) /* -0.5 */
295 setFixed(PhysReg
{241});
296 else if (v
== 0x3FF0000000000000) /* 1.0 */
297 setFixed(PhysReg
{242});
298 else if (v
== 0xBFF0000000000000) /* -1.0 */
299 setFixed(PhysReg
{243});
300 else if (v
== 0x4000000000000000) /* 2.0 */
301 setFixed(PhysReg
{244});
302 else if (v
== 0xC000000000000000) /* -2.0 */
303 setFixed(PhysReg
{245});
304 else if (v
== 0x4010000000000000) /* 4.0 */
305 setFixed(PhysReg
{246});
306 else if (v
== 0xC010000000000000) /* -4.0 */
307 setFixed(PhysReg
{247});
308 else if (v
== 0x3fc45f306dc9c882) /* 1/(2*PI) */
309 setFixed(PhysReg
{248});
310 else { /* Literal Constant: we don't know if it is a long or double.*/
312 assert(false && "attempt to create a 64-bit literal constant");
315 explicit Operand(RegClass type
) noexcept
318 data_
.temp
= Temp(0, type
);
319 setFixed(PhysReg
{128});
321 explicit Operand(PhysReg reg
, RegClass type
) noexcept
323 data_
.temp
= Temp(0, type
);
327 constexpr bool isTemp() const noexcept
332 constexpr void setTemp(Temp t
) noexcept
{
333 assert(!isConstant_
);
338 constexpr Temp
getTemp() const noexcept
343 constexpr uint32_t tempId() const noexcept
345 return data_
.temp
.id();
348 constexpr bool hasRegClass() const noexcept
350 return isTemp() || isUndefined();
353 constexpr RegClass
regClass() const noexcept
355 return data_
.temp
.regClass();
358 constexpr unsigned size() const noexcept
361 return is64BitConst_
? 2 : 1;
363 return data_
.temp
.size();
366 constexpr bool isFixed() const noexcept
371 constexpr PhysReg
physReg() const noexcept
376 constexpr void setFixed(PhysReg reg
) noexcept
378 isFixed_
= reg
!= unsigned(-1);
382 constexpr bool isConstant() const noexcept
387 constexpr bool isLiteral() const noexcept
389 return isConstant() && reg_
== 255;
392 constexpr bool isUndefined() const noexcept
397 constexpr uint32_t constantValue() const noexcept
402 constexpr bool constantEquals(uint32_t cmp
) const noexcept
404 return isConstant() && constantValue() == cmp
;
407 constexpr void setKill(bool flag
) noexcept
414 constexpr bool isKill() const noexcept
416 return isKill_
|| isFirstKill();
419 constexpr void setFirstKill(bool flag
) noexcept
426 /* When there are multiple operands killing the same temporary,
427 * isFirstKill() is only returns true for the first one. */
428 constexpr bool isFirstKill() const noexcept
437 Temp temp
= Temp(0, s1
);
444 uint8_t isConstant_
:1;
447 uint8_t isFirstKill_
:1;
448 uint8_t is64BitConst_
:1;
450 /* can't initialize bit-fields in c++11, so work around using a union */
451 uint8_t control_
= 0;
457 * Definitions are the results of Instructions
458 * and refer to temporary virtual registers
459 * which are later mapped to physical registers
461 class Definition final
464 constexpr Definition() : temp(Temp(0, s1
)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0) {}
465 Definition(uint32_t index
, RegClass type
) noexcept
466 : temp(index
, type
) {}
467 explicit Definition(Temp tmp
) noexcept
469 Definition(PhysReg reg
, RegClass type
) noexcept
470 : temp(Temp(0, type
))
474 Definition(uint32_t tmpId
, PhysReg reg
, RegClass type
) noexcept
475 : temp(Temp(tmpId
, type
))
480 constexpr bool isTemp() const noexcept
485 constexpr Temp
getTemp() const noexcept
490 constexpr uint32_t tempId() const noexcept
495 constexpr void setTemp(Temp t
) noexcept
{
499 constexpr RegClass
regClass() const noexcept
501 return temp
.regClass();
504 constexpr unsigned size() const noexcept
509 constexpr bool isFixed() const noexcept
514 constexpr PhysReg
physReg() const noexcept
519 constexpr void setFixed(PhysReg reg
) noexcept
525 constexpr void setHint(PhysReg reg
) noexcept
531 constexpr bool hasHint() const noexcept
536 constexpr void setKill(bool flag
) noexcept
541 constexpr bool isKill() const noexcept
547 Temp temp
= Temp(0, s1
);
555 /* can't initialize bit-fields in c++11, so work around using a union */
556 uint8_t control_
= 0;
567 aco::span
<Operand
> operands
;
568 aco::span
<Definition
> definitions
;
570 constexpr bool isVALU() const noexcept
572 return ((uint16_t) format
& (uint16_t) Format::VOP1
) == (uint16_t) Format::VOP1
573 || ((uint16_t) format
& (uint16_t) Format::VOP2
) == (uint16_t) Format::VOP2
574 || ((uint16_t) format
& (uint16_t) Format::VOPC
) == (uint16_t) Format::VOPC
575 || ((uint16_t) format
& (uint16_t) Format::VOP3A
) == (uint16_t) Format::VOP3A
576 || ((uint16_t) format
& (uint16_t) Format::VOP3B
) == (uint16_t) Format::VOP3B
577 || ((uint16_t) format
& (uint16_t) Format::VOP3P
) == (uint16_t) Format::VOP3P
;
580 constexpr bool isSALU() const noexcept
582 return format
== Format::SOP1
||
583 format
== Format::SOP2
||
584 format
== Format::SOPC
||
585 format
== Format::SOPK
||
586 format
== Format::SOPP
;
589 constexpr bool isVMEM() const noexcept
591 return format
== Format::MTBUF
||
592 format
== Format::MUBUF
||
593 format
== Format::MIMG
;
596 constexpr bool isDPP() const noexcept
598 return (uint16_t) format
& (uint16_t) Format::DPP
;
601 constexpr bool isVOP3() const noexcept
603 return ((uint16_t) format
& (uint16_t) Format::VOP3A
) ||
604 ((uint16_t) format
& (uint16_t) Format::VOP3B
) ||
605 format
== Format::VOP3P
;
608 constexpr bool isSDWA() const noexcept
610 return (uint16_t) format
& (uint16_t) Format::SDWA
;
613 constexpr bool isFlatOrGlobal() const noexcept
615 return format
== Format::FLAT
|| format
== Format::GLOBAL
;
619 struct SOPK_instruction
: public Instruction
{
623 struct SOPP_instruction
: public Instruction
{
628 struct SOPC_instruction
: public Instruction
{
631 struct SOP1_instruction
: public Instruction
{
634 struct SOP2_instruction
: public Instruction
{
638 * Scalar Memory Format:
639 * For s_(buffer_)load_dword*:
640 * Operand(0): SBASE - SGPR-pair which provides base address
641 * Operand(1): Offset - immediate (un)signed offset or SGPR
642 * Operand(2) / Definition(0): SDATA - SGPR for read / write result
643 * Operand(n-1): SOffset - SGPR offset (Vega only)
645 * Having no operands is also valid for instructions such as s_dcache_inv.
648 struct SMEM_instruction
: public Instruction
{
649 bool glc
; /* VI+: globally coherent */
650 bool dlc
; /* NAVI: device level coherent */
651 bool nv
; /* VEGA only: Non-volatile */
654 barrier_interaction barrier
;
657 struct VOP1_instruction
: public Instruction
{
660 struct VOP2_instruction
: public Instruction
{
663 struct VOPC_instruction
: public Instruction
{
666 struct VOP3A_instruction
: public Instruction
{
675 * Data Parallel Primitives Format:
676 * This format can be used for VOP1, VOP2 or VOPC instructions.
677 * The swizzle applies to the src0 operand.
680 struct DPP_instruction
: public Instruction
{
689 struct Interp_instruction
: public Instruction
{
695 * Local and Global Data Sharing instructions
696 * Operand(0): ADDR - VGPR which supplies the address.
697 * Operand(1): DATA0 - First data VGPR.
698 * Operand(2): DATA1 - Second data VGPR.
699 * Operand(n-1): M0 - LDS size.
700 * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
703 struct DS_instruction
: public Instruction
{
710 * Vector Memory Untyped-buffer Instructions
711 * Operand(0): VADDR - Address source. Can carry an index and/or offset
712 * Operand(1): SRSRC - Specifies which SGPR supplies T# (resource constant)
713 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
714 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
717 struct MUBUF_instruction
: public Instruction
{
718 unsigned offset
; /* Unsigned byte offset - 12 bit */
719 bool offen
; /* Supply an offset from VGPR (VADDR) */
720 bool idxen
; /* Supply an index from VGPR (VADDR) */
721 bool glc
; /* globally coherent */
722 bool dlc
; /* NAVI: device level coherent */
723 bool slc
; /* system level coherent */
724 bool tfe
; /* texture fail enable */
725 bool lds
; /* Return read-data to LDS instead of VGPRs */
726 bool disable_wqm
; /* Require an exec mask without helper invocations */
728 barrier_interaction barrier
;
732 * Vector Memory Typed-buffer Instructions
733 * Operand(0): VADDR - Address source. Can carry an index and/or offset
734 * Operand(1): SRSRC - Specifies which SGPR supplies T# (resource constant)
735 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
736 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
739 struct MTBUF_instruction
: public Instruction
{
740 uint8_t dfmt
: 4; /* Data Format of data in memory buffer */
741 uint8_t nfmt
: 3; /* Numeric format of data in memory */
742 unsigned offset
; /* Unsigned byte offset - 12 bit */
743 bool offen
; /* Supply an offset from VGPR (VADDR) */
744 bool idxen
; /* Supply an index from VGPR (VADDR) */
745 bool glc
; /* globally coherent */
746 bool dlc
; /* NAVI: device level coherent */
747 bool slc
; /* system level coherent */
748 bool tfe
; /* texture fail enable */
749 bool disable_wqm
; /* Require an exec mask without helper invocations */
751 barrier_interaction barrier
;
755 * Vector Memory Image Instructions
756 * Operand(0): VADDR - Address source. Can carry an offset or an index.
757 * Operand(1): SRSRC - Scalar GPR that specifies the resource constant.
758 * Operand(2): SSAMP - Scalar GPR that specifies sampler constant.
759 * Operand(3) / Definition(0): VDATA - Vector GPR for read / write result.
762 struct MIMG_instruction
: public Instruction
{
763 unsigned dmask
; /* Data VGPR enable mask */
764 unsigned dim
; /* NAVI: dimensionality */
765 bool unrm
; /* Force address to be un-normalized */
766 bool dlc
; /* NAVI: device level coherent */
767 bool glc
; /* globally coherent */
768 bool slc
; /* system level coherent */
769 bool tfe
; /* texture fail enable */
770 bool da
; /* declare an array */
771 bool lwe
; /* Force data to be un-normalized */
772 bool r128
; /* NAVI: Texture resource size */
773 bool a16
; /* VEGA, NAVI: Address components are 16-bits */
774 bool d16
; /* Convert 32-bit data to 16-bit data */
775 bool disable_wqm
; /* Require an exec mask without helper invocations */
777 barrier_interaction barrier
;
781 * Flat/Scratch/Global Instructions
784 * Operand(2) / Definition(0): DATA/VDST
787 struct FLAT_instruction
: public Instruction
{
788 uint16_t offset
; /* Vega only */
789 bool slc
; /* system level coherent */
790 bool glc
; /* globally coherent */
791 bool dlc
; /* NAVI: device level coherent */
796 struct Export_instruction
: public Instruction
{
797 unsigned enabled_mask
;
804 struct Pseudo_instruction
: public Instruction
{
806 PhysReg scratch_sgpr
; /* might not be valid if it's not needed */
809 struct Pseudo_branch_instruction
: public Instruction
{
810 /* target[0] is the block index of the branch target.
811 * For conditional branches, target[1] contains the fall-through alternative.
812 * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
817 struct Pseudo_barrier_instruction
: public Instruction
{
834 gfx10_wave64_bpermute
838 * Subgroup Reduction Instructions, everything except for the data to be
839 * reduced and the result as inserted by setup_reduce_temp().
840 * Operand(0): data to be reduced
841 * Operand(1): reduce temporary
842 * Operand(2): vector temporary
843 * Definition(0): result
844 * Definition(1): scalar temporary
845 * Definition(2): scalar identity temporary (not used to store identity on GFX10)
846 * Definition(3): scc clobber
847 * Definition(4): vcc clobber
850 struct Pseudo_reduction_instruction
: public Instruction
{
852 unsigned cluster_size
; // must be 0 for scans
855 struct instr_deleter_functor
{
856 void operator()(void* p
) {
862 using aco_ptr
= std::unique_ptr
<T
, instr_deleter_functor
>;
865 T
* create_instruction(aco_opcode opcode
, Format format
, uint32_t num_operands
, uint32_t num_definitions
)
867 std::size_t size
= sizeof(T
) + num_operands
* sizeof(Operand
) + num_definitions
* sizeof(Definition
);
868 char *data
= (char*) calloc(1, size
);
871 inst
->opcode
= opcode
;
872 inst
->format
= format
;
874 inst
->operands
= aco::span
<Operand
>((Operand
*)(data
+ sizeof(T
)), num_operands
);
875 inst
->definitions
= aco::span
<Definition
>((Definition
*)inst
->operands
.end(), num_definitions
);
880 constexpr bool is_phi(Instruction
* instr
)
882 return instr
->opcode
== aco_opcode::p_phi
|| instr
->opcode
== aco_opcode::p_linear_phi
;
885 static inline bool is_phi(aco_ptr
<Instruction
>& instr
)
887 return is_phi(instr
.get());
890 constexpr barrier_interaction
get_barrier_interaction(Instruction
* instr
)
892 switch (instr
->format
) {
894 return static_cast<SMEM_instruction
*>(instr
)->barrier
;
896 return static_cast<MUBUF_instruction
*>(instr
)->barrier
;
898 return static_cast<MIMG_instruction
*>(instr
)->barrier
;
901 return barrier_buffer
;
903 return barrier_shared
;
910 /* uniform indicates that leaving this block,
911 * all actives lanes stay active */
912 block_kind_uniform
= 1 << 0,
913 block_kind_top_level
= 1 << 1,
914 block_kind_loop_preheader
= 1 << 2,
915 block_kind_loop_header
= 1 << 3,
916 block_kind_loop_exit
= 1 << 4,
917 block_kind_continue
= 1 << 5,
918 block_kind_break
= 1 << 6,
919 block_kind_continue_or_break
= 1 << 7,
920 block_kind_discard
= 1 << 8,
921 block_kind_branch
= 1 << 9,
922 block_kind_merge
= 1 << 10,
923 block_kind_invert
= 1 << 11,
924 block_kind_uses_discard_if
= 1 << 12,
925 block_kind_needs_lowering
= 1 << 13,
926 block_kind_uses_demote
= 1 << 14,
930 struct RegisterDemand
{
931 constexpr RegisterDemand() = default;
932 constexpr RegisterDemand(const int16_t v
, const int16_t s
) noexcept
933 : vgpr
{v
}, sgpr
{s
} {}
937 constexpr friend bool operator==(const RegisterDemand a
, const RegisterDemand b
) noexcept
{
938 return a
.vgpr
== b
.vgpr
&& a
.sgpr
== b
.sgpr
;
941 constexpr bool exceeds(const RegisterDemand other
) const noexcept
{
942 return vgpr
> other
.vgpr
|| sgpr
> other
.sgpr
;
945 constexpr RegisterDemand
operator+(const Temp t
) const noexcept
{
946 if (t
.type() == RegType::sgpr
)
947 return RegisterDemand( vgpr
, sgpr
+ t
.size() );
949 return RegisterDemand( vgpr
+ t
.size(), sgpr
);
952 constexpr RegisterDemand
operator+(const RegisterDemand other
) const noexcept
{
953 return RegisterDemand(vgpr
+ other
.vgpr
, sgpr
+ other
.sgpr
);
956 constexpr RegisterDemand
operator-(const RegisterDemand other
) const noexcept
{
957 return RegisterDemand(vgpr
- other
.vgpr
, sgpr
- other
.sgpr
);
960 constexpr RegisterDemand
& operator+=(const RegisterDemand other
) noexcept
{
966 constexpr RegisterDemand
& operator-=(const RegisterDemand other
) noexcept
{
972 constexpr RegisterDemand
& operator+=(const Temp t
) noexcept
{
973 if (t
.type() == RegType::sgpr
)
980 constexpr RegisterDemand
& operator-=(const Temp t
) noexcept
{
981 if (t
.type() == RegType::sgpr
)
988 constexpr void update(const RegisterDemand other
) noexcept
{
989 vgpr
= std::max(vgpr
, other
.vgpr
);
990 sgpr
= std::max(sgpr
, other
.sgpr
);
999 std::vector
<aco_ptr
<Instruction
>> instructions
;
1000 std::vector
<unsigned> logical_preds
;
1001 std::vector
<unsigned> linear_preds
;
1002 std::vector
<unsigned> logical_succs
;
1003 std::vector
<unsigned> linear_succs
;
1004 RegisterDemand register_demand
= RegisterDemand();
1005 uint16_t loop_nest_depth
= 0;
1007 int logical_idom
= -1;
1008 int linear_idom
= -1;
1009 Temp live_out_exec
= Temp();
1011 /* this information is needed for predecessors to blocks with phis when
1012 * moving out of ssa */
1013 bool scc_live_out
= false;
1014 PhysReg scratch_sgpr
= PhysReg(); /* only needs to be valid if scc_live_out != false */
1016 Block(unsigned idx
) : index(idx
) {}
1017 Block() : index(0) {}
1020 using Stage
= uint16_t;
1022 /* software stages */
1023 static constexpr Stage sw_vs
= 1 << 0;
1024 static constexpr Stage sw_gs
= 1 << 1;
1025 static constexpr Stage sw_tcs
= 1 << 2;
1026 static constexpr Stage sw_tes
= 1 << 3;
1027 static constexpr Stage sw_fs
= 1 << 4;
1028 static constexpr Stage sw_cs
= 1 << 5;
1029 static constexpr Stage sw_mask
= 0x3f;
1031 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1032 static constexpr Stage hw_vs
= 1 << 6;
1033 static constexpr Stage hw_es
= 1 << 7; /* not on GFX9. combined into GS on GFX9 (and GFX10/legacy). */
1034 static constexpr Stage hw_gs
= 1 << 8;
1035 static constexpr Stage hw_ls
= 1 << 9; /* not on GFX9. combined into HS on GFX9 (and GFX10/legacy). */
1036 static constexpr Stage hw_hs
= 1 << 10;
1037 static constexpr Stage hw_fs
= 1 << 11;
1038 static constexpr Stage hw_cs
= 1 << 12;
1039 static constexpr Stage hw_mask
= 0x7f << 6;
1041 /* possible settings of Program::stage */
1042 static constexpr Stage vertex_vs
= sw_vs
| hw_vs
;
1043 static constexpr Stage fragment_fs
= sw_fs
| hw_fs
;
1044 static constexpr Stage compute_cs
= sw_cs
| hw_cs
;
1045 static constexpr Stage tess_eval_vs
= sw_tes
| hw_vs
;
1047 static constexpr Stage ngg_vertex_gs
= sw_vs
| hw_gs
;
1048 static constexpr Stage ngg_vertex_geometry_gs
= sw_vs
| sw_gs
| hw_gs
;
1049 static constexpr Stage ngg_tess_eval_geometry_gs
= sw_tes
| sw_gs
| hw_gs
;
1050 static constexpr Stage ngg_vertex_tess_control_hs
= sw_vs
| sw_tcs
| hw_hs
;
1051 /* GFX9 (and GFX10 if NGG isn't used) */
1052 static constexpr Stage vertex_geometry_gs
= sw_vs
| sw_gs
| hw_gs
;
1053 static constexpr Stage vertex_tess_control_hs
= sw_vs
| sw_tcs
| hw_hs
;
1054 static constexpr Stage tess_eval_geometry_gs
= sw_tes
| sw_gs
| hw_gs
;
1056 static constexpr Stage vertex_ls
= sw_vs
| hw_ls
; /* vertex before tesselation control */
1057 static constexpr Stage vertex_es
= sw_vs
| hw_es
; /* vertex before geometry */
1058 static constexpr Stage tess_control_hs
= sw_tcs
| hw_hs
;
1059 static constexpr Stage tess_eval_es
= sw_tes
| hw_gs
; /* tesselation evaluation before geometry */
1060 static constexpr Stage geometry_gs
= sw_gs
| hw_gs
;
1062 class Program final
{
1064 std::vector
<Block
> blocks
;
1065 RegisterDemand max_reg_demand
= RegisterDemand();
1066 uint16_t num_waves
= 0;
1067 uint16_t max_waves
= 0; /* maximum number of waves, regardless of register usage */
1068 ac_shader_config
* config
;
1069 struct radv_shader_info
*info
;
1070 enum chip_class chip_class
;
1071 enum radeon_family family
;
1073 Stage stage
; /* Stage */
1074 bool needs_exact
= false; /* there exists an instruction with disable_wqm = true */
1075 bool needs_wqm
= false; /* there exists a p_wqm instruction */
1076 bool wb_smem_l1_on_end
= false;
1078 std::vector
<uint8_t> constant_data
;
1079 Temp private_segment_buffer
;
1080 Temp scratch_offset
;
1082 uint16_t lds_alloc_granule
;
1083 uint32_t lds_limit
; /* in bytes */
1084 uint16_t vgpr_limit
;
1085 uint16_t sgpr_limit
;
1086 uint16_t physical_sgprs
;
1087 uint16_t sgpr_alloc_granule
; /* minus one. must be power of two */
1089 bool needs_vcc
= false;
1090 bool needs_xnack_mask
= false;
1091 bool needs_flat_scr
= false;
1093 uint32_t allocateId()
1095 assert(allocationID
<= 16777215);
1096 return allocationID
++;
1099 uint32_t peekAllocationId()
1101 return allocationID
;
1104 void setAllocationId(uint32_t id
)
1109 Block
* create_and_insert_block() {
1110 blocks
.emplace_back(blocks
.size());
1111 return &blocks
.back();
1114 Block
* insert_block(Block
&& block
) {
1115 block
.index
= blocks
.size();
1116 blocks
.emplace_back(std::move(block
));
1117 return &blocks
.back();
1121 uint32_t allocationID
= 1;
1125 /* live temps out per block */
1126 std::vector
<std::set
<Temp
>> live_out
;
1127 /* register demand (sgpr/vgpr) per instruction per block */
1128 std::vector
<std::vector
<RegisterDemand
>> register_demand
;
1131 void select_program(Program
*program
,
1132 unsigned shader_count
,
1133 struct nir_shader
*const *shaders
,
1134 ac_shader_config
* config
,
1135 struct radv_shader_info
*info
,
1136 struct radv_nir_compiler_options
*options
);
1138 void lower_wqm(Program
* program
, live
& live_vars
,
1139 const struct radv_nir_compiler_options
*options
);
1140 void lower_bool_phis(Program
* program
);
1141 void update_vgpr_sgpr_demand(Program
* program
, const RegisterDemand new_demand
);
1142 live
live_var_analysis(Program
* program
, const struct radv_nir_compiler_options
*options
);
1143 std::vector
<uint16_t> dead_code_analysis(Program
*program
);
1144 void dominator_tree(Program
* program
);
1145 void insert_exec_mask(Program
*program
);
1146 void value_numbering(Program
* program
);
1147 void optimize(Program
* program
);
1148 void setup_reduce_temp(Program
* program
);
1149 void lower_to_cssa(Program
* program
, live
& live_vars
, const struct radv_nir_compiler_options
*options
);
1150 void register_allocation(Program
*program
, std::vector
<std::set
<Temp
>> live_out_per_block
);
1151 void ssa_elimination(Program
* program
);
1152 void lower_to_hw_instr(Program
* program
);
1153 void schedule_program(Program
* program
, live
& live_vars
);
1154 void spill(Program
* program
, live
& live_vars
, const struct radv_nir_compiler_options
*options
);
1155 void insert_wait_states(Program
* program
);
1156 void insert_NOPs(Program
* program
);
1157 unsigned emit_program(Program
* program
, std::vector
<uint32_t>& code
);
1158 void print_asm(Program
*program
, std::vector
<uint32_t>& binary
,
1159 unsigned exec_size
, std::ostream
& out
);
1160 void validate(Program
* program
, FILE *output
);
1161 bool validate_ra(Program
* program
, const struct radv_nir_compiler_options
*options
, FILE *output
);
1163 void perfwarn(bool cond
, const char *msg
, Instruction
*instr
=NULL
);
1165 #define perfwarn(program, cond, msg, ...)
1168 void aco_print_instr(Instruction
*instr
, FILE *output
);
1169 void aco_print_program(Program
*program
, FILE *output
);
1171 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1172 uint16_t get_extra_sgprs(Program
*program
);
1174 /* get number of sgprs allocated required to address a number of sgprs */
1175 uint16_t get_sgpr_alloc(Program
*program
, uint16_t addressable_sgprs
);
1177 /* return number of addressable SGPRs for max_waves */
1178 uint16_t get_addr_sgpr_from_waves(Program
*program
, uint16_t max_waves
);
1181 const int16_t opcode_gfx9
[static_cast<int>(aco_opcode::num_opcodes
)];
1182 const int16_t opcode_gfx10
[static_cast<int>(aco_opcode::num_opcodes
)];
1183 const std::bitset
<static_cast<int>(aco_opcode::num_opcodes
)> can_use_input_modifiers
;
1184 const std::bitset
<static_cast<int>(aco_opcode::num_opcodes
)> can_use_output_modifiers
;
1185 const char *name
[static_cast<int>(aco_opcode::num_opcodes
)];
1186 const aco::Format format
[static_cast<int>(aco_opcode::num_opcodes
)];
1189 extern const Info instr_info
;
1193 #endif /* ACO_IR_H */