2 * Copyright © 2018 Valve Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
34 #include "ac_binary.h"
35 #include "amd_family.h"
36 #include "aco_opcodes.h"
39 struct radv_nir_compiler_options
;
40 struct radv_shader_args
;
41 struct radv_shader_info
;
45 extern uint64_t debug_flags
;
49 DEBUG_VALIDATE_RA
= 0x2,
54 * Representation of the instruction's microcode encoding format
55 * Note: Some Vector ALU Formats can be combined, such that:
56 * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
57 * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
58 * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
60 * (*) The same is applicable for VOP1 and VOPC instructions.
62 enum class Format
: std::uint16_t {
63 /* Pseudo Instruction Format */
65 /* Scalar ALU & Control Formats */
71 /* Scalar Memory Format */
75 /* Vector Memory Buffer Formats */
78 /* Vector Memory Image Format */
89 PSEUDO_REDUCTION
= 18,
91 /* Vector ALU Formats */
99 /* Vector Parameter Interpolation Format */
105 enum barrier_interaction
: uint8_t {
107 barrier_buffer
= 0x1,
109 barrier_atomic
= 0x4,
110 barrier_shared
= 0x8,
111 /* used for geometry shaders to ensure vertex data writes are before the
112 * GS_DONE s_sendmsg. */
113 barrier_gs_data
= 0x10,
114 /* used for geometry shaders to ensure s_sendmsg instructions are in-order. */
115 barrier_gs_sendmsg
= 0x20,
127 /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
128 * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
129 fp_denorm_flush
= 0x0,
130 fp_denorm_keep
= 0x3,
134 /* matches encoding of the MODE register */
138 fp_round round16_64
:2;
140 unsigned denorm16_64
:2;
144 /* if false, optimizations which may remove infs/nan/-0.0 can be done */
145 bool preserve_signed_zero_inf_nan32
:1;
146 bool preserve_signed_zero_inf_nan16_64
:1;
147 /* if false, optimizations which may remove denormal flushing can be done */
148 bool must_flush_denorms32
:1;
149 bool must_flush_denorms16_64
:1;
150 bool care_about_round32
:1;
151 bool care_about_round16_64
:1;
153 /* Returns true if instructions using the mode "other" can safely use the
154 * current one instead. */
155 bool canReplace(float_mode other
) const noexcept
{
156 return val
== other
.val
&&
157 (preserve_signed_zero_inf_nan32
|| !other
.preserve_signed_zero_inf_nan32
) &&
158 (preserve_signed_zero_inf_nan16_64
|| !other
.preserve_signed_zero_inf_nan16_64
) &&
159 (must_flush_denorms32
|| !other
.must_flush_denorms32
) &&
160 (must_flush_denorms16_64
|| !other
.must_flush_denorms16_64
) &&
161 (care_about_round32
|| !other
.care_about_round32
) &&
162 (care_about_round16_64
|| !other
.care_about_round16_64
);
166 constexpr Format
asVOP3(Format format
) {
167 return (Format
) ((uint32_t) Format::VOP3
| (uint32_t) format
);
195 /* these are used for WWM and spills to vgpr */
196 v1_linear
= v1
| (1 << 6),
197 v2_linear
= v2
| (1 << 6),
200 RegClass() = default;
201 constexpr RegClass(RC rc
)
203 constexpr RegClass(RegType type
, unsigned size
)
204 : rc((RC
) ((type
== RegType::vgpr
? 1 << 5 : 0) | size
)) {}
206 constexpr operator RC() const { return rc
; }
207 explicit operator bool() = delete;
209 constexpr RegType
type() const { return rc
<= RC::s16
? RegType::sgpr
: RegType::vgpr
; }
210 constexpr unsigned size() const { return (unsigned) rc
& 0x1F; }
211 constexpr bool is_linear() const { return rc
<= RC::s16
|| rc
& (1 << 6); }
212 constexpr RegClass
as_linear() const { return RegClass((RC
) (rc
| (1 << 6))); }
218 /* transitional helper expressions */
219 static constexpr RegClass s1
{RegClass::s1
};
220 static constexpr RegClass s2
{RegClass::s2
};
221 static constexpr RegClass s3
{RegClass::s3
};
222 static constexpr RegClass s4
{RegClass::s4
};
223 static constexpr RegClass s8
{RegClass::s8
};
224 static constexpr RegClass s16
{RegClass::s16
};
225 static constexpr RegClass v1
{RegClass::v1
};
226 static constexpr RegClass v2
{RegClass::v2
};
227 static constexpr RegClass v3
{RegClass::v3
};
228 static constexpr RegClass v4
{RegClass::v4
};
229 static constexpr RegClass v5
{RegClass::v5
};
230 static constexpr RegClass v6
{RegClass::v6
};
231 static constexpr RegClass v7
{RegClass::v7
};
232 static constexpr RegClass v8
{RegClass::v8
};
236 * Each temporary virtual register has a
237 * register class (i.e. size and type)
242 constexpr Temp(uint32_t id
, RegClass cls
) noexcept
243 : id_(id
), reg_class(cls
) {}
245 constexpr uint32_t id() const noexcept
{ return id_
; }
246 constexpr RegClass
regClass() const noexcept
{ return reg_class
; }
248 constexpr unsigned size() const noexcept
{ return reg_class
.size(); }
249 constexpr RegType
type() const noexcept
{ return reg_class
.type(); }
250 constexpr bool is_linear() const noexcept
{ return reg_class
.is_linear(); }
252 constexpr bool operator <(Temp other
) const noexcept
{ return id() < other
.id(); }
253 constexpr bool operator==(Temp other
) const noexcept
{ return id() == other
.id(); }
254 constexpr bool operator!=(Temp other
) const noexcept
{ return id() != other
.id(); }
263 * Represents the physical register for each
264 * Operand and Definition.
267 constexpr PhysReg() = default;
268 explicit constexpr PhysReg(unsigned r
) : reg(r
) {}
269 constexpr operator unsigned() const { return reg
; }
274 /* helper expressions for special registers */
275 static constexpr PhysReg m0
{124};
276 static constexpr PhysReg vcc
{106};
277 static constexpr PhysReg vcc_hi
{107};
278 static constexpr PhysReg sgpr_null
{125}; /* GFX10+ */
279 static constexpr PhysReg exec
{126};
280 static constexpr PhysReg exec_lo
{126};
281 static constexpr PhysReg exec_hi
{127};
282 static constexpr PhysReg vccz
{251};
283 static constexpr PhysReg execz
{252};
284 static constexpr PhysReg scc
{253};
288 * Initially, each Operand refers to either
289 * a temporary virtual register
290 * or to a constant value
291 * Temporary registers get mapped to physical register during RA
292 * Constant values are inlined into the instruction sequence.
298 : reg_(PhysReg
{128}), isTemp_(false), isFixed_(true), isConstant_(false),
299 isKill_(false), isUndef_(true), isFirstKill_(false), is64BitConst_(false) {}
301 explicit Operand(Temp r
) noexcept
308 setFixed(PhysReg
{128});
311 explicit Operand(uint32_t v
, bool is64bit
= false) noexcept
315 is64BitConst_
= is64bit
;
317 setFixed(PhysReg
{128 + v
});
318 else if (v
>= 0xFFFFFFF0) /* [-16 .. -1] */
319 setFixed(PhysReg
{192 - v
});
320 else if (v
== 0x3f000000) /* 0.5 */
321 setFixed(PhysReg
{240});
322 else if (v
== 0xbf000000) /* -0.5 */
323 setFixed(PhysReg
{241});
324 else if (v
== 0x3f800000) /* 1.0 */
325 setFixed(PhysReg
{242});
326 else if (v
== 0xbf800000) /* -1.0 */
327 setFixed(PhysReg
{243});
328 else if (v
== 0x40000000) /* 2.0 */
329 setFixed(PhysReg
{244});
330 else if (v
== 0xc0000000) /* -2.0 */
331 setFixed(PhysReg
{245});
332 else if (v
== 0x40800000) /* 4.0 */
333 setFixed(PhysReg
{246});
334 else if (v
== 0xc0800000) /* -4.0 */
335 setFixed(PhysReg
{247});
336 else { /* Literal Constant */
337 assert(!is64bit
&& "attempt to create a 64-bit literal constant");
338 setFixed(PhysReg
{255});
341 explicit Operand(uint64_t v
) noexcept
344 is64BitConst_
= true;
346 data_
.i
= (uint32_t) v
;
347 setFixed(PhysReg
{128 + (uint32_t) v
});
348 } else if (v
>= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
349 data_
.i
= (uint32_t) v
;
350 setFixed(PhysReg
{192 - (uint32_t) v
});
351 } else if (v
== 0x3FE0000000000000) { /* 0.5 */
352 data_
.i
= 0x3f000000;
353 setFixed(PhysReg
{240});
354 } else if (v
== 0xBFE0000000000000) { /* -0.5 */
355 data_
.i
= 0xbf000000;
356 setFixed(PhysReg
{241});
357 } else if (v
== 0x3FF0000000000000) { /* 1.0 */
358 data_
.i
= 0x3f800000;
359 setFixed(PhysReg
{242});
360 } else if (v
== 0xBFF0000000000000) { /* -1.0 */
361 data_
.i
= 0xbf800000;
362 setFixed(PhysReg
{243});
363 } else if (v
== 0x4000000000000000) { /* 2.0 */
364 data_
.i
= 0x40000000;
365 setFixed(PhysReg
{244});
366 } else if (v
== 0xC000000000000000) { /* -2.0 */
367 data_
.i
= 0xc0000000;
368 setFixed(PhysReg
{245});
369 } else if (v
== 0x4010000000000000) { /* 4.0 */
370 data_
.i
= 0x40800000;
371 setFixed(PhysReg
{246});
372 } else if (v
== 0xC010000000000000) { /* -4.0 */
373 data_
.i
= 0xc0800000;
374 setFixed(PhysReg
{247});
375 } else { /* Literal Constant: we don't know if it is a long or double.*/
377 assert(false && "attempt to create a 64-bit literal constant");
380 explicit Operand(RegClass type
) noexcept
383 data_
.temp
= Temp(0, type
);
384 setFixed(PhysReg
{128});
386 explicit Operand(PhysReg reg
, RegClass type
) noexcept
388 data_
.temp
= Temp(0, type
);
392 constexpr bool isTemp() const noexcept
397 constexpr void setTemp(Temp t
) noexcept
{
398 assert(!isConstant_
);
403 constexpr Temp
getTemp() const noexcept
408 constexpr uint32_t tempId() const noexcept
410 return data_
.temp
.id();
413 constexpr bool hasRegClass() const noexcept
415 return isTemp() || isUndefined();
418 constexpr RegClass
regClass() const noexcept
420 return data_
.temp
.regClass();
423 constexpr unsigned size() const noexcept
426 return is64BitConst_
? 2 : 1;
428 return data_
.temp
.size();
431 constexpr bool isFixed() const noexcept
436 constexpr PhysReg
physReg() const noexcept
441 constexpr void setFixed(PhysReg reg
) noexcept
443 isFixed_
= reg
!= unsigned(-1);
447 constexpr bool isConstant() const noexcept
452 constexpr bool isLiteral() const noexcept
454 return isConstant() && reg_
== 255;
457 constexpr bool isUndefined() const noexcept
462 constexpr uint32_t constantValue() const noexcept
467 constexpr bool constantEquals(uint32_t cmp
) const noexcept
469 return isConstant() && constantValue() == cmp
;
472 constexpr void setKill(bool flag
) noexcept
479 constexpr bool isKill() const noexcept
481 return isKill_
|| isFirstKill();
484 constexpr void setFirstKill(bool flag
) noexcept
491 /* When there are multiple operands killing the same temporary,
492 * isFirstKill() is only returns true for the first one. */
493 constexpr bool isFirstKill() const noexcept
502 Temp temp
= Temp(0, s1
);
509 uint8_t isConstant_
:1;
512 uint8_t isFirstKill_
:1;
513 uint8_t is64BitConst_
:1;
515 /* can't initialize bit-fields in c++11, so work around using a union */
516 uint8_t control_
= 0;
522 * Definitions are the results of Instructions
523 * and refer to temporary virtual registers
524 * which are later mapped to physical registers
526 class Definition final
529 constexpr Definition() : temp(Temp(0, s1
)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0) {}
530 Definition(uint32_t index
, RegClass type
) noexcept
531 : temp(index
, type
) {}
532 explicit Definition(Temp tmp
) noexcept
534 Definition(PhysReg reg
, RegClass type
) noexcept
535 : temp(Temp(0, type
))
539 Definition(uint32_t tmpId
, PhysReg reg
, RegClass type
) noexcept
540 : temp(Temp(tmpId
, type
))
545 constexpr bool isTemp() const noexcept
550 constexpr Temp
getTemp() const noexcept
555 constexpr uint32_t tempId() const noexcept
560 constexpr void setTemp(Temp t
) noexcept
{
564 constexpr RegClass
regClass() const noexcept
566 return temp
.regClass();
569 constexpr unsigned size() const noexcept
574 constexpr bool isFixed() const noexcept
579 constexpr PhysReg
physReg() const noexcept
584 constexpr void setFixed(PhysReg reg
) noexcept
590 constexpr void setHint(PhysReg reg
) noexcept
596 constexpr bool hasHint() const noexcept
601 constexpr void setKill(bool flag
) noexcept
606 constexpr bool isKill() const noexcept
612 Temp temp
= Temp(0, s1
);
620 /* can't initialize bit-fields in c++11, so work around using a union */
621 uint8_t control_
= 0;
632 aco::span
<Operand
> operands
;
633 aco::span
<Definition
> definitions
;
635 constexpr bool isVALU() const noexcept
637 return ((uint16_t) format
& (uint16_t) Format::VOP1
) == (uint16_t) Format::VOP1
638 || ((uint16_t) format
& (uint16_t) Format::VOP2
) == (uint16_t) Format::VOP2
639 || ((uint16_t) format
& (uint16_t) Format::VOPC
) == (uint16_t) Format::VOPC
640 || ((uint16_t) format
& (uint16_t) Format::VOP3A
) == (uint16_t) Format::VOP3A
641 || ((uint16_t) format
& (uint16_t) Format::VOP3B
) == (uint16_t) Format::VOP3B
642 || ((uint16_t) format
& (uint16_t) Format::VOP3P
) == (uint16_t) Format::VOP3P
;
645 constexpr bool isSALU() const noexcept
647 return format
== Format::SOP1
||
648 format
== Format::SOP2
||
649 format
== Format::SOPC
||
650 format
== Format::SOPK
||
651 format
== Format::SOPP
;
654 constexpr bool isVMEM() const noexcept
656 return format
== Format::MTBUF
||
657 format
== Format::MUBUF
||
658 format
== Format::MIMG
;
661 constexpr bool isDPP() const noexcept
663 return (uint16_t) format
& (uint16_t) Format::DPP
;
666 constexpr bool isVOP3() const noexcept
668 return ((uint16_t) format
& (uint16_t) Format::VOP3A
) ||
669 ((uint16_t) format
& (uint16_t) Format::VOP3B
) ||
670 format
== Format::VOP3P
;
673 constexpr bool isSDWA() const noexcept
675 return (uint16_t) format
& (uint16_t) Format::SDWA
;
678 constexpr bool isFlatOrGlobal() const noexcept
680 return format
== Format::FLAT
|| format
== Format::GLOBAL
;
683 constexpr bool usesModifiers() const noexcept
;
685 constexpr bool reads_exec() const noexcept
687 for (const Operand
& op
: operands
) {
688 if (op
.isFixed() && op
.physReg() == exec
)
695 struct SOPK_instruction
: public Instruction
{
699 struct SOPP_instruction
: public Instruction
{
704 struct SOPC_instruction
: public Instruction
{
707 struct SOP1_instruction
: public Instruction
{
710 struct SOP2_instruction
: public Instruction
{
714 * Scalar Memory Format:
715 * For s_(buffer_)load_dword*:
716 * Operand(0): SBASE - SGPR-pair which provides base address
717 * Operand(1): Offset - immediate (un)signed offset or SGPR
718 * Operand(2) / Definition(0): SDATA - SGPR for read / write result
719 * Operand(n-1): SOffset - SGPR offset (Vega only)
721 * Having no operands is also valid for instructions such as s_dcache_inv.
724 struct SMEM_instruction
: public Instruction
{
725 bool glc
: 1; /* VI+: globally coherent */
726 bool dlc
: 1; /* NAVI: device level coherent */
727 bool nv
: 1; /* VEGA only: Non-volatile */
728 bool can_reorder
: 1;
729 bool disable_wqm
: 1;
730 barrier_interaction barrier
;
733 struct VOP1_instruction
: public Instruction
{
736 struct VOP2_instruction
: public Instruction
{
739 struct VOPC_instruction
: public Instruction
{
742 struct VOP3A_instruction
: public Instruction
{
751 * Data Parallel Primitives Format:
752 * This format can be used for VOP1, VOP2 or VOPC instructions.
753 * The swizzle applies to the src0 operand.
756 struct DPP_instruction
: public Instruction
{
760 uint8_t row_mask
: 4;
761 uint8_t bank_mask
: 4;
765 struct Interp_instruction
: public Instruction
{
771 * Local and Global Data Sharing instructions
772 * Operand(0): ADDR - VGPR which supplies the address.
773 * Operand(1): DATA0 - First data VGPR.
774 * Operand(2): DATA1 - Second data VGPR.
775 * Operand(n-1): M0 - LDS size.
776 * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
779 struct DS_instruction
: public Instruction
{
786 * Vector Memory Untyped-buffer Instructions
787 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
788 * Operand(1): VADDR - Address source. Can carry an index and/or offset
789 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
790 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
793 struct MUBUF_instruction
: public Instruction
{
794 uint16_t offset
: 12; /* Unsigned byte offset - 12 bit */
795 bool offen
: 1; /* Supply an offset from VGPR (VADDR) */
796 bool idxen
: 1; /* Supply an index from VGPR (VADDR) */
797 bool addr64
: 1; /* SI, CIK: Address size is 64-bit */
798 bool glc
: 1; /* globally coherent */
799 bool dlc
: 1; /* NAVI: device level coherent */
800 bool slc
: 1; /* system level coherent */
801 bool tfe
: 1; /* texture fail enable */
802 bool lds
: 1; /* Return read-data to LDS instead of VGPRs */
803 bool disable_wqm
: 1; /* Require an exec mask without helper invocations */
804 bool can_reorder
: 1;
805 barrier_interaction barrier
;
809 * Vector Memory Typed-buffer Instructions
810 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
811 * Operand(1): VADDR - Address source. Can carry an index and/or offset
812 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
813 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
816 struct MTBUF_instruction
: public Instruction
{
817 uint16_t offset
; /* Unsigned byte offset - 12 bit */
818 uint8_t dfmt
: 4; /* Data Format of data in memory buffer */
819 uint8_t nfmt
: 3; /* Numeric format of data in memory */
820 bool offen
: 1; /* Supply an offset from VGPR (VADDR) */
821 bool idxen
: 1; /* Supply an index from VGPR (VADDR) */
822 bool glc
: 1; /* globally coherent */
823 bool dlc
: 1; /* NAVI: device level coherent */
824 bool slc
: 1; /* system level coherent */
825 bool tfe
: 1; /* texture fail enable */
826 bool disable_wqm
: 1; /* Require an exec mask without helper invocations */
827 bool can_reorder
: 1;
828 barrier_interaction barrier
;
832 * Vector Memory Image Instructions
833 * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
834 * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
835 * or VDATA - Vector GPR for write data.
836 * Operand(2): VADDR - Address source. Can carry an offset or an index.
837 * Definition(0): VDATA - Vector GPR for read result.
840 struct MIMG_instruction
: public Instruction
{
841 uint8_t dmask
; /* Data VGPR enable mask */
842 uint8_t dim
: 3; /* NAVI: dimensionality */
843 bool unrm
: 1; /* Force address to be un-normalized */
844 bool dlc
: 1; /* NAVI: device level coherent */
845 bool glc
: 1; /* globally coherent */
846 bool slc
: 1; /* system level coherent */
847 bool tfe
: 1; /* texture fail enable */
848 bool da
: 1; /* declare an array */
849 bool lwe
: 1; /* Force data to be un-normalized */
850 bool r128
: 1; /* NAVI: Texture resource size */
851 bool a16
: 1; /* VEGA, NAVI: Address components are 16-bits */
852 bool d16
: 1; /* Convert 32-bit data to 16-bit data */
853 bool disable_wqm
: 1; /* Require an exec mask without helper invocations */
854 bool can_reorder
: 1;
855 barrier_interaction barrier
;
859 * Flat/Scratch/Global Instructions
862 * Operand(2) / Definition(0): DATA/VDST
865 struct FLAT_instruction
: public Instruction
{
866 uint16_t offset
; /* Vega/Navi only */
867 bool slc
: 1; /* system level coherent */
868 bool glc
: 1; /* globally coherent */
869 bool dlc
: 1; /* NAVI: device level coherent */
872 bool disable_wqm
: 1; /* Require an exec mask without helper invocations */
873 bool can_reorder
: 1;
874 barrier_interaction barrier
;
877 struct Export_instruction
: public Instruction
{
878 uint8_t enabled_mask
;
885 struct Pseudo_instruction
: public Instruction
{
887 PhysReg scratch_sgpr
; /* might not be valid if it's not needed */
890 struct Pseudo_branch_instruction
: public Instruction
{
891 /* target[0] is the block index of the branch target.
892 * For conditional branches, target[1] contains the fall-through alternative.
893 * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
898 struct Pseudo_barrier_instruction
: public Instruction
{
915 gfx10_wave64_bpermute
919 * Subgroup Reduction Instructions, everything except for the data to be
920 * reduced and the result as inserted by setup_reduce_temp().
921 * Operand(0): data to be reduced
922 * Operand(1): reduce temporary
923 * Operand(2): vector temporary
924 * Definition(0): result
925 * Definition(1): scalar temporary
926 * Definition(2): scalar identity temporary (not used to store identity on GFX10)
927 * Definition(3): scc clobber
928 * Definition(4): vcc clobber
931 struct Pseudo_reduction_instruction
: public Instruction
{
933 unsigned cluster_size
; // must be 0 for scans
936 struct instr_deleter_functor
{
937 void operator()(void* p
) {
943 using aco_ptr
= std::unique_ptr
<T
, instr_deleter_functor
>;
946 T
* create_instruction(aco_opcode opcode
, Format format
, uint32_t num_operands
, uint32_t num_definitions
)
948 std::size_t size
= sizeof(T
) + num_operands
* sizeof(Operand
) + num_definitions
* sizeof(Definition
);
949 char *data
= (char*) calloc(1, size
);
952 inst
->opcode
= opcode
;
953 inst
->format
= format
;
955 uint16_t operands_offset
= data
+ sizeof(T
) - (char*)&inst
->operands
;
956 inst
->operands
= aco::span
<Operand
>(operands_offset
, num_operands
);
957 uint16_t definitions_offset
= (char*)inst
->operands
.end() - (char*)&inst
->definitions
;
958 inst
->definitions
= aco::span
<Definition
>(definitions_offset
, num_definitions
);
963 constexpr bool Instruction::usesModifiers() const noexcept
965 if (isDPP() || isSDWA())
969 const VOP3A_instruction
*vop3
= static_cast<const VOP3A_instruction
*>(this);
970 for (unsigned i
= 0; i
< operands
.size(); i
++) {
971 if (vop3
->abs
[i
] || vop3
->neg
[i
])
974 return vop3
->opsel
|| vop3
->clamp
|| vop3
->omod
;
977 constexpr bool is_phi(Instruction
* instr
)
979 return instr
->opcode
== aco_opcode::p_phi
|| instr
->opcode
== aco_opcode::p_linear_phi
;
982 static inline bool is_phi(aco_ptr
<Instruction
>& instr
)
984 return is_phi(instr
.get());
987 barrier_interaction
get_barrier_interaction(Instruction
* instr
);
989 bool is_dead(const std::vector
<uint16_t>& uses
, Instruction
*instr
);
992 /* uniform indicates that leaving this block,
993 * all actives lanes stay active */
994 block_kind_uniform
= 1 << 0,
995 block_kind_top_level
= 1 << 1,
996 block_kind_loop_preheader
= 1 << 2,
997 block_kind_loop_header
= 1 << 3,
998 block_kind_loop_exit
= 1 << 4,
999 block_kind_continue
= 1 << 5,
1000 block_kind_break
= 1 << 6,
1001 block_kind_continue_or_break
= 1 << 7,
1002 block_kind_discard
= 1 << 8,
1003 block_kind_branch
= 1 << 9,
1004 block_kind_merge
= 1 << 10,
1005 block_kind_invert
= 1 << 11,
1006 block_kind_uses_discard_if
= 1 << 12,
1007 block_kind_needs_lowering
= 1 << 13,
1008 block_kind_uses_demote
= 1 << 14,
1009 block_kind_export_end
= 1 << 15,
1013 struct RegisterDemand
{
1014 constexpr RegisterDemand() = default;
1015 constexpr RegisterDemand(const int16_t v
, const int16_t s
) noexcept
1016 : vgpr
{v
}, sgpr
{s
} {}
1020 constexpr friend bool operator==(const RegisterDemand a
, const RegisterDemand b
) noexcept
{
1021 return a
.vgpr
== b
.vgpr
&& a
.sgpr
== b
.sgpr
;
1024 constexpr bool exceeds(const RegisterDemand other
) const noexcept
{
1025 return vgpr
> other
.vgpr
|| sgpr
> other
.sgpr
;
1028 constexpr RegisterDemand
operator+(const Temp t
) const noexcept
{
1029 if (t
.type() == RegType::sgpr
)
1030 return RegisterDemand( vgpr
, sgpr
+ t
.size() );
1032 return RegisterDemand( vgpr
+ t
.size(), sgpr
);
1035 constexpr RegisterDemand
operator+(const RegisterDemand other
) const noexcept
{
1036 return RegisterDemand(vgpr
+ other
.vgpr
, sgpr
+ other
.sgpr
);
1039 constexpr RegisterDemand
operator-(const RegisterDemand other
) const noexcept
{
1040 return RegisterDemand(vgpr
- other
.vgpr
, sgpr
- other
.sgpr
);
1043 constexpr RegisterDemand
& operator+=(const RegisterDemand other
) noexcept
{
1049 constexpr RegisterDemand
& operator-=(const RegisterDemand other
) noexcept
{
1055 constexpr RegisterDemand
& operator+=(const Temp t
) noexcept
{
1056 if (t
.type() == RegType::sgpr
)
1063 constexpr RegisterDemand
& operator-=(const Temp t
) noexcept
{
1064 if (t
.type() == RegType::sgpr
)
1071 constexpr void update(const RegisterDemand other
) noexcept
{
1072 vgpr
= std::max(vgpr
, other
.vgpr
);
1073 sgpr
= std::max(sgpr
, other
.sgpr
);
1082 unsigned offset
= 0;
1083 std::vector
<aco_ptr
<Instruction
>> instructions
;
1084 std::vector
<unsigned> logical_preds
;
1085 std::vector
<unsigned> linear_preds
;
1086 std::vector
<unsigned> logical_succs
;
1087 std::vector
<unsigned> linear_succs
;
1088 RegisterDemand register_demand
= RegisterDemand();
1089 uint16_t loop_nest_depth
= 0;
1091 int logical_idom
= -1;
1092 int linear_idom
= -1;
1093 Temp live_out_exec
= Temp();
1095 /* this information is needed for predecessors to blocks with phis when
1096 * moving out of ssa */
1097 bool scc_live_out
= false;
1098 PhysReg scratch_sgpr
= PhysReg(); /* only needs to be valid if scc_live_out != false */
1100 Block(unsigned idx
) : index(idx
) {}
1101 Block() : index(0) {}
1104 using Stage
= uint16_t;
1106 /* software stages */
1107 static constexpr Stage sw_vs
= 1 << 0;
1108 static constexpr Stage sw_gs
= 1 << 1;
1109 static constexpr Stage sw_tcs
= 1 << 2;
1110 static constexpr Stage sw_tes
= 1 << 3;
1111 static constexpr Stage sw_fs
= 1 << 4;
1112 static constexpr Stage sw_cs
= 1 << 5;
1113 static constexpr Stage sw_gs_copy
= 1 << 6;
1114 static constexpr Stage sw_mask
= 0x7f;
1116 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1117 static constexpr Stage hw_vs
= 1 << 7;
1118 static constexpr Stage hw_es
= 1 << 8; /* not on GFX9. combined into GS on GFX9 (and GFX10/legacy). */
1119 static constexpr Stage hw_gs
= 1 << 9;
1120 static constexpr Stage hw_ls
= 1 << 10; /* not on GFX9. combined into HS on GFX9 (and GFX10/legacy). */
1121 static constexpr Stage hw_hs
= 1 << 11;
1122 static constexpr Stage hw_fs
= 1 << 12;
1123 static constexpr Stage hw_cs
= 1 << 13;
1124 static constexpr Stage hw_mask
= 0x7f << 7;
1126 /* possible settings of Program::stage */
1127 static constexpr Stage vertex_vs
= sw_vs
| hw_vs
;
1128 static constexpr Stage fragment_fs
= sw_fs
| hw_fs
;
1129 static constexpr Stage compute_cs
= sw_cs
| hw_cs
;
1130 static constexpr Stage tess_eval_vs
= sw_tes
| hw_vs
;
1131 static constexpr Stage gs_copy_vs
= sw_gs_copy
| hw_vs
;
1133 static constexpr Stage ngg_vertex_gs
= sw_vs
| hw_gs
;
1134 static constexpr Stage ngg_vertex_geometry_gs
= sw_vs
| sw_gs
| hw_gs
;
1135 static constexpr Stage ngg_tess_eval_geometry_gs
= sw_tes
| sw_gs
| hw_gs
;
1136 static constexpr Stage ngg_vertex_tess_control_hs
= sw_vs
| sw_tcs
| hw_hs
;
1137 /* GFX9 (and GFX10 if NGG isn't used) */
1138 static constexpr Stage vertex_geometry_gs
= sw_vs
| sw_gs
| hw_gs
;
1139 static constexpr Stage vertex_tess_control_hs
= sw_vs
| sw_tcs
| hw_hs
;
1140 static constexpr Stage tess_eval_geometry_gs
= sw_tes
| sw_gs
| hw_gs
;
1142 static constexpr Stage vertex_ls
= sw_vs
| hw_ls
; /* vertex before tesselation control */
1143 static constexpr Stage vertex_es
= sw_vs
| hw_es
; /* vertex before geometry */
1144 static constexpr Stage tess_control_hs
= sw_tcs
| hw_hs
;
1145 static constexpr Stage tess_eval_es
= sw_tes
| hw_es
; /* tesselation evaluation before geometry */
1146 static constexpr Stage geometry_gs
= sw_gs
| hw_gs
;
1148 class Program final
{
1150 float_mode next_fp_mode
;
1151 std::vector
<Block
> blocks
;
1152 RegisterDemand max_reg_demand
= RegisterDemand();
1153 uint16_t num_waves
= 0;
1154 uint16_t max_waves
= 0; /* maximum number of waves, regardless of register usage */
1155 ac_shader_config
* config
;
1156 struct radv_shader_info
*info
;
1157 enum chip_class chip_class
;
1158 enum radeon_family family
;
1161 Stage stage
; /* Stage */
1162 bool needs_exact
= false; /* there exists an instruction with disable_wqm = true */
1163 bool needs_wqm
= false; /* there exists a p_wqm instruction */
1164 bool wb_smem_l1_on_end
= false;
1166 std::vector
<uint8_t> constant_data
;
1167 Temp private_segment_buffer
;
1168 Temp scratch_offset
;
1170 uint16_t min_waves
= 0;
1171 uint16_t lds_alloc_granule
;
1172 uint32_t lds_limit
; /* in bytes */
1173 uint16_t vgpr_limit
;
1174 uint16_t sgpr_limit
;
1175 uint16_t physical_sgprs
;
1176 uint16_t sgpr_alloc_granule
; /* minus one. must be power of two */
1177 uint16_t vgpr_alloc_granule
; /* minus one. must be power of two */
1179 bool needs_vcc
= false;
1180 bool needs_xnack_mask
= false;
1181 bool needs_flat_scr
= false;
1183 uint32_t allocateId()
1185 assert(allocationID
<= 16777215);
1186 return allocationID
++;
1189 uint32_t peekAllocationId()
1191 return allocationID
;
1194 void setAllocationId(uint32_t id
)
1199 Block
* create_and_insert_block() {
1200 blocks
.emplace_back(blocks
.size());
1201 blocks
.back().fp_mode
= next_fp_mode
;
1202 return &blocks
.back();
1205 Block
* insert_block(Block
&& block
) {
1206 block
.index
= blocks
.size();
1207 block
.fp_mode
= next_fp_mode
;
1208 blocks
.emplace_back(std::move(block
));
1209 return &blocks
.back();
1213 uint32_t allocationID
= 1;
1217 /* live temps out per block */
1218 std::vector
<std::set
<Temp
>> live_out
;
1219 /* register demand (sgpr/vgpr) per instruction per block */
1220 std::vector
<std::vector
<RegisterDemand
>> register_demand
;
1223 void select_program(Program
*program
,
1224 unsigned shader_count
,
1225 struct nir_shader
*const *shaders
,
1226 ac_shader_config
* config
,
1227 struct radv_shader_args
*args
);
1228 void select_gs_copy_shader(Program
*program
, struct nir_shader
*gs_shader
,
1229 ac_shader_config
* config
,
1230 struct radv_shader_args
*args
);
1232 void lower_wqm(Program
* program
, live
& live_vars
,
1233 const struct radv_nir_compiler_options
*options
);
1234 void lower_bool_phis(Program
* program
);
1235 void calc_min_waves(Program
* program
);
1236 void update_vgpr_sgpr_demand(Program
* program
, const RegisterDemand new_demand
);
1237 live
live_var_analysis(Program
* program
, const struct radv_nir_compiler_options
*options
);
1238 std::vector
<uint16_t> dead_code_analysis(Program
*program
);
1239 void dominator_tree(Program
* program
);
1240 void insert_exec_mask(Program
*program
);
1241 void value_numbering(Program
* program
);
1242 void optimize(Program
* program
);
1243 void setup_reduce_temp(Program
* program
);
1244 void lower_to_cssa(Program
* program
, live
& live_vars
, const struct radv_nir_compiler_options
*options
);
1245 void register_allocation(Program
*program
, std::vector
<std::set
<Temp
>> live_out_per_block
);
1246 void ssa_elimination(Program
* program
);
1247 void lower_to_hw_instr(Program
* program
);
1248 void schedule_program(Program
* program
, live
& live_vars
);
1249 void spill(Program
* program
, live
& live_vars
, const struct radv_nir_compiler_options
*options
);
1250 void insert_wait_states(Program
* program
);
1251 void insert_NOPs(Program
* program
);
1252 unsigned emit_program(Program
* program
, std::vector
<uint32_t>& code
);
1253 void print_asm(Program
*program
, std::vector
<uint32_t>& binary
,
1254 unsigned exec_size
, std::ostream
& out
);
1255 void validate(Program
* program
, FILE *output
);
1256 bool validate_ra(Program
* program
, const struct radv_nir_compiler_options
*options
, FILE *output
);
1258 void perfwarn(bool cond
, const char *msg
, Instruction
*instr
=NULL
);
1260 #define perfwarn(program, cond, msg, ...) do {} while(0)
1263 void aco_print_instr(Instruction
*instr
, FILE *output
);
1264 void aco_print_program(Program
*program
, FILE *output
);
1266 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1267 uint16_t get_extra_sgprs(Program
*program
);
1269 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
1270 uint16_t get_sgpr_alloc(Program
*program
, uint16_t addressable_sgprs
);
1271 uint16_t get_vgpr_alloc(Program
*program
, uint16_t addressable_vgprs
);
1273 /* return number of addressable sgprs/vgprs for max_waves */
1274 uint16_t get_addr_sgpr_from_waves(Program
*program
, uint16_t max_waves
);
1275 uint16_t get_addr_vgpr_from_waves(Program
*program
, uint16_t max_waves
);
1278 const int16_t opcode_gfx7
[static_cast<int>(aco_opcode::num_opcodes
)];
1279 const int16_t opcode_gfx9
[static_cast<int>(aco_opcode::num_opcodes
)];
1280 const int16_t opcode_gfx10
[static_cast<int>(aco_opcode::num_opcodes
)];
1281 const std::bitset
<static_cast<int>(aco_opcode::num_opcodes
)> can_use_input_modifiers
;
1282 const std::bitset
<static_cast<int>(aco_opcode::num_opcodes
)> can_use_output_modifiers
;
1283 const std::bitset
<static_cast<int>(aco_opcode::num_opcodes
)> is_atomic
;
1284 const char *name
[static_cast<int>(aco_opcode::num_opcodes
)];
1285 const aco::Format format
[static_cast<int>(aco_opcode::num_opcodes
)];
1288 extern const Info instr_info
;
1292 #endif /* ACO_IR_H */