X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Famd%2Fcompiler%2Faco_ir.h;h=fb0c9beb2082166dbf712708a6b448cbd078fbc8;hb=331794495ed0e8bbd87cafedfa9ef334bb43b0b7;hp=370aa5a03c28f5d6975a4a019038883b0b04a75c;hpb=680b086db194a5ed90031bcfe28b71e23051b6bb;p=mesa.git diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 370aa5a03c2..fb0c9beb208 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -27,6 +27,7 @@ #include #include +#include #include #include @@ -37,6 +38,7 @@ #include "aco_util.h" struct radv_nir_compiler_options; +struct radv_shader_args; struct radv_shader_info; namespace aco { @@ -101,13 +103,20 @@ enum class Format : std::uint16_t { SDWA = 1 << 15, }; -enum barrier_interaction { +enum barrier_interaction : uint8_t { barrier_none = 0, barrier_buffer = 0x1, barrier_image = 0x2, barrier_atomic = 0x4, barrier_shared = 0x8, - barrier_count = 4, + /* used for geometry shaders to ensure vertex data writes are before the + * GS_DONE s_sendmsg. */ + barrier_gs_data = 0x10, + /* used for geometry shaders to ensure s_sendmsg instructions are in-order. */ + barrier_gs_sendmsg = 0x20, + /* used by barriers. created by s_barrier */ + barrier_barrier = 0x40, + barrier_count = 6, }; enum fp_round { @@ -161,6 +170,11 @@ constexpr Format asVOP3(Format format) { return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format); }; +constexpr Format asSDWA(Format format) { + assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC); + return (Format) ((uint32_t) Format::SDWA | (uint32_t) format); +} + enum class RegType { none = 0, sgpr, @@ -186,6 +200,13 @@ struct RegClass { v6 = 6 | (1 << 5), v7 = 7 | (1 << 5), v8 = 8 | (1 << 5), + /* byte-sized register class */ + v1b = v1 | (1 << 7), + v2b = v2 | (1 << 7), + v3b = v3 | (1 << 7), + v4b = v4 | (1 << 7), + v6b = v6 | (1 << 7), + v8b = v8 | (1 << 7), /* these are used for WWM and spills to vgpr */ v1_linear = v1 | (1 << 6), v2_linear = v2 | (1 << 6), @@ -201,9 +222,13 @@ struct RegClass { explicit operator bool() = delete; constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; } - constexpr unsigned size() const { return (unsigned) rc & 0x1F; } + constexpr bool is_subdword() const { return rc & (1 << 7); } + constexpr unsigned bytes() const { return ((unsigned) rc & 0x1F) * (is_subdword() ? 1 : 4); } + //TODO: use size() less in favor of bytes() + constexpr unsigned size() const { return (bytes() + 3) >> 2; } constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); } constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); } + constexpr RegClass as_subdword() const { return RegClass((RC) (rc | 1 << 7)); } private: RC rc; @@ -224,6 +249,12 @@ static constexpr RegClass v5{RegClass::v5}; static constexpr RegClass v6{RegClass::v6}; static constexpr RegClass v7{RegClass::v7}; static constexpr RegClass v8{RegClass::v8}; +static constexpr RegClass v1b{RegClass::v1b}; +static constexpr RegClass v2b{RegClass::v2b}; +static constexpr RegClass v3b{RegClass::v3b}; +static constexpr RegClass v4b{RegClass::v4b}; +static constexpr RegClass v6b{RegClass::v6b}; +static constexpr RegClass v8b{RegClass::v8b}; /** * Temp Class @@ -232,24 +263,25 @@ static constexpr RegClass v8{RegClass::v8}; * and SSA id. */ struct Temp { - Temp() = default; + Temp() noexcept : id_(0), reg_class(0) {} constexpr Temp(uint32_t id, RegClass cls) noexcept - : id_(id), reg_class(cls) {} + : id_(id), reg_class(uint8_t(cls)) {} constexpr uint32_t id() const noexcept { return id_; } - constexpr RegClass regClass() const noexcept { return reg_class; } + constexpr RegClass regClass() const noexcept { return (RegClass::RC)reg_class; } - constexpr unsigned size() const noexcept { return reg_class.size(); } - constexpr RegType type() const noexcept { return reg_class.type(); } - constexpr bool is_linear() const noexcept { return reg_class.is_linear(); } + constexpr unsigned bytes() const noexcept { return regClass().bytes(); } + constexpr unsigned size() const noexcept { return regClass().size(); } + constexpr RegType type() const noexcept { return regClass().type(); } + constexpr bool is_linear() const noexcept { return regClass().is_linear(); } constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); } constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); } constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); } private: - uint32_t id_:24; - RegClass reg_class; + uint32_t id_: 24; + uint32_t reg_class : 8; }; /** @@ -259,19 +291,27 @@ private: */ struct PhysReg { constexpr PhysReg() = default; - explicit constexpr PhysReg(unsigned r) : reg(r) {} - constexpr operator unsigned() const { return reg; } - - uint16_t reg = 0; + explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {} + constexpr unsigned reg() const { return reg_b >> 2; } + constexpr unsigned byte() const { return reg_b & 0x3; } + constexpr operator unsigned() const { return reg(); } + constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; } + constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; } + constexpr bool operator <(PhysReg other) const { return reg_b < other.reg_b; } + + uint16_t reg_b = 0; }; /* helper expressions for special registers */ static constexpr PhysReg m0{124}; static constexpr PhysReg vcc{106}; +static constexpr PhysReg vcc_hi{107}; static constexpr PhysReg sgpr_null{125}; /* GFX10+ */ static constexpr PhysReg exec{126}; static constexpr PhysReg exec_lo{126}; static constexpr PhysReg exec_hi{127}; +static constexpr PhysReg vccz{251}; +static constexpr PhysReg execz{252}; static constexpr PhysReg scc{253}; /** @@ -287,7 +327,8 @@ class Operand final public: constexpr Operand() : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false), - isKill_(false), isUndef_(true), isFirstKill_(false), is64BitConst_(false) {} + isKill_(false), isUndef_(true), isFirstKill_(false), is64BitConst_(false), + isLateKill_(false) {} explicit Operand(Temp r) noexcept { @@ -299,10 +340,11 @@ public: setFixed(PhysReg{128}); } }; - explicit Operand(uint32_t v) noexcept + explicit Operand(uint32_t v, bool is64bit = false) noexcept { data_.i = v; isConstant_ = true; + is64BitConst_ = is64bit; if (v <= 64) setFixed(PhysReg{128 + v}); else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */ @@ -323,38 +365,46 @@ public: setFixed(PhysReg{246}); else if (v == 0xc0800000) /* -4.0 */ setFixed(PhysReg{247}); - else if (v == 0x3e22f983) /* 1/(2*PI) */ - setFixed(PhysReg{248}); - else /* Literal Constant */ + else { /* Literal Constant */ + assert(!is64bit && "attempt to create a 64-bit literal constant"); setFixed(PhysReg{255}); + } }; explicit Operand(uint64_t v) noexcept { isConstant_ = true; is64BitConst_ = true; - if (v <= 64) + if (v <= 64) { + data_.i = (uint32_t) v; setFixed(PhysReg{128 + (uint32_t) v}); - else if (v >= 0xFFFFFFFFFFFFFFF0) /* [-16 .. -1] */ + } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */ + data_.i = (uint32_t) v; setFixed(PhysReg{192 - (uint32_t) v}); - else if (v == 0x3FE0000000000000) /* 0.5 */ + } else if (v == 0x3FE0000000000000) { /* 0.5 */ + data_.i = 0x3f000000; setFixed(PhysReg{240}); - else if (v == 0xBFE0000000000000) /* -0.5 */ + } else if (v == 0xBFE0000000000000) { /* -0.5 */ + data_.i = 0xbf000000; setFixed(PhysReg{241}); - else if (v == 0x3FF0000000000000) /* 1.0 */ + } else if (v == 0x3FF0000000000000) { /* 1.0 */ + data_.i = 0x3f800000; setFixed(PhysReg{242}); - else if (v == 0xBFF0000000000000) /* -1.0 */ + } else if (v == 0xBFF0000000000000) { /* -1.0 */ + data_.i = 0xbf800000; setFixed(PhysReg{243}); - else if (v == 0x4000000000000000) /* 2.0 */ + } else if (v == 0x4000000000000000) { /* 2.0 */ + data_.i = 0x40000000; setFixed(PhysReg{244}); - else if (v == 0xC000000000000000) /* -2.0 */ + } else if (v == 0xC000000000000000) { /* -2.0 */ + data_.i = 0xc0000000; setFixed(PhysReg{245}); - else if (v == 0x4010000000000000) /* 4.0 */ + } else if (v == 0x4010000000000000) { /* 4.0 */ + data_.i = 0x40800000; setFixed(PhysReg{246}); - else if (v == 0xC010000000000000) /* -4.0 */ + } else if (v == 0xC010000000000000) { /* -4.0 */ + data_.i = 0xc0800000; setFixed(PhysReg{247}); - else if (v == 0x3fc45f306dc9c882) /* 1/(2*PI) */ - setFixed(PhysReg{248}); - else { /* Literal Constant: we don't know if it is a long or double.*/ + } else { /* Literal Constant: we don't know if it is a long or double.*/ isConstant_ = 0; assert(false && "attempt to create a 64-bit literal constant"); } @@ -402,6 +452,14 @@ public: return data_.temp.regClass(); } + constexpr unsigned bytes() const noexcept + { + if (isConstant()) + return is64BitConst_ ? 8 : 4; //TODO: sub-dword constants + else + return data_.temp.bytes(); + } + constexpr unsigned size() const noexcept { if (isConstant()) @@ -451,6 +509,49 @@ public: return isConstant() && constantValue() == cmp; } + constexpr uint64_t constantValue64(bool signext=false) const noexcept + { + if (is64BitConst_) { + if (reg_ <= 192) + return reg_ - 128; + else if (reg_ <= 208) + return 0xFFFFFFFFFFFFFFFF - (reg_ - 193); + + switch (reg_) { + case 240: + return 0x3FE0000000000000; + case 241: + return 0xBFE0000000000000; + case 242: + return 0x3FF0000000000000; + case 243: + return 0xBFF0000000000000; + case 244: + return 0x4000000000000000; + case 245: + return 0xC000000000000000; + case 246: + return 0x4010000000000000; + case 247: + return 0xC010000000000000; + } + } + return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i; + } + + /* Indicates that the killed operand's live range intersects with the + * instruction's definitions. Unlike isKill() and isFirstKill(), this is + * not set by liveness analysis. */ + constexpr void setLateKill(bool flag) noexcept + { + isLateKill_ = flag; + } + + constexpr bool isLateKill() const noexcept + { + return isLateKill_; + } + constexpr void setKill(bool flag) noexcept { isKill_ = flag; @@ -477,6 +578,33 @@ public: return isFirstKill_; } + constexpr bool isKillBeforeDef() const noexcept + { + return isKill() && !isLateKill(); + } + + constexpr bool isFirstKillBeforeDef() const noexcept + { + return isFirstKill() && !isLateKill(); + } + + constexpr bool operator == (Operand other) const noexcept + { + if (other.size() != size()) + return false; + if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef()) + return false; + if (isFixed() && other.isFixed() && physReg() != other.physReg()) + return false; + if (isLiteral()) + return other.isLiteral() && other.constantValue() == constantValue(); + else if (isConstant()) + return other.isConstant() && other.physReg() == physReg(); + else if (isUndefined()) + return other.isUndefined() && other.regClass() == regClass(); + else + return other.isTemp() && other.getTemp() == getTemp(); + } private: union { uint32_t i; @@ -493,6 +621,7 @@ private: uint8_t isUndef_:1; uint8_t isFirstKill_:1; uint8_t is64BitConst_:1; + uint8_t isLateKill_:1; }; /* can't initialize bit-fields in c++11, so work around using a union */ uint8_t control_ = 0; @@ -548,6 +677,11 @@ public: return temp.regClass(); } + constexpr unsigned bytes() const noexcept + { + return temp.bytes(); + } + constexpr unsigned size() const noexcept { return temp.size(); @@ -673,24 +807,31 @@ struct Instruction { return false; } }; +static_assert(sizeof(Instruction) == 16); struct SOPK_instruction : public Instruction { uint16_t imm; + uint16_t padding; }; +static_assert(sizeof(SOPK_instruction) == sizeof(Instruction) + 4); struct SOPP_instruction : public Instruction { uint32_t imm; int block; }; +static_assert(sizeof(SOPP_instruction) == sizeof(Instruction) + 8); struct SOPC_instruction : public Instruction { }; +static_assert(sizeof(SOPC_instruction) == sizeof(Instruction) + 0); struct SOP1_instruction : public Instruction { }; +static_assert(sizeof(SOP1_instruction) == sizeof(Instruction) + 0); struct SOP2_instruction : public Instruction { }; +static_assert(sizeof(SOP2_instruction) == sizeof(Instruction) + 0); /** * Scalar Memory Format: @@ -704,30 +845,37 @@ struct SOP2_instruction : public Instruction { * */ struct SMEM_instruction : public Instruction { - bool glc; /* VI+: globally coherent */ - bool dlc; /* NAVI: device level coherent */ - bool nv; /* VEGA only: Non-volatile */ - bool can_reorder; - bool disable_wqm; barrier_interaction barrier; + bool glc : 1; /* VI+: globally coherent */ + bool dlc : 1; /* NAVI: device level coherent */ + bool nv : 1; /* VEGA only: Non-volatile */ + bool can_reorder : 1; + bool disable_wqm : 1; + uint32_t padding: 19; }; +static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4); struct VOP1_instruction : public Instruction { }; +static_assert(sizeof(VOP1_instruction) == sizeof(Instruction) + 0); struct VOP2_instruction : public Instruction { }; +static_assert(sizeof(VOP2_instruction) == sizeof(Instruction) + 0); struct VOPC_instruction : public Instruction { }; +static_assert(sizeof(VOPC_instruction) == sizeof(Instruction) + 0); struct VOP3A_instruction : public Instruction { bool abs[3]; - bool opsel[4]; - bool clamp; - unsigned omod; bool neg[3]; + uint8_t opsel : 4; + uint8_t omod : 2; + bool clamp : 1; + uint32_t padding : 9; }; +static_assert(sizeof(VOP3A_instruction) == sizeof(Instruction) + 8); /** * Data Parallel Primitives Format: @@ -736,18 +884,80 @@ struct VOP3A_instruction : public Instruction { * */ struct DPP_instruction : public Instruction { - uint16_t dpp_ctrl; - uint8_t row_mask; - uint8_t bank_mask; bool abs[2]; bool neg[2]; - bool bound_ctrl; + uint16_t dpp_ctrl; + uint8_t row_mask : 4; + uint8_t bank_mask : 4; + bool bound_ctrl : 1; + uint32_t padding : 7; +}; +static_assert(sizeof(DPP_instruction) == sizeof(Instruction) + 8); + +enum sdwa_sel : uint8_t { + /* masks */ + sdwa_wordnum = 0x1, + sdwa_bytenum = 0x3, + sdwa_asuint = 0x7 | 0x10, + sdwa_rasize = 0x3, + + /* flags */ + sdwa_isword = 0x4, + sdwa_sext = 0x8, + sdwa_isra = 0x10, + + /* specific values */ + sdwa_ubyte0 = 0, + sdwa_ubyte1 = 1, + sdwa_ubyte2 = 2, + sdwa_ubyte3 = 3, + sdwa_uword0 = sdwa_isword | 0, + sdwa_uword1 = sdwa_isword | 1, + sdwa_udword = 6, + + sdwa_sbyte0 = sdwa_ubyte0 | sdwa_sext, + sdwa_sbyte1 = sdwa_ubyte1 | sdwa_sext, + sdwa_sbyte2 = sdwa_ubyte2 | sdwa_sext, + sdwa_sbyte3 = sdwa_ubyte3 | sdwa_sext, + sdwa_sword0 = sdwa_uword0 | sdwa_sext, + sdwa_sword1 = sdwa_uword1 | sdwa_sext, + sdwa_sdword = sdwa_udword | sdwa_sext, + + /* register-allocated */ + sdwa_ubyte = 1 | sdwa_isra, + sdwa_uword = 2 | sdwa_isra, + sdwa_sbyte = sdwa_ubyte | sdwa_sext, + sdwa_sword = sdwa_uword | sdwa_sext, +}; + +/** + * Sub-Dword Addressing Format: + * This format can be used for VOP1, VOP2 or VOPC instructions. + * + * omod and SGPR/constant operands are only available on GFX9+. For VOPC, + * the definition doesn't have to be VCC on GFX9+. + * + */ +struct SDWA_instruction : public Instruction { + /* these destination modifiers aren't available with VOPC except for + * clamp on GFX8 */ + uint8_t sel[2]; + uint8_t dst_sel; + bool neg[2]; + bool abs[2]; + bool dst_preserve : 1; + bool clamp : 1; + uint8_t omod : 2; /* GFX9+ */ + uint32_t padding : 4; }; +static_assert(sizeof(SDWA_instruction) == sizeof(Instruction) + 8); struct Interp_instruction : public Instruction { - unsigned attribute; - unsigned component; + uint8_t attribute; + uint8_t component; + uint16_t padding; }; +static_assert(sizeof(Interp_instruction) == sizeof(Instruction) + 4); /** * Local and Global Data Sharing instructions @@ -763,77 +973,86 @@ struct DS_instruction : public Instruction { int8_t offset1; bool gds; }; +static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 4); /** * Vector Memory Untyped-buffer Instructions - * Operand(0): VADDR - Address source. Can carry an index and/or offset - * Operand(1): SRSRC - Specifies which SGPR supplies T# (resource constant) + * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant) + * Operand(1): VADDR - Address source. Can carry an index and/or offset * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant) * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data * */ struct MUBUF_instruction : public Instruction { - unsigned offset; /* Unsigned byte offset - 12 bit */ - bool offen; /* Supply an offset from VGPR (VADDR) */ - bool idxen; /* Supply an index from VGPR (VADDR) */ - bool glc; /* globally coherent */ - bool dlc; /* NAVI: device level coherent */ - bool slc; /* system level coherent */ - bool tfe; /* texture fail enable */ - bool lds; /* Return read-data to LDS instead of VGPRs */ - bool disable_wqm; /* Require an exec mask without helper invocations */ - bool can_reorder; + uint16_t offset : 12; /* Unsigned byte offset - 12 bit */ + bool offen : 1; /* Supply an offset from VGPR (VADDR) */ + bool idxen : 1; /* Supply an index from VGPR (VADDR) */ + bool addr64 : 1; /* SI, CIK: Address size is 64-bit */ + bool glc : 1; /* globally coherent */ + bool dlc : 1; /* NAVI: device level coherent */ + bool slc : 1; /* system level coherent */ + bool tfe : 1; /* texture fail enable */ + bool lds : 1; /* Return read-data to LDS instead of VGPRs */ + bool disable_wqm : 1; /* Require an exec mask without helper invocations */ + bool can_reorder : 1; + uint8_t padding : 2; barrier_interaction barrier; }; +static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 4); /** * Vector Memory Typed-buffer Instructions - * Operand(0): VADDR - Address source. Can carry an index and/or offset - * Operand(1): SRSRC - Specifies which SGPR supplies T# (resource constant) + * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant) + * Operand(1): VADDR - Address source. Can carry an index and/or offset * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant) * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data * */ struct MTBUF_instruction : public Instruction { + uint16_t offset; /* Unsigned byte offset - 12 bit */ + barrier_interaction barrier; uint8_t dfmt : 4; /* Data Format of data in memory buffer */ uint8_t nfmt : 3; /* Numeric format of data in memory */ - unsigned offset; /* Unsigned byte offset - 12 bit */ - bool offen; /* Supply an offset from VGPR (VADDR) */ - bool idxen; /* Supply an index from VGPR (VADDR) */ - bool glc; /* globally coherent */ - bool dlc; /* NAVI: device level coherent */ - bool slc; /* system level coherent */ - bool tfe; /* texture fail enable */ - bool disable_wqm; /* Require an exec mask without helper invocations */ - bool can_reorder; - barrier_interaction barrier; + bool offen : 1; /* Supply an offset from VGPR (VADDR) */ + bool idxen : 1; /* Supply an index from VGPR (VADDR) */ + bool glc : 1; /* globally coherent */ + bool dlc : 1; /* NAVI: device level coherent */ + bool slc : 1; /* system level coherent */ + bool tfe : 1; /* texture fail enable */ + bool disable_wqm : 1; /* Require an exec mask without helper invocations */ + bool can_reorder : 1; + uint32_t padding : 25; }; +static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8); /** * Vector Memory Image Instructions - * Operand(0): VADDR - Address source. Can carry an offset or an index. - * Operand(1): SRSRC - Scalar GPR that specifies the resource constant. - * Operand(2): SSAMP - Scalar GPR that specifies sampler constant. - * Operand(3) / Definition(0): VDATA - Vector GPR for read / write result. + * Operand(0) SRSRC - Scalar GPR that specifies the resource constant. + * Operand(1): SSAMP - Scalar GPR that specifies sampler constant. + * or VDATA - Vector GPR for write data. + * Operand(2): VADDR - Address source. Can carry an offset or an index. + * Definition(0): VDATA - Vector GPR for read result. * */ struct MIMG_instruction : public Instruction { - unsigned dmask; /* Data VGPR enable mask */ - unsigned dim; /* NAVI: dimensionality */ - bool unrm; /* Force address to be un-normalized */ - bool dlc; /* NAVI: device level coherent */ - bool glc; /* globally coherent */ - bool slc; /* system level coherent */ - bool tfe; /* texture fail enable */ - bool da; /* declare an array */ - bool lwe; /* Force data to be un-normalized */ - bool r128; /* NAVI: Texture resource size */ - bool a16; /* VEGA, NAVI: Address components are 16-bits */ - bool d16; /* Convert 32-bit data to 16-bit data */ - bool disable_wqm; /* Require an exec mask without helper invocations */ - bool can_reorder; + uint8_t dmask; /* Data VGPR enable mask */ + uint8_t dim : 3; /* NAVI: dimensionality */ + bool unrm : 1; /* Force address to be un-normalized */ + bool dlc : 1; /* NAVI: device level coherent */ + bool glc : 1; /* globally coherent */ + bool slc : 1; /* system level coherent */ + bool tfe : 1; /* texture fail enable */ + bool da : 1; /* declare an array */ + bool lwe : 1; /* Force data to be un-normalized */ + bool r128 : 1; /* NAVI: Texture resource size */ + bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */ + bool d16 : 1; /* Convert 32-bit data to 16-bit data */ + bool disable_wqm : 1; /* Require an exec mask without helper invocations */ + bool can_reorder : 1; + uint8_t padding : 1; barrier_interaction barrier; }; +static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 4); /** * Flat/Scratch/Global Instructions @@ -843,26 +1062,35 @@ struct MIMG_instruction : public Instruction { * */ struct FLAT_instruction : public Instruction { - uint16_t offset; /* Vega only */ - bool slc; /* system level coherent */ - bool glc; /* globally coherent */ - bool dlc; /* NAVI: device level coherent */ - bool lds; - bool nv; + uint16_t offset; /* Vega/Navi only */ + bool slc : 1; /* system level coherent */ + bool glc : 1; /* globally coherent */ + bool dlc : 1; /* NAVI: device level coherent */ + bool lds : 1; + bool nv : 1; + bool disable_wqm : 1; /* Require an exec mask without helper invocations */ + bool can_reorder : 1; + uint8_t padding : 1; + barrier_interaction barrier; }; +static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 4); struct Export_instruction : public Instruction { - unsigned enabled_mask; - unsigned dest; - bool compressed; - bool done; - bool valid_mask; + uint8_t enabled_mask; + uint8_t dest; + bool compressed : 1; + bool done : 1; + bool valid_mask : 1; + uint32_t padding : 13; }; +static_assert(sizeof(Export_instruction) == sizeof(Instruction) + 4); struct Pseudo_instruction : public Instruction { - bool tmp_in_scc; PhysReg scratch_sgpr; /* might not be valid if it's not needed */ + bool tmp_in_scc; + uint8_t padding; }; +static_assert(sizeof(Pseudo_instruction) == sizeof(Instruction) + 4); struct Pseudo_branch_instruction : public Instruction { /* target[0] is the block index of the branch target. @@ -871,11 +1099,13 @@ struct Pseudo_branch_instruction : public Instruction { */ uint32_t target[2]; }; +static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 8); struct Pseudo_barrier_instruction : public Instruction { }; +static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 0); -enum ReduceOp { +enum ReduceOp : uint16_t { iadd32, iadd64, imul32, imul64, fadd32, fadd64, @@ -907,8 +1137,9 @@ enum ReduceOp { */ struct Pseudo_reduction_instruction : public Instruction { ReduceOp reduce_op; - unsigned cluster_size; // must be 0 for scans + uint16_t cluster_size; // must be 0 for scans }; +static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4); struct instr_deleter_functor { void operator()(void* p) { @@ -929,8 +1160,10 @@ T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, u inst->opcode = opcode; inst->format = format; - inst->operands = aco::span((Operand*)(data + sizeof(T)), num_operands); - inst->definitions = aco::span((Definition*)inst->operands.end(), num_definitions); + uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands; + inst->operands = aco::span(operands_offset, num_operands); + uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions; + inst->definitions = aco::span(definitions_offset, num_definitions); return inst; } @@ -943,10 +1176,10 @@ constexpr bool Instruction::usesModifiers() const noexcept return false; const VOP3A_instruction *vop3 = static_cast(this); for (unsigned i = 0; i < operands.size(); i++) { - if (vop3->abs[i] || vop3->opsel[i] || vop3->neg[i]) + if (vop3->abs[i] || vop3->neg[i]) return true; } - return vop3->opsel[3] || vop3->clamp || vop3->omod; + return vop3->opsel || vop3->clamp || vop3->omod; } constexpr bool is_phi(Instruction* instr) @@ -959,24 +1192,9 @@ static inline bool is_phi(aco_ptr& instr) return is_phi(instr.get()); } -constexpr barrier_interaction get_barrier_interaction(Instruction* instr) -{ - switch (instr->format) { - case Format::SMEM: - return static_cast(instr)->barrier; - case Format::MUBUF: - return static_cast(instr)->barrier; - case Format::MIMG: - return static_cast(instr)->barrier; - case Format::FLAT: - case Format::GLOBAL: - return barrier_buffer; - case Format::DS: - return barrier_shared; - default: - return barrier_none; - } -} +barrier_interaction get_barrier_interaction(Instruction* instr); + +bool is_dead(const std::vector& uses, Instruction *instr); enum block_kind { /* uniform indicates that leaving this block, @@ -996,6 +1214,7 @@ enum block_kind { block_kind_uses_discard_if = 1 << 12, block_kind_needs_lowering = 1 << 13, block_kind_uses_demote = 1 << 14, + block_kind_export_end = 1 << 15, }; @@ -1099,28 +1318,31 @@ static constexpr Stage sw_tcs = 1 << 2; static constexpr Stage sw_tes = 1 << 3; static constexpr Stage sw_fs = 1 << 4; static constexpr Stage sw_cs = 1 << 5; -static constexpr Stage sw_mask = 0x3f; +static constexpr Stage sw_gs_copy = 1 << 6; +static constexpr Stage sw_mask = 0x7f; /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */ -static constexpr Stage hw_vs = 1 << 6; -static constexpr Stage hw_es = 1 << 7; /* not on GFX9. combined into GS on GFX9 (and GFX10/legacy). */ -static constexpr Stage hw_gs = 1 << 8; -static constexpr Stage hw_ls = 1 << 9; /* not on GFX9. combined into HS on GFX9 (and GFX10/legacy). */ -static constexpr Stage hw_hs = 1 << 10; -static constexpr Stage hw_fs = 1 << 11; -static constexpr Stage hw_cs = 1 << 12; -static constexpr Stage hw_mask = 0x7f << 6; +static constexpr Stage hw_vs = 1 << 7; +static constexpr Stage hw_es = 1 << 8; /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */ +static constexpr Stage hw_gs = 1 << 9; /* Geometry shader on GFX10/legacy and GFX6-9. */ +static constexpr Stage hw_ngg_gs = 1 << 10; /* Geometry shader on GFX10/NGG. */ +static constexpr Stage hw_ls = 1 << 11; /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */ +static constexpr Stage hw_hs = 1 << 12; /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */ +static constexpr Stage hw_fs = 1 << 13; +static constexpr Stage hw_cs = 1 << 14; +static constexpr Stage hw_mask = 0xff << 7; /* possible settings of Program::stage */ static constexpr Stage vertex_vs = sw_vs | hw_vs; static constexpr Stage fragment_fs = sw_fs | hw_fs; static constexpr Stage compute_cs = sw_cs | hw_cs; static constexpr Stage tess_eval_vs = sw_tes | hw_vs; +static constexpr Stage gs_copy_vs = sw_gs_copy | hw_vs; /* GFX10/NGG */ -static constexpr Stage ngg_vertex_gs = sw_vs | hw_gs; -static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_gs; -static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs; -static constexpr Stage ngg_vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs; +static constexpr Stage ngg_vertex_gs = sw_vs | hw_ngg_gs; +static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_ngg_gs; +static constexpr Stage ngg_tess_eval_gs = sw_tes | hw_ngg_gs; +static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_ngg_gs; /* GFX9 (and GFX10 if NGG isn't used) */ static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs; static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs; @@ -1129,9 +1351,24 @@ static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs; static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */ static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */ static constexpr Stage tess_control_hs = sw_tcs | hw_hs; -static constexpr Stage tess_eval_es = sw_tes | hw_gs; /* tesselation evaluation before geometry */ +static constexpr Stage tess_eval_es = sw_tes | hw_es; /* tesselation evaluation before geometry */ static constexpr Stage geometry_gs = sw_gs | hw_gs; +enum statistic { + statistic_hash, + statistic_instructions, + statistic_copies, + statistic_branches, + statistic_cycles, + statistic_vmem_clauses, + statistic_smem_clauses, + statistic_vmem_score, + statistic_smem_score, + statistic_sgpr_presched, + statistic_vgpr_presched, + num_statistics +}; + class Program final { public: float_mode next_fp_mode; @@ -1144,6 +1381,7 @@ public: enum chip_class chip_class; enum radeon_family family; unsigned wave_size; + RegClass lane_mask; Stage stage; /* Stage */ bool needs_exact = false; /* there exists an instruction with disable_wqm = true */ bool needs_wqm = false; /* there exists a p_wqm instruction */ @@ -1153,17 +1391,25 @@ public: Temp private_segment_buffer; Temp scratch_offset; + uint16_t min_waves = 0; uint16_t lds_alloc_granule; uint32_t lds_limit; /* in bytes */ + bool has_16bank_lds; uint16_t vgpr_limit; uint16_t sgpr_limit; uint16_t physical_sgprs; uint16_t sgpr_alloc_granule; /* minus one. must be power of two */ + uint16_t vgpr_alloc_granule; /* minus one. must be power of two */ + unsigned workgroup_size; /* if known; otherwise UINT_MAX */ + + bool xnack_enabled = false; bool needs_vcc = false; - bool needs_xnack_mask = false; bool needs_flat_scr = false; + bool collect_statistics = false; + uint32_t statistics[num_statistics]; + uint32_t allocateId() { assert(allocationID <= 16777215); @@ -1197,9 +1443,16 @@ private: uint32_t allocationID = 1; }; +struct TempHash { + std::size_t operator()(Temp t) const { + return t.id(); + } +}; +using TempSet = std::unordered_set; + struct live { /* live temps out per block */ - std::vector> live_out; + std::vector live_out; /* register demand (sgpr/vgpr) per instruction per block */ std::vector> register_demand; }; @@ -1208,12 +1461,15 @@ void select_program(Program *program, unsigned shader_count, struct nir_shader *const *shaders, ac_shader_config* config, - struct radv_shader_info *info, - const struct radv_nir_compiler_options *options); + struct radv_shader_args *args); +void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader, + ac_shader_config* config, + struct radv_shader_args *args); void lower_wqm(Program* program, live& live_vars, const struct radv_nir_compiler_options *options); -void lower_bool_phis(Program* program); +void lower_phis(Program* program); +void calc_min_waves(Program* program); void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand); live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options); std::vector dead_code_analysis(Program *program); @@ -1223,7 +1479,7 @@ void value_numbering(Program* program); void optimize(Program* program); void setup_reduce_temp(Program* program); void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options); -void register_allocation(Program *program, std::vector> live_out_per_block); +void register_allocation(Program *program, std::vector& live_out_per_block); void ssa_elimination(Program* program); void lower_to_hw_instr(Program* program); void schedule_program(Program* program, live& live_vars); @@ -1238,26 +1494,39 @@ bool validate_ra(Program* program, const struct radv_nir_compiler_options *optio #ifndef NDEBUG void perfwarn(bool cond, const char *msg, Instruction *instr=NULL); #else -#define perfwarn(program, cond, msg, ...) +#define perfwarn(program, cond, msg, ...) do {} while(0) #endif +void collect_presched_stats(Program *program); +void collect_preasm_stats(Program *program); +void collect_postasm_stats(Program *program, const std::vector& code); + void aco_print_instr(Instruction *instr, FILE *output); void aco_print_program(Program *program, FILE *output); +/* utilities for dealing with register demand */ +RegisterDemand get_live_changes(aco_ptr& instr); +RegisterDemand get_temp_registers(aco_ptr& instr); +RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr& instr, aco_ptr& instr_before); + /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */ uint16_t get_extra_sgprs(Program *program); -/* get number of sgprs allocated required to address a number of sgprs */ +/* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */ uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs); +uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs); -/* return number of addressable SGPRs for max_waves */ +/* return number of addressable sgprs/vgprs for max_waves */ uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves); +uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves); typedef struct { + const int16_t opcode_gfx7[static_cast(aco_opcode::num_opcodes)]; const int16_t opcode_gfx9[static_cast(aco_opcode::num_opcodes)]; const int16_t opcode_gfx10[static_cast(aco_opcode::num_opcodes)]; const std::bitset(aco_opcode::num_opcodes)> can_use_input_modifiers; const std::bitset(aco_opcode::num_opcodes)> can_use_output_modifiers; + const std::bitset(aco_opcode::num_opcodes)> is_atomic; const char *name[static_cast(aco_opcode::num_opcodes)]; const aco::Format format[static_cast(aco_opcode::num_opcodes)]; } Info;