src/amd/compiler/aco_ir.h

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  */
  24
  25 #ifndef ACO_IR_H
  26 #define ACO_IR_H
  27
  28 #include <vector>
  29 #include <set>
  30 #include <unordered_set>
  31 #include <bitset>
  32 #include <memory>
  33
  34 #include "nir.h"
  35 #include "ac_binary.h"
  36 #include "amd_family.h"
  37 #include "aco_opcodes.h"
  38 #include "aco_util.h"
  39
  40 struct radv_nir_compiler_options;
  41 struct radv_shader_args;
  42 struct radv_shader_info;
  43
  44 namespace aco {
  45
  46 extern uint64_t debug_flags;
  47
  48 enum {
  49    DEBUG_VALIDATE = 0x1,
  50    DEBUG_VALIDATE_RA = 0x2,
  51    DEBUG_PERFWARN = 0x4,
  52 };
  53
  54 /**
  55  * Representation of the instruction's microcode encoding format
  56  * Note: Some Vector ALU Formats can be combined, such that:
  57  * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
  58  * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
  59  * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
  60  *
  61  * (*) The same is applicable for VOP1 and VOPC instructions.
  62  */
  63 enum class Format : std::uint16_t {
  64    /* Pseudo Instruction Format */
  65    PSEUDO = 0,
  66    /* Scalar ALU & Control Formats */
  67    SOP1 = 1,
  68    SOP2 = 2,
  69    SOPK = 3,
  70    SOPP = 4,
  71    SOPC = 5,
  72    /* Scalar Memory Format */
  73    SMEM = 6,
  74    /* LDS/GDS Format */
  75    DS = 8,
  76    /* Vector Memory Buffer Formats */
  77    MTBUF = 9,
  78    MUBUF = 10,
  79    /* Vector Memory Image Format */
  80    MIMG = 11,
  81    /* Export Format */
  82    EXP = 12,
  83    /* Flat Formats */
  84    FLAT = 13,
  85    GLOBAL = 14,
  86    SCRATCH = 15,
  87
  88    PSEUDO_BRANCH = 16,
  89    PSEUDO_BARRIER = 17,
  90    PSEUDO_REDUCTION = 18,
  91
  92    /* Vector ALU Formats */
  93    VOP3P = 19,
  94    VOP1 = 1 << 8,
  95    VOP2 = 1 << 9,
  96    VOPC = 1 << 10,
  97    VOP3 = 1 << 11,
  98    VOP3A = 1 << 11,
  99    VOP3B = 1 << 11,
 100    /* Vector Parameter Interpolation Format */
 101    VINTRP = 1 << 12,
 102    DPP = 1 << 13,
 103    SDWA = 1 << 14,
 104 };
 105
 106 enum storage_class : uint8_t {
 107    storage_none = 0x0, /* no synchronization and can be reordered around aliasing stores */
 108    storage_buffer = 0x1, /* SSBOs and global memory */
 109    storage_atomic_counter = 0x2, /* not used for Vulkan */
 110    storage_image = 0x4,
 111    storage_shared = 0x8, /* or TCS output */
 112    storage_vmem_output = 0x10, /* GS or TCS output stores using VMEM */
 113    storage_scratch = 0x20,
 114    storage_vgpr_spill = 0x40,
 115    storage_count = 8,
 116 };
 117
 118 enum memory_semantics : uint8_t {
 119    semantic_none = 0x0,
 120    /* for loads: don't move any access after this load to before this load (even other loads)
 121     * for barriers: don't move any access after the barrier to before any
 122     * atomics/control_barriers/sendmsg_gs_done before the barrier */
 123    semantic_acquire = 0x1,
 124    /* for stores: don't move any access before this store to after this store
 125     * for barriers: don't move any access before the barrier to after any
 126     * atomics/control_barriers/sendmsg_gs_done after the barrier */
 127    semantic_release = 0x2,
 128
 129    /* the rest are for load/stores/atomics only */
 130    /* cannot be DCE'd or CSE'd */
 131    semantic_volatile = 0x4,
 132    /* does not interact with barriers and assumes this lane is the only lane
 133     * accessing this memory */
 134    semantic_private = 0x8,
 135    /* this operation can be reordered around operations of the same storage. says nothing about barriers */
 136    semantic_can_reorder = 0x10,
 137    /* this is a atomic instruction (may only read or write memory) */
 138    semantic_atomic = 0x20,
 139    /* this is instruction both reads and writes memory */
 140    semantic_rmw = 0x40,
 141
 142    semantic_acqrel = semantic_acquire | semantic_release,
 143    semantic_atomicrmw = semantic_volatile | semantic_atomic | semantic_rmw,
 144 };
 145
 146 enum sync_scope : uint8_t {
 147    scope_invocation = 0,
 148    scope_subgroup = 1,
 149    scope_workgroup = 2,
 150    scope_queuefamily = 3,
 151    scope_device = 4,
 152 };
 153
 154 struct memory_sync_info {
 155    memory_sync_info() : storage(storage_none), semantics(semantic_none), scope(scope_invocation) {}
 156    memory_sync_info(int storage, int semantics=0, sync_scope scope=scope_invocation)
 157       : storage((storage_class)storage), semantics((memory_semantics)semantics), scope(scope) {}
 158
 159    storage_class storage:8;
 160    memory_semantics semantics:8;
 161    sync_scope scope:8;
 162
 163    bool operator == (const memory_sync_info& rhs) const {
 164       return storage == rhs.storage &&
 165              semantics == rhs.semantics &&
 166              scope == rhs.scope;
 167    }
 168
 169    bool can_reorder() const {
 170       if (semantics & semantic_acqrel)
 171          return false;
 172       /* Also check storage so that zero-initialized memory_sync_info can be
 173        * reordered. */
 174       return (!storage || (semantics & semantic_can_reorder)) && !(semantics & semantic_volatile);
 175    }
 176 };
 177 static_assert(sizeof(memory_sync_info) == 3, "Unexpected padding");
 178
 179 enum fp_round {
 180    fp_round_ne = 0,
 181    fp_round_pi = 1,
 182    fp_round_ni = 2,
 183    fp_round_tz = 3,
 184 };
 185
 186 enum fp_denorm {
 187    /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
 188     * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
 189    fp_denorm_flush = 0x0,
 190    fp_denorm_keep = 0x3,
 191 };
 192
 193 struct float_mode {
 194    /* matches encoding of the MODE register */
 195    union {
 196       struct {
 197           fp_round round32:2;
 198           fp_round round16_64:2;
 199           unsigned denorm32:2;
 200           unsigned denorm16_64:2;
 201       };
 202       struct {
 203          uint8_t round:4;
 204          uint8_t denorm:4;
 205       };
 206       uint8_t val = 0;
 207    };
 208    /* if false, optimizations which may remove infs/nan/-0.0 can be done */
 209    bool preserve_signed_zero_inf_nan32:1;
 210    bool preserve_signed_zero_inf_nan16_64:1;
 211    /* if false, optimizations which may remove denormal flushing can be done */
 212    bool must_flush_denorms32:1;
 213    bool must_flush_denorms16_64:1;
 214    bool care_about_round32:1;
 215    bool care_about_round16_64:1;
 216
 217    /* Returns true if instructions using the mode "other" can safely use the
 218     * current one instead. */
 219    bool canReplace(float_mode other) const noexcept {
 220       return val == other.val &&
 221              (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
 222              (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
 223              (must_flush_denorms32  || !other.must_flush_denorms32) &&
 224              (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
 225              (care_about_round32 || !other.care_about_round32) &&
 226              (care_about_round16_64 || !other.care_about_round16_64);
 227    }
 228 };
 229
 230 constexpr Format asVOP3(Format format) {
 231    return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
 232 };
 233
 234 constexpr Format asSDWA(Format format) {
 235    assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC);
 236    return (Format) ((uint32_t) Format::SDWA | (uint32_t) format);
 237 }
 238
 239 enum class RegType {
 240    none = 0,
 241    sgpr,
 242    vgpr,
 243    linear_vgpr,
 244 };
 245
 246 struct RegClass {
 247
 248    enum RC : uint8_t {
 249       s1 = 1,
 250       s2 = 2,
 251       s3 = 3,
 252       s4 = 4,
 253       s6 = 6,
 254       s8 = 8,
 255       s16 = 16,
 256       v1 = s1 | (1 << 5),
 257       v2 = s2 | (1 << 5),
 258       v3 = s3 | (1 << 5),
 259       v4 = s4 | (1 << 5),
 260       v5 = 5  | (1 << 5),
 261       v6 = 6  | (1 << 5),
 262       v7 = 7  | (1 << 5),
 263       v8 = 8  | (1 << 5),
 264       /* byte-sized register class */
 265       v1b = v1 | (1 << 7),
 266       v2b = v2 | (1 << 7),
 267       v3b = v3 | (1 << 7),
 268       v4b = v4 | (1 << 7),
 269       v6b = v6 | (1 << 7),
 270       v8b = v8 | (1 << 7),
 271       /* these are used for WWM and spills to vgpr */
 272       v1_linear = v1 | (1 << 6),
 273       v2_linear = v2 | (1 << 6),
 274    };
 275
 276    RegClass() = default;
 277    constexpr RegClass(RC rc)
 278       : rc(rc) {}
 279    constexpr RegClass(RegType type, unsigned size)
 280       : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
 281
 282    constexpr operator RC() const { return rc; }
 283    explicit operator bool() = delete;
 284
 285    constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
 286    constexpr bool is_subdword() const { return rc & (1 << 7); }
 287    constexpr unsigned bytes() const { return ((unsigned) rc & 0x1F) * (is_subdword() ? 1 : 4); }
 288    //TODO: use size() less in favor of bytes()
 289    constexpr unsigned size() const { return (bytes() + 3) >> 2; }
 290    constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
 291    constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
 292    constexpr RegClass as_subdword() const { return RegClass((RC) (rc | 1 << 7)); }
 293
 294    static constexpr RegClass get(RegType type, unsigned bytes) {
 295       if (type == RegType::sgpr) {
 296          return RegClass(type, DIV_ROUND_UP(bytes, 4u));
 297       } else {
 298          return bytes % 4u ? RegClass(type, bytes).as_subdword() :
 299                              RegClass(type, bytes / 4u);
 300       }
 301    }
 302
 303 private:
 304    RC rc;
 305 };
 306
 307 /* transitional helper expressions */
 308 static constexpr RegClass s1{RegClass::s1};
 309 static constexpr RegClass s2{RegClass::s2};
 310 static constexpr RegClass s3{RegClass::s3};
 311 static constexpr RegClass s4{RegClass::s4};
 312 static constexpr RegClass s8{RegClass::s8};
 313 static constexpr RegClass s16{RegClass::s16};
 314 static constexpr RegClass v1{RegClass::v1};
 315 static constexpr RegClass v2{RegClass::v2};
 316 static constexpr RegClass v3{RegClass::v3};
 317 static constexpr RegClass v4{RegClass::v4};
 318 static constexpr RegClass v5{RegClass::v5};
 319 static constexpr RegClass v6{RegClass::v6};
 320 static constexpr RegClass v7{RegClass::v7};
 321 static constexpr RegClass v8{RegClass::v8};
 322 static constexpr RegClass v1b{RegClass::v1b};
 323 static constexpr RegClass v2b{RegClass::v2b};
 324 static constexpr RegClass v3b{RegClass::v3b};
 325 static constexpr RegClass v4b{RegClass::v4b};
 326 static constexpr RegClass v6b{RegClass::v6b};
 327 static constexpr RegClass v8b{RegClass::v8b};
 328
 329 /**
 330  * Temp Class
 331  * Each temporary virtual register has a
 332  * register class (i.e. size and type)
 333  * and SSA id.
 334  */
 335 struct Temp {
 336    Temp() noexcept : id_(0), reg_class(0) {}
 337    constexpr Temp(uint32_t id, RegClass cls) noexcept
 338       : id_(id), reg_class(uint8_t(cls)) {}
 339
 340    constexpr uint32_t id() const noexcept { return id_; }
 341    constexpr RegClass regClass() const noexcept { return (RegClass::RC)reg_class; }
 342
 343    constexpr unsigned bytes() const noexcept { return regClass().bytes(); }
 344    constexpr unsigned size() const noexcept { return regClass().size(); }
 345    constexpr RegType type() const noexcept { return regClass().type(); }
 346    constexpr bool is_linear() const noexcept { return regClass().is_linear(); }
 347
 348    constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
 349    constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
 350    constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
 351
 352 private:
 353    uint32_t id_: 24;
 354    uint32_t reg_class : 8;
 355 };
 356
 357 /**
 358  * PhysReg
 359  * Represents the physical register for each
 360  * Operand and Definition.
 361  */
 362 struct PhysReg {
 363    constexpr PhysReg() = default;
 364    explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {}
 365    constexpr unsigned reg() const { return reg_b >> 2; }
 366    constexpr unsigned byte() const { return reg_b & 0x3; }
 367    constexpr operator unsigned() const { return reg(); }
 368    constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; }
 369    constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; }
 370    constexpr bool operator <(PhysReg other) const { return reg_b < other.reg_b; }
 371    constexpr PhysReg advance(int bytes) const { PhysReg res = *this; res.reg_b += bytes; return res; }
 372
 373    uint16_t reg_b = 0;
 374 };
 375
 376 /* helper expressions for special registers */
 377 static constexpr PhysReg m0{124};
 378 static constexpr PhysReg vcc{106};
 379 static constexpr PhysReg vcc_hi{107};
 380 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
 381 static constexpr PhysReg exec{126};
 382 static constexpr PhysReg exec_lo{126};
 383 static constexpr PhysReg exec_hi{127};
 384 static constexpr PhysReg vccz{251};
 385 static constexpr PhysReg execz{252};
 386 static constexpr PhysReg scc{253};
 387
 388 /**
 389  * Operand Class
 390  * Initially, each Operand refers to either
 391  * a temporary virtual register
 392  * or to a constant value
 393  * Temporary registers get mapped to physical register during RA
 394  * Constant values are inlined into the instruction sequence.
 395  */
 396 class Operand final
 397 {
 398 public:
 399    constexpr Operand()
 400       : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
 401         isKill_(false), isUndef_(true), isFirstKill_(false), constSize(0),
 402         isLateKill_(false) {}
 403
 404    explicit Operand(Temp r) noexcept
 405    {
 406       data_.temp = r;
 407       if (r.id()) {
 408          isTemp_ = true;
 409       } else {
 410          isUndef_ = true;
 411          setFixed(PhysReg{128});
 412       }
 413    };
 414    explicit Operand(uint8_t v) noexcept
 415    {
 416       /* 8-bit constants are only used for copies and copies from any 8-bit
 417        * constant can be implemented with a SDWA v_mul_u32_u24. So consider all
 418        * to be inline constants. */
 419       data_.i = v;
 420       isConstant_ = true;
 421       constSize = 0;
 422       setFixed(PhysReg{0u});
 423    };
 424    explicit Operand(uint16_t v) noexcept
 425    {
 426       data_.i = v;
 427       isConstant_ = true;
 428       constSize = 1;
 429       if (v <= 64)
 430          setFixed(PhysReg{128u + v});
 431       else if (v >= 0xFFF0) /* [-16 .. -1] */
 432          setFixed(PhysReg{192u + (0xFFFF - v)});
 433       else if (v == 0x3800) /* 0.5 */
 434          setFixed(PhysReg{240});
 435       else if (v == 0xB800) /* -0.5 */
 436          setFixed(PhysReg{241});
 437       else if (v == 0x3C00) /* 1.0 */
 438          setFixed(PhysReg{242});
 439       else if (v == 0xBC00) /* -1.0 */
 440          setFixed(PhysReg{243});
 441       else if (v == 0x4000) /* 2.0 */
 442          setFixed(PhysReg{244});
 443       else if (v == 0xC000) /* -2.0 */
 444          setFixed(PhysReg{245});
 445       else if (v == 0x4400) /* 4.0 */
 446          setFixed(PhysReg{246});
 447       else if (v == 0xC400) /* -4.0 */
 448          setFixed(PhysReg{247});
 449       else if (v == 0x3118) /* 1/2 PI */
 450          setFixed(PhysReg{248});
 451       else /* Literal Constant */
 452          setFixed(PhysReg{255});
 453    };
 454    explicit Operand(uint32_t v, bool is64bit = false) noexcept
 455    {
 456       data_.i = v;
 457       isConstant_ = true;
 458       constSize = is64bit ? 3 : 2;
 459       if (v <= 64)
 460          setFixed(PhysReg{128 + v});
 461       else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
 462          setFixed(PhysReg{192 - v});
 463       else if (v == 0x3f000000) /* 0.5 */
 464          setFixed(PhysReg{240});
 465       else if (v == 0xbf000000) /* -0.5 */
 466          setFixed(PhysReg{241});
 467       else if (v == 0x3f800000) /* 1.0 */
 468          setFixed(PhysReg{242});
 469       else if (v == 0xbf800000) /* -1.0 */
 470          setFixed(PhysReg{243});
 471       else if (v == 0x40000000) /* 2.0 */
 472          setFixed(PhysReg{244});
 473       else if (v == 0xc0000000) /* -2.0 */
 474          setFixed(PhysReg{245});
 475       else if (v == 0x40800000) /* 4.0 */
 476          setFixed(PhysReg{246});
 477       else if (v == 0xc0800000) /* -4.0 */
 478          setFixed(PhysReg{247});
 479       else { /* Literal Constant */
 480          assert(!is64bit && "attempt to create a 64-bit literal constant");
 481          setFixed(PhysReg{255});
 482       }
 483    };
 484    explicit Operand(uint64_t v) noexcept
 485    {
 486       isConstant_ = true;
 487       constSize = 3;
 488       if (v <= 64) {
 489          data_.i = (uint32_t) v;
 490          setFixed(PhysReg{128 + (uint32_t) v});
 491       } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
 492          data_.i = (uint32_t) v;
 493          setFixed(PhysReg{192 - (uint32_t) v});
 494       } else if (v == 0x3FE0000000000000) { /* 0.5 */
 495          data_.i = 0x3f000000;
 496          setFixed(PhysReg{240});
 497       } else if (v == 0xBFE0000000000000) { /* -0.5 */
 498          data_.i = 0xbf000000;
 499          setFixed(PhysReg{241});
 500       } else if (v == 0x3FF0000000000000) { /* 1.0 */
 501          data_.i = 0x3f800000;
 502          setFixed(PhysReg{242});
 503       } else if (v == 0xBFF0000000000000) { /* -1.0 */
 504          data_.i = 0xbf800000;
 505          setFixed(PhysReg{243});
 506       } else if (v == 0x4000000000000000) { /* 2.0 */
 507          data_.i = 0x40000000;
 508          setFixed(PhysReg{244});
 509       } else if (v == 0xC000000000000000) { /* -2.0 */
 510          data_.i = 0xc0000000;
 511          setFixed(PhysReg{245});
 512       } else if (v == 0x4010000000000000) { /* 4.0 */
 513          data_.i = 0x40800000;
 514          setFixed(PhysReg{246});
 515       } else if (v == 0xC010000000000000) { /* -4.0 */
 516          data_.i = 0xc0800000;
 517          setFixed(PhysReg{247});
 518       } else { /* Literal Constant: we don't know if it is a long or double.*/
 519          isConstant_ = 0;
 520          assert(false && "attempt to create a 64-bit literal constant");
 521       }
 522    };
 523    explicit Operand(RegClass type) noexcept
 524    {
 525       isUndef_ = true;
 526       data_.temp = Temp(0, type);
 527       setFixed(PhysReg{128});
 528    };
 529    explicit Operand(PhysReg reg, RegClass type) noexcept
 530    {
 531       data_.temp = Temp(0, type);
 532       setFixed(reg);
 533    }
 534
 535    constexpr bool isTemp() const noexcept
 536    {
 537       return isTemp_;
 538    }
 539
 540    constexpr void setTemp(Temp t) noexcept {
 541       assert(!isConstant_);
 542       isTemp_ = true;
 543       data_.temp = t;
 544    }
 545
 546    constexpr Temp getTemp() const noexcept
 547    {
 548       return data_.temp;
 549    }
 550
 551    constexpr uint32_t tempId() const noexcept
 552    {
 553       return data_.temp.id();
 554    }
 555
 556    constexpr bool hasRegClass() const noexcept
 557    {
 558       return isTemp() || isUndefined();
 559    }
 560
 561    constexpr RegClass regClass() const noexcept
 562    {
 563       return data_.temp.regClass();
 564    }
 565
 566    constexpr unsigned bytes() const noexcept
 567    {
 568       if (isConstant())
 569          return 1 << constSize;
 570       else
 571          return data_.temp.bytes();
 572    }
 573
 574    constexpr unsigned size() const noexcept
 575    {
 576       if (isConstant())
 577          return constSize > 2 ? 2 : 1;
 578       else
 579          return data_.temp.size();
 580    }
 581
 582    constexpr bool isFixed() const noexcept
 583    {
 584       return isFixed_;
 585    }
 586
 587    constexpr PhysReg physReg() const noexcept
 588    {
 589       return reg_;
 590    }
 591
 592    constexpr void setFixed(PhysReg reg) noexcept
 593    {
 594       isFixed_ = reg != unsigned(-1);
 595       reg_ = reg;
 596    }
 597
 598    constexpr bool isConstant() const noexcept
 599    {
 600       return isConstant_;
 601    }
 602
 603    constexpr bool isLiteral() const noexcept
 604    {
 605       return isConstant() && reg_ == 255;
 606    }
 607
 608    constexpr bool isUndefined() const noexcept
 609    {
 610       return isUndef_;
 611    }
 612
 613    constexpr uint32_t constantValue() const noexcept
 614    {
 615       return data_.i;
 616    }
 617
 618    constexpr bool constantEquals(uint32_t cmp) const noexcept
 619    {
 620       return isConstant() && constantValue() == cmp;
 621    }
 622
 623    constexpr uint64_t constantValue64(bool signext=false) const noexcept
 624    {
 625       if (constSize == 3) {
 626          if (reg_ <= 192)
 627             return reg_ - 128;
 628          else if (reg_ <= 208)
 629             return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);
 630
 631          switch (reg_) {
 632          case 240:
 633             return 0x3FE0000000000000;
 634          case 241:
 635             return 0xBFE0000000000000;
 636          case 242:
 637             return 0x3FF0000000000000;
 638          case 243:
 639             return 0xBFF0000000000000;
 640          case 244:
 641             return 0x4000000000000000;
 642          case 245:
 643             return 0xC000000000000000;
 644          case 246:
 645             return 0x4010000000000000;
 646          case 247:
 647             return 0xC010000000000000;
 648          }
 649       } else if (constSize == 1) {
 650          return (signext && (data_.i & 0x8000u) ? 0xffffffffffff0000ull : 0ull) | data_.i;
 651       } else if (constSize == 0) {
 652          return (signext && (data_.i & 0x80u) ? 0xffffffffffffff00ull : 0ull) | data_.i;
 653       }
 654       return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
 655    }
 656
 657    constexpr bool isOfType(RegType type) const noexcept
 658    {
 659       return hasRegClass() && regClass().type() == type;
 660    }
 661
 662    /* Indicates that the killed operand's live range intersects with the
 663     * instruction's definitions. Unlike isKill() and isFirstKill(), this is
 664     * not set by liveness analysis. */
 665    constexpr void setLateKill(bool flag) noexcept
 666    {
 667       isLateKill_ = flag;
 668    }
 669
 670    constexpr bool isLateKill() const noexcept
 671    {
 672       return isLateKill_;
 673    }
 674
 675    constexpr void setKill(bool flag) noexcept
 676    {
 677       isKill_ = flag;
 678       if (!flag)
 679          setFirstKill(false);
 680    }
 681
 682    constexpr bool isKill() const noexcept
 683    {
 684       return isKill_ || isFirstKill();
 685    }
 686
 687    constexpr void setFirstKill(bool flag) noexcept
 688    {
 689       isFirstKill_ = flag;
 690       if (flag)
 691          setKill(flag);
 692    }
 693
 694    /* When there are multiple operands killing the same temporary,
 695     * isFirstKill() is only returns true for the first one. */
 696    constexpr bool isFirstKill() const noexcept
 697    {
 698       return isFirstKill_;
 699    }
 700
 701    constexpr bool isKillBeforeDef() const noexcept
 702    {
 703       return isKill() && !isLateKill();
 704    }
 705
 706    constexpr bool isFirstKillBeforeDef() const noexcept
 707    {
 708       return isFirstKill() && !isLateKill();
 709    }
 710
 711    constexpr bool operator == (Operand other) const noexcept
 712    {
 713       if (other.size() != size())
 714          return false;
 715       if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef())
 716          return false;
 717       if (isFixed() && other.isFixed() && physReg() != other.physReg())
 718          return false;
 719       if (isLiteral())
 720          return other.isLiteral() && other.constantValue() == constantValue();
 721       else if (isConstant())
 722          return other.isConstant() && other.physReg() == physReg();
 723       else if (isUndefined())
 724          return other.isUndefined() && other.regClass() == regClass();
 725       else
 726          return other.isTemp() && other.getTemp() == getTemp();
 727    }
 728 private:
 729    union {
 730       uint32_t i;
 731       float f;
 732       Temp temp = Temp(0, s1);
 733    } data_;
 734    PhysReg reg_;
 735    union {
 736       struct {
 737          uint8_t isTemp_:1;
 738          uint8_t isFixed_:1;
 739          uint8_t isConstant_:1;
 740          uint8_t isKill_:1;
 741          uint8_t isUndef_:1;
 742          uint8_t isFirstKill_:1;
 743          uint8_t constSize:2;
 744          uint8_t isLateKill_:1;
 745       };
 746       /* can't initialize bit-fields in c++11, so work around using a union */
 747       uint16_t control_ = 0;
 748    };
 749 };
 750
 751 /**
 752  * Definition Class
 753  * Definitions are the results of Instructions
 754  * and refer to temporary virtual registers
 755  * which are later mapped to physical registers
 756  */
 757 class Definition final
 758 {
 759 public:
 760    constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0),
 761                             isKill_(0), isPrecise_(0), isNUW_(0) {}
 762    Definition(uint32_t index, RegClass type) noexcept
 763       : temp(index, type) {}
 764    explicit Definition(Temp tmp) noexcept
 765       : temp(tmp) {}
 766    Definition(PhysReg reg, RegClass type) noexcept
 767       : temp(Temp(0, type))
 768    {
 769       setFixed(reg);
 770    }
 771    Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
 772       : temp(Temp(tmpId, type))
 773    {
 774       setFixed(reg);
 775    }
 776
 777    constexpr bool isTemp() const noexcept
 778    {
 779       return tempId() > 0;
 780    }
 781
 782    constexpr Temp getTemp() const noexcept
 783    {
 784       return temp;
 785    }
 786
 787    constexpr uint32_t tempId() const noexcept
 788    {
 789       return temp.id();
 790    }
 791
 792    constexpr void setTemp(Temp t) noexcept {
 793       temp = t;
 794    }
 795
 796    constexpr RegClass regClass() const noexcept
 797    {
 798       return temp.regClass();
 799    }
 800
 801    constexpr unsigned bytes() const noexcept
 802    {
 803       return temp.bytes();
 804    }
 805
 806    constexpr unsigned size() const noexcept
 807    {
 808       return temp.size();
 809    }
 810
 811    constexpr bool isFixed() const noexcept
 812    {
 813       return isFixed_;
 814    }
 815
 816    constexpr PhysReg physReg() const noexcept
 817    {
 818       return reg_;
 819    }
 820
 821    constexpr void setFixed(PhysReg reg) noexcept
 822    {
 823       isFixed_ = 1;
 824       reg_ = reg;
 825    }
 826
 827    constexpr void setHint(PhysReg reg) noexcept
 828    {
 829       hasHint_ = 1;
 830       reg_ = reg;
 831    }
 832
 833    constexpr bool hasHint() const noexcept
 834    {
 835       return hasHint_;
 836    }
 837
 838    constexpr void setKill(bool flag) noexcept
 839    {
 840       isKill_ = flag;
 841    }
 842
 843    constexpr bool isKill() const noexcept
 844    {
 845       return isKill_;
 846    }
 847
 848    constexpr void setPrecise(bool precise) noexcept
 849    {
 850       isPrecise_ = precise;
 851    }
 852
 853    constexpr bool isPrecise() const noexcept
 854    {
 855       return isPrecise_;
 856    }
 857
 858    /* No Unsigned Wrap */
 859    constexpr void setNUW(bool nuw) noexcept
 860    {
 861       isNUW_ = nuw;
 862    }
 863
 864    constexpr bool isNUW() const noexcept
 865    {
 866       return isNUW_;
 867    }
 868
 869 private:
 870    Temp temp = Temp(0, s1);
 871    PhysReg reg_;
 872    union {
 873       struct {
 874          uint8_t isFixed_:1;
 875          uint8_t hasHint_:1;
 876          uint8_t isKill_:1;
 877          uint8_t isPrecise_:1;
 878          uint8_t isNUW_:1;
 879       };
 880       /* can't initialize bit-fields in c++11, so work around using a union */
 881       uint8_t control_ = 0;
 882    };
 883 };
 884
 885 struct Block;
 886
 887 struct Instruction {
 888    aco_opcode opcode;
 889    Format format;
 890    uint32_t pass_flags;
 891
 892    aco::span<Operand> operands;
 893    aco::span<Definition> definitions;
 894
 895    constexpr bool isVALU() const noexcept
 896    {
 897       return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
 898           || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
 899           || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
 900           || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
 901           || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
 902           || format == Format::VOP3P;
 903    }
 904
 905    constexpr bool isSALU() const noexcept
 906    {
 907       return format == Format::SOP1 ||
 908              format == Format::SOP2 ||
 909              format == Format::SOPC ||
 910              format == Format::SOPK ||
 911              format == Format::SOPP;
 912    }
 913
 914    constexpr bool isVMEM() const noexcept
 915    {
 916       return format == Format::MTBUF ||
 917              format == Format::MUBUF ||
 918              format == Format::MIMG;
 919    }
 920
 921    constexpr bool isDPP() const noexcept
 922    {
 923       return (uint16_t) format & (uint16_t) Format::DPP;
 924    }
 925
 926    constexpr bool isVOP3() const noexcept
 927    {
 928       return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
 929              ((uint16_t) format & (uint16_t) Format::VOP3B);
 930    }
 931
 932    constexpr bool isSDWA() const noexcept
 933    {
 934       return (uint16_t) format & (uint16_t) Format::SDWA;
 935    }
 936
 937    constexpr bool isFlatOrGlobal() const noexcept
 938    {
 939       return format == Format::FLAT || format == Format::GLOBAL;
 940    }
 941
 942    constexpr bool usesModifiers() const noexcept;
 943
 944    constexpr bool reads_exec() const noexcept
 945    {
 946       for (const Operand& op : operands) {
 947          if (op.isFixed() && op.physReg() == exec)
 948             return true;
 949       }
 950       return false;
 951    }
 952 };
 953 static_assert(sizeof(Instruction) == 16, "Unexpected padding");
 954
 955 struct SOPK_instruction : public Instruction {
 956    uint16_t imm;
 957    uint16_t padding;
 958 };
 959 static_assert(sizeof(SOPK_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
 960
 961 struct SOPP_instruction : public Instruction {
 962    uint32_t imm;
 963    int block;
 964 };
 965 static_assert(sizeof(SOPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
 966
 967 struct SOPC_instruction : public Instruction {
 968 };
 969 static_assert(sizeof(SOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
 970
 971 struct SOP1_instruction : public Instruction {
 972 };
 973 static_assert(sizeof(SOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
 974
 975 struct SOP2_instruction : public Instruction {
 976 };
 977 static_assert(sizeof(SOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
 978
 979 /**
 980  * Scalar Memory Format:
 981  * For s_(buffer_)load_dword*:
 982  * Operand(0): SBASE - SGPR-pair which provides base address
 983  * Operand(1): Offset - immediate (un)signed offset or SGPR
 984  * Operand(2) / Definition(0): SDATA - SGPR for read / write result
 985  * Operand(n-1): SOffset - SGPR offset (Vega only)
 986  *
 987  * Having no operands is also valid for instructions such as s_dcache_inv.
 988  *
 989  */
 990 struct SMEM_instruction : public Instruction {
 991    memory_sync_info sync;
 992    bool glc : 1; /* VI+: globally coherent */
 993    bool dlc : 1; /* NAVI: device level coherent */
 994    bool nv : 1; /* VEGA only: Non-volatile */
 995    bool disable_wqm : 1;
 996    bool prevent_overflow : 1; /* avoid overflow when combining additions */
 997    uint32_t padding: 3;
 998 };
 999 static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1000
1001 struct VOP1_instruction : public Instruction {
1002 };
1003 static_assert(sizeof(VOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1004
1005 struct VOP2_instruction : public Instruction {
1006 };
1007 static_assert(sizeof(VOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1008
1009 struct VOPC_instruction : public Instruction {
1010 };
1011 static_assert(sizeof(VOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1012
1013 struct VOP3A_instruction : public Instruction {
1014    bool abs[3];
1015    bool neg[3];
1016    uint8_t opsel : 4;
1017    uint8_t omod : 2;
1018    bool clamp : 1;
1019    uint32_t padding : 9;
1020 };
1021 static_assert(sizeof(VOP3A_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1022
1023 struct VOP3P_instruction : public Instruction {
1024    bool neg_lo[3];
1025    bool neg_hi[3];
1026    uint8_t opsel_lo : 3;
1027    uint8_t opsel_hi : 3;
1028    bool clamp : 1;
1029    uint32_t padding : 9;
1030 };
1031 static_assert(sizeof(VOP3P_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1032
1033 /**
1034  * Data Parallel Primitives Format:
1035  * This format can be used for VOP1, VOP2 or VOPC instructions.
1036  * The swizzle applies to the src0 operand.
1037  *
1038  */
1039 struct DPP_instruction : public Instruction {
1040    bool abs[2];
1041    bool neg[2];
1042    uint16_t dpp_ctrl;
1043    uint8_t row_mask : 4;
1044    uint8_t bank_mask : 4;
1045    bool bound_ctrl : 1;
1046    uint32_t padding : 7;
1047 };
1048 static_assert(sizeof(DPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1049
1050 enum sdwa_sel : uint8_t {
1051     /* masks */
1052     sdwa_wordnum = 0x1,
1053     sdwa_bytenum = 0x3,
1054     sdwa_asuint = 0x7 | 0x10,
1055     sdwa_rasize = 0x3,
1056
1057     /* flags */
1058     sdwa_isword = 0x4,
1059     sdwa_sext = 0x8,
1060     sdwa_isra = 0x10,
1061
1062     /* specific values */
1063     sdwa_ubyte0 = 0,
1064     sdwa_ubyte1 = 1,
1065     sdwa_ubyte2 = 2,
1066     sdwa_ubyte3 = 3,
1067     sdwa_uword0 = sdwa_isword | 0,
1068     sdwa_uword1 = sdwa_isword | 1,
1069     sdwa_udword = 6,
1070
1071     sdwa_sbyte0 = sdwa_ubyte0 | sdwa_sext,
1072     sdwa_sbyte1 = sdwa_ubyte1 | sdwa_sext,
1073     sdwa_sbyte2 = sdwa_ubyte2 | sdwa_sext,
1074     sdwa_sbyte3 = sdwa_ubyte3 | sdwa_sext,
1075     sdwa_sword0 = sdwa_uword0 | sdwa_sext,
1076     sdwa_sword1 = sdwa_uword1 | sdwa_sext,
1077     sdwa_sdword = sdwa_udword | sdwa_sext,
1078
1079     /* register-allocated */
1080     sdwa_ubyte = 1 | sdwa_isra,
1081     sdwa_uword = 2 | sdwa_isra,
1082     sdwa_sbyte = sdwa_ubyte | sdwa_sext,
1083     sdwa_sword = sdwa_uword | sdwa_sext,
1084 };
1085
1086 /**
1087  * Sub-Dword Addressing Format:
1088  * This format can be used for VOP1, VOP2 or VOPC instructions.
1089  *
1090  * omod and SGPR/constant operands are only available on GFX9+. For VOPC,
1091  * the definition doesn't have to be VCC on GFX9+.
1092  *
1093  */
1094 struct SDWA_instruction : public Instruction {
1095    /* these destination modifiers aren't available with VOPC except for
1096     * clamp on GFX8 */
1097    uint8_t sel[2];
1098    uint8_t dst_sel;
1099    bool neg[2];
1100    bool abs[2];
1101    bool dst_preserve : 1;
1102    bool clamp : 1;
1103    uint8_t omod : 2; /* GFX9+ */
1104    uint32_t padding : 4;
1105 };
1106 static_assert(sizeof(SDWA_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1107
1108 struct Interp_instruction : public Instruction {
1109    uint8_t attribute;
1110    uint8_t component;
1111    uint16_t padding;
1112 };
1113 static_assert(sizeof(Interp_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1114
1115 /**
1116  * Local and Global Data Sharing instructions
1117  * Operand(0): ADDR - VGPR which supplies the address.
1118  * Operand(1): DATA0 - First data VGPR.
1119  * Operand(2): DATA1 - Second data VGPR.
1120  * Operand(n-1): M0 - LDS size.
1121  * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
1122  *
1123  */
1124 struct DS_instruction : public Instruction {
1125    memory_sync_info sync;
1126    bool gds;
1127    int16_t offset0;
1128    int8_t offset1;
1129    uint8_t padding;
1130 };
1131 static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1132
1133 /**
1134  * Vector Memory Untyped-buffer Instructions
1135  * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1136  * Operand(1): VADDR - Address source. Can carry an index and/or offset
1137  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1138  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1139  *
1140  */
1141 struct MUBUF_instruction : public Instruction {
1142    memory_sync_info sync;
1143    bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1144    bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1145    bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
1146    bool glc : 1; /* globally coherent */
1147    bool dlc : 1; /* NAVI: device level coherent */
1148    bool slc : 1; /* system level coherent */
1149    bool tfe : 1; /* texture fail enable */
1150    bool lds : 1; /* Return read-data to LDS instead of VGPRs */
1151    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1152    uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
1153    bool swizzled : 1;
1154    uint32_t padding1 : 18;
1155 };
1156 static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1157
1158 /**
1159  * Vector Memory Typed-buffer Instructions
1160  * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1161  * Operand(1): VADDR - Address source. Can carry an index and/or offset
1162  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1163  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1164  *
1165  */
1166 struct MTBUF_instruction : public Instruction {
1167    memory_sync_info sync;
1168    uint8_t dfmt : 4; /* Data Format of data in memory buffer */
1169    uint8_t nfmt : 3; /* Numeric format of data in memory */
1170    bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1171    bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1172    bool glc : 1; /* globally coherent */
1173    bool dlc : 1; /* NAVI: device level coherent */
1174    bool slc : 1; /* system level coherent */
1175    bool tfe : 1; /* texture fail enable */
1176    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1177    uint32_t padding : 10;
1178    uint16_t offset; /* Unsigned byte offset - 12 bit */
1179 };
1180 static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1181
1182 /**
1183  * Vector Memory Image Instructions
1184  * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
1185  * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
1186  *             or VDATA - Vector GPR for write data.
1187  * Operand(2): VADDR - Address source. Can carry an offset or an index.
1188  * Definition(0): VDATA - Vector GPR for read result.
1189  *
1190  */
1191 struct MIMG_instruction : public Instruction {
1192    memory_sync_info sync;
1193    uint8_t dmask; /* Data VGPR enable mask */
1194    uint8_t dim : 3; /* NAVI: dimensionality */
1195    bool unrm : 1; /* Force address to be un-normalized */
1196    bool dlc : 1; /* NAVI: device level coherent */
1197    bool glc : 1; /* globally coherent */
1198    bool slc : 1; /* system level coherent */
1199    bool tfe : 1; /* texture fail enable */
1200    bool da : 1; /* declare an array */
1201    bool lwe : 1; /* Force data to be un-normalized */
1202    bool r128 : 1; /* NAVI: Texture resource size */
1203    bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
1204    bool d16 : 1; /* Convert 32-bit data to 16-bit data */
1205    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1206    uint32_t padding : 18;
1207 };
1208 static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1209
1210 /**
1211  * Flat/Scratch/Global Instructions
1212  * Operand(0): ADDR
1213  * Operand(1): SADDR
1214  * Operand(2) / Definition(0): DATA/VDST
1215  *
1216  */
1217 struct FLAT_instruction : public Instruction {
1218    memory_sync_info sync;
1219    bool slc : 1; /* system level coherent */
1220    bool glc : 1; /* globally coherent */
1221    bool dlc : 1; /* NAVI: device level coherent */
1222    bool lds : 1;
1223    bool nv : 1;
1224    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1225    uint32_t padding0 : 2;
1226    uint16_t offset; /* Vega/Navi only */
1227    uint16_t padding1;
1228 };
1229 static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1230
1231 struct Export_instruction : public Instruction {
1232    uint8_t enabled_mask;
1233    uint8_t dest;
1234    bool compressed : 1;
1235    bool done : 1;
1236    bool valid_mask : 1;
1237    uint32_t padding : 13;
1238 };
1239 static_assert(sizeof(Export_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1240
1241 struct Pseudo_instruction : public Instruction {
1242    PhysReg scratch_sgpr; /* might not be valid if it's not needed */
1243    bool tmp_in_scc;
1244    uint8_t padding;
1245 };
1246 static_assert(sizeof(Pseudo_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1247
1248 struct Pseudo_branch_instruction : public Instruction {
1249    /* target[0] is the block index of the branch target.
1250     * For conditional branches, target[1] contains the fall-through alternative.
1251     * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
1252     */
1253    uint32_t target[2];
1254 };
1255 static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1256
1257 struct Pseudo_barrier_instruction : public Instruction {
1258    memory_sync_info sync;
1259    sync_scope exec_scope;
1260 };
1261 static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1262
1263 enum ReduceOp : uint16_t {
1264    iadd8, iadd16, iadd32, iadd64,
1265    imul8, imul16, imul32, imul64,
1266           fadd16, fadd32, fadd64,
1267           fmul16, fmul32, fmul64,
1268    imin8, imin16, imin32, imin64,
1269    imax8, imax16, imax32, imax64,
1270    umin8, umin16, umin32, umin64,
1271    umax8, umax16, umax32, umax64,
1272           fmin16, fmin32, fmin64,
1273           fmax16, fmax32, fmax64,
1274    iand8, iand16, iand32, iand64,
1275    ior8, ior16, ior32, ior64,
1276    ixor8, ixor16, ixor32, ixor64,
1277 };
1278
1279 /**
1280  * Subgroup Reduction Instructions, everything except for the data to be
1281  * reduced and the result as inserted by setup_reduce_temp().
1282  * Operand(0): data to be reduced
1283  * Operand(1): reduce temporary
1284  * Operand(2): vector temporary
1285  * Definition(0): result
1286  * Definition(1): scalar temporary
1287  * Definition(2): scalar identity temporary (not used to store identity on GFX10)
1288  * Definition(3): scc clobber
1289  * Definition(4): vcc clobber
1290  *
1291  */
1292 struct Pseudo_reduction_instruction : public Instruction {
1293    ReduceOp reduce_op;
1294    uint16_t cluster_size; // must be 0 for scans
1295 };
1296 static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1297
1298 struct instr_deleter_functor {
1299    void operator()(void* p) {
1300       free(p);
1301    }
1302 };
1303
1304 template<typename T>
1305 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
1306
1307 template<typename T>
1308 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
1309 {
1310    std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1311    char *data = (char*) calloc(1, size);
1312    T* inst = (T*) data;
1313
1314    inst->opcode = opcode;
1315    inst->format = format;
1316
1317    uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
1318    inst->operands = aco::span<Operand>(operands_offset, num_operands);
1319    uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1320    inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1321
1322    return inst;
1323 }
1324
1325 constexpr bool Instruction::usesModifiers() const noexcept
1326 {
1327    if (isDPP() || isSDWA())
1328       return true;
1329
1330    if (format == Format::VOP3P) {
1331       const VOP3P_instruction *vop3p = static_cast<const VOP3P_instruction*>(this);
1332       for (unsigned i = 0; i < operands.size(); i++) {
1333          if (vop3p->neg_lo[i] || vop3p->neg_hi[i])
1334             return true;
1335       }
1336       return vop3p->opsel_lo || vop3p->opsel_hi || vop3p->clamp;
1337    } else if (isVOP3()) {
1338       const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
1339       for (unsigned i = 0; i < operands.size(); i++) {
1340          if (vop3->abs[i] || vop3->neg[i])
1341             return true;
1342       }
1343       return vop3->opsel || vop3->clamp || vop3->omod;
1344    }
1345    return false;
1346 }
1347
1348 constexpr bool is_phi(Instruction* instr)
1349 {
1350    return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
1351 }
1352
1353 static inline bool is_phi(aco_ptr<Instruction>& instr)
1354 {
1355    return is_phi(instr.get());
1356 }
1357
1358 memory_sync_info get_sync_info(const Instruction* instr);
1359
1360 bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr);
1361
1362 bool can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high);
1363 bool can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr);
1364 /* updates "instr" and returns the old instruction (or NULL if no update was needed) */
1365 aco_ptr<Instruction> convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& instr);
1366
1367 enum block_kind {
1368    /* uniform indicates that leaving this block,
1369     * all actives lanes stay active */
1370    block_kind_uniform = 1 << 0,
1371    block_kind_top_level = 1 << 1,
1372    block_kind_loop_preheader = 1 << 2,
1373    block_kind_loop_header = 1 << 3,
1374    block_kind_loop_exit = 1 << 4,
1375    block_kind_continue = 1 << 5,
1376    block_kind_break = 1 << 6,
1377    block_kind_continue_or_break = 1 << 7,
1378    block_kind_discard = 1 << 8,
1379    block_kind_branch = 1 << 9,
1380    block_kind_merge = 1 << 10,
1381    block_kind_invert = 1 << 11,
1382    block_kind_uses_discard_if = 1 << 12,
1383    block_kind_needs_lowering = 1 << 13,
1384    block_kind_uses_demote = 1 << 14,
1385    block_kind_export_end = 1 << 15,
1386 };
1387
1388
1389 struct RegisterDemand {
1390    constexpr RegisterDemand() = default;
1391    constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1392       : vgpr{v}, sgpr{s} {}
1393    int16_t vgpr = 0;
1394    int16_t sgpr = 0;
1395
1396    constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1397       return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1398    }
1399
1400    constexpr bool exceeds(const RegisterDemand other) const noexcept {
1401       return vgpr > other.vgpr || sgpr > other.sgpr;
1402    }
1403
1404    constexpr RegisterDemand operator+(const Temp t) const noexcept {
1405       if (t.type() == RegType::sgpr)
1406          return RegisterDemand( vgpr, sgpr + t.size() );
1407       else
1408          return RegisterDemand( vgpr + t.size(), sgpr );
1409    }
1410
1411    constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1412       return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1413    }
1414
1415    constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1416       return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1417    }
1418
1419    constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1420       vgpr += other.vgpr;
1421       sgpr += other.sgpr;
1422       return *this;
1423    }
1424
1425    constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1426       vgpr -= other.vgpr;
1427       sgpr -= other.sgpr;
1428       return *this;
1429    }
1430
1431    constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1432       if (t.type() == RegType::sgpr)
1433          sgpr += t.size();
1434       else
1435          vgpr += t.size();
1436       return *this;
1437    }
1438
1439    constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1440       if (t.type() == RegType::sgpr)
1441          sgpr -= t.size();
1442       else
1443          vgpr -= t.size();
1444       return *this;
1445    }
1446
1447    constexpr void update(const RegisterDemand other) noexcept {
1448       vgpr = std::max(vgpr, other.vgpr);
1449       sgpr = std::max(sgpr, other.sgpr);
1450    }
1451
1452 };
1453
1454 /* CFG */
1455 struct Block {
1456    float_mode fp_mode;
1457    unsigned index;
1458    unsigned offset = 0;
1459    std::vector<aco_ptr<Instruction>> instructions;
1460    std::vector<unsigned> logical_preds;
1461    std::vector<unsigned> linear_preds;
1462    std::vector<unsigned> logical_succs;
1463    std::vector<unsigned> linear_succs;
1464    RegisterDemand register_demand = RegisterDemand();
1465    uint16_t loop_nest_depth = 0;
1466    uint16_t kind = 0;
1467    int logical_idom = -1;
1468    int linear_idom = -1;
1469    Temp live_out_exec = Temp();
1470
1471    /* this information is needed for predecessors to blocks with phis when
1472     * moving out of ssa */
1473    bool scc_live_out = false;
1474    PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1475
1476    Block(unsigned idx) : index(idx) {}
1477    Block() : index(0) {}
1478 };
1479
1480 using Stage = uint16_t;
1481
1482 /* software stages */
1483 static constexpr Stage sw_vs = 1 << 0;
1484 static constexpr Stage sw_gs = 1 << 1;
1485 static constexpr Stage sw_tcs = 1 << 2;
1486 static constexpr Stage sw_tes = 1 << 3;
1487 static constexpr Stage sw_fs = 1 << 4;
1488 static constexpr Stage sw_cs = 1 << 5;
1489 static constexpr Stage sw_gs_copy = 1 << 6;
1490 static constexpr Stage sw_mask = 0x7f;
1491
1492 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1493 static constexpr Stage hw_vs = 1 << 7;
1494 static constexpr Stage hw_es = 1 << 8; /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */
1495 static constexpr Stage hw_gs = 1 << 9; /* Geometry shader on GFX10/legacy and GFX6-9. */
1496 static constexpr Stage hw_ngg_gs = 1 << 10; /* Geometry shader on GFX10/NGG. */
1497 static constexpr Stage hw_ls = 1 << 11; /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */
1498 static constexpr Stage hw_hs = 1 << 12; /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */
1499 static constexpr Stage hw_fs = 1 << 13;
1500 static constexpr Stage hw_cs = 1 << 14;
1501 static constexpr Stage hw_mask = 0xff << 7;
1502
1503 /* possible settings of Program::stage */
1504 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1505 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1506 static constexpr Stage compute_cs = sw_cs | hw_cs;
1507 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1508 static constexpr Stage gs_copy_vs = sw_gs_copy | hw_vs;
1509 /* GFX10/NGG */
1510 static constexpr Stage ngg_vertex_gs = sw_vs | hw_ngg_gs;
1511 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_ngg_gs;
1512 static constexpr Stage ngg_tess_eval_gs = sw_tes | hw_ngg_gs;
1513 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_ngg_gs;
1514 /* GFX9 (and GFX10 if NGG isn't used) */
1515 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1516 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1517 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1518 /* pre-GFX9 */
1519 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1520 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1521 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1522 static constexpr Stage tess_eval_es = sw_tes | hw_es; /* tesselation evaluation before geometry */
1523 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1524
1525 enum statistic {
1526    statistic_hash,
1527    statistic_instructions,
1528    statistic_copies,
1529    statistic_branches,
1530    statistic_cycles,
1531    statistic_vmem_clauses,
1532    statistic_smem_clauses,
1533    statistic_vmem_score,
1534    statistic_smem_score,
1535    statistic_sgpr_presched,
1536    statistic_vgpr_presched,
1537    num_statistics
1538 };
1539
1540 class Program final {
1541 public:
1542    float_mode next_fp_mode;
1543    std::vector<Block> blocks;
1544    RegisterDemand max_reg_demand = RegisterDemand();
1545    uint16_t num_waves = 0;
1546    uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1547    ac_shader_config* config;
1548    struct radv_shader_info *info;
1549    enum chip_class chip_class;
1550    enum radeon_family family;
1551    unsigned wave_size;
1552    RegClass lane_mask;
1553    Stage stage; /* Stage */
1554    bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1555    bool needs_wqm = false; /* there exists a p_wqm instruction */
1556    bool wb_smem_l1_on_end = false;
1557
1558    std::vector<uint8_t> constant_data;
1559    Temp private_segment_buffer;
1560    Temp scratch_offset;
1561
1562    uint16_t min_waves = 0;
1563    uint16_t lds_alloc_granule;
1564    uint32_t lds_limit; /* in bytes */
1565    bool has_16bank_lds;
1566    uint16_t vgpr_limit;
1567    uint16_t sgpr_limit;
1568    uint16_t physical_sgprs;
1569    uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1570    uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
1571    unsigned workgroup_size; /* if known; otherwise UINT_MAX */
1572
1573    bool xnack_enabled = false;
1574    bool sram_ecc_enabled = false;
1575    bool has_fast_fma32 = false;
1576
1577    bool needs_vcc = false;
1578    bool needs_flat_scr = false;
1579
1580    bool collect_statistics = false;
1581    uint32_t statistics[num_statistics];
1582
1583    uint32_t allocateId()
1584    {
1585       assert(allocationID <= 16777215);
1586       return allocationID++;
1587    }
1588
1589    uint32_t peekAllocationId()
1590    {
1591       return allocationID;
1592    }
1593
1594    void setAllocationId(uint32_t id)
1595    {
1596       allocationID = id;
1597    }
1598
1599    Block* create_and_insert_block() {
1600       blocks.emplace_back(blocks.size());
1601       blocks.back().fp_mode = next_fp_mode;
1602       return &blocks.back();
1603    }
1604
1605    Block* insert_block(Block&& block) {
1606       block.index = blocks.size();
1607       block.fp_mode = next_fp_mode;
1608       blocks.emplace_back(std::move(block));
1609       return &blocks.back();
1610    }
1611
1612 private:
1613    uint32_t allocationID = 1;
1614 };
1615
1616 struct TempHash {
1617    std::size_t operator()(Temp t) const {
1618       return t.id();
1619    }
1620 };
1621 using TempSet = std::unordered_set<Temp, TempHash>;
1622
1623 struct live {
1624    /* live temps out per block */
1625    std::vector<TempSet> live_out;
1626    /* register demand (sgpr/vgpr) per instruction per block */
1627    std::vector<std::vector<RegisterDemand>> register_demand;
1628 };
1629
1630 void init();
1631
1632 void init_program(Program *program, Stage stage, struct radv_shader_info *info,
1633                   enum chip_class chip_class, enum radeon_family family,
1634                   ac_shader_config *config);
1635
1636 void select_program(Program *program,
1637                     unsigned shader_count,
1638                     struct nir_shader *const *shaders,
1639                     ac_shader_config* config,
1640                     struct radv_shader_args *args);
1641 void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
1642                            ac_shader_config* config,
1643                            struct radv_shader_args *args);
1644
1645 void lower_wqm(Program* program, live& live_vars,
1646                const struct radv_nir_compiler_options *options);
1647 void lower_phis(Program* program);
1648 void calc_min_waves(Program* program);
1649 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1650 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1651 std::vector<uint16_t> dead_code_analysis(Program *program);
1652 void dominator_tree(Program* program);
1653 void insert_exec_mask(Program *program);
1654 void value_numbering(Program* program);
1655 void optimize(Program* program);
1656 void setup_reduce_temp(Program* program);
1657 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1658 void register_allocation(Program *program, std::vector<TempSet>& live_out_per_block);
1659 void ssa_elimination(Program* program);
1660 void lower_to_hw_instr(Program* program);
1661 void schedule_program(Program* program, live& live_vars);
1662 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1663 void insert_wait_states(Program* program);
1664 void insert_NOPs(Program* program);
1665 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1666 void print_asm(Program *program, std::vector<uint32_t>& binary,
1667                unsigned exec_size, std::ostream& out);
1668 bool validate(Program* program, FILE *output);
1669 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output);
1670 #ifndef NDEBUG
1671 void perfwarn(bool cond, const char *msg, Instruction *instr=NULL);
1672 #else
1673 #define perfwarn(program, cond, msg, ...) do {} while(0)
1674 #endif
1675
1676 void collect_presched_stats(Program *program);
1677 void collect_preasm_stats(Program *program);
1678 void collect_postasm_stats(Program *program, const std::vector<uint32_t>& code);
1679
1680 void aco_print_instr(const Instruction *instr, FILE *output);
1681 void aco_print_program(const Program *program, FILE *output);
1682
1683 /* utilities for dealing with register demand */
1684 RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
1685 RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
1686 RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, aco_ptr<Instruction>& instr_before);
1687
1688 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1689 uint16_t get_extra_sgprs(Program *program);
1690
1691 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
1692 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1693 uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs);
1694
1695 /* return number of addressable sgprs/vgprs for max_waves */
1696 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1697 uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves);
1698
1699 typedef struct {
1700    const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
1701    const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1702    const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1703    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1704    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1705    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
1706    const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1707    const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1708    /* sizes used for input/output modifiers and constants */
1709    const unsigned operand_size[static_cast<int>(aco_opcode::num_opcodes)];
1710    const unsigned definition_size[static_cast<int>(aco_opcode::num_opcodes)];
1711 } Info;
1712
1713 extern const Info instr_info;
1714
1715 }
1716
1717 #endif /* ACO_IR_H */
1718