src/amd/compiler/aco_ir.h

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  */
  24
  25 #ifndef ACO_IR_H
  26 #define ACO_IR_H
  27
  28 #include <vector>
  29 #include <set>
  30 #include <unordered_set>
  31 #include <bitset>
  32 #include <memory>
  33
  34 #include "nir.h"
  35 #include "ac_binary.h"
  36 #include "amd_family.h"
  37 #include "aco_opcodes.h"
  38 #include "aco_util.h"
  39
  40 #include "vulkan/radv_shader.h"
  41
  42 struct radv_nir_compiler_options;
  43 struct radv_shader_args;
  44 struct radv_shader_info;
  45
  46 namespace aco {
  47
  48 extern uint64_t debug_flags;
  49
  50 enum {
  51    DEBUG_VALIDATE_IR = 0x1,
  52    DEBUG_VALIDATE_RA = 0x2,
  53    DEBUG_PERFWARN = 0x4,
  54    DEBUG_FORCE_WAITCNT = 0x8,
  55 };
  56
  57 /**
  58  * Representation of the instruction's microcode encoding format
  59  * Note: Some Vector ALU Formats can be combined, such that:
  60  * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
  61  * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
  62  * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
  63  *
  64  * (*) The same is applicable for VOP1 and VOPC instructions.
  65  */
  66 enum class Format : std::uint16_t {
  67    /* Pseudo Instruction Format */
  68    PSEUDO = 0,
  69    /* Scalar ALU & Control Formats */
  70    SOP1 = 1,
  71    SOP2 = 2,
  72    SOPK = 3,
  73    SOPP = 4,
  74    SOPC = 5,
  75    /* Scalar Memory Format */
  76    SMEM = 6,
  77    /* LDS/GDS Format */
  78    DS = 8,
  79    /* Vector Memory Buffer Formats */
  80    MTBUF = 9,
  81    MUBUF = 10,
  82    /* Vector Memory Image Format */
  83    MIMG = 11,
  84    /* Export Format */
  85    EXP = 12,
  86    /* Flat Formats */
  87    FLAT = 13,
  88    GLOBAL = 14,
  89    SCRATCH = 15,
  90
  91    PSEUDO_BRANCH = 16,
  92    PSEUDO_BARRIER = 17,
  93    PSEUDO_REDUCTION = 18,
  94
  95    /* Vector ALU Formats */
  96    VOP3P = 19,
  97    VOP1 = 1 << 8,
  98    VOP2 = 1 << 9,
  99    VOPC = 1 << 10,
 100    VOP3 = 1 << 11,
 101    VOP3A = 1 << 11,
 102    VOP3B = 1 << 11,
 103    /* Vector Parameter Interpolation Format */
 104    VINTRP = 1 << 12,
 105    DPP = 1 << 13,
 106    SDWA = 1 << 14,
 107 };
 108
 109 enum storage_class : uint8_t {
 110    storage_none = 0x0, /* no synchronization and can be reordered around aliasing stores */
 111    storage_buffer = 0x1, /* SSBOs and global memory */
 112    storage_atomic_counter = 0x2, /* not used for Vulkan */
 113    storage_image = 0x4,
 114    storage_shared = 0x8, /* or TCS output */
 115    storage_vmem_output = 0x10, /* GS or TCS output stores using VMEM */
 116    storage_scratch = 0x20,
 117    storage_vgpr_spill = 0x40,
 118    storage_count = 8,
 119 };
 120
 121 enum memory_semantics : uint8_t {
 122    semantic_none = 0x0,
 123    /* for loads: don't move any access after this load to before this load (even other loads)
 124     * for barriers: don't move any access after the barrier to before any
 125     * atomics/control_barriers/sendmsg_gs_done before the barrier */
 126    semantic_acquire = 0x1,
 127    /* for stores: don't move any access before this store to after this store
 128     * for barriers: don't move any access before the barrier to after any
 129     * atomics/control_barriers/sendmsg_gs_done after the barrier */
 130    semantic_release = 0x2,
 131
 132    /* the rest are for load/stores/atomics only */
 133    /* cannot be DCE'd or CSE'd */
 134    semantic_volatile = 0x4,
 135    /* does not interact with barriers and assumes this lane is the only lane
 136     * accessing this memory */
 137    semantic_private = 0x8,
 138    /* this operation can be reordered around operations of the same storage. says nothing about barriers */
 139    semantic_can_reorder = 0x10,
 140    /* this is a atomic instruction (may only read or write memory) */
 141    semantic_atomic = 0x20,
 142    /* this is instruction both reads and writes memory */
 143    semantic_rmw = 0x40,
 144
 145    semantic_acqrel = semantic_acquire | semantic_release,
 146    semantic_atomicrmw = semantic_volatile | semantic_atomic | semantic_rmw,
 147 };
 148
 149 enum sync_scope : uint8_t {
 150    scope_invocation = 0,
 151    scope_subgroup = 1,
 152    scope_workgroup = 2,
 153    scope_queuefamily = 3,
 154    scope_device = 4,
 155 };
 156
 157 struct memory_sync_info {
 158    memory_sync_info() : storage(storage_none), semantics(semantic_none), scope(scope_invocation) {}
 159    memory_sync_info(int storage, int semantics=0, sync_scope scope=scope_invocation)
 160       : storage((storage_class)storage), semantics((memory_semantics)semantics), scope(scope) {}
 161
 162    storage_class storage:8;
 163    memory_semantics semantics:8;
 164    sync_scope scope:8;
 165
 166    bool operator == (const memory_sync_info& rhs) const {
 167       return storage == rhs.storage &&
 168              semantics == rhs.semantics &&
 169              scope == rhs.scope;
 170    }
 171
 172    bool can_reorder() const {
 173       if (semantics & semantic_acqrel)
 174          return false;
 175       /* Also check storage so that zero-initialized memory_sync_info can be
 176        * reordered. */
 177       return (!storage || (semantics & semantic_can_reorder)) && !(semantics & semantic_volatile);
 178    }
 179 };
 180 static_assert(sizeof(memory_sync_info) == 3, "Unexpected padding");
 181
 182 enum fp_round {
 183    fp_round_ne = 0,
 184    fp_round_pi = 1,
 185    fp_round_ni = 2,
 186    fp_round_tz = 3,
 187 };
 188
 189 enum fp_denorm {
 190    /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
 191     * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
 192    fp_denorm_flush = 0x0,
 193    fp_denorm_keep = 0x3,
 194 };
 195
 196 struct float_mode {
 197    /* matches encoding of the MODE register */
 198    union {
 199       struct {
 200           fp_round round32:2;
 201           fp_round round16_64:2;
 202           unsigned denorm32:2;
 203           unsigned denorm16_64:2;
 204       };
 205       struct {
 206          uint8_t round:4;
 207          uint8_t denorm:4;
 208       };
 209       uint8_t val = 0;
 210    };
 211    /* if false, optimizations which may remove infs/nan/-0.0 can be done */
 212    bool preserve_signed_zero_inf_nan32:1;
 213    bool preserve_signed_zero_inf_nan16_64:1;
 214    /* if false, optimizations which may remove denormal flushing can be done */
 215    bool must_flush_denorms32:1;
 216    bool must_flush_denorms16_64:1;
 217    bool care_about_round32:1;
 218    bool care_about_round16_64:1;
 219
 220    /* Returns true if instructions using the mode "other" can safely use the
 221     * current one instead. */
 222    bool canReplace(float_mode other) const noexcept {
 223       return val == other.val &&
 224              (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
 225              (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
 226              (must_flush_denorms32  || !other.must_flush_denorms32) &&
 227              (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
 228              (care_about_round32 || !other.care_about_round32) &&
 229              (care_about_round16_64 || !other.care_about_round16_64);
 230    }
 231 };
 232
 233 constexpr Format asVOP3(Format format) {
 234    return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
 235 };
 236
 237 constexpr Format asSDWA(Format format) {
 238    assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC);
 239    return (Format) ((uint32_t) Format::SDWA | (uint32_t) format);
 240 }
 241
 242 enum class RegType {
 243    none = 0,
 244    sgpr,
 245    vgpr,
 246    linear_vgpr,
 247 };
 248
 249 struct RegClass {
 250
 251    enum RC : uint8_t {
 252       s1 = 1,
 253       s2 = 2,
 254       s3 = 3,
 255       s4 = 4,
 256       s6 = 6,
 257       s8 = 8,
 258       s16 = 16,
 259       v1 = s1 | (1 << 5),
 260       v2 = s2 | (1 << 5),
 261       v3 = s3 | (1 << 5),
 262       v4 = s4 | (1 << 5),
 263       v5 = 5  | (1 << 5),
 264       v6 = 6  | (1 << 5),
 265       v7 = 7  | (1 << 5),
 266       v8 = 8  | (1 << 5),
 267       /* byte-sized register class */
 268       v1b = v1 | (1 << 7),
 269       v2b = v2 | (1 << 7),
 270       v3b = v3 | (1 << 7),
 271       v4b = v4 | (1 << 7),
 272       v6b = v6 | (1 << 7),
 273       v8b = v8 | (1 << 7),
 274       /* these are used for WWM and spills to vgpr */
 275       v1_linear = v1 | (1 << 6),
 276       v2_linear = v2 | (1 << 6),
 277    };
 278
 279    RegClass() = default;
 280    constexpr RegClass(RC rc)
 281       : rc(rc) {}
 282    constexpr RegClass(RegType type, unsigned size)
 283       : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
 284
 285    constexpr operator RC() const { return rc; }
 286    explicit operator bool() = delete;
 287
 288    constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
 289    constexpr bool is_subdword() const { return rc & (1 << 7); }
 290    constexpr unsigned bytes() const { return ((unsigned) rc & 0x1F) * (is_subdword() ? 1 : 4); }
 291    //TODO: use size() less in favor of bytes()
 292    constexpr unsigned size() const { return (bytes() + 3) >> 2; }
 293    constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
 294    constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
 295    constexpr RegClass as_subdword() const { return RegClass((RC) (rc | 1 << 7)); }
 296
 297    static constexpr RegClass get(RegType type, unsigned bytes) {
 298       if (type == RegType::sgpr) {
 299          return RegClass(type, DIV_ROUND_UP(bytes, 4u));
 300       } else {
 301          return bytes % 4u ? RegClass(type, bytes).as_subdword() :
 302                              RegClass(type, bytes / 4u);
 303       }
 304    }
 305
 306 private:
 307    RC rc;
 308 };
 309
 310 /* transitional helper expressions */
 311 static constexpr RegClass s1{RegClass::s1};
 312 static constexpr RegClass s2{RegClass::s2};
 313 static constexpr RegClass s3{RegClass::s3};
 314 static constexpr RegClass s4{RegClass::s4};
 315 static constexpr RegClass s8{RegClass::s8};
 316 static constexpr RegClass s16{RegClass::s16};
 317 static constexpr RegClass v1{RegClass::v1};
 318 static constexpr RegClass v2{RegClass::v2};
 319 static constexpr RegClass v3{RegClass::v3};
 320 static constexpr RegClass v4{RegClass::v4};
 321 static constexpr RegClass v5{RegClass::v5};
 322 static constexpr RegClass v6{RegClass::v6};
 323 static constexpr RegClass v7{RegClass::v7};
 324 static constexpr RegClass v8{RegClass::v8};
 325 static constexpr RegClass v1b{RegClass::v1b};
 326 static constexpr RegClass v2b{RegClass::v2b};
 327 static constexpr RegClass v3b{RegClass::v3b};
 328 static constexpr RegClass v4b{RegClass::v4b};
 329 static constexpr RegClass v6b{RegClass::v6b};
 330 static constexpr RegClass v8b{RegClass::v8b};
 331
 332 /**
 333  * Temp Class
 334  * Each temporary virtual register has a
 335  * register class (i.e. size and type)
 336  * and SSA id.
 337  */
 338 struct Temp {
 339    Temp() noexcept : id_(0), reg_class(0) {}
 340    constexpr Temp(uint32_t id, RegClass cls) noexcept
 341       : id_(id), reg_class(uint8_t(cls)) {}
 342
 343    constexpr uint32_t id() const noexcept { return id_; }
 344    constexpr RegClass regClass() const noexcept { return (RegClass::RC)reg_class; }
 345
 346    constexpr unsigned bytes() const noexcept { return regClass().bytes(); }
 347    constexpr unsigned size() const noexcept { return regClass().size(); }
 348    constexpr RegType type() const noexcept { return regClass().type(); }
 349    constexpr bool is_linear() const noexcept { return regClass().is_linear(); }
 350
 351    constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
 352    constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
 353    constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
 354
 355 private:
 356    uint32_t id_: 24;
 357    uint32_t reg_class : 8;
 358 };
 359
 360 /**
 361  * PhysReg
 362  * Represents the physical register for each
 363  * Operand and Definition.
 364  */
 365 struct PhysReg {
 366    constexpr PhysReg() = default;
 367    explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {}
 368    constexpr unsigned reg() const { return reg_b >> 2; }
 369    constexpr unsigned byte() const { return reg_b & 0x3; }
 370    constexpr operator unsigned() const { return reg(); }
 371    constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; }
 372    constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; }
 373    constexpr bool operator <(PhysReg other) const { return reg_b < other.reg_b; }
 374    constexpr PhysReg advance(int bytes) const { PhysReg res = *this; res.reg_b += bytes; return res; }
 375
 376    uint16_t reg_b = 0;
 377 };
 378
 379 /* helper expressions for special registers */
 380 static constexpr PhysReg m0{124};
 381 static constexpr PhysReg vcc{106};
 382 static constexpr PhysReg vcc_hi{107};
 383 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
 384 static constexpr PhysReg exec{126};
 385 static constexpr PhysReg exec_lo{126};
 386 static constexpr PhysReg exec_hi{127};
 387 static constexpr PhysReg vccz{251};
 388 static constexpr PhysReg execz{252};
 389 static constexpr PhysReg scc{253};
 390
 391 /**
 392  * Operand Class
 393  * Initially, each Operand refers to either
 394  * a temporary virtual register
 395  * or to a constant value
 396  * Temporary registers get mapped to physical register during RA
 397  * Constant values are inlined into the instruction sequence.
 398  */
 399 class Operand final
 400 {
 401 public:
 402    constexpr Operand()
 403       : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
 404         isKill_(false), isUndef_(true), isFirstKill_(false), constSize(0),
 405         isLateKill_(false) {}
 406
 407    explicit Operand(Temp r) noexcept
 408    {
 409       data_.temp = r;
 410       if (r.id()) {
 411          isTemp_ = true;
 412       } else {
 413          isUndef_ = true;
 414          setFixed(PhysReg{128});
 415       }
 416    };
 417    explicit Operand(uint8_t v) noexcept
 418    {
 419       /* 8-bit constants are only used for copies and copies from any 8-bit
 420        * constant can be implemented with a SDWA v_mul_u32_u24. So consider all
 421        * to be inline constants. */
 422       data_.i = v;
 423       isConstant_ = true;
 424       constSize = 0;
 425       setFixed(PhysReg{0u});
 426    };
 427    explicit Operand(uint16_t v) noexcept
 428    {
 429       data_.i = v;
 430       isConstant_ = true;
 431       constSize = 1;
 432       if (v <= 64)
 433          setFixed(PhysReg{128u + v});
 434       else if (v >= 0xFFF0) /* [-16 .. -1] */
 435          setFixed(PhysReg{192u + (0xFFFF - v)});
 436       else if (v == 0x3800) /* 0.5 */
 437          setFixed(PhysReg{240});
 438       else if (v == 0xB800) /* -0.5 */
 439          setFixed(PhysReg{241});
 440       else if (v == 0x3C00) /* 1.0 */
 441          setFixed(PhysReg{242});
 442       else if (v == 0xBC00) /* -1.0 */
 443          setFixed(PhysReg{243});
 444       else if (v == 0x4000) /* 2.0 */
 445          setFixed(PhysReg{244});
 446       else if (v == 0xC000) /* -2.0 */
 447          setFixed(PhysReg{245});
 448       else if (v == 0x4400) /* 4.0 */
 449          setFixed(PhysReg{246});
 450       else if (v == 0xC400) /* -4.0 */
 451          setFixed(PhysReg{247});
 452       else if (v == 0x3118) /* 1/2 PI */
 453          setFixed(PhysReg{248});
 454       else /* Literal Constant */
 455          setFixed(PhysReg{255});
 456    };
 457    explicit Operand(uint32_t v, bool is64bit = false) noexcept
 458    {
 459       data_.i = v;
 460       isConstant_ = true;
 461       constSize = is64bit ? 3 : 2;
 462       if (v <= 64)
 463          setFixed(PhysReg{128 + v});
 464       else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
 465          setFixed(PhysReg{192 - v});
 466       else if (v == 0x3f000000) /* 0.5 */
 467          setFixed(PhysReg{240});
 468       else if (v == 0xbf000000) /* -0.5 */
 469          setFixed(PhysReg{241});
 470       else if (v == 0x3f800000) /* 1.0 */
 471          setFixed(PhysReg{242});
 472       else if (v == 0xbf800000) /* -1.0 */
 473          setFixed(PhysReg{243});
 474       else if (v == 0x40000000) /* 2.0 */
 475          setFixed(PhysReg{244});
 476       else if (v == 0xc0000000) /* -2.0 */
 477          setFixed(PhysReg{245});
 478       else if (v == 0x40800000) /* 4.0 */
 479          setFixed(PhysReg{246});
 480       else if (v == 0xc0800000) /* -4.0 */
 481          setFixed(PhysReg{247});
 482       else { /* Literal Constant */
 483          assert(!is64bit && "attempt to create a 64-bit literal constant");
 484          setFixed(PhysReg{255});
 485       }
 486    };
 487    explicit Operand(uint64_t v) noexcept
 488    {
 489       isConstant_ = true;
 490       constSize = 3;
 491       if (v <= 64) {
 492          data_.i = (uint32_t) v;
 493          setFixed(PhysReg{128 + (uint32_t) v});
 494       } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
 495          data_.i = (uint32_t) v;
 496          setFixed(PhysReg{192 - (uint32_t) v});
 497       } else if (v == 0x3FE0000000000000) { /* 0.5 */
 498          data_.i = 0x3f000000;
 499          setFixed(PhysReg{240});
 500       } else if (v == 0xBFE0000000000000) { /* -0.5 */
 501          data_.i = 0xbf000000;
 502          setFixed(PhysReg{241});
 503       } else if (v == 0x3FF0000000000000) { /* 1.0 */
 504          data_.i = 0x3f800000;
 505          setFixed(PhysReg{242});
 506       } else if (v == 0xBFF0000000000000) { /* -1.0 */
 507          data_.i = 0xbf800000;
 508          setFixed(PhysReg{243});
 509       } else if (v == 0x4000000000000000) { /* 2.0 */
 510          data_.i = 0x40000000;
 511          setFixed(PhysReg{244});
 512       } else if (v == 0xC000000000000000) { /* -2.0 */
 513          data_.i = 0xc0000000;
 514          setFixed(PhysReg{245});
 515       } else if (v == 0x4010000000000000) { /* 4.0 */
 516          data_.i = 0x40800000;
 517          setFixed(PhysReg{246});
 518       } else if (v == 0xC010000000000000) { /* -4.0 */
 519          data_.i = 0xc0800000;
 520          setFixed(PhysReg{247});
 521       } else { /* Literal Constant: we don't know if it is a long or double.*/
 522          isConstant_ = 0;
 523          assert(false && "attempt to create a 64-bit literal constant");
 524       }
 525    };
 526    explicit Operand(RegClass type) noexcept
 527    {
 528       isUndef_ = true;
 529       data_.temp = Temp(0, type);
 530       setFixed(PhysReg{128});
 531    };
 532    explicit Operand(PhysReg reg, RegClass type) noexcept
 533    {
 534       data_.temp = Temp(0, type);
 535       setFixed(reg);
 536    }
 537
 538    constexpr bool isTemp() const noexcept
 539    {
 540       return isTemp_;
 541    }
 542
 543    constexpr void setTemp(Temp t) noexcept {
 544       assert(!isConstant_);
 545       isTemp_ = true;
 546       data_.temp = t;
 547    }
 548
 549    constexpr Temp getTemp() const noexcept
 550    {
 551       return data_.temp;
 552    }
 553
 554    constexpr uint32_t tempId() const noexcept
 555    {
 556       return data_.temp.id();
 557    }
 558
 559    constexpr bool hasRegClass() const noexcept
 560    {
 561       return isTemp() || isUndefined();
 562    }
 563
 564    constexpr RegClass regClass() const noexcept
 565    {
 566       return data_.temp.regClass();
 567    }
 568
 569    constexpr unsigned bytes() const noexcept
 570    {
 571       if (isConstant())
 572          return 1 << constSize;
 573       else
 574          return data_.temp.bytes();
 575    }
 576
 577    constexpr unsigned size() const noexcept
 578    {
 579       if (isConstant())
 580          return constSize > 2 ? 2 : 1;
 581       else
 582          return data_.temp.size();
 583    }
 584
 585    constexpr bool isFixed() const noexcept
 586    {
 587       return isFixed_;
 588    }
 589
 590    constexpr PhysReg physReg() const noexcept
 591    {
 592       return reg_;
 593    }
 594
 595    constexpr void setFixed(PhysReg reg) noexcept
 596    {
 597       isFixed_ = reg != unsigned(-1);
 598       reg_ = reg;
 599    }
 600
 601    constexpr bool isConstant() const noexcept
 602    {
 603       return isConstant_;
 604    }
 605
 606    constexpr bool isLiteral() const noexcept
 607    {
 608       return isConstant() && reg_ == 255;
 609    }
 610
 611    constexpr bool isUndefined() const noexcept
 612    {
 613       return isUndef_;
 614    }
 615
 616    constexpr uint32_t constantValue() const noexcept
 617    {
 618       return data_.i;
 619    }
 620
 621    constexpr bool constantEquals(uint32_t cmp) const noexcept
 622    {
 623       return isConstant() && constantValue() == cmp;
 624    }
 625
 626    constexpr uint64_t constantValue64(bool signext=false) const noexcept
 627    {
 628       if (constSize == 3) {
 629          if (reg_ <= 192)
 630             return reg_ - 128;
 631          else if (reg_ <= 208)
 632             return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);
 633
 634          switch (reg_) {
 635          case 240:
 636             return 0x3FE0000000000000;
 637          case 241:
 638             return 0xBFE0000000000000;
 639          case 242:
 640             return 0x3FF0000000000000;
 641          case 243:
 642             return 0xBFF0000000000000;
 643          case 244:
 644             return 0x4000000000000000;
 645          case 245:
 646             return 0xC000000000000000;
 647          case 246:
 648             return 0x4010000000000000;
 649          case 247:
 650             return 0xC010000000000000;
 651          }
 652       } else if (constSize == 1) {
 653          return (signext && (data_.i & 0x8000u) ? 0xffffffffffff0000ull : 0ull) | data_.i;
 654       } else if (constSize == 0) {
 655          return (signext && (data_.i & 0x80u) ? 0xffffffffffffff00ull : 0ull) | data_.i;
 656       }
 657       return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
 658    }
 659
 660    constexpr bool isOfType(RegType type) const noexcept
 661    {
 662       return hasRegClass() && regClass().type() == type;
 663    }
 664
 665    /* Indicates that the killed operand's live range intersects with the
 666     * instruction's definitions. Unlike isKill() and isFirstKill(), this is
 667     * not set by liveness analysis. */
 668    constexpr void setLateKill(bool flag) noexcept
 669    {
 670       isLateKill_ = flag;
 671    }
 672
 673    constexpr bool isLateKill() const noexcept
 674    {
 675       return isLateKill_;
 676    }
 677
 678    constexpr void setKill(bool flag) noexcept
 679    {
 680       isKill_ = flag;
 681       if (!flag)
 682          setFirstKill(false);
 683    }
 684
 685    constexpr bool isKill() const noexcept
 686    {
 687       return isKill_ || isFirstKill();
 688    }
 689
 690    constexpr void setFirstKill(bool flag) noexcept
 691    {
 692       isFirstKill_ = flag;
 693       if (flag)
 694          setKill(flag);
 695    }
 696
 697    /* When there are multiple operands killing the same temporary,
 698     * isFirstKill() is only returns true for the first one. */
 699    constexpr bool isFirstKill() const noexcept
 700    {
 701       return isFirstKill_;
 702    }
 703
 704    constexpr bool isKillBeforeDef() const noexcept
 705    {
 706       return isKill() && !isLateKill();
 707    }
 708
 709    constexpr bool isFirstKillBeforeDef() const noexcept
 710    {
 711       return isFirstKill() && !isLateKill();
 712    }
 713
 714    constexpr bool operator == (Operand other) const noexcept
 715    {
 716       if (other.size() != size())
 717          return false;
 718       if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef())
 719          return false;
 720       if (isFixed() && other.isFixed() && physReg() != other.physReg())
 721          return false;
 722       if (isLiteral())
 723          return other.isLiteral() && other.constantValue() == constantValue();
 724       else if (isConstant())
 725          return other.isConstant() && other.physReg() == physReg();
 726       else if (isUndefined())
 727          return other.isUndefined() && other.regClass() == regClass();
 728       else
 729          return other.isTemp() && other.getTemp() == getTemp();
 730    }
 731 private:
 732    union {
 733       uint32_t i;
 734       float f;
 735       Temp temp = Temp(0, s1);
 736    } data_;
 737    PhysReg reg_;
 738    union {
 739       struct {
 740          uint8_t isTemp_:1;
 741          uint8_t isFixed_:1;
 742          uint8_t isConstant_:1;
 743          uint8_t isKill_:1;
 744          uint8_t isUndef_:1;
 745          uint8_t isFirstKill_:1;
 746          uint8_t constSize:2;
 747          uint8_t isLateKill_:1;
 748       };
 749       /* can't initialize bit-fields in c++11, so work around using a union */
 750       uint16_t control_ = 0;
 751    };
 752 };
 753
 754 /**
 755  * Definition Class
 756  * Definitions are the results of Instructions
 757  * and refer to temporary virtual registers
 758  * which are later mapped to physical registers
 759  */
 760 class Definition final
 761 {
 762 public:
 763    constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0),
 764                             isKill_(0), isPrecise_(0), isNUW_(0) {}
 765    Definition(uint32_t index, RegClass type) noexcept
 766       : temp(index, type) {}
 767    explicit Definition(Temp tmp) noexcept
 768       : temp(tmp) {}
 769    Definition(PhysReg reg, RegClass type) noexcept
 770       : temp(Temp(0, type))
 771    {
 772       setFixed(reg);
 773    }
 774    Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
 775       : temp(Temp(tmpId, type))
 776    {
 777       setFixed(reg);
 778    }
 779
 780    constexpr bool isTemp() const noexcept
 781    {
 782       return tempId() > 0;
 783    }
 784
 785    constexpr Temp getTemp() const noexcept
 786    {
 787       return temp;
 788    }
 789
 790    constexpr uint32_t tempId() const noexcept
 791    {
 792       return temp.id();
 793    }
 794
 795    constexpr void setTemp(Temp t) noexcept {
 796       temp = t;
 797    }
 798
 799    constexpr RegClass regClass() const noexcept
 800    {
 801       return temp.regClass();
 802    }
 803
 804    constexpr unsigned bytes() const noexcept
 805    {
 806       return temp.bytes();
 807    }
 808
 809    constexpr unsigned size() const noexcept
 810    {
 811       return temp.size();
 812    }
 813
 814    constexpr bool isFixed() const noexcept
 815    {
 816       return isFixed_;
 817    }
 818
 819    constexpr PhysReg physReg() const noexcept
 820    {
 821       return reg_;
 822    }
 823
 824    constexpr void setFixed(PhysReg reg) noexcept
 825    {
 826       isFixed_ = 1;
 827       reg_ = reg;
 828    }
 829
 830    constexpr void setHint(PhysReg reg) noexcept
 831    {
 832       hasHint_ = 1;
 833       reg_ = reg;
 834    }
 835
 836    constexpr bool hasHint() const noexcept
 837    {
 838       return hasHint_;
 839    }
 840
 841    constexpr void setKill(bool flag) noexcept
 842    {
 843       isKill_ = flag;
 844    }
 845
 846    constexpr bool isKill() const noexcept
 847    {
 848       return isKill_;
 849    }
 850
 851    constexpr void setPrecise(bool precise) noexcept
 852    {
 853       isPrecise_ = precise;
 854    }
 855
 856    constexpr bool isPrecise() const noexcept
 857    {
 858       return isPrecise_;
 859    }
 860
 861    /* No Unsigned Wrap */
 862    constexpr void setNUW(bool nuw) noexcept
 863    {
 864       isNUW_ = nuw;
 865    }
 866
 867    constexpr bool isNUW() const noexcept
 868    {
 869       return isNUW_;
 870    }
 871
 872 private:
 873    Temp temp = Temp(0, s1);
 874    PhysReg reg_;
 875    union {
 876       struct {
 877          uint8_t isFixed_:1;
 878          uint8_t hasHint_:1;
 879          uint8_t isKill_:1;
 880          uint8_t isPrecise_:1;
 881          uint8_t isNUW_:1;
 882       };
 883       /* can't initialize bit-fields in c++11, so work around using a union */
 884       uint8_t control_ = 0;
 885    };
 886 };
 887
 888 struct Block;
 889
 890 struct Instruction {
 891    aco_opcode opcode;
 892    Format format;
 893    uint32_t pass_flags;
 894
 895    aco::span<Operand> operands;
 896    aco::span<Definition> definitions;
 897
 898    constexpr bool isVALU() const noexcept
 899    {
 900       return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
 901           || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
 902           || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
 903           || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
 904           || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
 905           || format == Format::VOP3P;
 906    }
 907
 908    constexpr bool isSALU() const noexcept
 909    {
 910       return format == Format::SOP1 ||
 911              format == Format::SOP2 ||
 912              format == Format::SOPC ||
 913              format == Format::SOPK ||
 914              format == Format::SOPP;
 915    }
 916
 917    constexpr bool isVMEM() const noexcept
 918    {
 919       return format == Format::MTBUF ||
 920              format == Format::MUBUF ||
 921              format == Format::MIMG;
 922    }
 923
 924    constexpr bool isDPP() const noexcept
 925    {
 926       return (uint16_t) format & (uint16_t) Format::DPP;
 927    }
 928
 929    constexpr bool isVOP3() const noexcept
 930    {
 931       return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
 932              ((uint16_t) format & (uint16_t) Format::VOP3B);
 933    }
 934
 935    constexpr bool isSDWA() const noexcept
 936    {
 937       return (uint16_t) format & (uint16_t) Format::SDWA;
 938    }
 939
 940    constexpr bool isFlatOrGlobal() const noexcept
 941    {
 942       return format == Format::FLAT || format == Format::GLOBAL;
 943    }
 944
 945    constexpr bool usesModifiers() const noexcept;
 946
 947    constexpr bool reads_exec() const noexcept
 948    {
 949       for (const Operand& op : operands) {
 950          if (op.isFixed() && op.physReg() == exec)
 951             return true;
 952       }
 953       return false;
 954    }
 955 };
 956 static_assert(sizeof(Instruction) == 16, "Unexpected padding");
 957
 958 struct SOPK_instruction : public Instruction {
 959    uint16_t imm;
 960    uint16_t padding;
 961 };
 962 static_assert(sizeof(SOPK_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
 963
 964 struct SOPP_instruction : public Instruction {
 965    uint32_t imm;
 966    int block;
 967 };
 968 static_assert(sizeof(SOPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
 969
 970 struct SOPC_instruction : public Instruction {
 971 };
 972 static_assert(sizeof(SOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
 973
 974 struct SOP1_instruction : public Instruction {
 975 };
 976 static_assert(sizeof(SOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
 977
 978 struct SOP2_instruction : public Instruction {
 979 };
 980 static_assert(sizeof(SOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
 981
 982 /**
 983  * Scalar Memory Format:
 984  * For s_(buffer_)load_dword*:
 985  * Operand(0): SBASE - SGPR-pair which provides base address
 986  * Operand(1): Offset - immediate (un)signed offset or SGPR
 987  * Operand(2) / Definition(0): SDATA - SGPR for read / write result
 988  * Operand(n-1): SOffset - SGPR offset (Vega only)
 989  *
 990  * Having no operands is also valid for instructions such as s_dcache_inv.
 991  *
 992  */
 993 struct SMEM_instruction : public Instruction {
 994    memory_sync_info sync;
 995    bool glc : 1; /* VI+: globally coherent */
 996    bool dlc : 1; /* NAVI: device level coherent */
 997    bool nv : 1; /* VEGA only: Non-volatile */
 998    bool disable_wqm : 1;
 999    bool prevent_overflow : 1; /* avoid overflow when combining additions */
1000    uint32_t padding: 3;
1001 };
1002 static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1003
1004 struct VOP1_instruction : public Instruction {
1005 };
1006 static_assert(sizeof(VOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1007
1008 struct VOP2_instruction : public Instruction {
1009 };
1010 static_assert(sizeof(VOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1011
1012 struct VOPC_instruction : public Instruction {
1013 };
1014 static_assert(sizeof(VOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1015
1016 struct VOP3A_instruction : public Instruction {
1017    bool abs[3];
1018    bool neg[3];
1019    uint8_t opsel : 4;
1020    uint8_t omod : 2;
1021    bool clamp : 1;
1022    uint32_t padding : 9;
1023 };
1024 static_assert(sizeof(VOP3A_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1025
1026 struct VOP3P_instruction : public Instruction {
1027    bool neg_lo[3];
1028    bool neg_hi[3];
1029    uint8_t opsel_lo : 3;
1030    uint8_t opsel_hi : 3;
1031    bool clamp : 1;
1032    uint32_t padding : 9;
1033 };
1034 static_assert(sizeof(VOP3P_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1035
1036 /**
1037  * Data Parallel Primitives Format:
1038  * This format can be used for VOP1, VOP2 or VOPC instructions.
1039  * The swizzle applies to the src0 operand.
1040  *
1041  */
1042 struct DPP_instruction : public Instruction {
1043    bool abs[2];
1044    bool neg[2];
1045    uint16_t dpp_ctrl;
1046    uint8_t row_mask : 4;
1047    uint8_t bank_mask : 4;
1048    bool bound_ctrl : 1;
1049    uint32_t padding : 7;
1050 };
1051 static_assert(sizeof(DPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1052
1053 enum sdwa_sel : uint8_t {
1054     /* masks */
1055     sdwa_wordnum = 0x1,
1056     sdwa_bytenum = 0x3,
1057     sdwa_asuint = 0x7 | 0x10,
1058     sdwa_rasize = 0x3,
1059
1060     /* flags */
1061     sdwa_isword = 0x4,
1062     sdwa_sext = 0x8,
1063     sdwa_isra = 0x10,
1064
1065     /* specific values */
1066     sdwa_ubyte0 = 0,
1067     sdwa_ubyte1 = 1,
1068     sdwa_ubyte2 = 2,
1069     sdwa_ubyte3 = 3,
1070     sdwa_uword0 = sdwa_isword | 0,
1071     sdwa_uword1 = sdwa_isword | 1,
1072     sdwa_udword = 6,
1073
1074     sdwa_sbyte0 = sdwa_ubyte0 | sdwa_sext,
1075     sdwa_sbyte1 = sdwa_ubyte1 | sdwa_sext,
1076     sdwa_sbyte2 = sdwa_ubyte2 | sdwa_sext,
1077     sdwa_sbyte3 = sdwa_ubyte3 | sdwa_sext,
1078     sdwa_sword0 = sdwa_uword0 | sdwa_sext,
1079     sdwa_sword1 = sdwa_uword1 | sdwa_sext,
1080     sdwa_sdword = sdwa_udword | sdwa_sext,
1081
1082     /* register-allocated */
1083     sdwa_ubyte = 1 | sdwa_isra,
1084     sdwa_uword = 2 | sdwa_isra,
1085     sdwa_sbyte = sdwa_ubyte | sdwa_sext,
1086     sdwa_sword = sdwa_uword | sdwa_sext,
1087 };
1088
1089 /**
1090  * Sub-Dword Addressing Format:
1091  * This format can be used for VOP1, VOP2 or VOPC instructions.
1092  *
1093  * omod and SGPR/constant operands are only available on GFX9+. For VOPC,
1094  * the definition doesn't have to be VCC on GFX9+.
1095  *
1096  */
1097 struct SDWA_instruction : public Instruction {
1098    /* these destination modifiers aren't available with VOPC except for
1099     * clamp on GFX8 */
1100    uint8_t sel[2];
1101    uint8_t dst_sel;
1102    bool neg[2];
1103    bool abs[2];
1104    bool dst_preserve : 1;
1105    bool clamp : 1;
1106    uint8_t omod : 2; /* GFX9+ */
1107    uint32_t padding : 4;
1108 };
1109 static_assert(sizeof(SDWA_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1110
1111 struct Interp_instruction : public Instruction {
1112    uint8_t attribute;
1113    uint8_t component;
1114    uint16_t padding;
1115 };
1116 static_assert(sizeof(Interp_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1117
1118 /**
1119  * Local and Global Data Sharing instructions
1120  * Operand(0): ADDR - VGPR which supplies the address.
1121  * Operand(1): DATA0 - First data VGPR.
1122  * Operand(2): DATA1 - Second data VGPR.
1123  * Operand(n-1): M0 - LDS size.
1124  * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
1125  *
1126  */
1127 struct DS_instruction : public Instruction {
1128    memory_sync_info sync;
1129    bool gds;
1130    int16_t offset0;
1131    int8_t offset1;
1132    uint8_t padding;
1133 };
1134 static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1135
1136 /**
1137  * Vector Memory Untyped-buffer Instructions
1138  * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1139  * Operand(1): VADDR - Address source. Can carry an index and/or offset
1140  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1141  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1142  *
1143  */
1144 struct MUBUF_instruction : public Instruction {
1145    memory_sync_info sync;
1146    bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1147    bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1148    bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
1149    bool glc : 1; /* globally coherent */
1150    bool dlc : 1; /* NAVI: device level coherent */
1151    bool slc : 1; /* system level coherent */
1152    bool tfe : 1; /* texture fail enable */
1153    bool lds : 1; /* Return read-data to LDS instead of VGPRs */
1154    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1155    uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
1156    bool swizzled : 1;
1157    uint32_t padding1 : 18;
1158 };
1159 static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1160
1161 /**
1162  * Vector Memory Typed-buffer Instructions
1163  * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1164  * Operand(1): VADDR - Address source. Can carry an index and/or offset
1165  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1166  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1167  *
1168  */
1169 struct MTBUF_instruction : public Instruction {
1170    memory_sync_info sync;
1171    uint8_t dfmt : 4; /* Data Format of data in memory buffer */
1172    uint8_t nfmt : 3; /* Numeric format of data in memory */
1173    bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1174    bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1175    bool glc : 1; /* globally coherent */
1176    bool dlc : 1; /* NAVI: device level coherent */
1177    bool slc : 1; /* system level coherent */
1178    bool tfe : 1; /* texture fail enable */
1179    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1180    uint32_t padding : 10;
1181    uint16_t offset; /* Unsigned byte offset - 12 bit */
1182 };
1183 static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1184
1185 /**
1186  * Vector Memory Image Instructions
1187  * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
1188  * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
1189  *             or VDATA - Vector GPR for write data.
1190  * Operand(2): VADDR - Address source. Can carry an offset or an index.
1191  * Definition(0): VDATA - Vector GPR for read result.
1192  *
1193  */
1194 struct MIMG_instruction : public Instruction {
1195    memory_sync_info sync;
1196    uint8_t dmask; /* Data VGPR enable mask */
1197    uint8_t dim : 3; /* NAVI: dimensionality */
1198    bool unrm : 1; /* Force address to be un-normalized */
1199    bool dlc : 1; /* NAVI: device level coherent */
1200    bool glc : 1; /* globally coherent */
1201    bool slc : 1; /* system level coherent */
1202    bool tfe : 1; /* texture fail enable */
1203    bool da : 1; /* declare an array */
1204    bool lwe : 1; /* Force data to be un-normalized */
1205    bool r128 : 1; /* NAVI: Texture resource size */
1206    bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
1207    bool d16 : 1; /* Convert 32-bit data to 16-bit data */
1208    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1209    uint32_t padding : 18;
1210 };
1211 static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1212
1213 /**
1214  * Flat/Scratch/Global Instructions
1215  * Operand(0): ADDR
1216  * Operand(1): SADDR
1217  * Operand(2) / Definition(0): DATA/VDST
1218  *
1219  */
1220 struct FLAT_instruction : public Instruction {
1221    memory_sync_info sync;
1222    bool slc : 1; /* system level coherent */
1223    bool glc : 1; /* globally coherent */
1224    bool dlc : 1; /* NAVI: device level coherent */
1225    bool lds : 1;
1226    bool nv : 1;
1227    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1228    uint32_t padding0 : 2;
1229    uint16_t offset; /* Vega/Navi only */
1230    uint16_t padding1;
1231 };
1232 static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1233
1234 struct Export_instruction : public Instruction {
1235    uint8_t enabled_mask;
1236    uint8_t dest;
1237    bool compressed : 1;
1238    bool done : 1;
1239    bool valid_mask : 1;
1240    uint32_t padding : 13;
1241 };
1242 static_assert(sizeof(Export_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1243
1244 struct Pseudo_instruction : public Instruction {
1245    PhysReg scratch_sgpr; /* might not be valid if it's not needed */
1246    bool tmp_in_scc;
1247    uint8_t padding;
1248 };
1249 static_assert(sizeof(Pseudo_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1250
1251 struct Pseudo_branch_instruction : public Instruction {
1252    /* target[0] is the block index of the branch target.
1253     * For conditional branches, target[1] contains the fall-through alternative.
1254     * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
1255     */
1256    uint32_t target[2];
1257 };
1258 static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1259
1260 struct Pseudo_barrier_instruction : public Instruction {
1261    memory_sync_info sync;
1262    sync_scope exec_scope;
1263 };
1264 static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1265
1266 enum ReduceOp : uint16_t {
1267    iadd8, iadd16, iadd32, iadd64,
1268    imul8, imul16, imul32, imul64,
1269           fadd16, fadd32, fadd64,
1270           fmul16, fmul32, fmul64,
1271    imin8, imin16, imin32, imin64,
1272    imax8, imax16, imax32, imax64,
1273    umin8, umin16, umin32, umin64,
1274    umax8, umax16, umax32, umax64,
1275           fmin16, fmin32, fmin64,
1276           fmax16, fmax32, fmax64,
1277    iand8, iand16, iand32, iand64,
1278    ior8, ior16, ior32, ior64,
1279    ixor8, ixor16, ixor32, ixor64,
1280 };
1281
1282 /**
1283  * Subgroup Reduction Instructions, everything except for the data to be
1284  * reduced and the result as inserted by setup_reduce_temp().
1285  * Operand(0): data to be reduced
1286  * Operand(1): reduce temporary
1287  * Operand(2): vector temporary
1288  * Definition(0): result
1289  * Definition(1): scalar temporary
1290  * Definition(2): scalar identity temporary (not used to store identity on GFX10)
1291  * Definition(3): scc clobber
1292  * Definition(4): vcc clobber
1293  *
1294  */
1295 struct Pseudo_reduction_instruction : public Instruction {
1296    ReduceOp reduce_op;
1297    uint16_t cluster_size; // must be 0 for scans
1298 };
1299 static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1300
1301 struct instr_deleter_functor {
1302    void operator()(void* p) {
1303       free(p);
1304    }
1305 };
1306
1307 template<typename T>
1308 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
1309
1310 template<typename T>
1311 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
1312 {
1313    std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1314    char *data = (char*) calloc(1, size);
1315    T* inst = (T*) data;
1316
1317    inst->opcode = opcode;
1318    inst->format = format;
1319
1320    uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
1321    inst->operands = aco::span<Operand>(operands_offset, num_operands);
1322    uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1323    inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1324
1325    return inst;
1326 }
1327
1328 constexpr bool Instruction::usesModifiers() const noexcept
1329 {
1330    if (isDPP() || isSDWA())
1331       return true;
1332
1333    if (format == Format::VOP3P) {
1334       const VOP3P_instruction *vop3p = static_cast<const VOP3P_instruction*>(this);
1335       for (unsigned i = 0; i < operands.size(); i++) {
1336          if (vop3p->neg_lo[i] || vop3p->neg_hi[i])
1337             return true;
1338       }
1339       return vop3p->opsel_lo || vop3p->opsel_hi || vop3p->clamp;
1340    } else if (isVOP3()) {
1341       const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
1342       for (unsigned i = 0; i < operands.size(); i++) {
1343          if (vop3->abs[i] || vop3->neg[i])
1344             return true;
1345       }
1346       return vop3->opsel || vop3->clamp || vop3->omod;
1347    }
1348    return false;
1349 }
1350
1351 constexpr bool is_phi(Instruction* instr)
1352 {
1353    return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
1354 }
1355
1356 static inline bool is_phi(aco_ptr<Instruction>& instr)
1357 {
1358    return is_phi(instr.get());
1359 }
1360
1361 memory_sync_info get_sync_info(const Instruction* instr);
1362
1363 bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr);
1364
1365 bool can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high);
1366 bool can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr);
1367 /* updates "instr" and returns the old instruction (or NULL if no update was needed) */
1368 aco_ptr<Instruction> convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& instr);
1369
1370 enum block_kind {
1371    /* uniform indicates that leaving this block,
1372     * all actives lanes stay active */
1373    block_kind_uniform = 1 << 0,
1374    block_kind_top_level = 1 << 1,
1375    block_kind_loop_preheader = 1 << 2,
1376    block_kind_loop_header = 1 << 3,
1377    block_kind_loop_exit = 1 << 4,
1378    block_kind_continue = 1 << 5,
1379    block_kind_break = 1 << 6,
1380    block_kind_continue_or_break = 1 << 7,
1381    block_kind_discard = 1 << 8,
1382    block_kind_branch = 1 << 9,
1383    block_kind_merge = 1 << 10,
1384    block_kind_invert = 1 << 11,
1385    block_kind_uses_discard_if = 1 << 12,
1386    block_kind_needs_lowering = 1 << 13,
1387    block_kind_uses_demote = 1 << 14,
1388    block_kind_export_end = 1 << 15,
1389 };
1390
1391
1392 struct RegisterDemand {
1393    constexpr RegisterDemand() = default;
1394    constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1395       : vgpr{v}, sgpr{s} {}
1396    int16_t vgpr = 0;
1397    int16_t sgpr = 0;
1398
1399    constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1400       return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1401    }
1402
1403    constexpr bool exceeds(const RegisterDemand other) const noexcept {
1404       return vgpr > other.vgpr || sgpr > other.sgpr;
1405    }
1406
1407    constexpr RegisterDemand operator+(const Temp t) const noexcept {
1408       if (t.type() == RegType::sgpr)
1409          return RegisterDemand( vgpr, sgpr + t.size() );
1410       else
1411          return RegisterDemand( vgpr + t.size(), sgpr );
1412    }
1413
1414    constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1415       return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1416    }
1417
1418    constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1419       return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1420    }
1421
1422    constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1423       vgpr += other.vgpr;
1424       sgpr += other.sgpr;
1425       return *this;
1426    }
1427
1428    constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1429       vgpr -= other.vgpr;
1430       sgpr -= other.sgpr;
1431       return *this;
1432    }
1433
1434    constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1435       if (t.type() == RegType::sgpr)
1436          sgpr += t.size();
1437       else
1438          vgpr += t.size();
1439       return *this;
1440    }
1441
1442    constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1443       if (t.type() == RegType::sgpr)
1444          sgpr -= t.size();
1445       else
1446          vgpr -= t.size();
1447       return *this;
1448    }
1449
1450    constexpr void update(const RegisterDemand other) noexcept {
1451       vgpr = std::max(vgpr, other.vgpr);
1452       sgpr = std::max(sgpr, other.sgpr);
1453    }
1454
1455 };
1456
1457 /* CFG */
1458 struct Block {
1459    float_mode fp_mode;
1460    unsigned index;
1461    unsigned offset = 0;
1462    std::vector<aco_ptr<Instruction>> instructions;
1463    std::vector<unsigned> logical_preds;
1464    std::vector<unsigned> linear_preds;
1465    std::vector<unsigned> logical_succs;
1466    std::vector<unsigned> linear_succs;
1467    RegisterDemand register_demand = RegisterDemand();
1468    uint16_t loop_nest_depth = 0;
1469    uint16_t kind = 0;
1470    int logical_idom = -1;
1471    int linear_idom = -1;
1472    Temp live_out_exec = Temp();
1473
1474    /* this information is needed for predecessors to blocks with phis when
1475     * moving out of ssa */
1476    bool scc_live_out = false;
1477    PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1478
1479    Block(unsigned idx) : index(idx) {}
1480    Block() : index(0) {}
1481 };
1482
1483 using Stage = uint16_t;
1484
1485 /* software stages */
1486 static constexpr Stage sw_vs = 1 << 0;
1487 static constexpr Stage sw_gs = 1 << 1;
1488 static constexpr Stage sw_tcs = 1 << 2;
1489 static constexpr Stage sw_tes = 1 << 3;
1490 static constexpr Stage sw_fs = 1 << 4;
1491 static constexpr Stage sw_cs = 1 << 5;
1492 static constexpr Stage sw_gs_copy = 1 << 6;
1493 static constexpr Stage sw_mask = 0x7f;
1494
1495 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1496 static constexpr Stage hw_vs = 1 << 7;
1497 static constexpr Stage hw_es = 1 << 8; /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */
1498 static constexpr Stage hw_gs = 1 << 9; /* Geometry shader on GFX10/legacy and GFX6-9. */
1499 static constexpr Stage hw_ngg_gs = 1 << 10; /* Geometry shader on GFX10/NGG. */
1500 static constexpr Stage hw_ls = 1 << 11; /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */
1501 static constexpr Stage hw_hs = 1 << 12; /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */
1502 static constexpr Stage hw_fs = 1 << 13;
1503 static constexpr Stage hw_cs = 1 << 14;
1504 static constexpr Stage hw_mask = 0xff << 7;
1505
1506 /* possible settings of Program::stage */
1507 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1508 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1509 static constexpr Stage compute_cs = sw_cs | hw_cs;
1510 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1511 static constexpr Stage gs_copy_vs = sw_gs_copy | hw_vs;
1512 /* GFX10/NGG */
1513 static constexpr Stage ngg_vertex_gs = sw_vs | hw_ngg_gs;
1514 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_ngg_gs;
1515 static constexpr Stage ngg_tess_eval_gs = sw_tes | hw_ngg_gs;
1516 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_ngg_gs;
1517 /* GFX9 (and GFX10 if NGG isn't used) */
1518 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1519 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1520 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1521 /* pre-GFX9 */
1522 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1523 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1524 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1525 static constexpr Stage tess_eval_es = sw_tes | hw_es; /* tesselation evaluation before geometry */
1526 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1527
1528 enum statistic {
1529    statistic_hash,
1530    statistic_instructions,
1531    statistic_copies,
1532    statistic_branches,
1533    statistic_cycles,
1534    statistic_vmem_clauses,
1535    statistic_smem_clauses,
1536    statistic_vmem_score,
1537    statistic_smem_score,
1538    statistic_sgpr_presched,
1539    statistic_vgpr_presched,
1540    num_statistics
1541 };
1542
1543 class Program final {
1544 public:
1545    float_mode next_fp_mode;
1546    std::vector<Block> blocks;
1547    RegisterDemand max_reg_demand = RegisterDemand();
1548    uint16_t num_waves = 0;
1549    uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1550    ac_shader_config* config;
1551    struct radv_shader_info *info;
1552    enum chip_class chip_class;
1553    enum radeon_family family;
1554    unsigned wave_size;
1555    RegClass lane_mask;
1556    Stage stage; /* Stage */
1557    bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1558    bool needs_wqm = false; /* there exists a p_wqm instruction */
1559    bool wb_smem_l1_on_end = false;
1560
1561    std::vector<uint8_t> constant_data;
1562    Temp private_segment_buffer;
1563    Temp scratch_offset;
1564
1565    uint16_t min_waves = 0;
1566    uint16_t lds_alloc_granule;
1567    uint32_t lds_limit; /* in bytes */
1568    bool has_16bank_lds;
1569    uint16_t vgpr_limit;
1570    uint16_t sgpr_limit;
1571    uint16_t physical_sgprs;
1572    uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1573    uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
1574    unsigned workgroup_size; /* if known; otherwise UINT_MAX */
1575
1576    bool xnack_enabled = false;
1577    bool sram_ecc_enabled = false;
1578    bool has_fast_fma32 = false;
1579
1580    bool needs_vcc = false;
1581    bool needs_flat_scr = false;
1582
1583    bool collect_statistics = false;
1584    uint32_t statistics[num_statistics];
1585
1586    struct {
1587       void (*func)(void *private_data,
1588                    enum radv_compiler_debug_level level,
1589                    const char *message);
1590       void *private_data;
1591    } debug;
1592
1593    uint32_t allocateId()
1594    {
1595       assert(allocationID <= 16777215);
1596       return allocationID++;
1597    }
1598
1599    uint32_t peekAllocationId()
1600    {
1601       return allocationID;
1602    }
1603
1604    void setAllocationId(uint32_t id)
1605    {
1606       allocationID = id;
1607    }
1608
1609    Block* create_and_insert_block() {
1610       blocks.emplace_back(blocks.size());
1611       blocks.back().fp_mode = next_fp_mode;
1612       return &blocks.back();
1613    }
1614
1615    Block* insert_block(Block&& block) {
1616       block.index = blocks.size();
1617       block.fp_mode = next_fp_mode;
1618       blocks.emplace_back(std::move(block));
1619       return &blocks.back();
1620    }
1621
1622 private:
1623    uint32_t allocationID = 1;
1624 };
1625
1626 struct TempHash {
1627    std::size_t operator()(Temp t) const {
1628       return t.id();
1629    }
1630 };
1631 using TempSet = std::unordered_set<Temp, TempHash>;
1632
1633 struct live {
1634    /* live temps out per block */
1635    std::vector<TempSet> live_out;
1636    /* register demand (sgpr/vgpr) per instruction per block */
1637    std::vector<std::vector<RegisterDemand>> register_demand;
1638 };
1639
1640 void init();
1641
1642 void init_program(Program *program, Stage stage, struct radv_shader_info *info,
1643                   enum chip_class chip_class, enum radeon_family family,
1644                   ac_shader_config *config);
1645
1646 void select_program(Program *program,
1647                     unsigned shader_count,
1648                     struct nir_shader *const *shaders,
1649                     ac_shader_config* config,
1650                     struct radv_shader_args *args);
1651 void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
1652                            ac_shader_config* config,
1653                            struct radv_shader_args *args);
1654
1655 void lower_wqm(Program* program, live& live_vars,
1656                const struct radv_nir_compiler_options *options);
1657 void lower_phis(Program* program);
1658 void calc_min_waves(Program* program);
1659 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1660 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1661 std::vector<uint16_t> dead_code_analysis(Program *program);
1662 void dominator_tree(Program* program);
1663 void insert_exec_mask(Program *program);
1664 void value_numbering(Program* program);
1665 void optimize(Program* program);
1666 void setup_reduce_temp(Program* program);
1667 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1668 void register_allocation(Program *program, std::vector<TempSet>& live_out_per_block);
1669 void ssa_elimination(Program* program);
1670 void lower_to_hw_instr(Program* program);
1671 void schedule_program(Program* program, live& live_vars);
1672 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1673 void insert_wait_states(Program* program);
1674 void insert_NOPs(Program* program);
1675 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1676 void print_asm(Program *program, std::vector<uint32_t>& binary,
1677                unsigned exec_size, std::ostream& out);
1678 bool validate_ir(Program* program);
1679 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options);
1680 #ifndef NDEBUG
1681 void perfwarn(Program *program, bool cond, const char *msg, Instruction *instr=NULL);
1682 #else
1683 #define perfwarn(program, cond, msg, ...) do {} while(0)
1684 #endif
1685
1686 void collect_presched_stats(Program *program);
1687 void collect_preasm_stats(Program *program);
1688 void collect_postasm_stats(Program *program, const std::vector<uint32_t>& code);
1689
1690 void aco_print_instr(const Instruction *instr, FILE *output);
1691 void aco_print_program(const Program *program, FILE *output);
1692
1693 void _aco_perfwarn(Program *program, const char *file, unsigned line,
1694                    const char *fmt, ...);
1695 void _aco_err(Program *program, const char *file, unsigned line,
1696               const char *fmt, ...);
1697
1698 #define aco_perfwarn(program, ...) _aco_perfwarn(program, __FILE__, __LINE__, __VA_ARGS__)
1699 #define aco_err(program, ...) _aco_err(program, __FILE__, __LINE__, __VA_ARGS__)
1700
1701 /* utilities for dealing with register demand */
1702 RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
1703 RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
1704 RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, aco_ptr<Instruction>& instr_before);
1705
1706 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1707 uint16_t get_extra_sgprs(Program *program);
1708
1709 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
1710 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1711 uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs);
1712
1713 /* return number of addressable sgprs/vgprs for max_waves */
1714 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1715 uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves);
1716
1717 typedef struct {
1718    const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
1719    const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1720    const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1721    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1722    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1723    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
1724    const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1725    const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1726    /* sizes used for input/output modifiers and constants */
1727    const unsigned operand_size[static_cast<int>(aco_opcode::num_opcodes)];
1728    const unsigned definition_size[static_cast<int>(aco_opcode::num_opcodes)];
1729 } Info;
1730
1731 extern const Info instr_info;
1732
1733 }
1734
1735 #endif /* ACO_IR_H */
1736